You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2023/06/08 08:48:57 UTC

[spark] branch master updated: [SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 4c7eff26962 [SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet
4c7eff26962 is described below

commit 4c7eff269622fccd1ffdf5b7246d7bcdba2b8e96
Author: Yuming Wang <yu...@ebay.com>
AuthorDate: Thu Jun 8 01:48:39 2023 -0700

    [SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet
    
    ### What changes were proposed in this pull request?
    
    Parquet 1.13.0 supports `LZ4_RAW` codec. Please see https://issues.apache.org/jira/browse/PARQUET-2196.
    
    This PR adds `lz4raw` to the supported list of `spark.sql.parquet.compression.codec`.
    
    ### Why are the changes needed?
    
    Support writing Parquet files with `lz4raw` compression codec.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Unit test and manual testing:
    ```scala
    spark.sql("set spark.sql.parquet.compression.codec=lz4raw")
    spark.range(10).write.parquet("/tmp/spark/lz4raw")
    spark.read.parquet("/tmp/spark/lz4raw").show(false)
    ```
    
    ```
    yumwangLM-SHC-16508156 lz4raw % ll /tmp/spark/lz4raw
    total 16
    -rw-r--r-- 1 yumwang  wheel    0 Jun  8 12:10 _SUCCESS
    -rw-r--r-- 1 yumwang  wheel  487 Jun  8 12:10 part-00000-c6786f4d-b5a6-406d-96a1-37bf0ceeeac7-c000.lz4raw.parquet
    -rw-r--r-- 1 yumwang  wheel  489 Jun  8 12:10 part-00001-c6786f4d-b5a6-406d-96a1-37bf0ceeeac7-c000.lz4raw.parquet
    ```
    
    Closes #41507 from wangyum/SPARK-43273.
    
    Lead-authored-by: Yuming Wang <yu...@ebay.com>
    Co-authored-by: Yuming Wang <yu...@apache.org>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala       | 5 +++--
 .../spark/sql/execution/datasources/parquet/ParquetOptions.scala     | 3 ++-
 .../spark/sql/execution/datasources/FileSourceCodecSuite.scala       | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8d1e73cb86f..47b8474953b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -984,11 +984,12 @@ object SQLConf {
       "`parquet.compression` is specified in the table-specific options/properties, the " +
       "precedence would be `compression`, `parquet.compression`, " +
       "`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
-      "snappy, gzip, lzo, brotli, lz4, zstd.")
+      "snappy, gzip, lzo, brotli, lz4, lz4raw, zstd.")
     .version("1.1.1")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo", "lz4", "brotli", "zstd"))
+    .checkValues(
+      Set("none", "uncompressed", "snappy", "gzip", "lzo", "brotli", "lz4", "lz4raw", "zstd"))
     .createWithDefault("snappy")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index d20edbde00b..023d2460959 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -94,8 +94,9 @@ object ParquetOptions extends DataSourceOptions {
     "snappy" -> CompressionCodecName.SNAPPY,
     "gzip" -> CompressionCodecName.GZIP,
     "lzo" -> CompressionCodecName.LZO,
-    "lz4" -> CompressionCodecName.LZ4,
     "brotli" -> CompressionCodecName.BROTLI,
+    "lz4" -> CompressionCodecName.LZ4,
+    "lz4raw" -> CompressionCodecName.LZ4_RAW,
     "zstd" -> CompressionCodecName.ZSTD)
 
   def getParquetCompressionCodecName(name: String): String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
index 8d0bae51521..09a348cd294 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
@@ -59,7 +59,7 @@ class ParquetCodecSuite extends FileSourceCodecSuite {
   // Exclude "brotli" because the com.github.rdblue:brotli-codec dependency is not available
   // on Maven Central.
   override protected def availableCodecs: Seq[String] = {
-    Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4")
+    Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4", "lz4raw")
   }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org