You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2023/06/08 08:48:57 UTC
[spark] branch master updated: [SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4c7eff26962 [SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet
4c7eff26962 is described below
commit 4c7eff269622fccd1ffdf5b7246d7bcdba2b8e96
Author: Yuming Wang <yu...@ebay.com>
AuthorDate: Thu Jun 8 01:48:39 2023 -0700
[SPARK-43273][SQL] Support `lz4raw` compression codec for Parquet
### What changes were proposed in this pull request?
Parquet 1.13.0 supports `LZ4_RAW` codec. Please see https://issues.apache.org/jira/browse/PARQUET-2196.
This PR adds `lz4raw` to the supported list of `spark.sql.parquet.compression.codec`.
### Why are the changes needed?
Support writing Parquet files with `lz4raw` compression codec.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unit test and manual testing:
```scala
spark.sql("set spark.sql.parquet.compression.codec=lz4raw")
spark.range(10).write.parquet("/tmp/spark/lz4raw")
spark.read.parquet("/tmp/spark/lz4raw").show(false)
```
```
yumwangLM-SHC-16508156 lz4raw % ll /tmp/spark/lz4raw
total 16
-rw-r--r-- 1 yumwang wheel 0 Jun 8 12:10 _SUCCESS
-rw-r--r-- 1 yumwang wheel 487 Jun 8 12:10 part-00000-c6786f4d-b5a6-406d-96a1-37bf0ceeeac7-c000.lz4raw.parquet
-rw-r--r-- 1 yumwang wheel 489 Jun 8 12:10 part-00001-c6786f4d-b5a6-406d-96a1-37bf0ceeeac7-c000.lz4raw.parquet
```
Closes #41507 from wangyum/SPARK-43273.
Lead-authored-by: Yuming Wang <yu...@ebay.com>
Co-authored-by: Yuming Wang <yu...@apache.org>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 5 +++--
.../spark/sql/execution/datasources/parquet/ParquetOptions.scala | 3 ++-
.../spark/sql/execution/datasources/FileSourceCodecSuite.scala | 2 +-
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8d1e73cb86f..47b8474953b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -984,11 +984,12 @@ object SQLConf {
"`parquet.compression` is specified in the table-specific options/properties, the " +
"precedence would be `compression`, `parquet.compression`, " +
"`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
- "snappy, gzip, lzo, brotli, lz4, zstd.")
+ "snappy, gzip, lzo, brotli, lz4, lz4raw, zstd.")
.version("1.1.1")
.stringConf
.transform(_.toLowerCase(Locale.ROOT))
- .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo", "lz4", "brotli", "zstd"))
+ .checkValues(
+ Set("none", "uncompressed", "snappy", "gzip", "lzo", "brotli", "lz4", "lz4raw", "zstd"))
.createWithDefault("snappy")
val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index d20edbde00b..023d2460959 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -94,8 +94,9 @@ object ParquetOptions extends DataSourceOptions {
"snappy" -> CompressionCodecName.SNAPPY,
"gzip" -> CompressionCodecName.GZIP,
"lzo" -> CompressionCodecName.LZO,
- "lz4" -> CompressionCodecName.LZ4,
"brotli" -> CompressionCodecName.BROTLI,
+ "lz4" -> CompressionCodecName.LZ4,
+ "lz4raw" -> CompressionCodecName.LZ4_RAW,
"zstd" -> CompressionCodecName.ZSTD)
def getParquetCompressionCodecName(name: String): String = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
index 8d0bae51521..09a348cd294 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceCodecSuite.scala
@@ -59,7 +59,7 @@ class ParquetCodecSuite extends FileSourceCodecSuite {
// Exclude "brotli" because the com.github.rdblue:brotli-codec dependency is not available
// on Maven Central.
override protected def availableCodecs: Seq[String] = {
- Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4")
+ Seq("none", "uncompressed", "snappy", "gzip", "zstd", "lz4", "lz4raw")
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org