You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yu...@apache.org on 2023/02/20 11:32:26 UTC
[spark] branch branch-3.3 updated: [SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters
This is an automated email from the ASF dual-hosted git repository.
yumwang pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new 8a490eaf2c4 [SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters
8a490eaf2c4 is described below
commit 8a490eaf2c48de413924560c869ab53a5de6e303
Author: Yuming Wang <yu...@ebay.com>
AuthorDate: Mon Feb 20 19:15:30 2023 +0800
[SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters
This PR makes it encode the string using the `UTF_8` charset in `ParquetFilters`.
Fix data issue where the default charset is not `UTF_8`.
No.
Manual test.
Closes #40090 from wangyum/SPARK-41741.
Authored-by: Yuming Wang <yu...@ebay.com>
Signed-off-by: Yuming Wang <yu...@ebay.com>
(cherry picked from commit d5fa41efe2b1aa0aa41f558c1bef048b4632cf5c)
Signed-off-by: Yuming Wang <yu...@ebay.com>
---
.../spark/sql/execution/datasources/parquet/ParquetFilters.scala | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index e04019fa9a0..210f37d473a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long => JLong}
import java.math.{BigDecimal => JBigDecimal}
+import java.nio.charset.StandardCharsets.UTF_8
import java.sql.{Date, Timestamp}
import java.time.{Duration, Instant, LocalDate, Period}
import java.util.Locale
@@ -767,7 +768,7 @@ class ParquetFilters(
Option(prefix).map { v =>
FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames),
new UserDefinedPredicate[Binary] with Serializable {
- private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
+ private val strToBinary = Binary.fromReusedByteArray(v.getBytes(UTF_8))
private val size = strToBinary.length
override def canDrop(statistics: Statistics[Binary]): Boolean = {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org