You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2023/06/13 03:44:30 UTC
[spark] branch branch-3.4 updated: [SPARK-32559][SQL] Fix the trim logic did't handle ASCII control characters correctly
This is an automated email from the ASF dual-hosted git repository.
yao pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new e0103653cd3 [SPARK-32559][SQL] Fix the trim logic did't handle ASCII control characters correctly
e0103653cd3 is described below
commit e0103653cd3f130a917ba1d972bd03dfb8c3d2d8
Author: wangjunbo <wa...@qiyi.com>
AuthorDate: Tue Jun 13 11:43:54 2023 +0800
[SPARK-32559][SQL] Fix the trim logic did't handle ASCII control characters correctly
### What changes were proposed in this pull request?
The trim logic in Cast expression introduced in https://github.com/apache/spark/pull/29375 trim ASCII control characters unexpectly.
Before this patch
![image](https://github.com/apache/spark/assets/25627922/ca6a2fb1-2143-4264-84d1-70b6bb755ec7)
And hive
![image](https://github.com/apache/spark/assets/25627922/017aaa4a-133e-4396-9694-79f03f027bbe)
### Why are the changes needed?
The behavior described above doesn't consistent with the behavior of Hive
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
add ut
Closes #41535 from Kwafoor/trim_bugfix.
Lead-authored-by: wangjunbo <wa...@qiyi.com>
Co-authored-by: Junbo wang <10...@qq.com>
Signed-off-by: Kent Yao <ya...@apache.org>
(cherry picked from commit 80588e4ddfac1d2e2fdf4f9a7783c56be6a97cdd)
Signed-off-by: Kent Yao <ya...@apache.org>
---
.../org/apache/spark/unsafe/types/UTF8String.java | 20 ++++++++++++++------
.../apache/spark/unsafe/types/UTF8StringSuite.java | 4 ++++
.../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 4 ++++
3 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 6a3e9d313b5..891806e2c70 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -499,6 +499,14 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
return UTF8String.fromBytes(newBytes);
}
+ /**
+ * Determines if the specified character (Unicode code point) is white space or an ISO control
+ * character according to Java.
+ */
+ private boolean isWhitespaceOrISOControl(int codePoint) {
+ return Character.isWhitespace(codePoint) || Character.isISOControl(codePoint);
+ }
+
/**
* Trims space characters (ASCII 32) from both ends of this string.
*
@@ -535,14 +543,14 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimAll() {
int s = 0;
// skip all of the whitespaces in the left side
- while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
+ while (s < this.numBytes && isWhitespaceOrISOControl(getByte(s))) s++;
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the whitespaces in the right side
int e = this.numBytes - 1;
- while (e > s && Character.isWhitespace(getByte(e))) e--;
+ while (e > s && isWhitespaceOrISOControl(getByte(e))) e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;
@@ -1131,11 +1139,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
+ while (offset < this.numBytes && isWhitespaceOrISOControl(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && Character.isWhitespace(getByte(end))) end--;
+ while (end > offset && isWhitespaceOrISOControl(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
@@ -1228,11 +1236,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
+ while (offset < this.numBytes && isWhitespaceOrISOControl(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && Character.isWhitespace(getByte(end))) end--;
+ while (end > offset && isWhitespaceOrISOControl(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 8411d448dc9..47c6a987853 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -229,6 +229,10 @@ public class UTF8StringSuite {
assertEquals(fromString("1"), fromString("1").trim());
assertEquals(fromString("1"), fromString("1\t").trimAll());
+ assertEquals(fromString("1中文").toString(), fromString("1中文").trimAll().toString());
+ assertEquals(fromString("1"), fromString("1\u0003").trimAll());
+ assertEquals(fromString("1"), fromString("1\u007F").trimAll());
+
assertEquals(fromString("hello"), fromString(" hello ").trim());
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index fdc17eed77b..d12d01460f7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -127,6 +127,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
stringToDate(UTF8String.fromString(s))
}
+ test("SPARK-32559: string to date trim Control Characters") {
+ assert(toDate("MIDDLE\u0003") === None)
+ }
+
test("string to date") {
assert(toDate("2015-01-28").get === days(2015, 1, 28))
assert(toDate("2015").get === days(2015, 1, 1))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org