You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/08/09 19:16:15 UTC
[spark] branch branch-3.0 updated: [SPARK-32559][SQL][3.0] Fix the
trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters
correctly
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 9391705 [SPARK-32559][SQL][3.0] Fix the trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters correctly
9391705 is described below
commit 9391705958a8aa41360063e1a70968daf1a1a975
Author: wangguangxin.cn <wa...@gmail.com>
AuthorDate: Sun Aug 9 12:12:14 2020 -0700
[SPARK-32559][SQL][3.0] Fix the trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters correctly
### What changes were proposed in this pull request?
This is a backport of https://github.com/apache/spark/pull/29375
The trim logic in Cast expression introduced in https://github.com/apache/spark/pull/26622 trim non-ASCII characters unexpectly.
Before this patch
![image](https://user-images.githubusercontent.com/1312321/89513154-caad9b80-d806-11ea-9ebe-17c9e7d1b5b3.png)
After this patch
![image](https://user-images.githubusercontent.com/1312321/89513196-d731f400-d806-11ea-959c-6a7dc29dcd49.png)
### Why are the changes needed?
The behavior described above doesn't make sense, and also doesn't consistent with the behavior when cast a string to double/float, as well as doesn't consistent with the behavior of Hive
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
Added more UT
Closes #29393 from WangGuangxin/cast-bugfix-branch-3.0.
Authored-by: wangguangxin.cn <wa...@gmail.com>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../org/apache/spark/unsafe/types/UTF8String.java | 12 +++----
.../src/test/resources/sql-tests/inputs/cast.sql | 5 +++
.../test/resources/sql-tests/inputs/datetime.sql | 2 ++
.../test/resources/sql-tests/inputs/interval.sql | 1 +
.../sql-tests/results/ansi/datetime.sql.out | 30 +++++++++++++++-
.../sql-tests/results/ansi/interval.sql.out | 16 ++++++++-
.../test/resources/sql-tests/results/cast.sql.out | 42 +++++++++++++++++++++-
.../sql-tests/results/datetime-legacy.sql.out | 30 +++++++++++++++-
.../resources/sql-tests/results/datetime.sql.out | 30 +++++++++++++++-
.../resources/sql-tests/results/interval.sql.out | 16 ++++++++-
10 files changed, 172 insertions(+), 12 deletions(-)
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 7205293..43bd797 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -575,14 +575,14 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
public UTF8String trimAll() {
int s = 0;
// skip all of the whitespaces (<=0x20) in the left side
- while (s < this.numBytes && getByte(s) <= ' ') s++;
+ while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
if (s == this.numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
// skip all of the whitespaces (<=0x20) in the right side
int e = this.numBytes - 1;
- while (e > s && getByte(e) <= ' ') e--;
+ while (e > s && Character.isWhitespace(getByte(e))) e--;
if (s == 0 && e == numBytes - 1) {
// Nothing trimmed
return this;
@@ -1119,11 +1119,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+ while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && getByte(end) <= ' ') end--;
+ while (end > offset && Character.isWhitespace(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
@@ -1216,11 +1216,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
int offset = 0;
- while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+ while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
if (offset == this.numBytes) return false;
int end = this.numBytes - 1;
- while (end > offset && getByte(end) <= ' ') end--;
+ while (end > offset && Character.isWhitespace(getByte(end))) end--;
byte b = getByte(offset);
final boolean negative = b == '-';
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
index 972ebdd..81c741a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
@@ -70,6 +70,11 @@ select cast(' 1' as bigint);
select cast(' 1' as float);
select cast(' 1 ' as DOUBLE);
select cast('1.0 ' as DEC);
+select cast('1中文' as tinyint);
+select cast('1中文' as smallint);
+select cast('1中文' as INT);
+select cast('中文1' as bigint);
+select cast('1中文' as bigint);
-- trim string before cast to boolean
select cast('\t\t true \n\r ' as boolean);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
index db54b55..ae5831c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -33,6 +33,8 @@ select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01');
select date '2019-01-01\t';
select timestamp '2019-01-01\t';
+select date '2020-01-01中文';
+select timestamp '2019-01-01中文';
-- time add/sub
select timestamp'2011-11-11 11:11:11' + interval '2' day;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
index a7e1afe..ebc39f5 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
@@ -160,6 +160,7 @@ select interval '2-2\t' year to month;
select interval '-\t2-2\t' year to month;
select interval '\n0 12:34:46.789\t' day to second;
select interval '\n-\t10\t 12:34:46.789\t' day to second;
+select interval '中文 interval 1 day';
-- interval overflow if (ansi) exception else NULL
select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b);
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
index 3667404..ec007aa 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
-- !query
@@ -142,6 +142,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
-- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
select timestamp'2011-11-11 11:11:11' + interval '2' day
-- !query schema
struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index e83444d..6a6a1df 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 92
+-- Number of queries: 93
-- !query
@@ -876,6 +876,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second
-- !query
+select interval '中文 interval 1 day'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7)
+
+== SQL ==
+select interval '中文 interval 1 day'
+-------^^^
+
+
+-- !query
select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
-- !query schema
struct<>
diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
index 35b4c0e..d4872ca 100644
--- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 46
+-- Number of queries: 51
-- !query
@@ -354,6 +354,46 @@ struct<CAST(1.0 AS DECIMAL(10,0)):decimal(10,0)>
-- !query
+select cast('1中文' as tinyint)
+-- !query schema
+struct<CAST(1中文 AS TINYINT):tinyint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as smallint)
+-- !query schema
+struct<CAST(1中文 AS SMALLINT):smallint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as INT)
+-- !query schema
+struct<CAST(1中文 AS INT):int>
+-- !query output
+NULL
+
+
+-- !query
+select cast('中文1' as bigint)
+-- !query schema
+struct<CAST(中文1 AS BIGINT):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as bigint)
+-- !query schema
+struct<CAST(1中文 AS BIGINT):bigint>
+-- !query output
+NULL
+
+
+-- !query
select cast('\t\t true \n\r ' as boolean)
-- !query schema
struct<CAST( true
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
index 1342c71..fbbdb5f 100644
--- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
-- !query
@@ -116,6 +116,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
-- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
select timestamp'2011-11-11 11:11:11' + interval '2' day
-- !query schema
struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
index d3657e8..d7e960a 100755
--- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
-- !query
@@ -116,6 +116,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
-- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
select timestamp'2011-11-11 11:11:11' + interval '2' day
-- !query schema
struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
index 4cdc669..63ed9b5 100644
--- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 92
+-- Number of queries: 93
-- !query
@@ -853,6 +853,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second
-- !query
+select interval '中文 interval 1 day'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7)
+
+== SQL ==
+select interval '中文 interval 1 day'
+-------^^^
+
+
+-- !query
select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
-- !query schema
struct<(- a):interval>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org