You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/08/09 19:16:15 UTC

[spark] branch branch-3.0 updated: [SPARK-32559][SQL][3.0] Fix the trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters correctly

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 9391705  [SPARK-32559][SQL][3.0] Fix the trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters correctly
9391705 is described below

commit 9391705958a8aa41360063e1a70968daf1a1a975
Author: wangguangxin.cn <wa...@gmail.com>
AuthorDate: Sun Aug 9 12:12:14 2020 -0700

    [SPARK-32559][SQL][3.0] Fix the trim logic in UTF8String.toInt/toLong did't handle non-ASCII characters correctly
    
    ### What changes were proposed in this pull request?
    
    This is a backport of https://github.com/apache/spark/pull/29375
    The trim logic in Cast expression introduced in https://github.com/apache/spark/pull/26622 trim non-ASCII characters unexpectly.
    
    Before this patch
    ![image](https://user-images.githubusercontent.com/1312321/89513154-caad9b80-d806-11ea-9ebe-17c9e7d1b5b3.png)
    
    After this patch
    ![image](https://user-images.githubusercontent.com/1312321/89513196-d731f400-d806-11ea-959c-6a7dc29dcd49.png)
    
    ### Why are the changes needed?
    The behavior described above doesn't make sense, and also doesn't consistent with the behavior when cast a string to double/float, as well as doesn't consistent with the behavior of Hive
    
    ### Does this PR introduce _any_ user-facing change?
    Yes
    
    ### How was this patch tested?
    Added more UT
    
    Closes #29393 from WangGuangxin/cast-bugfix-branch-3.0.
    
    Authored-by: wangguangxin.cn <wa...@gmail.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../org/apache/spark/unsafe/types/UTF8String.java  | 12 +++----
 .../src/test/resources/sql-tests/inputs/cast.sql   |  5 +++
 .../test/resources/sql-tests/inputs/datetime.sql   |  2 ++
 .../test/resources/sql-tests/inputs/interval.sql   |  1 +
 .../sql-tests/results/ansi/datetime.sql.out        | 30 +++++++++++++++-
 .../sql-tests/results/ansi/interval.sql.out        | 16 ++++++++-
 .../test/resources/sql-tests/results/cast.sql.out  | 42 +++++++++++++++++++++-
 .../sql-tests/results/datetime-legacy.sql.out      | 30 +++++++++++++++-
 .../resources/sql-tests/results/datetime.sql.out   | 30 +++++++++++++++-
 .../resources/sql-tests/results/interval.sql.out   | 16 ++++++++-
 10 files changed, 172 insertions(+), 12 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 7205293..43bd797 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -575,14 +575,14 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
   public UTF8String trimAll() {
     int s = 0;
     // skip all of the whitespaces (<=0x20) in the left side
-    while (s < this.numBytes && getByte(s) <= ' ') s++;
+    while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
     if (s == this.numBytes) {
       // Everything trimmed
       return EMPTY_UTF8;
     }
     // skip all of the whitespaces (<=0x20) in the right side
     int e = this.numBytes - 1;
-    while (e > s && getByte(e) <= ' ') e--;
+    while (e > s && Character.isWhitespace(getByte(e))) e--;
     if (s == 0 && e == numBytes - 1) {
       // Nothing trimmed
       return this;
@@ -1119,11 +1119,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
 
   private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
     int offset = 0;
-    while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+    while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
     if (offset == this.numBytes) return false;
 
     int end = this.numBytes - 1;
-    while (end > offset && getByte(end) <= ' ') end--;
+    while (end > offset && Character.isWhitespace(getByte(end))) end--;
 
     byte b = getByte(offset);
     final boolean negative = b == '-';
@@ -1216,11 +1216,11 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable,
 
   private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
     int offset = 0;
-    while (offset < this.numBytes && getByte(offset) <= ' ') offset++;
+    while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
     if (offset == this.numBytes) return false;
 
     int end = this.numBytes - 1;
-    while (end > offset && getByte(end) <= ' ') end--;
+    while (end > offset && Character.isWhitespace(getByte(end))) end--;
 
     byte b = getByte(offset);
     final boolean negative = b == '-';
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
index 972ebdd..81c741a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql
@@ -70,6 +70,11 @@ select cast(' 1' as bigint);
 select cast(' 1' as float);
 select cast(' 1 ' as DOUBLE);
 select cast('1.0 ' as DEC);
+select cast('1中文' as tinyint);
+select cast('1中文' as smallint);
+select cast('1中文' as INT);
+select cast('中文1' as bigint);
+select cast('1中文' as bigint);
 
 -- trim string before cast to boolean
 select cast('\t\t true \n\r ' as boolean);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
index db54b55..ae5831c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -33,6 +33,8 @@ select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01');
 
 select date '2019-01-01\t';
 select timestamp '2019-01-01\t';
+select date '2020-01-01中文';
+select timestamp '2019-01-01中文';
 
 -- time add/sub
 select timestamp'2011-11-11 11:11:11' + interval '2' day;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
index a7e1afe..ebc39f5 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/interval.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql
@@ -160,6 +160,7 @@ select interval '2-2\t' year to month;
 select interval '-\t2-2\t' year to month;
 select interval '\n0 12:34:46.789\t' day to second;
 select interval '\n-\t10\t 12:34:46.789\t' day to second;
+select interval '中文 interval 1 day';
 
 -- interval overflow if (ansi) exception else NULL
 select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b);
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
index 3667404..ec007aa 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
 
 
 -- !query
@@ -142,6 +142,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
 
 
 -- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
 select timestamp'2011-11-11 11:11:11' + interval '2' day
 -- !query schema
 struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
index e83444d..6a6a1df 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 92
+-- Number of queries: 93
 
 
 -- !query
@@ -876,6 +876,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second
 
 
 -- !query
+select interval '中文 interval 1 day'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7)
+
+== SQL ==
+select interval '中文 interval 1 day'
+-------^^^
+
+
+-- !query
 select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
 -- !query schema
 struct<>
diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
index 35b4c0e..d4872ca 100644
--- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 46
+-- Number of queries: 51
 
 
 -- !query
@@ -354,6 +354,46 @@ struct<CAST(1.0  AS DECIMAL(10,0)):decimal(10,0)>
 
 
 -- !query
+select cast('1中文' as tinyint)
+-- !query schema
+struct<CAST(1中文 AS TINYINT):tinyint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as smallint)
+-- !query schema
+struct<CAST(1中文 AS SMALLINT):smallint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as INT)
+-- !query schema
+struct<CAST(1中文 AS INT):int>
+-- !query output
+NULL
+
+
+-- !query
+select cast('中文1' as bigint)
+-- !query schema
+struct<CAST(中文1 AS BIGINT):bigint>
+-- !query output
+NULL
+
+
+-- !query
+select cast('1中文' as bigint)
+-- !query schema
+struct<CAST(1中文 AS BIGINT):bigint>
+-- !query output
+NULL
+
+
+-- !query
 select cast('\t\t true \n\r ' as boolean)
 -- !query schema
 struct<CAST(		 true 
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
index 1342c71..fbbdb5f 100644
--- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
 
 
 -- !query
@@ -116,6 +116,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
 
 
 -- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
 select timestamp'2011-11-11 11:11:11' + interval '2' day
 -- !query schema
 struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
index d3657e8..d7e960a 100755
--- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 106
+-- Number of queries: 108
 
 
 -- !query
@@ -116,6 +116,34 @@ struct<TIMESTAMP '2019-01-01 00:00:00':timestamp>
 
 
 -- !query
+select date '2020-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 2020-01-01中文(line 1, pos 7)
+
+== SQL ==
+select date '2020-01-01中文'
+-------^^^
+
+
+-- !query
+select timestamp '2019-01-01中文'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the TIMESTAMP value: 2019-01-01中文(line 1, pos 7)
+
+== SQL ==
+select timestamp '2019-01-01中文'
+-------^^^
+
+
+-- !query
 select timestamp'2011-11-11 11:11:11' + interval '2' day
 -- !query schema
 struct<CAST(TIMESTAMP '2011-11-11 11:11:11' + INTERVAL '2 days' AS TIMESTAMP):timestamp>
diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
index 4cdc669..63ed9b5 100644
--- a/sql/core/src/test/resources/sql-tests/results/interval.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 92
+-- Number of queries: 93
 
 
 -- !query
@@ -853,6 +853,20 @@ select interval '\n-\t10\t 12:34:46.789\t' day to second
 
 
 -- !query
+select interval '中文 interval 1 day'
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the INTERVAL value: 中文 interval 1 day(line 1, pos 7)
+
+== SQL ==
+select interval '中文 interval 1 day'
+-------^^^
+
+
+-- !query
 select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b)
 -- !query schema
 struct<(- a):interval>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org