You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/25 04:19:25 UTC
[spark] branch branch-3.2 updated: [SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 94f3b111ca9 [SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters
94f3b111ca9 is described below
commit 94f3b111ca9de500ab4277a0f6bc119e5de6398d
Author: Linhong Liu <li...@databricks.com>
AuthorDate: Thu Aug 25 13:18:45 2022 +0900
[SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters
### What changes were proposed in this pull request?
This PR proposes to support ASCII value conversion for Latin-1 Supplement characters.
### Why are the changes needed?
`ascii()` should be the inverse of `chr()`. But for latin-1 char, we get incorrect ascii value. For example:
```sql
select ascii('§') -- output: -62, expect: 167
select chr(167) -- output: '§'
```
### Does this PR introduce _any_ user-facing change?
Yes, fixes the incorrect ASCII conversion for Latin-1 Supplement characters
### How was this patch tested?
UT
Closes #37651 from linhongliu-db/SPARK-40213.
Authored-by: Linhong Liu <li...@databricks.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit c07852380471f02955d6d17cddb3150231daa71f)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../sql/catalyst/expressions/stringExpressions.scala | 15 ++++++++-------
.../catalyst/expressions/StringExpressionsSuite.scala | 11 +++++++++++
.../src/test/resources/sql-tests/inputs/charvarchar.sql | 4 ++++
.../test/resources/sql-tests/results/charvarchar.sql.out | 16 ++++++++++++++++
4 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 80f05c2e99e..3ffa8390df1 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2109,9 +2109,10 @@ case class Ascii(child: Expression)
override def inputTypes: Seq[DataType] = Seq(StringType)
protected override def nullSafeEval(string: Any): Any = {
- val bytes = string.asInstanceOf[UTF8String].getBytes
- if (bytes.length > 0) {
- bytes(0).asInstanceOf[Int]
+ // only pick the first character to reduce the `toString` cost
+ val firstCharStr = string.asInstanceOf[UTF8String].substring(0, 1)
+ if (firstCharStr.numChars > 0) {
+ firstCharStr.toString.codePointAt(0)
} else {
0
}
@@ -2119,11 +2120,11 @@ case class Ascii(child: Expression)
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
nullSafeCodeGen(ctx, ev, (child) => {
- val bytes = ctx.freshName("bytes")
+ val firstCharStr = ctx.freshName("firstCharStr")
s"""
- byte[] $bytes = $child.getBytes();
- if ($bytes.length > 0) {
- ${ev.value} = (int) $bytes[0];
+ UTF8String $firstCharStr = $child.substring(0, 1);
+ if ($firstCharStr.numChars() > 0) {
+ ${ev.value} = $firstCharStr.toString().codePointAt(0);
} else {
${ev.value} = 0;
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index d25240fec13..2c40d226236 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -301,6 +301,17 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache")
}
+ test("SPARK-40213: ascii for Latin-1 Supplement characters") {
+ // scalastyle:off
+ checkEvaluation(Ascii(Literal("¥")), 165, create_row("¥"))
+ checkEvaluation(Ascii(Literal("®")), 174, create_row("®"))
+ checkEvaluation(Ascii(Literal("©")), 169, create_row("©"))
+ // scalastyle:on
+ (128 until 256).foreach { c =>
+ checkEvaluation(Ascii(Chr(Literal(c.toLong))), c, create_row(c.toLong))
+ }
+ }
+
test("ascii for string") {
val a = 'a.string.at(0)
checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef"))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql b/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
index 098d09d9821..b62cbf64323 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
@@ -113,3 +113,7 @@ drop table char_tbl1;
drop table char_tbl2;
drop table char_tbl3;
drop table char_tbl4;
+
+-- ascii value for Latin-1 Supplement characters
+select ascii('§'), ascii('÷'), ascii('×10');
+select chr(167), chr(247), chr(215);
diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
index 00180eec565..fd22918d1a1 100644
--- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
@@ -1157,3 +1157,19 @@ drop table char_tbl4
struct<>
-- !query output
+
+
+-- !query
+select ascii('§'), ascii('÷'), ascii('×10')
+-- !query schema
+struct<ascii(§):int,ascii(÷):int,ascii(×10):int>
+-- !query output
+167 247 215
+
+
+-- !query
+select chr(167), chr(247), chr(215)
+-- !query schema
+struct<chr(167):string,chr(247):string,chr(215):string>
+-- !query output
+§ ÷ ×
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org