You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/08/25 04:19:13 UTC

[spark] branch branch-3.3 updated: [SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new d9dc28075bc [SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters
d9dc28075bc is described below

commit d9dc28075bcf6e4c6756418ae872fc8db36867f2
Author: Linhong Liu <li...@databricks.com>
AuthorDate: Thu Aug 25 13:18:45 2022 +0900

    [SPARK-40213][SQL] Support ASCII value conversion for Latin-1 characters
    
    ### What changes were proposed in this pull request?
    This PR proposes to support ASCII value conversion for Latin-1 Supplement characters.
    
    ### Why are the changes needed?
    `ascii()` should be the inverse of `chr()`. But for latin-1 char, we get incorrect ascii value. For example:
    ```sql
    select ascii('§') -- output: -62, expect: 167
    select chr(167) -- output: '§'
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, fixes the incorrect ASCII conversion for Latin-1 Supplement characters
    
    ### How was this patch tested?
    UT
    
    Closes #37651 from linhongliu-db/SPARK-40213.
    
    Authored-by: Linhong Liu <li...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit c07852380471f02955d6d17cddb3150231daa71f)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../sql/catalyst/expressions/stringExpressions.scala     | 15 ++++++++-------
 .../catalyst/expressions/StringExpressionsSuite.scala    | 11 +++++++++++
 .../src/test/resources/sql-tests/inputs/charvarchar.sql  |  4 ++++
 .../test/resources/sql-tests/results/charvarchar.sql.out | 16 ++++++++++++++++
 4 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index c56a1dc47ae..00bd98a93e5 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2335,9 +2335,10 @@ case class Ascii(child: Expression)
   override def inputTypes: Seq[DataType] = Seq(StringType)
 
   protected override def nullSafeEval(string: Any): Any = {
-    val bytes = string.asInstanceOf[UTF8String].getBytes
-    if (bytes.length > 0) {
-      bytes(0).asInstanceOf[Int]
+    // only pick the first character to reduce the `toString` cost
+    val firstCharStr = string.asInstanceOf[UTF8String].substring(0, 1)
+    if (firstCharStr.numChars > 0) {
+      firstCharStr.toString.codePointAt(0)
     } else {
       0
     }
@@ -2345,11 +2346,11 @@ case class Ascii(child: Expression)
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, (child) => {
-      val bytes = ctx.freshName("bytes")
+      val firstCharStr = ctx.freshName("firstCharStr")
       s"""
-        byte[] $bytes = $child.getBytes();
-        if ($bytes.length > 0) {
-          ${ev.value} = (int) $bytes[0];
+        UTF8String $firstCharStr = $child.substring(0, 1);
+        if ($firstCharStr.numChars() > 0) {
+          ${ev.value} = $firstCharStr.toString().codePointAt(0);
         } else {
           ${ev.value} = 0;
         }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 655e9b744bf..45907917870 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -307,6 +307,17 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       SubstringIndex(Literal("www||apache||org"), Literal( "||"), Literal(2)), "www||apache")
   }
 
+  test("SPARK-40213: ascii for Latin-1 Supplement characters") {
+    // scalastyle:off
+    checkEvaluation(Ascii(Literal("¥")), 165, create_row("¥"))
+    checkEvaluation(Ascii(Literal("®")), 174, create_row("®"))
+    checkEvaluation(Ascii(Literal("©")), 169, create_row("©"))
+    // scalastyle:on
+    (128 until 256).foreach { c =>
+      checkEvaluation(Ascii(Chr(Literal(c.toLong))), c, create_row(c.toLong))
+    }
+  }
+
   test("ascii for string") {
     val a = 'a.string.at(0)
     checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef"))
diff --git a/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql b/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
index 098d09d9821..b62cbf64323 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/charvarchar.sql
@@ -113,3 +113,7 @@ drop table char_tbl1;
 drop table char_tbl2;
 drop table char_tbl3;
 drop table char_tbl4;
+
+-- ascii value for Latin-1 Supplement characters
+select ascii('§'), ascii('÷'), ascii('×10');
+select chr(167), chr(247), chr(215);
diff --git a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
index 6345702e00e..a1eb20da9f6 100644
--- a/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/charvarchar.sql.out
@@ -1154,3 +1154,19 @@ drop table char_tbl4
 struct<>
 -- !query output
 
+
+
+-- !query
+select ascii('§'), ascii('÷'), ascii('×10')
+-- !query schema
+struct<ascii(§):int,ascii(÷):int,ascii(×10):int>
+-- !query output
+167	247	215
+
+
+-- !query
+select chr(167), chr(247), chr(215)
+-- !query schema
+struct<chr(167):string,chr(247):string,chr(215):string>
+-- !query output
+§	÷	×


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org