You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/02/04 01:53:29 UTC
[spark] branch branch-3.0 updated: [SPARK-34318][SQL][3.0] Dataset.colRegex should work with column names and qualifiers which contain newlines

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 0694233  [SPARK-34318][SQL][3.0] Dataset.colRegex should work with column names and qualifiers which contain newlines
0694233 is described below

commit 06942331a7db1e6d5e6709ac7009c180c94cc7c0
Author: Kousuke Saruta <sa...@oss.nttdata.com>
AuthorDate: Thu Feb 4 10:53:02 2021 +0900

    [SPARK-34318][SQL][3.0] Dataset.colRegex should work with column names and qualifiers which contain newlines
    
    ### What changes were proposed in this pull request?
    
    Backport of #31426 for the record.
    
    This PR fixes an issue that `Dataset.colRegex` doesn't work with column names or qualifiers which contain newlines.
    In the current master, if column names or qualifiers passed to `colRegex` contain newlines, it throws exception.
    ```
    val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
    val col1 = df.colRegex("`tes.*\n.*mn`")
    org.apache.spark.sql.AnalysisException: Cannot resolve column name "`tes.*
    .*mn`" among (test
    _column)
      at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
      at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
      at scala.Option.getOrElse(Option.scala:189)
      at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
      at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
      ... 47 elided
    
    val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
    org.apache.spark.sql.AnalysisException: Cannot resolve column name "test
    _table.`tes.*
    .*mn`" among (test
    _column)
      at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
      at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
      at scala.Option.getOrElse(Option.scala:189)
      at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
      at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
      ... 47 elided
    ```
    
    ### Why are the changes needed?
    
    Column names and qualifiers can contain newlines but `colRegex` can't work with them, so it's a bug.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. users can pass column names and qualifiers even though they contain newlines.
    
    ### How was this patch tested?
    
    New test.
    
    Closes #31458 from sarutak/SPARK-34318-branch-3.0.
    
    Authored-by: Kousuke Saruta <sa...@oss.nttdata.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala | 4 ++--
 .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala     | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index f2dab94..5e324e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -192,10 +192,10 @@ object ParserUtils {
   }
 
   /** the column name pattern in quoted regex without qualifier */
-  val escapedIdentifier = "`(.+)`".r
+  val escapedIdentifier = "`((?s).+)`".r
 
   /** the column name pattern in quoted regex with qualifier */
-  val qualifiedEscapedIdentifier = ("(.+)" + """.""" + "`(.+)`").r
+  val qualifiedEscapedIdentifier = ("((?s).+)" + """.""" + "`((?s).+)`").r
 
   /** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
   implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 7c410e8..66525d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2480,6 +2480,15 @@ class DataFrameSuite extends QueryTest
   test("SPARK-32761: aggregating multiple distinct CONSTANT columns") {
      checkAnswer(sql("select count(distinct 2), count(distinct 2,3)"), Row(1, 1))
   }
+
+  test("SPARK-34318: colRegex should work with column names & qualifiers which contain newlines") {
+    val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
+    val col1 = df.colRegex("`tes.*\n.*mn`")
+    checkAnswer(df.select(col1), Row(1) :: Row(2) :: Row(3) :: Nil)
+
+    val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
+    checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org