You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/02/04 01:53:29 UTC
[spark] branch branch-3.0 updated: [SPARK-34318][SQL][3.0]
Dataset.colRegex should work with column names and qualifiers which contain
newlines
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 0694233 [SPARK-34318][SQL][3.0] Dataset.colRegex should work with column names and qualifiers which contain newlines
0694233 is described below
commit 06942331a7db1e6d5e6709ac7009c180c94cc7c0
Author: Kousuke Saruta <sa...@oss.nttdata.com>
AuthorDate: Thu Feb 4 10:53:02 2021 +0900
[SPARK-34318][SQL][3.0] Dataset.colRegex should work with column names and qualifiers which contain newlines
### What changes were proposed in this pull request?
Backport of #31426 for the record.
This PR fixes an issue that `Dataset.colRegex` doesn't work with column names or qualifiers which contain newlines.
In the current master, if column names or qualifiers passed to `colRegex` contain newlines, it throws exception.
```
val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
val col1 = df.colRegex("`tes.*\n.*mn`")
org.apache.spark.sql.AnalysisException: Cannot resolve column name "`tes.*
.*mn`" among (test
_column)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
... 47 elided
val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
org.apache.spark.sql.AnalysisException: Cannot resolve column name "test
_table.`tes.*
.*mn`" among (test
_column)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
... 47 elided
```
### Why are the changes needed?
Column names and qualifiers can contain newlines but `colRegex` can't work with them, so it's a bug.
### Does this PR introduce _any_ user-facing change?
Yes. users can pass column names and qualifiers even though they contain newlines.
### How was this patch tested?
New test.
Closes #31458 from sarutak/SPARK-34318-branch-3.0.
Authored-by: Kousuke Saruta <sa...@oss.nttdata.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
.../scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala | 4 ++--
.../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 9 +++++++++
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index f2dab94..5e324e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -192,10 +192,10 @@ object ParserUtils {
}
/** the column name pattern in quoted regex without qualifier */
- val escapedIdentifier = "`(.+)`".r
+ val escapedIdentifier = "`((?s).+)`".r
/** the column name pattern in quoted regex with qualifier */
- val qualifiedEscapedIdentifier = ("(.+)" + """.""" + "`(.+)`").r
+ val qualifiedEscapedIdentifier = ("((?s).+)" + """.""" + "`((?s).+)`").r
/** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 7c410e8..66525d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2480,6 +2480,15 @@ class DataFrameSuite extends QueryTest
test("SPARK-32761: aggregating multiple distinct CONSTANT columns") {
checkAnswer(sql("select count(distinct 2), count(distinct 2,3)"), Row(1, 1))
}
+
+ test("SPARK-34318: colRegex should work with column names & qualifiers which contain newlines") {
+ val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
+ val col1 = df.colRegex("`tes.*\n.*mn`")
+ checkAnswer(df.select(col1), Row(1) :: Row(2) :: Row(3) :: Nil)
+
+ val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
+ checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
+ }
}
case class GroupByKey(a: Int, b: Int)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org