You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/02/04 01:55:47 UTC
[spark] branch branch-3.1 updated: [SPARK-34318][SQL][3.1]
Dataset.colRegex should work with column names and qualifiers which contain
newlines
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 5f3b8b8 [SPARK-34318][SQL][3.1] Dataset.colRegex should work with column names and qualifiers which contain newlines
5f3b8b8 is described below
commit 5f3b8b825c4b0d9ae085221d1e648f26fb6a4599
Author: Kousuke Saruta <sa...@oss.nttdata.com>
AuthorDate: Thu Feb 4 10:55:17 2021 +0900
[SPARK-34318][SQL][3.1] Dataset.colRegex should work with column names and qualifiers which contain newlines
### What changes were proposed in this pull request?
Backport of #31426 for the record.
This PR fixes an issue that `Dataset.colRegex` doesn't work with column names or qualifiers which contain newlines.
In the current master, if column names or qualifiers passed to `colRegex` contain newlines, it throws exception.
```
val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
val col1 = df.colRegex("`tes.*\n.*mn`")
org.apache.spark.sql.AnalysisException: Cannot resolve column name "`tes.*
.*mn`" among (test
_column)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
... 47 elided
val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
org.apache.spark.sql.AnalysisException: Cannot resolve column name "test
_table.`tes.*
.*mn`" among (test
_column)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272)
at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263)
at org.apache.spark.sql.Dataset.colRegex(Dataset.scala:1407)
... 47 elided
```
### Why are the changes needed?
Column names and qualifiers can contain newlines but `colRegex` can't work with them, so it's a bug.
### Does this PR introduce _any_ user-facing change?
Yes. users can pass column names and qualifiers even though they contain newlines.
### How was this patch tested?
New test.
Closes #31457 from sarutak/SPARK-34318-branch-3.1.
Authored-by: Kousuke Saruta <sa...@oss.nttdata.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
.../scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala | 4 ++--
.../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 9 +++++++++
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 1f32620..4976e2c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -203,10 +203,10 @@ object ParserUtils {
}
/** the column name pattern in quoted regex without qualifier */
- val escapedIdentifier = "`(.+)`".r
+ val escapedIdentifier = "`((?s).+)`".r
/** the column name pattern in quoted regex with qualifier */
- val qualifiedEscapedIdentifier = ("(.+)" + """.""" + "`(.+)`").r
+ val qualifiedEscapedIdentifier = ("((?s).+)" + """.""" + "`((?s).+)`").r
/** Some syntactic sugar which makes it easier to work with optional clauses for LogicalPlans. */
implicit class EnhancedLogicalPlan(val plan: LogicalPlan) extends AnyVal {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 4fecd62..8d95f83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2575,6 +2575,15 @@ class DataFrameSuite extends QueryTest
val df = l.join(r, $"col2" === $"col4", "LeftOuter")
checkAnswer(df, Row("2", "2"))
}
+
+ test("SPARK-34318: colRegex should work with column names & qualifiers which contain newlines") {
+ val df = Seq(1, 2, 3).toDF("test\n_column").as("test\n_table")
+ val col1 = df.colRegex("`tes.*\n.*mn`")
+ checkAnswer(df.select(col1), Row(1) :: Row(2) :: Row(3) :: Nil)
+
+ val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
+ checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
+ }
}
case class GroupByKey(a: Int, b: Int)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org