You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/19 12:35:37 UTC
[spark] branch branch-2.4 updated: [SPARK-30763][SQL][2.4] Fix
java.lang.IndexOutOfBoundsException No group 1 for regexp_extract
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push:
new c80b79f [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract
c80b79f is described below
commit c80b79f2d4f13d1db1414f7ce28d7d054105c498
Author: beliefer <be...@163.com>
AuthorDate: Wed Feb 19 20:34:53 2020 +0800
[SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract
### What changes were proposed in this pull request?
This PR follows https://github.com/apache/spark/pull/27508 and used to spark2.4.
### Why are the changes needed?
Fix a bug `java.lang.IndexOutOfBoundsException No group 1`
### Does this PR introduce any user-facing change?
Yes
### How was this patch tested?
New UT.
Closes #27631 from beliefer/fix-2.4-regexp_extract-bug.
Authored-by: beliefer <be...@163.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../catalyst/expressions/regexpExpressions.scala | 15 ++++-
.../expressions/RegexpExpressionsSuite.scala | 12 ++++
.../sql-tests/inputs/regexp-functions.sql | 9 +++
.../sql-tests/results/regexp-functions.sql.out | 69 ++++++++++++++++++++++
4 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index e80543c..7086e4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -366,6 +366,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
}
}
+object RegExpExtract {
+ def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = {
+ if (groupCount < groupIndex) {
+ throw new IllegalArgumentException(
+ s"Regex group count is $groupCount, but the specified group index is $groupIndex")
+ }
+ }
+}
+
/**
* Extract a specific(idx) group identified by a Java regex.
*
@@ -397,7 +406,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
val m = pattern.matcher(s.toString)
if (m.find) {
val mr: MatchResult = m.toMatchResult
- val group = mr.group(r.asInstanceOf[Int])
+ val index = r.asInstanceOf[Int]
+ RegExpExtract.checkGroupIndex(mr.groupCount, index)
+ val group = mr.group(index)
if (group == null) { // Pattern matched, but not optional group
UTF8String.EMPTY_UTF8
} else {
@@ -415,6 +426,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val classNamePattern = classOf[Pattern].getCanonicalName
+ val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName
val matcher = ctx.freshName("matcher")
val matchResult = ctx.freshName("matchResult")
@@ -438,6 +450,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
$termPattern.matcher($subject.toString());
if ($matcher.find()) {
java.util.regex.MatchResult $matchResult = $matcher.toMatchResult();
+ $classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx);
if ($matchResult.group($idx) == null) {
${ev.value} = UTF8String.EMPTY_UTF8;
} else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index d532dc4..4c7a037 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -215,6 +215,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
checkEvaluation(nonNullExpr, "100", row1)
+
+ // invalid group index
+ val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3)
+ val row9 = create_row("100-200", "(\\d+).*", 2)
+ val row10 = create_row("100-200", "\\d+", 1)
+
+ checkExceptionInExpression[IllegalArgumentException](
+ expr, row8, "Regex group count is 2, but the specified group index is 3")
+ checkExceptionInExpression[IllegalArgumentException](
+ expr, row9, "Regex group count is 1, but the specified group index is 2")
+ checkExceptionInExpression[IllegalArgumentException](
+ expr, row10, "Regex group count is 0, but the specified group index is 1")
}
test("SPLIT") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
new file mode 100644
index 0000000..c0827a3
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
@@ -0,0 +1,9 @@
+-- regexp_extract
+SELECT regexp_extract('1a 2b 14m', '\\d+');
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
new file mode 100644
index 0000000..f54f67f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
@@ -0,0 +1,69 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+SELECT regexp_extract('1a 2b 14m', '\\d+')
+-- !query 0 schema
+struct<>
+-- !query 0 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query 1
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0)
+-- !query 1 schema
+struct<regexp_extract(1a 2b 14m, \d+, 0):string>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query 3
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 2
+
+
+-- !query 4
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)')
+-- !query 4 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query 4 output
+1
+
+
+-- !query 5
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0)
+-- !query 5 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string>
+-- !query 5 output
+1a
+
+
+-- !query 6
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1)
+-- !query 6 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query 6 output
+1
+
+
+-- !query 7
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2)
+-- !query 7 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string>
+-- !query 7 output
+a
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org