You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/19 12:35:37 UTC

[spark] branch branch-2.4 updated: [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new c80b79f  [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract
c80b79f is described below

commit c80b79f2d4f13d1db1414f7ce28d7d054105c498
Author: beliefer <be...@163.com>
AuthorDate: Wed Feb 19 20:34:53 2020 +0800

    [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract
    
    ### What changes were proposed in this pull request?
    This PR follows https://github.com/apache/spark/pull/27508 and used to spark2.4.
    
    ### Why are the changes needed?
    Fix a bug `java.lang.IndexOutOfBoundsException No group 1`
    
    ### Does this PR introduce any user-facing change?
    Yes
    
    ### How was this patch tested?
    New UT.
    
    Closes #27631 from beliefer/fix-2.4-regexp_extract-bug.
    
    Authored-by: beliefer <be...@163.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../catalyst/expressions/regexpExpressions.scala   | 15 ++++-
 .../expressions/RegexpExpressionsSuite.scala       | 12 ++++
 .../sql-tests/inputs/regexp-functions.sql          |  9 +++
 .../sql-tests/results/regexp-functions.sql.out     | 69 ++++++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index e80543c..7086e4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -366,6 +366,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   }
 }
 
+object RegExpExtract {
+  def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = {
+    if (groupCount < groupIndex) {
+      throw new IllegalArgumentException(
+        s"Regex group count is $groupCount, but the specified group index is $groupIndex")
+    }
+  }
+}
+
 /**
  * Extract a specific(idx) group identified by a Java regex.
  *
@@ -397,7 +406,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
     val m = pattern.matcher(s.toString)
     if (m.find) {
       val mr: MatchResult = m.toMatchResult
-      val group = mr.group(r.asInstanceOf[Int])
+      val index = r.asInstanceOf[Int]
+      RegExpExtract.checkGroupIndex(mr.groupCount, index)
+      val group = mr.group(index)
       if (group == null) { // Pattern matched, but not optional group
         UTF8String.EMPTY_UTF8
       } else {
@@ -415,6 +426,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val classNamePattern = classOf[Pattern].getCanonicalName
+    val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName
     val matcher = ctx.freshName("matcher")
     val matchResult = ctx.freshName("matchResult")
 
@@ -438,6 +450,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
         $termPattern.matcher($subject.toString());
       if ($matcher.find()) {
         java.util.regex.MatchResult $matchResult = $matcher.toMatchResult();
+        $classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx);
         if ($matchResult.group($idx) == null) {
           ${ev.value} = UTF8String.EMPTY_UTF8;
         } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index d532dc4..4c7a037 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -215,6 +215,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1))
     checkEvaluation(nonNullExpr, "100", row1)
+
+    // invalid group index
+    val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3)
+    val row9 = create_row("100-200", "(\\d+).*", 2)
+    val row10 = create_row("100-200", "\\d+", 1)
+
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row8, "Regex group count is 2, but the specified group index is 3")
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row9, "Regex group count is 1, but the specified group index is 2")
+    checkExceptionInExpression[IllegalArgumentException](
+      expr, row10, "Regex group count is 0, but the specified group index is 1")
   }
 
   test("SPLIT") {
diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
new file mode 100644
index 0000000..c0827a3
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
@@ -0,0 +1,9 @@
+-- regexp_extract
+SELECT regexp_extract('1a 2b 14m', '\\d+');
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1);
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)');
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1);
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2);
diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
new file mode 100644
index 0000000..f54f67f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
@@ -0,0 +1,69 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+SELECT regexp_extract('1a 2b 14m', '\\d+')
+-- !query 0 schema
+struct<>
+-- !query 0 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query 1
+SELECT regexp_extract('1a 2b 14m', '\\d+', 0)
+-- !query 1 schema
+struct<regexp_extract(1a 2b 14m, \d+, 0):string>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT regexp_extract('1a 2b 14m', '\\d+', 1)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 1
+
+
+-- !query 3
+SELECT regexp_extract('1a 2b 14m', '\\d+', 2)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+java.lang.IllegalArgumentException
+Regex group count is 0, but the specified group index is 2
+
+
+-- !query 4
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)')
+-- !query 4 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query 4 output
+1
+
+
+-- !query 5
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0)
+-- !query 5 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string>
+-- !query 5 output
+1a
+
+
+-- !query 6
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1)
+-- !query 6 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string>
+-- !query 6 output
+1
+
+
+-- !query 7
+SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2)
+-- !query 7 schema
+struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string>
+-- !query 7 output
+a


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org