You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2022/06/28 17:53:41 UTC
[spark] branch master updated: [SPARK-39618][SQL] Add the `REGEXP_COUNT` function

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 458f8a7bd9c [SPARK-39618][SQL] Add the `REGEXP_COUNT` function
458f8a7bd9c is described below

commit 458f8a7bd9c94e249bc094f095090651cabbd535
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue Jun 28 20:53:05 2022 +0300

    [SPARK-39618][SQL] Add the `REGEXP_COUNT` function
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to add new expression `RegExpCount` as a runtime replaceable expression of `Size` and `RegExpExtractAll`. And bind the expression to the function name `REGEXP_COUNT`. The `REGEXP_COUNT` function returns a count of the number of times that a regular expression pattern is matched in a string. It takes two parameters:
    1. An expression that specifies the string in which the search is to take place.
    2. An expression that specifies the regular expression string that is the pattern for the search.
    
    When any of the input parameters are NULL, the function returns NULL too.
    
    ### Why are the changes needed?
    To make the migration process from other systems to Spark SQL easier, and achieve feature parity to such systems. For example, Redshift and Snowflake support the `REGEXP_COUNT` function, see:
    - https://docs.aws.amazon.com/redshift/latest/dg/REGEXP_COUNT.html
    - https://docs.snowflake.com/en/sql-reference/functions/regexp_count.html
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    By running new tests:
    ```
    $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z regexp-functions.sql"
    $ build/sbt "sql/testOnly *ExpressionsSchemaSuite"
    $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite"
    ```
    
    Closes #37004 from MaxGekk/regexp_count.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   |  1 +
 .../catalyst/expressions/regexpExpressions.scala   | 38 +++++++++++++++
 .../sql-functions/sql-expression-schema.md         |  1 +
 .../sql-tests/inputs/regexp-functions.sql          |  9 ++++
 .../sql-tests/results/regexp-functions.sql.out     | 56 ++++++++++++++++++++++
 5 files changed, 105 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 7f2366014a4..b236cf33af0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -585,6 +585,7 @@ object FunctionRegistry {
     expression[XPathLong]("xpath_long"),
     expression[XPathShort]("xpath_short"),
     expression[XPathString]("xpath_string"),
+    expression[RegExpCount]("regexp_count"),
 
     // datetime functions
     expression[AddMonths]("add_months"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 01763f082d6..126e0b6dc1f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -980,3 +980,41 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres
       newFirst: Expression, newSecond: Expression, newThird: Expression): RegExpExtractAll =
     copy(subject = newFirst, regexp = newSecond, idx = newThird)
 }
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(str, regexp) - Returns a count of the number of times that the regular expression pattern `regexp` is matched in the string `str`.
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression.
+      * regexp - a string representing a regular expression. The regex string should be a
+          Java regular expression.
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en');
+       2
+      > SELECT _FUNC_('abcdefghijklmnopqrstuvwxyz', '[a-z]{3}');
+       8
+  """,
+  since = "3.4.0",
+  group = "string_funcs")
+// scalastyle:on line.size.limit
+case class RegExpCount(left: Expression, right: Expression)
+  extends RuntimeReplaceable with ImplicitCastInputTypes {
+
+  override lazy val replacement: Expression =
+    Size(RegExpExtractAll(left, right, Literal(0)), legacySizeOfNull = false)
+
+  override def prettyName: String = "regexp_count"
+
+  override def children: Seq[Expression] = Seq(left, right)
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+
+  override protected def withNewChildrenInternal(
+      newChildren: IndexedSeq[Expression]): RegExpCount =
+    copy(left = newChildren(0), right = newChildren(1))
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 9aa6e49ee08..369594c902a 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -237,6 +237,7 @@
 | org.apache.spark.sql.catalyst.expressions.Rand | random | SELECT random() | struct<rand():double> |
 | org.apache.spark.sql.catalyst.expressions.Randn | randn | SELECT randn() | struct<randn():double> |
 | org.apache.spark.sql.catalyst.expressions.Rank | rank | SELECT a, b, rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct<a:string,b:int,RANK() OVER (PARTITION BY a ORDER BY b ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):int> |
+| org.apache.spark.sql.catalyst.expressions.RegExpCount | regexp_count | SELECT regexp_count('Steven Jones and Stephen Smith are the best players', 'Ste(v&#124;ph)en') | struct<regexp_count(Steven Jones and Stephen Smith are the best players, Ste(v&#124;ph)en):int> |
 | org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> |
 | org.apache.spark.sql.catalyst.expressions.RegExpExtractAll | regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400', '(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+), 1):array<string>> |
 | org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | SELECT regexp_replace('100-200', '(\\d+)', 'num') | struct<regexp_replace(100-200, (\d+), num, 1):string> |
diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
index efe5c278730..a48cfe68489 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql
@@ -51,3 +51,12 @@ SELECT regexp('1a 2b 14m', '\\d+b');
 SELECT regexp('1a 2b 14m', '[a-z]+b');
 SELECT rlike('1a 2b 14m', '\\d+b');
 SELECT rlike('1a 2b 14m', '[a-z]+b');
+
+-- regexp_count
+SELECT regexp_count('1a 2b 14m', '\\d+');
+SELECT regexp_count('1a 2b 14m', 'mmm');
+SELECT regexp_count('the fox', 'FOX');
+SELECT regexp_count('the fox', '(?i)FOX');
+SELECT regexp_count('passwd7 plain A1234 a1234', '(?=[^ ]*[a-z])(?=[^ ]*[0-9])[^ ]+');
+SELECT regexp_count(null, 'abc');
+SELECT regexp_count('abc', null);
diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
index 7f7b52febdd..a2eb2b2a148 100644
--- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out
@@ -379,3 +379,59 @@ SELECT rlike('1a 2b 14m', '[a-z]+b')
 struct<RLIKE(1a 2b 14m, [a-z]+b):boolean>
 -- !query output
 false
+
+
+-- !query
+SELECT regexp_count('1a 2b 14m', '\\d+')
+-- !query schema
+struct<regexp_count(1a 2b 14m, \d+):int>
+-- !query output
+3
+
+
+-- !query
+SELECT regexp_count('1a 2b 14m', 'mmm')
+-- !query schema
+struct<regexp_count(1a 2b 14m, mmm):int>
+-- !query output
+0
+
+
+-- !query
+SELECT regexp_count('the fox', 'FOX')
+-- !query schema
+struct<regexp_count(the fox, FOX):int>
+-- !query output
+0
+
+
+-- !query
+SELECT regexp_count('the fox', '(?i)FOX')
+-- !query schema
+struct<regexp_count(the fox, (?i)FOX):int>
+-- !query output
+1
+
+
+-- !query
+SELECT regexp_count('passwd7 plain A1234 a1234', '(?=[^ ]*[a-z])(?=[^ ]*[0-9])[^ ]+')
+-- !query schema
+struct<regexp_count(passwd7 plain A1234 a1234, (?=[^ ]*[a-z])(?=[^ ]*[0-9])[^ ]+):int>
+-- !query output
+2
+
+
+-- !query
+SELECT regexp_count(null, 'abc')
+-- !query schema
+struct<regexp_count(NULL, abc):int>
+-- !query output
+NULL
+
+
+-- !query
+SELECT regexp_count('abc', null)
+-- !query schema
+struct<regexp_count(abc, NULL):int>
+-- !query output
+NULL


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org