You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/07/15 04:31:44 UTC

[spark] branch master updated: [SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e5c1b822016 [SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
e5c1b822016 is described below

commit e5c1b822016600e77fabcdf145ecb3ba93c692b3
Author: Yikf <yi...@gmail.com>
AuthorDate: Fri Jul 15 12:31:23 2022 +0800

    [SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
    
    ### What changes were proposed in this pull request?
    Currently, Spark don't support url encode/decode as built-in functions, the user might use reflect instead, It's a bit of a hassle, And often these functions are useful.
    
    This pr aims to two points as follow:
    - add url encode/decode as built-in function support.
    - tidy up url-related functions to one scala file
    
    ### Why are the changes needed?
    
    url encode/decode functions are useful
    
    ### Does this PR introduce _any_ user-facing change?
    
    yes, add new function as built-in function
    
    ### How was this patch tested?
    
    add new tests
    
    Closes #37113 from yikf/url.
    
    Authored-by: Yikf <yi...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../sql/catalyst/expressions/ExpressionInfo.java   |   2 +-
 .../sql/catalyst/analysis/FunctionRegistry.scala   |   6 +-
 .../catalyst/expressions/stringExpressions.scala   | 177 -------------
 .../sql/catalyst/expressions/urlExpressions.scala  | 290 +++++++++++++++++++++
 .../sql-functions/sql-expression-schema.md         |   2 +
 .../resources/sql-tests/inputs/url-functions.sql   |  19 ++
 .../sql-tests/results/url-functions.sql.out        | 111 ++++++++
 .../apache/spark/sql/StringFunctionsSuite.scala    |  47 ----
 .../org/apache/spark/sql/UrlFunctionsSuite.scala   |  85 ++++++
 9 files changed, 513 insertions(+), 226 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
index 9ed764a3485..be2b3dbe819 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
@@ -45,7 +45,7 @@ public class ExpressionInfo {
             "collection_funcs", "predicate_funcs", "conditional_funcs", "conversion_funcs",
             "csv_funcs", "datetime_funcs", "generator_funcs", "hash_funcs", "json_funcs",
             "lambda_funcs", "map_funcs", "math_funcs", "misc_funcs", "string_funcs", "struct_funcs",
-            "window_funcs", "xml_funcs", "table_funcs"));
+            "window_funcs", "xml_funcs", "table_funcs", "url_funcs"));
 
     private static final Set<String> validSources =
             new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", "java_udf"));
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 088d85034db..d97b344d166 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -544,7 +544,6 @@ object FunctionRegistry {
     expressionBuilder("lpad", LPadExpressionBuilder),
     expression[StringTrimLeft]("ltrim"),
     expression[JsonTuple]("json_tuple"),
-    expression[ParseUrl]("parse_url"),
     expression[StringLocate]("position", true),
     expression[FormatString]("printf", true),
     expression[RegExpExtract]("regexp_extract"),
@@ -588,6 +587,11 @@ object FunctionRegistry {
     expression[RegExpSubStr]("regexp_substr"),
     expression[RegExpInStr]("regexp_instr"),
 
+    // url functions
+    expression[UrlEncode]("url_encode"),
+    expression[UrlDecode]("url_decode"),
+    expression[ParseUrl]("parse_url"),
+
     // datetime functions
     expression[AddMonths]("add_months"),
     expression[CurrentDate]("current_date"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index a4c5af582fa..bc24a12f083 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,11 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.net.{URI, URISyntaxException}
 import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
 import java.util.{Base64 => JBase64}
 import java.util.{HashMap, Locale, Map => JMap}
-import java.util.regex.Pattern
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -1626,181 +1624,6 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera
     copy(str = newFirst, len = newSecond, pad = newThird)
 }
 
-object ParseUrl {
-  private val HOST = UTF8String.fromString("HOST")
-  private val PATH = UTF8String.fromString("PATH")
-  private val QUERY = UTF8String.fromString("QUERY")
-  private val REF = UTF8String.fromString("REF")
-  private val PROTOCOL = UTF8String.fromString("PROTOCOL")
-  private val FILE = UTF8String.fromString("FILE")
-  private val AUTHORITY = UTF8String.fromString("AUTHORITY")
-  private val USERINFO = UTF8String.fromString("USERINFO")
-  private val REGEXPREFIX = "(&|^)"
-  private val REGEXSUBFIX = "=([^&]*)"
-}
-
-/**
- * Extracts a part from a URL
- */
-@ExpressionDescription(
-  usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
-  examples = """
-    Examples:
-      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST');
-       spark.apache.org
-      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY');
-       query=1
-      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query');
-       1
-  """,
-  since = "2.0.0",
-  group = "string_funcs")
-case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled)
-  extends Expression with ExpectsInputTypes with CodegenFallback {
-  def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled)
-
-  override def nullable: Boolean = true
-  override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType)
-  override def dataType: DataType = StringType
-  override def prettyName: String = "parse_url"
-
-  // If the url is a constant, cache the URL object so that we don't need to convert url
-  // from UTF8String to String to URL for every row.
-  @transient private lazy val cachedUrl = children(0) match {
-    case Literal(url: UTF8String, _) if url ne null => getUrl(url)
-    case _ => null
-  }
-
-  // If the key is a constant, cache the Pattern object so that we don't need to convert key
-  // from UTF8String to String to StringBuilder to String to Pattern for every row.
-  @transient private lazy val cachedPattern = children(2) match {
-    case Literal(key: UTF8String, _) if key ne null => getPattern(key)
-    case _ => null
-  }
-
-  // If the partToExtract is a constant, cache the Extract part function so that we don't need
-  // to check the partToExtract for every row.
-  @transient private lazy val cachedExtractPartFunc = children(1) match {
-    case Literal(part: UTF8String, _) => getExtractPartFunc(part)
-    case _ => null
-  }
-
-  import ParseUrl._
-
-  override def checkInputDataTypes(): TypeCheckResult = {
-    if (children.size > 3 || children.size < 2) {
-      TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments")
-    } else {
-      super[ExpectsInputTypes].checkInputDataTypes()
-    }
-  }
-
-  private def getPattern(key: UTF8String): Pattern = {
-    Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
-  }
-
-  private def getUrl(url: UTF8String): URI = {
-    try {
-      new URI(url.toString)
-    } catch {
-      case e: URISyntaxException if failOnError =>
-        throw QueryExecutionErrors.invalidUrlError(url, e)
-      case _: URISyntaxException => null
-    }
-  }
-
-  private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
-
-    // partToExtract match {
-    //   case HOST => _.toURL().getHost
-    //   case PATH => _.toURL().getPath
-    //   case QUERY => _.toURL().getQuery
-    //   case REF => _.toURL().getRef
-    //   case PROTOCOL => _.toURL().getProtocol
-    //   case FILE => _.toURL().getFile
-    //   case AUTHORITY => _.toURL().getAuthority
-    //   case USERINFO => _.toURL().getUserInfo
-    //   case _ => (url: URI) => null
-    // }
-
-    partToExtract match {
-      case HOST => _.getHost
-      case PATH => _.getRawPath
-      case QUERY => _.getRawQuery
-      case REF => _.getRawFragment
-      case PROTOCOL => _.getScheme
-      case FILE =>
-        (url: URI) =>
-          if (url.getRawQuery ne null) {
-            url.getRawPath + "?" + url.getRawQuery
-          } else {
-            url.getRawPath
-          }
-      case AUTHORITY => _.getRawAuthority
-      case USERINFO => _.getRawUserInfo
-      case _ => (url: URI) => null
-    }
-  }
-
-  private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = {
-    val m = pattern.matcher(query.toString)
-    if (m.find()) {
-      UTF8String.fromString(m.group(2))
-    } else {
-      null
-    }
-  }
-
-  private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
-    if (cachedExtractPartFunc ne null) {
-      UTF8String.fromString(cachedExtractPartFunc.apply(url))
-    } else {
-      UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url))
-    }
-  }
-
-  private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = {
-    if (cachedUrl ne null) {
-      extractFromUrl(cachedUrl, partToExtract)
-    } else {
-      val currentUrl = getUrl(url)
-      if (currentUrl ne null) {
-        extractFromUrl(currentUrl, partToExtract)
-      } else {
-        null
-      }
-    }
-  }
-
-  override def eval(input: InternalRow): Any = {
-    val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]}
-    if (evaluated.contains(null)) return null
-    if (evaluated.size == 2) {
-      parseUrlWithoutKey(evaluated(0), evaluated(1))
-    } else {
-      // 3-arg, i.e. QUERY with key
-      assert(evaluated.size == 3)
-      if (evaluated(1) != QUERY) {
-        return null
-      }
-
-      val query = parseUrlWithoutKey(evaluated(0), evaluated(1))
-      if (query eq null) {
-        return null
-      }
-
-      if (cachedPattern ne null) {
-        extractValueFromQuery(query, cachedPattern)
-      } else {
-        extractValueFromQuery(query, getPattern(evaluated(2)))
-      }
-    }
-  }
-
-  override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl =
-    copy(children = newChildren)
-}
-
 /**
  * Returns the input formatted according do printf-style format strings
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
new file mode 100644
index 00000000000..174e60371af
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder}
+import java.util.regex.Pattern
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
+import org.apache.spark.sql.catalyst.trees.UnaryLike
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(str) - Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding scheme.
+  """,
+  arguments = """
+    Arguments:
+      str - a string expression to be translated
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('https://spark.apache.org');
+       https%3A%2F%2Fspark.apache.org
+  """,
+  since = "3.4.0",
+  group = "url_funcs")
+// scalastyle:on line.size.limit
+case class UrlEncode(child: Expression)
+  extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes {
+
+  override def replacement: Expression =
+    StaticInvoke(
+      UrlCodec.getClass,
+      StringType,
+      "encode",
+      Seq(child, Literal("UTF-8")),
+      Seq(StringType))
+
+  override protected def withNewChildInternal(newChild: Expression): Expression = {
+    copy(child = newChild)
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "url_encode"
+}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(str) - Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding scheme.
+  """,
+  arguments = """
+    Arguments:
+      * str - a string expression to decode
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('https%3A%2F%2Fspark.apache.org');
+       https://spark.apache.org
+  """,
+  since = "3.4.0",
+  group = "url_funcs")
+// scalastyle:on line.size.limit
+case class UrlDecode(child: Expression)
+  extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes {
+
+  override def replacement: Expression =
+    StaticInvoke(
+      UrlCodec.getClass,
+      StringType,
+      "decode",
+      Seq(child, Literal("UTF-8")),
+      Seq(StringType))
+
+  override protected def withNewChildInternal(newChild: Expression): Expression = {
+    copy(child = newChild)
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "url_decode"
+}
+
+object UrlCodec {
+  def encode(src: UTF8String, enc: UTF8String): UTF8String = {
+    UTF8String.fromString(URLEncoder.encode(src.toString, enc.toString))
+  }
+
+  def decode(src: UTF8String, enc: UTF8String): UTF8String = {
+    UTF8String.fromString(URLDecoder.decode(src.toString, enc.toString))
+  }
+}
+
+object ParseUrl {
+  private val HOST = UTF8String.fromString("HOST")
+  private val PATH = UTF8String.fromString("PATH")
+  private val QUERY = UTF8String.fromString("QUERY")
+  private val REF = UTF8String.fromString("REF")
+  private val PROTOCOL = UTF8String.fromString("PROTOCOL")
+  private val FILE = UTF8String.fromString("FILE")
+  private val AUTHORITY = UTF8String.fromString("AUTHORITY")
+  private val USERINFO = UTF8String.fromString("USERINFO")
+  private val REGEXPREFIX = "(&|^)"
+  private val REGEXSUBFIX = "=([^&]*)"
+}
+
+/**
+ * Extracts a part from a URL
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST');
+       spark.apache.org
+      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY');
+       query=1
+      > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query');
+       1
+  """,
+  since = "2.0.0",
+  group = "url_funcs")
+case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled)
+  extends Expression with ExpectsInputTypes with CodegenFallback {
+  def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled)
+
+  override def nullable: Boolean = true
+  override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType)
+  override def dataType: DataType = StringType
+  override def prettyName: String = "parse_url"
+
+  // If the url is a constant, cache the URL object so that we don't need to convert url
+  // from UTF8String to String to URL for every row.
+  @transient private lazy val cachedUrl = children(0) match {
+    case Literal(url: UTF8String, _) if url ne null => getUrl(url)
+    case _ => null
+  }
+
+  // If the key is a constant, cache the Pattern object so that we don't need to convert key
+  // from UTF8String to String to StringBuilder to String to Pattern for every row.
+  @transient private lazy val cachedPattern = children(2) match {
+    case Literal(key: UTF8String, _) if key ne null => getPattern(key)
+    case _ => null
+  }
+
+  // If the partToExtract is a constant, cache the Extract part function so that we don't need
+  // to check the partToExtract for every row.
+  @transient private lazy val cachedExtractPartFunc = children(1) match {
+    case Literal(part: UTF8String, _) => getExtractPartFunc(part)
+    case _ => null
+  }
+
+  import ParseUrl._
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.size > 3 || children.size < 2) {
+      TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments")
+    } else {
+      super[ExpectsInputTypes].checkInputDataTypes()
+    }
+  }
+
+  private def getPattern(key: UTF8String): Pattern = {
+    Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
+  }
+
+  private def getUrl(url: UTF8String): URI = {
+    try {
+      new URI(url.toString)
+    } catch {
+      case e: URISyntaxException if failOnError =>
+        throw QueryExecutionErrors.invalidUrlError(url, e)
+      case _: URISyntaxException => null
+    }
+  }
+
+  private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
+
+    // partToExtract match {
+    //   case HOST => _.toURL().getHost
+    //   case PATH => _.toURL().getPath
+    //   case QUERY => _.toURL().getQuery
+    //   case REF => _.toURL().getRef
+    //   case PROTOCOL => _.toURL().getProtocol
+    //   case FILE => _.toURL().getFile
+    //   case AUTHORITY => _.toURL().getAuthority
+    //   case USERINFO => _.toURL().getUserInfo
+    //   case _ => (url: URI) => null
+    // }
+
+    partToExtract match {
+      case HOST => _.getHost
+      case PATH => _.getRawPath
+      case QUERY => _.getRawQuery
+      case REF => _.getRawFragment
+      case PROTOCOL => _.getScheme
+      case FILE =>
+        (url: URI) =>
+          if (url.getRawQuery ne null) {
+            url.getRawPath + "?" + url.getRawQuery
+          } else {
+            url.getRawPath
+          }
+      case AUTHORITY => _.getRawAuthority
+      case USERINFO => _.getRawUserInfo
+      case _ => (url: URI) => null
+    }
+  }
+
+  private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = {
+    val m = pattern.matcher(query.toString)
+    if (m.find()) {
+      UTF8String.fromString(m.group(2))
+    } else {
+      null
+    }
+  }
+
+  private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
+    if (cachedExtractPartFunc ne null) {
+      UTF8String.fromString(cachedExtractPartFunc.apply(url))
+    } else {
+      UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url))
+    }
+  }
+
+  private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = {
+    if (cachedUrl ne null) {
+      extractFromUrl(cachedUrl, partToExtract)
+    } else {
+      val currentUrl = getUrl(url)
+      if (currentUrl ne null) {
+        extractFromUrl(currentUrl, partToExtract)
+      } else {
+        null
+      }
+    }
+  }
+
+  override def eval(input: InternalRow): Any = {
+    val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]}
+    if (evaluated.contains(null)) return null
+    if (evaluated.size == 2) {
+      parseUrlWithoutKey(evaluated(0), evaluated(1))
+    } else {
+      // 3-arg, i.e. QUERY with key
+      assert(evaluated.size == 3)
+      if (evaluated(1) != QUERY) {
+        return null
+      }
+
+      val query = parseUrlWithoutKey(evaluated(0), evaluated(1))
+      if (query eq null) {
+        return null
+      }
+
+      if (cachedPattern ne null) {
+        extractValueFromQuery(query, cachedPattern)
+      } else {
+        extractValueFromQuery(query, getPattern(evaluated(2)))
+      }
+    }
+  }
+
+  override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl =
+    copy(children = newChildren)
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 907c62b4ee0..78a9ce7c386 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -332,6 +332,8 @@
 | org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct<unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss):bigint> |
 | org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct<ucase(SparkSql):string> |
 | org.apache.spark.sql.catalyst.expressions.Upper | upper | SELECT upper('SparkSql') | struct<upper(SparkSql):string> |
+| org.apache.spark.sql.catalyst.expressions.UrlDecode | url_decode | SELECT url_decode('https%3A%2F%2Fspark.apache.org') | struct<url_decode(https%3A%2F%2Fspark.apache.org):string> |
+| org.apache.spark.sql.catalyst.expressions.UrlEncode | url_encode | SELECT url_encode('https://spark.apache.org') | struct<url_encode(https://spark.apache.org):string> |
 | org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct<uuid():string> |
 | org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct<weekday(2009-07-30):int> |
 | org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct<weekofyear(2008-02-20):int> |
diff --git a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
new file mode 100644
index 00000000000..9f8af7eac7e
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
@@ -0,0 +1,19 @@
+-- parse_url function
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO');
+
+-- url_encode function
+select url_encode('https://spark.apache.org');
+select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select url_encode(null);
+
+-- url_decode function
+select url_decode('https%3A%2F%2Fspark.apache.org');
+select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select url_decode(null);
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
new file mode 100644
index 00000000000..fc714bfc41b
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
@@ -0,0 +1,111 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, HOST):string>
+-- !query output
+spark.apache.org
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, PATH):string>
+-- !query output
+/path
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, QUERY):string>
+-- !query output
+query=1
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, REF):string>
+-- !query output
+Ref
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, PROTOCOL):string>
+-- !query output
+http
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, FILE):string>
+-- !query output
+/path?query=1
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, AUTHORITY):string>
+-- !query output
+userinfo@spark.apache.org
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, USERINFO):string>
+-- !query output
+userinfo
+
+
+-- !query
+select url_encode('https://spark.apache.org')
+-- !query schema
+struct<url_encode(https://spark.apache.org):string>
+-- !query output
+https%3A%2F%2Fspark.apache.org
+
+
+-- !query
+select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<url_encode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva+lid%3A%2F%2Fuser%3Apass%40host%2Ffile%5C%3Bparam%3Fquery%5C%3Bp2
+
+
+-- !query
+select url_encode(null)
+-- !query schema
+struct<url_encode(NULL):string>
+-- !query output
+NULL
+
+
+-- !query
+select url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query schema
+struct<url_decode(https%3A%2F%2Fspark.apache.org):string>
+-- !query output
+https://spark.apache.org
+
+
+-- !query
+select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<url_decode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva lid://user:pass@host/file\;param?query\;p2
+
+
+-- !query
+select url_decode(null)
+-- !query schema
+struct<url_decode(NULL):string>
+-- !query output
+NULL
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 2f118f236e2..d07be9c1971 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -346,53 +346,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       Row("???hi", "hi???", "h", "h"))
   }
 
-  test("string parse_url function") {
-
-    def testUrl(url: String, expected: Row): Unit = {
-      checkAnswer(Seq[String]((url)).toDF("url").selectExpr(
-        "parse_url(url, 'HOST')", "parse_url(url, 'PATH')",
-        "parse_url(url, 'QUERY')", "parse_url(url, 'REF')",
-        "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')",
-        "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')",
-        "parse_url(url, 'QUERY', 'query')"), expected)
-    }
-
-    testUrl(
-      "http://userinfo@spark.apache.org/path?query=1#Ref",
-      Row("spark.apache.org", "/path", "query=1", "Ref",
-        "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1"))
-
-    testUrl(
-      "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two",
-      Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two",
-        "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com",
-        "use%20r:pas%20s", "x%20y"))
-
-    testUrl(
-      "http://user:pass@host",
-      Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null))
-
-    testUrl(
-      "http://user:pass@host/",
-      Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null))
-
-    testUrl(
-      "http://user:pass@host/?#",
-      Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null))
-
-    testUrl(
-      "http://user:pass@host/file;param?query;p2",
-      Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2",
-        "user:pass@host", "user:pass", null))
-
-    withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") {
-      testUrl(
-        "inva lid://user:pass@host/file;param?query;p2",
-        Row(null, null, null, null, null, null, null, null, null))
-    }
-
-  }
-
   test("string repeat function") {
     val df = Seq(("hi", 2)).toDF("a", "b")
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala
new file mode 100644
index 00000000000..85f0d70df7b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class UrlFunctionsSuite extends QueryTest with SharedSparkSession {
+  import testImplicits._
+
+  test("url parse_url function") {
+
+    def testUrl(url: String, expected: Row): Unit = {
+      checkAnswer(Seq[String]((url)).toDF("url").selectExpr(
+        "parse_url(url, 'HOST')", "parse_url(url, 'PATH')",
+        "parse_url(url, 'QUERY')", "parse_url(url, 'REF')",
+        "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')",
+        "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')",
+        "parse_url(url, 'QUERY', 'query')"), expected)
+    }
+
+    testUrl(
+      "http://userinfo@spark.apache.org/path?query=1#Ref",
+      Row("spark.apache.org", "/path", "query=1", "Ref",
+        "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1"))
+
+    testUrl(
+      "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two",
+      Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two",
+        "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com",
+        "use%20r:pas%20s", "x%20y"))
+
+    testUrl(
+      "http://user:pass@host",
+      Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/",
+      Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/?#",
+      Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null))
+
+    testUrl(
+      "http://user:pass@host/file;param?query;p2",
+      Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2",
+        "user:pass@host", "user:pass", null))
+
+    withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") {
+      testUrl(
+        "inva lid://user:pass@host/file;param?query;p2",
+        Row(null, null, null, null, null, null, null, null, null))
+    }
+  }
+
+  test("url encode/decode function") {
+    def testUrl(url: String, fn: String, expected: Row): Unit = {
+      checkAnswer(Seq[String]((url)).toDF("url")
+        .selectExpr(s"$fn(url)"), expected)
+    }
+
+    testUrl("https://spark.apache.org", "url_encode", Row("https%3A%2F%2Fspark.apache.org"))
+    testUrl("null", "url_encode", Row("null"))
+
+    testUrl("https%3A%2F%2Fspark.apache.org", "url_decode", Row("https://spark.apache.org"))
+    testUrl("null", "url_decode", Row("null"))
+  }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org