You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/07/15 04:31:44 UTC
[spark] branch master updated: [SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e5c1b822016 [SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
e5c1b822016 is described below
commit e5c1b822016600e77fabcdf145ecb3ba93c692b3
Author: Yikf <yi...@gmail.com>
AuthorDate: Fri Jul 15 12:31:23 2022 +0800
[SPARK-39741][SQL] Support url encode/decode as built-in function and tidy up url-related functions
### What changes were proposed in this pull request?
Currently, Spark don't support url encode/decode as built-in functions, the user might use reflect instead, It's a bit of a hassle, And often these functions are useful.
This pr aims to two points as follow:
- add url encode/decode as built-in function support.
- tidy up url-related functions to one scala file
### Why are the changes needed?
url encode/decode functions are useful
### Does this PR introduce _any_ user-facing change?
yes, add new function as built-in function
### How was this patch tested?
add new tests
Closes #37113 from yikf/url.
Authored-by: Yikf <yi...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../sql/catalyst/expressions/ExpressionInfo.java | 2 +-
.../sql/catalyst/analysis/FunctionRegistry.scala | 6 +-
.../catalyst/expressions/stringExpressions.scala | 177 -------------
.../sql/catalyst/expressions/urlExpressions.scala | 290 +++++++++++++++++++++
.../sql-functions/sql-expression-schema.md | 2 +
.../resources/sql-tests/inputs/url-functions.sql | 19 ++
.../sql-tests/results/url-functions.sql.out | 111 ++++++++
.../apache/spark/sql/StringFunctionsSuite.scala | 47 ----
.../org/apache/spark/sql/UrlFunctionsSuite.scala | 85 ++++++
9 files changed, 513 insertions(+), 226 deletions(-)
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
index 9ed764a3485..be2b3dbe819 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionInfo.java
@@ -45,7 +45,7 @@ public class ExpressionInfo {
"collection_funcs", "predicate_funcs", "conditional_funcs", "conversion_funcs",
"csv_funcs", "datetime_funcs", "generator_funcs", "hash_funcs", "json_funcs",
"lambda_funcs", "map_funcs", "math_funcs", "misc_funcs", "string_funcs", "struct_funcs",
- "window_funcs", "xml_funcs", "table_funcs"));
+ "window_funcs", "xml_funcs", "table_funcs", "url_funcs"));
private static final Set<String> validSources =
new HashSet<>(Arrays.asList("built-in", "hive", "python_udf", "scala_udf", "java_udf"));
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 088d85034db..d97b344d166 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -544,7 +544,6 @@ object FunctionRegistry {
expressionBuilder("lpad", LPadExpressionBuilder),
expression[StringTrimLeft]("ltrim"),
expression[JsonTuple]("json_tuple"),
- expression[ParseUrl]("parse_url"),
expression[StringLocate]("position", true),
expression[FormatString]("printf", true),
expression[RegExpExtract]("regexp_extract"),
@@ -588,6 +587,11 @@ object FunctionRegistry {
expression[RegExpSubStr]("regexp_substr"),
expression[RegExpInStr]("regexp_instr"),
+ // url functions
+ expression[UrlEncode]("url_encode"),
+ expression[UrlDecode]("url_decode"),
+ expression[ParseUrl]("parse_url"),
+
// datetime functions
expression[AddMonths]("add_months"),
expression[CurrentDate]("current_date"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index a4c5af582fa..bc24a12f083 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,11 +17,9 @@
package org.apache.spark.sql.catalyst.expressions
-import java.net.{URI, URISyntaxException}
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
import java.util.{Base64 => JBase64}
import java.util.{HashMap, Locale, Map => JMap}
-import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
@@ -1626,181 +1624,6 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression = Litera
copy(str = newFirst, len = newSecond, pad = newThird)
}
-object ParseUrl {
- private val HOST = UTF8String.fromString("HOST")
- private val PATH = UTF8String.fromString("PATH")
- private val QUERY = UTF8String.fromString("QUERY")
- private val REF = UTF8String.fromString("REF")
- private val PROTOCOL = UTF8String.fromString("PROTOCOL")
- private val FILE = UTF8String.fromString("FILE")
- private val AUTHORITY = UTF8String.fromString("AUTHORITY")
- private val USERINFO = UTF8String.fromString("USERINFO")
- private val REGEXPREFIX = "(&|^)"
- private val REGEXSUBFIX = "=([^&]*)"
-}
-
-/**
- * Extracts a part from a URL
- */
-@ExpressionDescription(
- usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
- examples = """
- Examples:
- > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST');
- spark.apache.org
- > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY');
- query=1
- > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query');
- 1
- """,
- since = "2.0.0",
- group = "string_funcs")
-case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled)
- extends Expression with ExpectsInputTypes with CodegenFallback {
- def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled)
-
- override def nullable: Boolean = true
- override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType)
- override def dataType: DataType = StringType
- override def prettyName: String = "parse_url"
-
- // If the url is a constant, cache the URL object so that we don't need to convert url
- // from UTF8String to String to URL for every row.
- @transient private lazy val cachedUrl = children(0) match {
- case Literal(url: UTF8String, _) if url ne null => getUrl(url)
- case _ => null
- }
-
- // If the key is a constant, cache the Pattern object so that we don't need to convert key
- // from UTF8String to String to StringBuilder to String to Pattern for every row.
- @transient private lazy val cachedPattern = children(2) match {
- case Literal(key: UTF8String, _) if key ne null => getPattern(key)
- case _ => null
- }
-
- // If the partToExtract is a constant, cache the Extract part function so that we don't need
- // to check the partToExtract for every row.
- @transient private lazy val cachedExtractPartFunc = children(1) match {
- case Literal(part: UTF8String, _) => getExtractPartFunc(part)
- case _ => null
- }
-
- import ParseUrl._
-
- override def checkInputDataTypes(): TypeCheckResult = {
- if (children.size > 3 || children.size < 2) {
- TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments")
- } else {
- super[ExpectsInputTypes].checkInputDataTypes()
- }
- }
-
- private def getPattern(key: UTF8String): Pattern = {
- Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
- }
-
- private def getUrl(url: UTF8String): URI = {
- try {
- new URI(url.toString)
- } catch {
- case e: URISyntaxException if failOnError =>
- throw QueryExecutionErrors.invalidUrlError(url, e)
- case _: URISyntaxException => null
- }
- }
-
- private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
-
- // partToExtract match {
- // case HOST => _.toURL().getHost
- // case PATH => _.toURL().getPath
- // case QUERY => _.toURL().getQuery
- // case REF => _.toURL().getRef
- // case PROTOCOL => _.toURL().getProtocol
- // case FILE => _.toURL().getFile
- // case AUTHORITY => _.toURL().getAuthority
- // case USERINFO => _.toURL().getUserInfo
- // case _ => (url: URI) => null
- // }
-
- partToExtract match {
- case HOST => _.getHost
- case PATH => _.getRawPath
- case QUERY => _.getRawQuery
- case REF => _.getRawFragment
- case PROTOCOL => _.getScheme
- case FILE =>
- (url: URI) =>
- if (url.getRawQuery ne null) {
- url.getRawPath + "?" + url.getRawQuery
- } else {
- url.getRawPath
- }
- case AUTHORITY => _.getRawAuthority
- case USERINFO => _.getRawUserInfo
- case _ => (url: URI) => null
- }
- }
-
- private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = {
- val m = pattern.matcher(query.toString)
- if (m.find()) {
- UTF8String.fromString(m.group(2))
- } else {
- null
- }
- }
-
- private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
- if (cachedExtractPartFunc ne null) {
- UTF8String.fromString(cachedExtractPartFunc.apply(url))
- } else {
- UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url))
- }
- }
-
- private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = {
- if (cachedUrl ne null) {
- extractFromUrl(cachedUrl, partToExtract)
- } else {
- val currentUrl = getUrl(url)
- if (currentUrl ne null) {
- extractFromUrl(currentUrl, partToExtract)
- } else {
- null
- }
- }
- }
-
- override def eval(input: InternalRow): Any = {
- val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]}
- if (evaluated.contains(null)) return null
- if (evaluated.size == 2) {
- parseUrlWithoutKey(evaluated(0), evaluated(1))
- } else {
- // 3-arg, i.e. QUERY with key
- assert(evaluated.size == 3)
- if (evaluated(1) != QUERY) {
- return null
- }
-
- val query = parseUrlWithoutKey(evaluated(0), evaluated(1))
- if (query eq null) {
- return null
- }
-
- if (cachedPattern ne null) {
- extractValueFromQuery(query, cachedPattern)
- } else {
- extractValueFromQuery(query, getPattern(evaluated(2)))
- }
- }
- }
-
- override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl =
- copy(children = newChildren)
-}
-
/**
* Returns the input formatted according do printf-style format strings
*/
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
new file mode 100644
index 00000000000..174e60371af
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import java.net.{URI, URISyntaxException, URLDecoder, URLEncoder}
+import java.util.regex.Pattern
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke
+import org.apache.spark.sql.catalyst.trees.UnaryLike
+import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{AbstractDataType, DataType, StringType}
+import org.apache.spark.unsafe.types.UTF8String
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """
+ _FUNC_(str) - Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding scheme.
+ """,
+ arguments = """
+ Arguments:
+ str - a string expression to be translated
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_('https://spark.apache.org');
+ https%3A%2F%2Fspark.apache.org
+ """,
+ since = "3.4.0",
+ group = "url_funcs")
+// scalastyle:on line.size.limit
+case class UrlEncode(child: Expression)
+ extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes {
+
+ override def replacement: Expression =
+ StaticInvoke(
+ UrlCodec.getClass,
+ StringType,
+ "encode",
+ Seq(child, Literal("UTF-8")),
+ Seq(StringType))
+
+ override protected def withNewChildInternal(newChild: Expression): Expression = {
+ copy(child = newChild)
+ }
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+ override def prettyName: String = "url_encode"
+}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = """
+ _FUNC_(str) - Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding scheme.
+ """,
+ arguments = """
+ Arguments:
+ * str - a string expression to decode
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_('https%3A%2F%2Fspark.apache.org');
+ https://spark.apache.org
+ """,
+ since = "3.4.0",
+ group = "url_funcs")
+// scalastyle:on line.size.limit
+case class UrlDecode(child: Expression)
+ extends RuntimeReplaceable with UnaryLike[Expression] with ImplicitCastInputTypes {
+
+ override def replacement: Expression =
+ StaticInvoke(
+ UrlCodec.getClass,
+ StringType,
+ "decode",
+ Seq(child, Literal("UTF-8")),
+ Seq(StringType))
+
+ override protected def withNewChildInternal(newChild: Expression): Expression = {
+ copy(child = newChild)
+ }
+
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+ override def prettyName: String = "url_decode"
+}
+
+object UrlCodec {
+ def encode(src: UTF8String, enc: UTF8String): UTF8String = {
+ UTF8String.fromString(URLEncoder.encode(src.toString, enc.toString))
+ }
+
+ def decode(src: UTF8String, enc: UTF8String): UTF8String = {
+ UTF8String.fromString(URLDecoder.decode(src.toString, enc.toString))
+ }
+}
+
+object ParseUrl {
+ private val HOST = UTF8String.fromString("HOST")
+ private val PATH = UTF8String.fromString("PATH")
+ private val QUERY = UTF8String.fromString("QUERY")
+ private val REF = UTF8String.fromString("REF")
+ private val PROTOCOL = UTF8String.fromString("PROTOCOL")
+ private val FILE = UTF8String.fromString("FILE")
+ private val AUTHORITY = UTF8String.fromString("AUTHORITY")
+ private val USERINFO = UTF8String.fromString("USERINFO")
+ private val REGEXPREFIX = "(&|^)"
+ private val REGEXSUBFIX = "=([^&]*)"
+}
+
+/**
+ * Extracts a part from a URL
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.",
+ examples = """
+ Examples:
+ > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST');
+ spark.apache.org
+ > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY');
+ query=1
+ > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query');
+ 1
+ """,
+ since = "2.0.0",
+ group = "url_funcs")
+case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.get.ansiEnabled)
+ extends Expression with ExpectsInputTypes with CodegenFallback {
+ def this(children: Seq[Expression]) = this(children, SQLConf.get.ansiEnabled)
+
+ override def nullable: Boolean = true
+ override def inputTypes: Seq[DataType] = Seq.fill(children.size)(StringType)
+ override def dataType: DataType = StringType
+ override def prettyName: String = "parse_url"
+
+ // If the url is a constant, cache the URL object so that we don't need to convert url
+ // from UTF8String to String to URL for every row.
+ @transient private lazy val cachedUrl = children(0) match {
+ case Literal(url: UTF8String, _) if url ne null => getUrl(url)
+ case _ => null
+ }
+
+ // If the key is a constant, cache the Pattern object so that we don't need to convert key
+ // from UTF8String to String to StringBuilder to String to Pattern for every row.
+ @transient private lazy val cachedPattern = children(2) match {
+ case Literal(key: UTF8String, _) if key ne null => getPattern(key)
+ case _ => null
+ }
+
+ // If the partToExtract is a constant, cache the Extract part function so that we don't need
+ // to check the partToExtract for every row.
+ @transient private lazy val cachedExtractPartFunc = children(1) match {
+ case Literal(part: UTF8String, _) => getExtractPartFunc(part)
+ case _ => null
+ }
+
+ import ParseUrl._
+
+ override def checkInputDataTypes(): TypeCheckResult = {
+ if (children.size > 3 || children.size < 2) {
+ TypeCheckResult.TypeCheckFailure(s"$prettyName function requires two or three arguments")
+ } else {
+ super[ExpectsInputTypes].checkInputDataTypes()
+ }
+ }
+
+ private def getPattern(key: UTF8String): Pattern = {
+ Pattern.compile(REGEXPREFIX + key.toString + REGEXSUBFIX)
+ }
+
+ private def getUrl(url: UTF8String): URI = {
+ try {
+ new URI(url.toString)
+ } catch {
+ case e: URISyntaxException if failOnError =>
+ throw QueryExecutionErrors.invalidUrlError(url, e)
+ case _: URISyntaxException => null
+ }
+ }
+
+ private def getExtractPartFunc(partToExtract: UTF8String): URI => String = {
+
+ // partToExtract match {
+ // case HOST => _.toURL().getHost
+ // case PATH => _.toURL().getPath
+ // case QUERY => _.toURL().getQuery
+ // case REF => _.toURL().getRef
+ // case PROTOCOL => _.toURL().getProtocol
+ // case FILE => _.toURL().getFile
+ // case AUTHORITY => _.toURL().getAuthority
+ // case USERINFO => _.toURL().getUserInfo
+ // case _ => (url: URI) => null
+ // }
+
+ partToExtract match {
+ case HOST => _.getHost
+ case PATH => _.getRawPath
+ case QUERY => _.getRawQuery
+ case REF => _.getRawFragment
+ case PROTOCOL => _.getScheme
+ case FILE =>
+ (url: URI) =>
+ if (url.getRawQuery ne null) {
+ url.getRawPath + "?" + url.getRawQuery
+ } else {
+ url.getRawPath
+ }
+ case AUTHORITY => _.getRawAuthority
+ case USERINFO => _.getRawUserInfo
+ case _ => (url: URI) => null
+ }
+ }
+
+ private def extractValueFromQuery(query: UTF8String, pattern: Pattern): UTF8String = {
+ val m = pattern.matcher(query.toString)
+ if (m.find()) {
+ UTF8String.fromString(m.group(2))
+ } else {
+ null
+ }
+ }
+
+ private def extractFromUrl(url: URI, partToExtract: UTF8String): UTF8String = {
+ if (cachedExtractPartFunc ne null) {
+ UTF8String.fromString(cachedExtractPartFunc.apply(url))
+ } else {
+ UTF8String.fromString(getExtractPartFunc(partToExtract).apply(url))
+ }
+ }
+
+ private def parseUrlWithoutKey(url: UTF8String, partToExtract: UTF8String): UTF8String = {
+ if (cachedUrl ne null) {
+ extractFromUrl(cachedUrl, partToExtract)
+ } else {
+ val currentUrl = getUrl(url)
+ if (currentUrl ne null) {
+ extractFromUrl(currentUrl, partToExtract)
+ } else {
+ null
+ }
+ }
+ }
+
+ override def eval(input: InternalRow): Any = {
+ val evaluated = children.map{e => e.eval(input).asInstanceOf[UTF8String]}
+ if (evaluated.contains(null)) return null
+ if (evaluated.size == 2) {
+ parseUrlWithoutKey(evaluated(0), evaluated(1))
+ } else {
+ // 3-arg, i.e. QUERY with key
+ assert(evaluated.size == 3)
+ if (evaluated(1) != QUERY) {
+ return null
+ }
+
+ val query = parseUrlWithoutKey(evaluated(0), evaluated(1))
+ if (query eq null) {
+ return null
+ }
+
+ if (cachedPattern ne null) {
+ extractValueFromQuery(query, cachedPattern)
+ } else {
+ extractValueFromQuery(query, getPattern(evaluated(2)))
+ }
+ }
+ }
+
+ override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): ParseUrl =
+ copy(children = newChildren)
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 907c62b4ee0..78a9ce7c386 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -332,6 +332,8 @@
| org.apache.spark.sql.catalyst.expressions.UnixTimestamp | unix_timestamp | SELECT unix_timestamp() | struct<unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss):bigint> |
| org.apache.spark.sql.catalyst.expressions.Upper | ucase | SELECT ucase('SparkSql') | struct<ucase(SparkSql):string> |
| org.apache.spark.sql.catalyst.expressions.Upper | upper | SELECT upper('SparkSql') | struct<upper(SparkSql):string> |
+| org.apache.spark.sql.catalyst.expressions.UrlDecode | url_decode | SELECT url_decode('https%3A%2F%2Fspark.apache.org') | struct<url_decode(https%3A%2F%2Fspark.apache.org):string> |
+| org.apache.spark.sql.catalyst.expressions.UrlEncode | url_encode | SELECT url_encode('https://spark.apache.org') | struct<url_encode(https://spark.apache.org):string> |
| org.apache.spark.sql.catalyst.expressions.Uuid | uuid | SELECT uuid() | struct<uuid():string> |
| org.apache.spark.sql.catalyst.expressions.WeekDay | weekday | SELECT weekday('2009-07-30') | struct<weekday(2009-07-30):int> |
| org.apache.spark.sql.catalyst.expressions.WeekOfYear | weekofyear | SELECT weekofyear('2008-02-20') | struct<weekofyear(2008-02-20):int> |
diff --git a/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
new file mode 100644
index 00000000000..9f8af7eac7e
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/url-functions.sql
@@ -0,0 +1,19 @@
+-- parse_url function
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY');
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO');
+
+-- url_encode function
+select url_encode('https://spark.apache.org');
+select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select url_encode(null);
+
+-- url_decode function
+select url_decode('https%3A%2F%2Fspark.apache.org');
+select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2');
+select url_decode(null);
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
new file mode 100644
index 00000000000..fc714bfc41b
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/url-functions.sql.out
@@ -0,0 +1,111 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'HOST')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, HOST):string>
+-- !query output
+spark.apache.org
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PATH')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, PATH):string>
+-- !query output
+/path
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'QUERY')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, QUERY):string>
+-- !query output
+query=1
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'REF')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, REF):string>
+-- !query output
+Ref
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'PROTOCOL')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, PROTOCOL):string>
+-- !query output
+http
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'FILE')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, FILE):string>
+-- !query output
+/path?query=1
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'AUTHORITY')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, AUTHORITY):string>
+-- !query output
+userinfo@spark.apache.org
+
+
+-- !query
+select parse_url('http://userinfo@spark.apache.org/path?query=1#Ref', 'USERINFO')
+-- !query schema
+struct<parse_url(http://userinfo@spark.apache.org/path?query=1#Ref, USERINFO):string>
+-- !query output
+userinfo
+
+
+-- !query
+select url_encode('https://spark.apache.org')
+-- !query schema
+struct<url_encode(https://spark.apache.org):string>
+-- !query output
+https%3A%2F%2Fspark.apache.org
+
+
+-- !query
+select url_encode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<url_encode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva+lid%3A%2F%2Fuser%3Apass%40host%2Ffile%5C%3Bparam%3Fquery%5C%3Bp2
+
+
+-- !query
+select url_encode(null)
+-- !query schema
+struct<url_encode(NULL):string>
+-- !query output
+NULL
+
+
+-- !query
+select url_decode('https%3A%2F%2Fspark.apache.org')
+-- !query schema
+struct<url_decode(https%3A%2F%2Fspark.apache.org):string>
+-- !query output
+https://spark.apache.org
+
+
+-- !query
+select url_decode('inva lid://user:pass@host/file\\;param?query\\;p2')
+-- !query schema
+struct<url_decode(inva lid://user:pass@host/file\;param?query\;p2):string>
+-- !query output
+inva lid://user:pass@host/file\;param?query\;p2
+
+
+-- !query
+select url_decode(null)
+-- !query schema
+struct<url_decode(NULL):string>
+-- !query output
+NULL
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 2f118f236e2..d07be9c1971 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -346,53 +346,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
Row("???hi", "hi???", "h", "h"))
}
- test("string parse_url function") {
-
- def testUrl(url: String, expected: Row): Unit = {
- checkAnswer(Seq[String]((url)).toDF("url").selectExpr(
- "parse_url(url, 'HOST')", "parse_url(url, 'PATH')",
- "parse_url(url, 'QUERY')", "parse_url(url, 'REF')",
- "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')",
- "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')",
- "parse_url(url, 'QUERY', 'query')"), expected)
- }
-
- testUrl(
- "http://userinfo@spark.apache.org/path?query=1#Ref",
- Row("spark.apache.org", "/path", "query=1", "Ref",
- "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1"))
-
- testUrl(
- "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two",
- Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two",
- "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com",
- "use%20r:pas%20s", "x%20y"))
-
- testUrl(
- "http://user:pass@host",
- Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null))
-
- testUrl(
- "http://user:pass@host/",
- Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null))
-
- testUrl(
- "http://user:pass@host/?#",
- Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null))
-
- testUrl(
- "http://user:pass@host/file;param?query;p2",
- Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2",
- "user:pass@host", "user:pass", null))
-
- withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") {
- testUrl(
- "inva lid://user:pass@host/file;param?query;p2",
- Row(null, null, null, null, null, null, null, null, null))
- }
-
- }
-
test("string repeat function") {
val df = Seq(("hi", 2)).toDF("a", "b")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala
new file mode 100644
index 00000000000..85f0d70df7b
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class UrlFunctionsSuite extends QueryTest with SharedSparkSession {
+ import testImplicits._
+
+ test("url parse_url function") {
+
+ def testUrl(url: String, expected: Row): Unit = {
+ checkAnswer(Seq[String]((url)).toDF("url").selectExpr(
+ "parse_url(url, 'HOST')", "parse_url(url, 'PATH')",
+ "parse_url(url, 'QUERY')", "parse_url(url, 'REF')",
+ "parse_url(url, 'PROTOCOL')", "parse_url(url, 'FILE')",
+ "parse_url(url, 'AUTHORITY')", "parse_url(url, 'USERINFO')",
+ "parse_url(url, 'QUERY', 'query')"), expected)
+ }
+
+ testUrl(
+ "http://userinfo@spark.apache.org/path?query=1#Ref",
+ Row("spark.apache.org", "/path", "query=1", "Ref",
+ "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1"))
+
+ testUrl(
+ "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two",
+ Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two",
+ "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com",
+ "use%20r:pas%20s", "x%20y"))
+
+ testUrl(
+ "http://user:pass@host",
+ Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null))
+
+ testUrl(
+ "http://user:pass@host/",
+ Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null))
+
+ testUrl(
+ "http://user:pass@host/?#",
+ Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null))
+
+ testUrl(
+ "http://user:pass@host/file;param?query;p2",
+ Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2",
+ "user:pass@host", "user:pass", null))
+
+ withSQLConf(SQLConf.ANSI_ENABLED.key -> "false") {
+ testUrl(
+ "inva lid://user:pass@host/file;param?query;p2",
+ Row(null, null, null, null, null, null, null, null, null))
+ }
+ }
+
+ test("url encode/decode function") {
+ def testUrl(url: String, fn: String, expected: Row): Unit = {
+ checkAnswer(Seq[String]((url)).toDF("url")
+ .selectExpr(s"$fn(url)"), expected)
+ }
+
+ testUrl("https://spark.apache.org", "url_encode", Row("https%3A%2F%2Fspark.apache.org"))
+ testUrl("null", "url_encode", Row("null"))
+
+ testUrl("https%3A%2F%2Fspark.apache.org", "url_decode", Row("https://spark.apache.org"))
+ testUrl("null", "url_decode", Row("null"))
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org