You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2017/06/16 06:07:02 UTC
spark git commit: [SPARK-20749][SQL] Built-in SQL Function Support -
all variants of LEN[GTH]
Repository: spark
Updated Branches:
refs/heads/master 87ab0cec6 -> 7a3e5dc28
[SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH]
## What changes were proposed in this pull request?
This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions.
`BIT_LENGTH()` returns the bit length of the given string or binary expression.
`CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`)
`OCTET_LENGTH()` returns the byte length of the given string or binary expression.
## How was this patch tested?
Added new test suites for these three functions
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Closes #18046 from kiszk/SPARK-20749.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a3e5dc2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a3e5dc2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a3e5dc2
Branch: refs/heads/master
Commit: 7a3e5dc28b67ac1630c5a578a27a5a5acf80aa51
Parents: 87ab0ce
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Authored: Thu Jun 15 23:06:58 2017 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Thu Jun 15 23:06:58 2017 -0700
----------------------------------------------------------------------
.../catalyst/analysis/FunctionRegistry.scala | 3 +
.../expressions/stringExpressions.scala | 61 +++++++++++++++++++-
.../expressions/StringExpressionsSuite.scala | 20 +++++++
.../resources/sql-tests/inputs/operators.sql | 5 ++
.../sql-tests/results/operators.sql.out | 26 ++++++++-
5 files changed, 112 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8773281..e4e9918 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -305,6 +305,8 @@ object FunctionRegistry {
expression[Chr]("char"),
expression[Chr]("chr"),
expression[Base64]("base64"),
+ expression[BitLength]("bit_length"),
+ expression[Length]("char_length"),
expression[Concat]("concat"),
expression[ConcatWs]("concat_ws"),
expression[Decode]("decode"),
@@ -321,6 +323,7 @@ object FunctionRegistry {
expression[Levenshtein]("levenshtein"),
expression[Like]("like"),
expression[Lower]("lower"),
+ expression[OctetLength]("octet_length"),
expression[StringLocate]("locate"),
expression[StringLPad]("lpad"),
expression[StringTrimLeft]("ltrim"),
http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
old mode 100644
new mode 100755
index 717ada2..908fdb8
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
}
/**
- * A function that return the length of the given string or binary expression.
+ * A function that returns the char length of the given string expression or
+ * number of bytes of the given binary expression.
*/
+// scalastyle:off line.size.limit
@ExpressionDescription(
- usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.",
+ usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
extended = """
Examples:
> SELECT _FUNC_('Spark SQL');
9
""")
+// scalastyle:on line.size.limit
case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
override def dataType: DataType = IntegerType
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1226,6 +1229,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
}
/**
+ * A function that returns the bit length of the given string or binary expression.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
+ extended = """
+ Examples:
+ > SELECT _FUNC_('Spark SQL');
+ 72
+ """)
+case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+ override def dataType: DataType = IntegerType
+ override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+ protected override def nullSafeEval(value: Any): Any = child.dataType match {
+ case StringType => value.asInstanceOf[UTF8String].numBytes * 8
+ case BinaryType => value.asInstanceOf[Array[Byte]].length * 8
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ child.dataType match {
+ case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8")
+ case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8")
+ }
+ }
+}
+
+/**
+ * A function that returns the byte length of the given string or binary expression.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
+ extended = """
+ Examples:
+ > SELECT _FUNC_('Spark SQL');
+ 9
+ """)
+case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+ override def dataType: DataType = IntegerType
+ override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+ protected override def nullSafeEval(value: Any): Any = child.dataType match {
+ case StringType => value.asInstanceOf[UTF8String].numBytes
+ case BinaryType => value.asInstanceOf[Array[Byte]].length
+ }
+
+ override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+ child.dataType match {
+ case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()")
+ case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
+ }
+ }
+}
+
+/**
* A function that return the Levenshtein distance between the two given strings.
*/
@ExpressionDescription(
http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 4bdb43b..4f08031 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// scalastyle:off
// non ascii characters are not allowed in the source code, so we disable the scalastyle.
checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
+ checkEvaluation(OctetLength(Literal("a花花c")), 8, create_row(string))
+ checkEvaluation(BitLength(Literal("a花花c")), 8 * 8, create_row(string))
// scalastyle:on
checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+ checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+ checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte]))
checkEvaluation(Length(a), 5, create_row(string))
+ checkEvaluation(OctetLength(a), 5, create_row(string))
+ checkEvaluation(BitLength(a), 5 * 8, create_row(string))
checkEvaluation(Length(b), 5, create_row(bytes))
+ checkEvaluation(OctetLength(b), 5, create_row(bytes))
+ checkEvaluation(BitLength(b), 5 * 8, create_row(bytes))
checkEvaluation(Length(a), 0, create_row(""))
+ checkEvaluation(OctetLength(a), 0, create_row(""))
+ checkEvaluation(BitLength(a), 0, create_row(""))
checkEvaluation(Length(b), 0, create_row(Array.empty[Byte]))
+ checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte]))
+ checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte]))
checkEvaluation(Length(a), null, create_row(null))
+ checkEvaluation(OctetLength(a), null, create_row(null))
+ checkEvaluation(BitLength(a), null, create_row(null))
checkEvaluation(Length(b), null, create_row(null))
+ checkEvaluation(OctetLength(b), null, create_row(null))
+ checkEvaluation(BitLength(b), null, create_row(null))
checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string))
+ checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string))
+ checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string))
checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
+ checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes))
+ checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes))
}
test("format_number / FormatNumber") {
http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/inputs/operators.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
index 3934620..a8de23e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
@@ -80,3 +80,8 @@ select 1 > 0.00001;
-- mod
select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null);
+
+-- length
+select BIT_LENGTH('abc');
+select CHAR_LENGTH('abc');
+select OCTET_LENGTH('abc');
http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/results/operators.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
index 51ccf76..85ee10b 100644
--- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 51
+-- Number of queries: 54
-- !query 0
@@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu
struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double>
-- !query 50 output
1 NULL 0 NULL NULL NULL
+
+
+-- !query 51
+select BIT_LENGTH('abc')
+-- !query 51 schema
+struct<bitlength(abc):int>
+-- !query 51 output
+24
+
+
+-- !query 52
+select CHAR_LENGTH('abc')
+-- !query 52 schema
+struct<length(abc):int>
+-- !query 52 output
+3
+
+
+-- !query 53
+select OCTET_LENGTH('abc')
+-- !query 53 schema
+struct<octetlength(abc):int>
+-- !query 53 output
+3
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org