You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2017/06/16 06:07:02 UTC

spark git commit: [SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH]

Repository: spark
Updated Branches:
  refs/heads/master 87ab0cec6 -> 7a3e5dc28


[SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH]

## What changes were proposed in this pull request?

This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions.

`BIT_LENGTH()` returns the bit length of the given string or binary expression.
`CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`)
`OCTET_LENGTH()` returns the byte length of the given string or binary expression.

## How was this patch tested?

Added new test suites for these three functions

Author: Kazuaki Ishizaki <is...@jp.ibm.com>

Closes #18046 from kiszk/SPARK-20749.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a3e5dc2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a3e5dc2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a3e5dc2

Branch: refs/heads/master
Commit: 7a3e5dc28b67ac1630c5a578a27a5a5acf80aa51
Parents: 87ab0ce
Author: Kazuaki Ishizaki <is...@jp.ibm.com>
Authored: Thu Jun 15 23:06:58 2017 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Thu Jun 15 23:06:58 2017 -0700

----------------------------------------------------------------------
 .../catalyst/analysis/FunctionRegistry.scala    |  3 +
 .../expressions/stringExpressions.scala         | 61 +++++++++++++++++++-
 .../expressions/StringExpressionsSuite.scala    | 20 +++++++
 .../resources/sql-tests/inputs/operators.sql    |  5 ++
 .../sql-tests/results/operators.sql.out         | 26 ++++++++-
 5 files changed, 112 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8773281..e4e9918 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -305,6 +305,8 @@ object FunctionRegistry {
     expression[Chr]("char"),
     expression[Chr]("chr"),
     expression[Base64]("base64"),
+    expression[BitLength]("bit_length"),
+    expression[Length]("char_length"),
     expression[Concat]("concat"),
     expression[ConcatWs]("concat_ws"),
     expression[Decode]("decode"),
@@ -321,6 +323,7 @@ object FunctionRegistry {
     expression[Levenshtein]("levenshtein"),
     expression[Like]("like"),
     expression[Lower]("lower"),
+    expression[OctetLength]("octet_length"),
     expression[StringLocate]("locate"),
     expression[StringLPad]("lpad"),
     expression[StringTrimLeft]("ltrim"),

http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
old mode 100644
new mode 100755
index 717ada2..908fdb8
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression)
 }
 
 /**
- * A function that return the length of the given string or binary expression.
+ * A function that returns the char length of the given string expression or
+ * number of bytes of the given binary expression.
  */
+// scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.",
+  usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.",
   extended = """
     Examples:
       > SELECT _FUNC_('Spark SQL');
        9
   """)
+// scalastyle:on line.size.limit
 case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
   override def dataType: DataType = IntegerType
   override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
@@ -1226,6 +1229,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn
 }
 
 /**
+ * A function that returns the bit length of the given string or binary expression.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       72
+  """)
+case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+  protected override def nullSafeEval(value: Any): Any = child.dataType match {
+    case StringType => value.asInstanceOf[UTF8String].numBytes * 8
+    case BinaryType => value.asInstanceOf[Array[Byte]].length * 8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.dataType match {
+      case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8")
+      case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8")
+    }
+  }
+}
+
+/**
+ * A function that returns the byte length of the given string or binary expression.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.",
+  extended = """
+    Examples:
+      > SELECT _FUNC_('Spark SQL');
+       9
+  """)
+case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType))
+
+  protected override def nullSafeEval(value: Any): Any = child.dataType match {
+    case StringType => value.asInstanceOf[UTF8String].numBytes
+    case BinaryType => value.asInstanceOf[Array[Byte]].length
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    child.dataType match {
+      case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()")
+      case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length")
+    }
+  }
+}
+
+/**
  * A function that return the Levenshtein distance between the two given strings.
  */
 @ExpressionDescription(

http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 4bdb43b..4f08031 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // scalastyle:off
     // non ascii characters are not allowed in the source code, so we disable the scalastyle.
     checkEvaluation(Length(Literal("a花花c")), 4, create_row(string))
+    checkEvaluation(OctetLength(Literal("a花花c")), 8, create_row(string))
+    checkEvaluation(BitLength(Literal("a花花c")), 8 * 8, create_row(string))
     // scalastyle:on
     checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+    checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte]))
+    checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), 5, create_row(string))
+    checkEvaluation(OctetLength(a), 5, create_row(string))
+    checkEvaluation(BitLength(a), 5 * 8, create_row(string))
     checkEvaluation(Length(b), 5, create_row(bytes))
+    checkEvaluation(OctetLength(b), 5, create_row(bytes))
+    checkEvaluation(BitLength(b), 5 * 8, create_row(bytes))
 
     checkEvaluation(Length(a), 0, create_row(""))
+    checkEvaluation(OctetLength(a), 0, create_row(""))
+    checkEvaluation(BitLength(a), 0, create_row(""))
     checkEvaluation(Length(b), 0, create_row(Array.empty[Byte]))
+    checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte]))
+    checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte]))
 
     checkEvaluation(Length(a), null, create_row(null))
+    checkEvaluation(OctetLength(a), null, create_row(null))
+    checkEvaluation(BitLength(a), null, create_row(null))
     checkEvaluation(Length(b), null, create_row(null))
+    checkEvaluation(OctetLength(b), null, create_row(null))
+    checkEvaluation(BitLength(b), null, create_row(null))
 
     checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string))
+    checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string))
+    checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string))
     checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes))
+    checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes))
+    checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes))
   }
 
   test("format_number / FormatNumber") {

http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/inputs/operators.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
index 3934620..a8de23e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql
@@ -80,3 +80,8 @@ select 1 > 0.00001;
 
 -- mod
 select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null);
+
+-- length
+select BIT_LENGTH('abc');
+select CHAR_LENGTH('abc');
+select OCTET_LENGTH('abc');

http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/results/operators.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
index 51ccf76..85ee10b 100644
--- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 51
+-- Number of queries: 54
 
 
 -- !query 0
@@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu
 struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double>
 -- !query 50 output
 1	NULL	0	NULL	NULL	NULL
+
+
+-- !query 51
+select BIT_LENGTH('abc')
+-- !query 51 schema
+struct<bitlength(abc):int>
+-- !query 51 output
+24
+
+
+-- !query 52
+select CHAR_LENGTH('abc')
+-- !query 52 schema
+struct<length(abc):int>
+-- !query 52 output
+3
+
+
+-- !query 53
+select OCTET_LENGTH('abc')
+-- !query 53 schema
+struct<octetlength(abc):int>
+-- !query 53 output
+3


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org