You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/06/19 19:49:41 UTC
spark git commit: [SPARK-8234][SQL] misc function: md5
Repository: spark
Updated Branches:
refs/heads/master fe08561e2 -> 0c32fc125
[SPARK-8234][SQL] misc function: md5
Author: Shilei <sh...@intel.com>
Closes #6779 from qiansl127/MD5 and squashes the following commits:
11fcdb2 [Shilei] Fix the indent
04bd27b [Shilei] Add codegen
da60eb3 [Shilei] Remove checkInputDataTypes function
9509ad0 [Shilei] Format code
12c61f4 [Shilei] Accept only BinaryType for Md5
1df0b5b [Shilei] format to scala type
60ccde1 [Shilei] Add more test case
b8c73b4 [Shilei] Rewrite the type check for Md5
c166167 [Shilei] Add md5 function
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c32fc12
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c32fc12
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c32fc12
Branch: refs/heads/master
Commit: 0c32fc125c45e59f06cb55f3ba7da612d840ca86
Parents: fe08561
Author: Shilei <sh...@intel.com>
Authored: Fri Jun 19 10:49:27 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Fri Jun 19 10:49:27 2015 -0700
----------------------------------------------------------------------
.../catalyst/analysis/FunctionRegistry.scala | 3 ++
.../spark/sql/catalyst/expressions/misc.scala | 50 ++++++++++++++++++++
.../expressions/MiscFunctionsSuite.scala | 32 +++++++++++++
.../scala/org/apache/spark/sql/functions.scala | 21 ++++++++
.../spark/sql/DataFrameFunctionsSuite.scala | 11 +++++
5 files changed, 117 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/0c32fc12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 79273a7..5fb3369 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -133,6 +133,9 @@ object FunctionRegistry {
expression[ToDegrees]("degrees"),
expression[ToRadians]("radians"),
+ // misc functions
+ expression[Md5]("md5"),
+
// aggregate functions
expression[Average]("avg"),
expression[Count]("count"),
http://git-wip-us.apache.org/repos/asf/spark/blob/0c32fc12/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
new file mode 100644
index 0000000..4bee8cb
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.commons.codec.digest.DigestUtils
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.types.{BinaryType, StringType, DataType}
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A function that calculates an MD5 128-bit checksum and returns it as a hex string
+ * For input of type [[BinaryType]]
+ */
+case class Md5(child: Expression)
+ extends UnaryExpression with ExpectsInputTypes {
+
+ override def dataType: DataType = StringType
+
+ override def expectedChildTypes: Seq[DataType] = Seq(BinaryType)
+
+ override def eval(input: InternalRow): Any = {
+ val value = child.eval(input)
+ if (value == null) {
+ null
+ } else {
+ UTF8String.fromString(DigestUtils.md5Hex(value.asInstanceOf[Array[Byte]]))
+ }
+ }
+
+ override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+ defineCodeGen(ctx, ev, c =>
+ "org.apache.spark.unsafe.types.UTF8String.fromString" +
+ s"(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/0c32fc12/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
new file mode 100644
index 0000000..48b8413
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscFunctionsSuite.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types.{StringType, BinaryType}
+
+class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+ test("md5") {
+ checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932")
+ checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
+ "6ac1e56bc78f031059be7be854522c4c")
+ checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/0c32fc12/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 40ae9f5..7e7a099 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -36,6 +36,7 @@ import org.apache.spark.util.Utils
* @groupname sort_funcs Sorting functions
* @groupname normal_funcs Non-aggregate functions
* @groupname math_funcs Math functions
+ * @groupname misc_funcs Misc functions
* @groupname window_funcs Window functions
* @groupname string_funcs String functions
* @groupname Ungrouped Support functions for DataFrames.
@@ -1377,6 +1378,26 @@ object functions {
def toRadians(columnName: String): Column = toRadians(Column(columnName))
//////////////////////////////////////////////////////////////////////////////////////////////
+ // Misc functions
+ //////////////////////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * Calculates the MD5 digest and returns the value as a 32 character hex string.
+ *
+ * @group misc_funcs
+ * @since 1.5.0
+ */
+ def md5(e: Column): Column = Md5(e.expr)
+
+ /**
+ * Calculates the MD5 digest and returns the value as a 32 character hex string.
+ *
+ * @group misc_funcs
+ * @since 1.5.0
+ */
+ def md5(columnName: String): Column = md5(Column(columnName))
+
+ //////////////////////////////////////////////////////////////////////////////////////////////
// String functions
//////////////////////////////////////////////////////////////////////////////////////////////
http://git-wip-us.apache.org/repos/asf/spark/blob/0c32fc12/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 70819fe..8b53b38 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -133,6 +133,17 @@ class DataFrameFunctionsSuite extends QueryTest {
Row("x", "y", null))
}
+ test("misc md5 function") {
+ val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
+ checkAnswer(
+ df.select(md5($"a"), md5("b")),
+ Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
+
+ checkAnswer(
+ df.selectExpr("md5(a)", "md5(b)"),
+ Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
+ }
+
test("string length function") {
checkAnswer(
nullStrings.select(strlen($"s"), strlen("s")),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org