You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by am...@apache.org on 2020/05/27 21:10:38 UTC

[beam] branch master updated: [BEAM-10074] | implement hashing functions

This is an automated email from the ASF dual-hosted git repository.

amaliujia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new fa028e6  [BEAM-10074] | implement hashing functions
     new 706a06c  Merge pull request #11817 from darshanj/BEAM-10074
fa028e6 is described below

commit fa028e68e47c2a47858783a5a5f7adc15569c654
Author: darshan jani <da...@gmail.com>
AuthorDate: Tue May 26 22:36:35 2020 +0800

    [BEAM-10074] | implement hashing functions
---
 .../sql/impl/udf/BuiltinHashFunctions.java         | 139 +++++++++++++++++++++
 .../beam/sdk/extensions/sql/BeamSqlDslBase.java    |   6 +
 .../udf/BeamSalUhfSpecialTypeAndValueTest.java     |  69 ++++++++++
 .../sql/impl/udf/BeamSqlUdfExpressionTest.java     |  41 ++++++
 4 files changed, 255 insertions(+)

diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/udf/BuiltinHashFunctions.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/udf/BuiltinHashFunctions.java
new file mode 100644
index 0000000..c3fc82b
--- /dev/null
+++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/udf/BuiltinHashFunctions.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.extensions.sql.impl.udf;
+
+import com.google.auto.service.AutoService;
+import org.apache.beam.sdk.schemas.Schema;
+import org.apache.beam.vendor.calcite.v1_20_0.org.apache.calcite.linq4j.function.Strict;
+
+/** Hash Functions. */
+@AutoService(BeamBuiltinFunctionProvider.class)
+public class BuiltinHashFunctions extends BeamBuiltinFunctionProvider {
+
+  /**
+   * MD5(X)
+   *
+   * <p>Calculates the MD5 digest and returns the value as a 16 element {@code byte[]}.
+   */
+  @UDF(
+      funcName = "MD5",
+      parameterArray = {Schema.TypeName.STRING},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] md5String(String str) {
+    return org.apache.commons.codec.digest.DigestUtils.md5(str);
+  }
+
+  /**
+   * MD5(X)
+   *
+   * <p>Calculates the MD5 digest and returns the value as a 16 element {@code byte[]}.
+   */
+  @UDF(
+      funcName = "MD5",
+      parameterArray = {Schema.TypeName.BYTES},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] md5Bytes(byte[] bytes) {
+    return org.apache.commons.codec.digest.DigestUtils.md5(bytes);
+  }
+
+  /**
+   * SHA1(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA1",
+      parameterArray = {Schema.TypeName.STRING},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha1String(String str) {
+    return org.apache.commons.codec.digest.DigestUtils.sha1(str);
+  }
+
+  /**
+   * SHA1(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA1",
+      parameterArray = {Schema.TypeName.BYTES},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha1Bytes(byte[] bytes) {
+    return org.apache.commons.codec.digest.DigestUtils.sha1(bytes);
+  }
+
+  /**
+   * SHA256(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA256",
+      parameterArray = {Schema.TypeName.STRING},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha256String(String str) {
+    return org.apache.commons.codec.digest.DigestUtils.sha256(str);
+  }
+
+  /**
+   * SHA256(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA256",
+      parameterArray = {Schema.TypeName.BYTES},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha256Bytes(byte[] bytes) {
+    return org.apache.commons.codec.digest.DigestUtils.sha256(bytes);
+  }
+
+  /**
+   * SHA512(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA512",
+      parameterArray = {Schema.TypeName.STRING},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha512String(String str) {
+    return org.apache.commons.codec.digest.DigestUtils.sha512(str);
+  }
+
+  /**
+   * SHA512(X)
+   *
+   * <p>Calculates the SHA-1 digest and returns the value as a {@code byte[]}.
+   */
+  @UDF(
+      funcName = "SHA512",
+      parameterArray = {Schema.TypeName.BYTES},
+      returnType = Schema.TypeName.BYTES)
+  @Strict
+  public byte[] sha512Bytes(byte[] bytes) {
+    return org.apache.commons.codec.digest.DigestUtils.sha512(bytes);
+  }
+}
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslBase.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslBase.java
index ad26d4a..4298c07 100644
--- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslBase.java
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamSqlDslBase.java
@@ -213,6 +213,12 @@ public class BeamSqlDslBase {
                 "TO_HEX",
                 "abcABC".getBytes(UTF_8),
                 "TO_HEX",
+                "abcABCжщфЖЩФ".getBytes(UTF_8),
+                "HashingFn",
+                "foobar".getBytes(UTF_8),
+                "HashingFn",
+                " ".getBytes(UTF_8),
+                "HashingFn",
                 "abcABCжщфЖЩФ".getBytes(UTF_8))
             .getRows();
 
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSalUhfSpecialTypeAndValueTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSalUhfSpecialTypeAndValueTest.java
index 1370c62..db87eb7 100644
--- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSalUhfSpecialTypeAndValueTest.java
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSalUhfSpecialTypeAndValueTest.java
@@ -27,6 +27,7 @@ import org.apache.beam.sdk.schemas.Schema.FieldType;
 import org.apache.beam.sdk.testing.PAssert;
 import org.apache.beam.sdk.values.PCollection;
 import org.apache.beam.sdk.values.Row;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -167,4 +168,72 @@ public class BeamSalUhfSpecialTypeAndValueTest extends BeamSqlDslBase {
             resultRow8);
     pipeline.run().waitUntilFinish();
   }
+
+  @Test
+  public void testMd5() throws Exception {
+    Schema resultType = Schema.builder().addByteArrayField("field").build();
+    Row resultRow1 =
+        Row.withSchema(resultType).addValues(DigestUtils.md5("foobar".getBytes(UTF_8))).build();
+    Row resultRow2 =
+        Row.withSchema(resultType).addValues(DigestUtils.md5(" ".getBytes(UTF_8))).build();
+    Row resultRow3 =
+        Row.withSchema(resultType)
+            .addValues(DigestUtils.md5("abcABCжщфЖЩФ".getBytes(UTF_8)))
+            .build();
+    String sql = "SELECT MD5(f_bytes) FROM PCOLLECTION WHERE f_func = 'HashingFn'";
+    PCollection<Row> result = boundedInputBytes.apply("testUdf", SqlTransform.query(sql));
+    PAssert.that(result).containsInAnyOrder(resultRow1, resultRow2, resultRow3);
+    pipeline.run().waitUntilFinish();
+  }
+
+  @Test
+  public void testSHA1() throws Exception {
+    Schema resultType = Schema.builder().addByteArrayField("field").build();
+    Row resultRow1 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha1("foobar".getBytes(UTF_8))).build();
+    Row resultRow2 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha1(" ".getBytes(UTF_8))).build();
+    Row resultRow3 =
+        Row.withSchema(resultType)
+            .addValues(DigestUtils.sha1("abcABCжщфЖЩФ".getBytes(UTF_8)))
+            .build();
+    String sql = "SELECT SHA1(f_bytes) FROM PCOLLECTION WHERE f_func = 'HashingFn'";
+    PCollection<Row> result = boundedInputBytes.apply("testUdf", SqlTransform.query(sql));
+    PAssert.that(result).containsInAnyOrder(resultRow1, resultRow2, resultRow3);
+    pipeline.run().waitUntilFinish();
+  }
+
+  @Test
+  public void testSHA256() throws Exception {
+    Schema resultType = Schema.builder().addByteArrayField("field").build();
+    Row resultRow1 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha256("foobar".getBytes(UTF_8))).build();
+    Row resultRow2 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha256(" ".getBytes(UTF_8))).build();
+    Row resultRow3 =
+        Row.withSchema(resultType)
+            .addValues(DigestUtils.sha256("abcABCжщфЖЩФ".getBytes(UTF_8)))
+            .build();
+    String sql = "SELECT SHA256(f_bytes) FROM PCOLLECTION WHERE f_func = 'HashingFn'";
+    PCollection<Row> result = boundedInputBytes.apply("testUdf", SqlTransform.query(sql));
+    PAssert.that(result).containsInAnyOrder(resultRow1, resultRow2, resultRow3);
+    pipeline.run().waitUntilFinish();
+  }
+
+  @Test
+  public void testSHA512() throws Exception {
+    Schema resultType = Schema.builder().addByteArrayField("field").build();
+    Row resultRow1 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha512("foobar".getBytes(UTF_8))).build();
+    Row resultRow2 =
+        Row.withSchema(resultType).addValues(DigestUtils.sha512(" ".getBytes(UTF_8))).build();
+    Row resultRow3 =
+        Row.withSchema(resultType)
+            .addValues(DigestUtils.sha512("abcABCжщфЖЩФ".getBytes(UTF_8)))
+            .build();
+    String sql = "SELECT SHA512(f_bytes) FROM PCOLLECTION WHERE f_func = 'HashingFn'";
+    PCollection<Row> result = boundedInputBytes.apply("testUdf", SqlTransform.query(sql));
+    PAssert.that(result).containsInAnyOrder(resultRow1, resultRow2, resultRow3);
+    pipeline.run().waitUntilFinish();
+  }
 }
diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSqlUdfExpressionTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSqlUdfExpressionTest.java
index 82ffccf..495f329 100644
--- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSqlUdfExpressionTest.java
+++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/impl/udf/BeamSqlUdfExpressionTest.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 
 import org.apache.beam.sdk.extensions.sql.integrationtest.BeamSqlBuiltinFunctionsIntegrationTestBase;
 import org.apache.beam.sdk.schemas.Schema.TypeName;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -199,4 +200,44 @@ public class BeamSqlUdfExpressionTest extends BeamSqlBuiltinFunctionsIntegration
 
     checker.buildRunAndCheck();
   }
+
+  @Test
+  public void testMd5() throws Exception {
+    ExpressionChecker checker =
+        new ExpressionChecker()
+            .addExpr("MD5('foobar')", DigestUtils.md5("foobar"))
+            .addExpr("MD5('中文')", DigestUtils.md5("中文"))
+            .addExprWithNullExpectedValue("MD5(CAST(NULL AS VARCHAR(0)))", TypeName.BYTES);
+    checker.buildRunAndCheck();
+  }
+
+  @Test
+  public void testSHA1() throws Exception {
+    ExpressionChecker checker =
+        new ExpressionChecker()
+            .addExpr("SHA1('foobar')", DigestUtils.sha1("foobar"))
+            .addExpr("SHA1('中文')", DigestUtils.sha1("中文"))
+            .addExprWithNullExpectedValue("SHA1(CAST(NULL AS VARCHAR(0)))", TypeName.BYTES);
+    checker.buildRunAndCheck();
+  }
+
+  @Test
+  public void testSHA256() throws Exception {
+    ExpressionChecker checker =
+        new ExpressionChecker()
+            .addExpr("SHA256('foobar')", DigestUtils.sha256("foobar"))
+            .addExpr("SHA256('中文')", DigestUtils.sha256("中文"))
+            .addExprWithNullExpectedValue("SHA256(CAST(NULL AS VARCHAR(0)))", TypeName.BYTES);
+    checker.buildRunAndCheck();
+  }
+
+  @Test
+  public void testSHA512() throws Exception {
+    ExpressionChecker checker =
+        new ExpressionChecker()
+            .addExpr("SHA512('foobar')", DigestUtils.sha512("foobar"))
+            .addExpr("SHA512('中文')", DigestUtils.sha512("中文"))
+            .addExprWithNullExpectedValue("SHA512(CAST(NULL AS VARCHAR(0)))", TypeName.BYTES);
+    checker.buildRunAndCheck();
+  }
 }