You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ue...@apache.org on 2018/05/30 18:18:08 UTC

spark git commit: [SPARK-23901][SQL] Add masking functions

Repository: spark
Updated Branches:
  refs/heads/master ec6f971dc -> 1b36f1488


[SPARK-23901][SQL] Add masking functions

## What changes were proposed in this pull request?

The PR adds the masking function as they are described in Hive's documentation: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-DataMaskingFunctions.
This means that only `string`s are accepted as parameter for the masking functions.

## How was this patch tested?

added UTs

Author: Marco Gaido <ma...@gmail.com>

Closes #21246 from mgaido91/SPARK-23901.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b36f148
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b36f148
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b36f148

Branch: refs/heads/master
Commit: 1b36f148891ac41ef36a40366f87dd5405cb3751
Parents: ec6f971
Author: Marco Gaido <ma...@gmail.com>
Authored: Wed May 30 11:18:04 2018 -0700
Committer: Takuya UESHIN <ue...@databricks.com>
Committed: Wed May 30 11:18:04 2018 -0700

----------------------------------------------------------------------
 .../expressions/MaskExpressionsUtils.java       |  80 +++
 .../catalyst/analysis/FunctionRegistry.scala    |   8 +
 .../catalyst/expressions/maskExpressions.scala  | 569 +++++++++++++++++++
 .../expressions/MaskExpressionsSuite.scala      | 236 ++++++++
 .../scala/org/apache/spark/sql/functions.scala  | 119 ++++
 .../spark/sql/DataFrameFunctionsSuite.scala     | 107 ++++
 6 files changed, 1119 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/MaskExpressionsUtils.java
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/MaskExpressionsUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/MaskExpressionsUtils.java
new file mode 100644
index 0000000..0587990
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/MaskExpressionsUtils.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions;
+
+/**
+ * Contains all the Utils methods used in the masking expressions.
+ */
+public class MaskExpressionsUtils {
+  static final int UNMASKED_VAL = -1;
+
+  /**
+   * Returns the masking character for {@param c} or {@param c} is it should not be masked.
+   * @param c the character to transform
+   * @param maskedUpperChar the character to use instead of a uppercase letter
+   * @param maskedLowerChar the character to use instead of a lowercase letter
+   * @param maskedDigitChar the character to use instead of a digit
+   * @param maskedOtherChar the character to use instead of a any other character
+   * @return masking character for {@param c}
+   */
+  public static int transformChar(
+      final int c,
+      int maskedUpperChar,
+      int maskedLowerChar,
+      int maskedDigitChar,
+      int maskedOtherChar) {
+    switch(Character.getType(c)) {
+      case Character.UPPERCASE_LETTER:
+        if(maskedUpperChar != UNMASKED_VAL) {
+          return maskedUpperChar;
+        }
+        break;
+
+      case Character.LOWERCASE_LETTER:
+        if(maskedLowerChar != UNMASKED_VAL) {
+          return maskedLowerChar;
+        }
+        break;
+
+      case Character.DECIMAL_DIGIT_NUMBER:
+        if(maskedDigitChar != UNMASKED_VAL) {
+          return maskedDigitChar;
+        }
+        break;
+
+      default:
+        if(maskedOtherChar != UNMASKED_VAL) {
+          return maskedOtherChar;
+        }
+        break;
+    }
+
+    return c;
+  }
+
+  /**
+   * Returns the replacement char to use according to the {@param rep} specified by the user and
+   * the {@param def} default.
+   */
+  public static int getReplacementChar(String rep, int def) {
+    if (rep != null && rep.length() > 0) {
+      return rep.codePointAt(0);
+    }
+    return def;
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 1134a88..23a4a44 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -432,6 +432,14 @@ object FunctionRegistry {
     expression[ArrayRepeat]("array_repeat"),
     CreateStruct.registryEntry,
 
+    // mask functions
+    expression[Mask]("mask"),
+    expression[MaskFirstN]("mask_first_n"),
+    expression[MaskLastN]("mask_last_n"),
+    expression[MaskShowFirstN]("mask_show_first_n"),
+    expression[MaskShowLastN]("mask_show_last_n"),
+    expression[MaskHash]("mask_hash"),
+
     // misc functions
     expression[AssertTrue]("assert_true"),
     expression[Crc32]("crc32"),

http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala
new file mode 100644
index 0000000..276a572
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/maskExpressions.scala
@@ -0,0 +1,569 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.commons.codec.digest.DigestUtils
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions.MaskExpressionsUtils._
+import org.apache.spark.sql.catalyst.expressions.MaskLike._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+
+trait MaskLike {
+  def upper: String
+  def lower: String
+  def digit: String
+
+  protected lazy val upperReplacement: Int = getReplacementChar(upper, defaultMaskedUppercase)
+  protected lazy val lowerReplacement: Int = getReplacementChar(lower, defaultMaskedLowercase)
+  protected lazy val digitReplacement: Int = getReplacementChar(digit, defaultMaskedDigit)
+
+  protected val maskUtilsClassName: String = classOf[MaskExpressionsUtils].getName
+
+  def inputStringLengthCode(inputString: String, length: String): String = {
+    s"${CodeGenerator.JAVA_INT} $length = $inputString.codePointCount(0, $inputString.length());"
+  }
+
+  def appendMaskedToStringBuilderCode(
+      ctx: CodegenContext,
+      sb: String,
+      inputString: String,
+      offset: String,
+      numChars: String): String = {
+    val i = ctx.freshName("i")
+    val codePoint = ctx.freshName("codePoint")
+    s"""
+       |for (${CodeGenerator.JAVA_INT} $i = 0; $i < $numChars; $i++) {
+       |  ${CodeGenerator.JAVA_INT} $codePoint = $inputString.codePointAt($offset);
+       |  $sb.appendCodePoint($maskUtilsClassName.transformChar($codePoint,
+       |    $upperReplacement, $lowerReplacement,
+       |    $digitReplacement, $defaultMaskedOther));
+       |  $offset += Character.charCount($codePoint);
+       |}
+     """.stripMargin
+  }
+
+  def appendUnchangedToStringBuilderCode(
+      ctx: CodegenContext,
+      sb: String,
+      inputString: String,
+      offset: String,
+      numChars: String): String = {
+    val i = ctx.freshName("i")
+    val codePoint = ctx.freshName("codePoint")
+    s"""
+       |for (${CodeGenerator.JAVA_INT} $i = 0; $i < $numChars; $i++) {
+       |  ${CodeGenerator.JAVA_INT} $codePoint = $inputString.codePointAt($offset);
+       |  $sb.appendCodePoint($codePoint);
+       |  $offset += Character.charCount($codePoint);
+       |}
+     """.stripMargin
+  }
+
+  def appendMaskedToStringBuilder(
+      sb: java.lang.StringBuilder,
+      inputString: String,
+      startOffset: Int,
+      numChars: Int): Int = {
+    var offset = startOffset
+    (1 to numChars) foreach { _ =>
+      val codePoint = inputString.codePointAt(offset)
+      sb.appendCodePoint(transformChar(
+        codePoint,
+        upperReplacement,
+        lowerReplacement,
+        digitReplacement,
+        defaultMaskedOther))
+      offset += Character.charCount(codePoint)
+    }
+    offset
+  }
+
+  def appendUnchangedToStringBuilder(
+      sb: java.lang.StringBuilder,
+      inputString: String,
+      startOffset: Int,
+      numChars: Int): Int = {
+    var offset = startOffset
+    (1 to numChars) foreach { _ =>
+      val codePoint = inputString.codePointAt(offset)
+      sb.appendCodePoint(codePoint)
+      offset += Character.charCount(codePoint)
+    }
+    offset
+  }
+}
+
+trait MaskLikeWithN extends MaskLike {
+  def n: Int
+  protected lazy val charCount: Int = if (n < 0) 0 else n
+}
+
+/**
+ * Utils for mask operations.
+ */
+object MaskLike {
+  val defaultCharCount = 4
+  val defaultMaskedUppercase: Int = 'X'
+  val defaultMaskedLowercase: Int = 'x'
+  val defaultMaskedDigit: Int = 'n'
+  val defaultMaskedOther: Int = MaskExpressionsUtils.UNMASKED_VAL
+
+  def extractCharCount(e: Expression): Int = e match {
+    case Literal(i, IntegerType | NullType) =>
+      if (i == null) defaultCharCount else i.asInstanceOf[Int]
+    case Literal(_, dt) => throw new AnalysisException("Expected literal expression of type " +
+      s"${IntegerType.simpleString}, but got literal of ${dt.simpleString}")
+    case other => throw new AnalysisException(s"Expected literal expression, but got ${other.sql}")
+  }
+
+  def extractReplacement(e: Expression): String = e match {
+    case Literal(s, StringType | NullType) => if (s == null) null else s.toString
+    case Literal(_, dt) => throw new AnalysisException("Expected literal expression of type " +
+      s"${StringType.simpleString}, but got literal of ${dt.simpleString}")
+    case other => throw new AnalysisException(s"Expected literal expression, but got ${other.sql}")
+  }
+}
+
+/**
+ * Masks the input string. Additional parameters can be set to change the masking chars for
+ * uppercase letters, lowercase letters and digits.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str[, upper[, lower[, digit]]]) - Masks str. By default, upper case letters are converted to \"X\", lower case letters are converted to \"x\" and numbers are converted to \"n\". You can override the characters used in the mask by supplying additional arguments: the second argument controls the mask character for upper case letters, the third argument for lower case letters and the fourth argument for numbers.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("abcd-EFGH-8765-4321", "U", "l", "#");
+       llll-UUUU-####-####
+  """)
+// scalastyle:on line.size.limit
+case class Mask(child: Expression, upper: String, lower: String, digit: String)
+  extends UnaryExpression with ExpectsInputTypes with MaskLike {
+
+  def this(child: Expression) = this(child, null.asInstanceOf[String], null, null)
+
+  def this(child: Expression, upper: Expression) =
+    this(child, extractReplacement(upper), null, null)
+
+  def this(child: Expression, upper: Expression, lower: Expression) =
+    this(child, extractReplacement(upper), extractReplacement(lower), null)
+
+  def this(child: Expression, upper: Expression, lower: Expression, digit: Expression) =
+    this(child, extractReplacement(upper), extractReplacement(lower), extractReplacement(digit))
+
+  override def nullSafeEval(input: Any): Any = {
+    val str = input.asInstanceOf[UTF8String].toString
+    val length = str.codePointCount(0, str.length())
+    val sb = new java.lang.StringBuilder(length)
+    appendMaskedToStringBuilder(sb, str, 0, length)
+    UTF8String.fromString(sb.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val sb = ctx.freshName("sb")
+      val length = ctx.freshName("length")
+      val offset = ctx.freshName("offset")
+      val inputString = ctx.freshName("inputString")
+      s"""
+         |String $inputString = $input.toString();
+         |${inputStringLengthCode(inputString, length)}
+         |StringBuilder $sb = new StringBuilder($length);
+         |${CodeGenerator.JAVA_INT} $offset = 0;
+         |${appendMaskedToStringBuilderCode(ctx, sb, inputString, offset, length)}
+         |${ev.value} = UTF8String.fromString($sb.toString());
+       """.stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+}
+
+/**
+ * Masks the first N chars of the input string. N defaults to 4. Additional parameters can be set
+ * to change the masking chars for uppercase letters, lowercase letters and digits.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str[, n[, upper[, lower[, digit]]]]) - Masks the first n values of str. By default, n is 4, upper case letters are converted to \"X\", lower case letters are converted to \"x\" and numbers are converted to \"n\". You can override the characters used in the mask by supplying additional arguments: the second argument controls the mask character for upper case letters, the third argument for lower case letters and the fourth argument for numbers.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("1234-5678-8765-4321", 4);
+       nnnn-5678-8765-4321
+  """)
+// scalastyle:on line.size.limit
+case class MaskFirstN(
+    child: Expression,
+    n: Int,
+    upper: String,
+    lower: String,
+    digit: String)
+  extends UnaryExpression with ExpectsInputTypes with MaskLikeWithN {
+
+  def this(child: Expression) =
+    this(child, defaultCharCount, null, null, null)
+
+  def this(child: Expression, n: Expression) =
+    this(child, extractCharCount(n), null, null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression, lower: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), extractReplacement(lower), null)
+
+  def this(
+      child: Expression,
+      n: Expression,
+      upper: Expression,
+      lower: Expression,
+      digit: Expression) =
+    this(child,
+      extractCharCount(n),
+      extractReplacement(upper),
+      extractReplacement(lower),
+      extractReplacement(digit))
+
+  override def nullSafeEval(input: Any): Any = {
+    val str = input.asInstanceOf[UTF8String].toString
+    val length = str.codePointCount(0, str.length())
+    val endOfMask = if (charCount > length) length else charCount
+    val sb = new java.lang.StringBuilder(length)
+    val offset = appendMaskedToStringBuilder(sb, str, 0, endOfMask)
+    appendUnchangedToStringBuilder(sb, str, offset, length - endOfMask)
+    UTF8String.fromString(sb.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val sb = ctx.freshName("sb")
+      val length = ctx.freshName("length")
+      val offset = ctx.freshName("offset")
+      val inputString = ctx.freshName("inputString")
+      val endOfMask = ctx.freshName("endOfMask")
+      s"""
+         |String $inputString = $input.toString();
+         |${inputStringLengthCode(inputString, length)}
+         |${CodeGenerator.JAVA_INT} $endOfMask = $charCount > $length ? $length : $charCount;
+         |${CodeGenerator.JAVA_INT} $offset = 0;
+         |StringBuilder $sb = new StringBuilder($length);
+         |${appendMaskedToStringBuilderCode(ctx, sb, inputString, offset, endOfMask)}
+         |${appendUnchangedToStringBuilderCode(
+              ctx, sb, inputString, offset, s"$length - $endOfMask")}
+         |${ev.value} = UTF8String.fromString($sb.toString());
+         |""".stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "mask_first_n"
+}
+
+/**
+ * Masks the last N chars of the input string. N defaults to 4. Additional parameters can be set
+ * to change the masking chars for uppercase letters, lowercase letters and digits.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str[, n[, upper[, lower[, digit]]]]) - Masks the last n values of str. By default, n is 4, upper case letters are converted to \"X\", lower case letters are converted to \"x\" and numbers are converted to \"n\". You can override the characters used in the mask by supplying additional arguments: the second argument controls the mask character for upper case letters, the third argument for lower case letters and the fourth argument for numbers.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("1234-5678-8765-4321", 4);
+       1234-5678-8765-nnnn
+  """, since = "2.4.0")
+// scalastyle:on line.size.limit
+case class MaskLastN(
+    child: Expression,
+    n: Int,
+    upper: String,
+    lower: String,
+    digit: String)
+  extends UnaryExpression with ExpectsInputTypes with MaskLikeWithN {
+
+  def this(child: Expression) =
+    this(child, defaultCharCount, null, null, null)
+
+  def this(child: Expression, n: Expression) =
+    this(child, extractCharCount(n), null, null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression, lower: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), extractReplacement(lower), null)
+
+  def this(
+      child: Expression,
+      n: Expression,
+      upper: Expression,
+      lower: Expression,
+      digit: Expression) =
+    this(child,
+      extractCharCount(n),
+      extractReplacement(upper),
+      extractReplacement(lower),
+      extractReplacement(digit))
+
+  override def nullSafeEval(input: Any): Any = {
+    val str = input.asInstanceOf[UTF8String].toString
+    val length = str.codePointCount(0, str.length())
+    val startOfMask = if (charCount >= length) 0 else length - charCount
+    val sb = new java.lang.StringBuilder(length)
+    val offset = appendUnchangedToStringBuilder(sb, str, 0, startOfMask)
+    appendMaskedToStringBuilder(sb, str, offset, length - startOfMask)
+    UTF8String.fromString(sb.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val sb = ctx.freshName("sb")
+      val length = ctx.freshName("length")
+      val offset = ctx.freshName("offset")
+      val inputString = ctx.freshName("inputString")
+      val startOfMask = ctx.freshName("startOfMask")
+      s"""
+         |String $inputString = $input.toString();
+         |${inputStringLengthCode(inputString, length)}
+         |${CodeGenerator.JAVA_INT} $startOfMask = $charCount >= $length ?
+         |  0 : $length - $charCount;
+         |${CodeGenerator.JAVA_INT} $offset = 0;
+         |StringBuilder $sb = new StringBuilder($length);
+         |${appendUnchangedToStringBuilderCode(ctx, sb, inputString, offset, startOfMask)}
+         |${appendMaskedToStringBuilderCode(
+              ctx, sb, inputString, offset, s"$length - $startOfMask")}
+         |${ev.value} = UTF8String.fromString($sb.toString());
+         |""".stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "mask_last_n"
+}
+
+/**
+ * Masks all but the first N chars of the input string. N defaults to 4. Additional parameters can
+ * be set to change the masking chars for uppercase letters, lowercase letters and digits.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str[, n[, upper[, lower[, digit]]]]) - Masks all but the first n values of str. By default, n is 4, upper case letters are converted to \"X\", lower case letters are converted to \"x\" and numbers are converted to \"n\". You can override the characters used in the mask by supplying additional arguments: the second argument controls the mask character for upper case letters, the third argument for lower case letters and the fourth argument for numbers.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("1234-5678-8765-4321", 4);
+       1234-nnnn-nnnn-nnnn
+  """, since = "2.4.0")
+// scalastyle:on line.size.limit
+case class MaskShowFirstN(
+    child: Expression,
+    n: Int,
+    upper: String,
+    lower: String,
+    digit: String)
+  extends UnaryExpression with ExpectsInputTypes with MaskLikeWithN {
+
+  def this(child: Expression) =
+    this(child, defaultCharCount, null, null, null)
+
+  def this(child: Expression, n: Expression) =
+    this(child, extractCharCount(n), null, null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression, lower: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), extractReplacement(lower), null)
+
+  def this(
+      child: Expression,
+      n: Expression,
+      upper: Expression,
+      lower: Expression,
+      digit: Expression) =
+    this(child,
+      extractCharCount(n),
+      extractReplacement(upper),
+      extractReplacement(lower),
+      extractReplacement(digit))
+
+  override def nullSafeEval(input: Any): Any = {
+    val str = input.asInstanceOf[UTF8String].toString
+    val length = str.codePointCount(0, str.length())
+    val startOfMask = if (charCount > length) length else charCount
+    val sb = new java.lang.StringBuilder(length)
+    val offset = appendUnchangedToStringBuilder(sb, str, 0, startOfMask)
+    appendMaskedToStringBuilder(sb, str, offset, length - startOfMask)
+    UTF8String.fromString(sb.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val sb = ctx.freshName("sb")
+      val length = ctx.freshName("length")
+      val offset = ctx.freshName("offset")
+      val inputString = ctx.freshName("inputString")
+      val startOfMask = ctx.freshName("startOfMask")
+      s"""
+         |String $inputString = $input.toString();
+         |${inputStringLengthCode(inputString, length)}
+         |${CodeGenerator.JAVA_INT} $startOfMask = $charCount > $length ? $length : $charCount;
+         |${CodeGenerator.JAVA_INT} $offset = 0;
+         |StringBuilder $sb = new StringBuilder($length);
+         |${appendUnchangedToStringBuilderCode(ctx, sb, inputString, offset, startOfMask)}
+         |${appendMaskedToStringBuilderCode(
+              ctx, sb, inputString, offset, s"$length - $startOfMask")}
+         |${ev.value} = UTF8String.fromString($sb.toString());
+         |""".stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "mask_show_first_n"
+}
+
+/**
+ * Masks all but the last N chars of the input string. N defaults to 4. Additional parameters can
+ * be set to change the masking chars for uppercase letters, lowercase letters and digits.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str[, n[, upper[, lower[, digit]]]]) - Masks all but the last n values of str. By default, n is 4, upper case letters are converted to \"X\", lower case letters are converted to \"x\" and numbers are converted to \"n\". You can override the characters used in the mask by supplying additional arguments: the second argument controls the mask character for upper case letters, the third argument for lower case letters and the fourth argument for numbers.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("1234-5678-8765-4321", 4);
+       nnnn-nnnn-nnnn-4321
+  """, since = "2.4.0")
+// scalastyle:on line.size.limit
+case class MaskShowLastN(
+    child: Expression,
+    n: Int,
+    upper: String,
+    lower: String,
+    digit: String)
+  extends UnaryExpression with ExpectsInputTypes with MaskLikeWithN {
+
+  def this(child: Expression) =
+    this(child, defaultCharCount, null, null, null)
+
+  def this(child: Expression, n: Expression) =
+    this(child, extractCharCount(n), null, null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), null, null)
+
+  def this(child: Expression, n: Expression, upper: Expression, lower: Expression) =
+    this(child, extractCharCount(n), extractReplacement(upper), extractReplacement(lower), null)
+
+  def this(
+      child: Expression,
+      n: Expression,
+      upper: Expression,
+      lower: Expression,
+      digit: Expression) =
+    this(child,
+      extractCharCount(n),
+      extractReplacement(upper),
+      extractReplacement(lower),
+      extractReplacement(digit))
+
+  override def nullSafeEval(input: Any): Any = {
+    val str = input.asInstanceOf[UTF8String].toString
+    val length = str.codePointCount(0, str.length())
+    val endOfMask = if (charCount >= length) 0 else length - charCount
+    val sb = new java.lang.StringBuilder(length)
+    val offset = appendMaskedToStringBuilder(sb, str, 0, endOfMask)
+    appendUnchangedToStringBuilder(sb, str, offset, length - endOfMask)
+    UTF8String.fromString(sb.toString)
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val sb = ctx.freshName("sb")
+      val length = ctx.freshName("length")
+      val offset = ctx.freshName("offset")
+      val inputString = ctx.freshName("inputString")
+      val endOfMask = ctx.freshName("endOfMask")
+      s"""
+         |String $inputString = $input.toString();
+         |${inputStringLengthCode(inputString, length)}
+         |${CodeGenerator.JAVA_INT} $endOfMask = $charCount >= $length ? 0 : $length - $charCount;
+         |${CodeGenerator.JAVA_INT} $offset = 0;
+         |StringBuilder $sb = new StringBuilder($length);
+         |${appendMaskedToStringBuilderCode(ctx, sb, inputString, offset, endOfMask)}
+         |${appendUnchangedToStringBuilderCode(
+              ctx, sb, inputString, offset, s"$length - $endOfMask")}
+         |${ev.value} = UTF8String.fromString($sb.toString());
+         |""".stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "mask_show_last_n"
+}
+
+/**
+ * Returns a hashed value based on str.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns a hashed value based on str. The hash is consistent and can be used to join masked values together across tables.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_("abcd-EFGH-8765-4321");
+       60c713f5ec6912229d2060df1c322776
+  """)
+// scalastyle:on line.size.limit
+case class MaskHash(child: Expression)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def nullSafeEval(input: Any): Any = {
+    UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[UTF8String].toString))
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, (input: String) => {
+      val digestUtilsClass = classOf[DigestUtils].getName.stripSuffix("$")
+      s"""
+         |${ev.value} = UTF8String.fromString($digestUtilsClass.md5Hex($input.toString()));
+         |""".stripMargin
+    })
+  }
+
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  override def prettyName: String = "mask_hash"
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MaskExpressionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MaskExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MaskExpressionsSuite.scala
new file mode 100644
index 0000000..4d69dc3
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MaskExpressionsSuite.scala
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.types.{IntegerType, StringType}
+
+class MaskExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
+
+  test("mask") {
+    checkEvaluation(Mask(Literal("abcd-EFGH-8765-4321"), "U", "l", "#"), "llll-UUUU-####-####")
+    checkEvaluation(
+      new Mask(Literal("abcd-EFGH-8765-4321"), Literal("U"), Literal("l"), Literal("#")),
+      "llll-UUUU-####-####")
+    checkEvaluation(new Mask(Literal("abcd-EFGH-8765-4321"), Literal("U"), Literal("l")),
+      "llll-UUUU-nnnn-nnnn")
+    checkEvaluation(new Mask(Literal("abcd-EFGH-8765-4321"), Literal("U")), "xxxx-UUUU-nnnn-nnnn")
+    checkEvaluation(new Mask(Literal("abcd-EFGH-8765-4321")), "xxxx-XXXX-nnnn-nnnn")
+    checkEvaluation(new Mask(Literal(null, StringType)), null)
+    checkEvaluation(Mask(Literal("abcd-EFGH-8765-4321"), null, "l", "#"), "llll-XXXX-####-####")
+    checkEvaluation(new Mask(
+      Literal("abcd-EFGH-8765-4321"),
+      Literal(null, StringType),
+      Literal(null, StringType),
+      Literal(null, StringType)), "xxxx-XXXX-nnnn-nnnn")
+    checkEvaluation(new Mask(Literal("abcd-EFGH-8765-4321"), Literal("Upper")),
+      "xxxx-UUUU-nnnn-nnnn")
+    checkEvaluation(new Mask(Literal("")), "")
+    checkEvaluation(new Mask(Literal("abcd-EFGH-8765-4321"), Literal("")), "xxxx-XXXX-nnnn-nnnn")
+    checkEvaluation(Mask(Literal("abcd-EFGH-8765-4321"), "", "", ""), "xxxx-XXXX-nnnn-nnnn")
+    // scalastyle:off nonascii
+    checkEvaluation(Mask(Literal("Ul9U"), "\u2200", null, null), "\u2200xn\u2200")
+    checkEvaluation(new Mask(Literal("Hello World, こんにちは, 𠀋"), Literal("あ"), Literal("𡈽")),
+      "あ𡈽𡈽𡈽𡈽 あ𡈽𡈽𡈽𡈽, こんにちは, 𠀋")
+    // scalastyle:on nonascii
+    intercept[AnalysisException] {
+      checkEvaluation(new Mask(Literal(""), Literal(1)), "")
+    }
+  }
+
+  test("mask_first_n") {
+    checkEvaluation(MaskFirstN(Literal("aB3d-EFGH-8765"), 6, "U", "l", "#"),
+      "lU#l-UFGH-8765")
+    checkEvaluation(new MaskFirstN(
+      Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l"), Literal("#")),
+      "llll-UFGH-8765-4321")
+    checkEvaluation(
+      new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l")),
+      "llll-UFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U")),
+      "xxxx-UFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6)),
+      "xxxx-XFGH-8765-4321")
+    intercept[AnalysisException] {
+      checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal("U")), "")
+    }
+    checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321")), "xxxx-EFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(Literal(null, StringType)), null)
+    checkEvaluation(MaskFirstN(Literal("abcd-EFGH-8765-4321"), 4, "U", "l", null),
+      "llll-EFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(
+      Literal("abcd-EFGH-8765-4321"),
+      Literal(null, IntegerType),
+      Literal(null, StringType),
+      Literal(null, StringType),
+      Literal(null, StringType)), "xxxx-EFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("Upper")),
+      "xxxx-UFGH-8765-4321")
+    checkEvaluation(new MaskFirstN(Literal("")), "")
+    checkEvaluation(new MaskFirstN(Literal("abcd-EFGH-8765-4321"), Literal(4), Literal("")),
+      "xxxx-EFGH-8765-4321")
+    checkEvaluation(MaskFirstN(Literal("abcd-EFGH-8765-4321"), 1000, "", "", ""),
+      "xxxx-XXXX-nnnn-nnnn")
+    checkEvaluation(MaskFirstN(Literal("abcd-EFGH-8765-4321"), -1, "", "", ""),
+      "abcd-EFGH-8765-4321")
+    // scalastyle:off nonascii
+    checkEvaluation(MaskFirstN(Literal("Ul9U"), 2, "\u2200", null, null), "\u2200x9U")
+    checkEvaluation(new MaskFirstN(Literal("あ, 𠀋, Hello World"), Literal(10)),
+      "あ, 𠀋, Xxxxo World")
+    // scalastyle:on nonascii
+  }
+
+  test("mask_last_n") {
+    checkEvaluation(MaskLastN(Literal("abcd-EFGH-aB3d"), 6, "U", "l", "#"),
+      "abcd-EFGU-lU#l")
+    checkEvaluation(new MaskLastN(
+      Literal("abcd-EFGH-8765"), Literal(6), Literal("U"), Literal("l"), Literal("#")),
+      "abcd-EFGU-####")
+    checkEvaluation(
+      new MaskLastN(Literal("abcd-EFGH-8765"), Literal(6), Literal("U"), Literal("l")),
+      "abcd-EFGU-nnnn")
+    checkEvaluation(
+      new MaskLastN(Literal("abcd-EFGH-8765"), Literal(6), Literal("U")),
+      "abcd-EFGU-nnnn")
+    checkEvaluation(
+      new MaskLastN(Literal("abcd-EFGH-8765"), Literal(6)),
+      "abcd-EFGX-nnnn")
+    intercept[AnalysisException] {
+      checkEvaluation(new MaskLastN(Literal("abcd-EFGH-8765"), Literal("U")), "")
+    }
+    checkEvaluation(new MaskLastN(Literal("abcd-EFGH-8765-4321")), "abcd-EFGH-8765-nnnn")
+    checkEvaluation(new MaskLastN(Literal(null, StringType)), null)
+    checkEvaluation(MaskLastN(Literal("abcd-EFGH-8765-4321"), 4, "U", "l", null),
+      "abcd-EFGH-8765-nnnn")
+    checkEvaluation(new MaskLastN(
+      Literal("abcd-EFGH-8765-4321"),
+      Literal(null, IntegerType),
+      Literal(null, StringType),
+      Literal(null, StringType),
+      Literal(null, StringType)), "abcd-EFGH-8765-nnnn")
+    checkEvaluation(new MaskLastN(Literal("abcd-EFGH-8765-4321"), Literal(12), Literal("Upper")),
+      "abcd-EFUU-nnnn-nnnn")
+    checkEvaluation(new MaskLastN(Literal("")), "")
+    checkEvaluation(new MaskLastN(Literal("abcd-EFGH-8765-4321"), Literal(16), Literal("")),
+      "abcx-XXXX-nnnn-nnnn")
+    checkEvaluation(MaskLastN(Literal("abcd-EFGH-8765-4321"), 1000, "", "", ""),
+      "xxxx-XXXX-nnnn-nnnn")
+    checkEvaluation(MaskLastN(Literal("abcd-EFGH-8765-4321"), -1, "", "", ""),
+      "abcd-EFGH-8765-4321")
+    // scalastyle:off nonascii
+    checkEvaluation(MaskLastN(Literal("Ul9U"), 2, "\u2200", null, null), "Uln\u2200")
+    checkEvaluation(new MaskLastN(Literal("あ, 𠀋, Hello World あ 𠀋"), Literal(10)),
+      "あ, 𠀋, Hello Xxxxx あ 𠀋")
+    // scalastyle:on nonascii
+  }
+
+  test("mask_show_first_n") {
+    checkEvaluation(MaskShowFirstN(Literal("abcd-EFGH-8765-aB3d"), 6, "U", "l", "#"),
+      "abcd-EUUU-####-lU#l")
+    checkEvaluation(new MaskShowFirstN(
+      Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l"), Literal("#")),
+      "abcd-EUUU-####-####")
+    checkEvaluation(
+      new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l")),
+      "abcd-EUUU-nnnn-nnnn")
+    checkEvaluation(new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U")),
+      "abcd-EUUU-nnnn-nnnn")
+    checkEvaluation(new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6)),
+      "abcd-EXXX-nnnn-nnnn")
+    intercept[AnalysisException] {
+      checkEvaluation(new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal("U")), "")
+    }
+    checkEvaluation(new MaskShowFirstN(Literal("abcd-EFGH-8765-4321")), "abcd-XXXX-nnnn-nnnn")
+    checkEvaluation(new MaskShowFirstN(Literal(null, StringType)), null)
+    checkEvaluation(MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), 4, "U", "l", null),
+      "abcd-UUUU-nnnn-nnnn")
+    checkEvaluation(new MaskShowFirstN(
+      Literal("abcd-EFGH-8765-4321"),
+      Literal(null, IntegerType),
+      Literal(null, StringType),
+      Literal(null, StringType),
+      Literal(null, StringType)), "abcd-XXXX-nnnn-nnnn")
+    checkEvaluation(
+      new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("Upper")),
+      "abcd-EUUU-nnnn-nnnn")
+    checkEvaluation(new MaskShowFirstN(Literal("")), "")
+    checkEvaluation(new MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), Literal(4), Literal("")),
+      "abcd-XXXX-nnnn-nnnn")
+    checkEvaluation(MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), 1000, "", "", ""),
+      "abcd-EFGH-8765-4321")
+    checkEvaluation(MaskShowFirstN(Literal("abcd-EFGH-8765-4321"), -1, "", "", ""),
+      "xxxx-XXXX-nnnn-nnnn")
+    // scalastyle:off nonascii
+    checkEvaluation(MaskShowFirstN(Literal("Ul9U"), 2, "\u2200", null, null), "Uln\u2200")
+    checkEvaluation(new MaskShowFirstN(Literal("あ, 𠀋, Hello World"), Literal(10)),
+      "あ, 𠀋, Hellx Xxxxx")
+    // scalastyle:on nonascii
+  }
+
+  test("mask_show_last_n") {
+    checkEvaluation(MaskShowLastN(Literal("aB3d-EFGH-8765"), 6, "U", "l", "#"),
+      "lU#l-UUUH-8765")
+    checkEvaluation(new MaskShowLastN(
+      Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l"), Literal("#")),
+      "llll-UUUU-###5-4321")
+    checkEvaluation(
+      new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U"), Literal("l")),
+      "llll-UUUU-nnn5-4321")
+    checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("U")),
+      "xxxx-UUUU-nnn5-4321")
+    checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal(6)),
+      "xxxx-XXXX-nnn5-4321")
+    intercept[AnalysisException] {
+      checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal("U")), "")
+    }
+    checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321")), "xxxx-XXXX-nnnn-4321")
+    checkEvaluation(new MaskShowLastN(Literal(null, StringType)), null)
+    checkEvaluation(MaskShowLastN(Literal("abcd-EFGH-8765-4321"), 4, "U", "l", null),
+      "llll-UUUU-nnnn-4321")
+    checkEvaluation(new MaskShowLastN(
+      Literal("abcd-EFGH-8765-4321"),
+      Literal(null, IntegerType),
+      Literal(null, StringType),
+      Literal(null, StringType),
+      Literal(null, StringType)), "xxxx-XXXX-nnnn-4321")
+    checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal(6), Literal("Upper")),
+      "xxxx-UUUU-nnn5-4321")
+    checkEvaluation(new MaskShowLastN(Literal("")), "")
+    checkEvaluation(new MaskShowLastN(Literal("abcd-EFGH-8765-4321"), Literal(4), Literal("")),
+      "xxxx-XXXX-nnnn-4321")
+    checkEvaluation(MaskShowLastN(Literal("abcd-EFGH-8765-4321"), 1000, "", "", ""),
+      "abcd-EFGH-8765-4321")
+    checkEvaluation(MaskShowLastN(Literal("abcd-EFGH-8765-4321"), -1, "", "", ""),
+      "xxxx-XXXX-nnnn-nnnn")
+    // scalastyle:off nonascii
+    checkEvaluation(MaskShowLastN(Literal("Ul9U"), 2, "\u2200", null, null), "\u2200x9U")
+    checkEvaluation(new MaskShowLastN(Literal("あ, 𠀋, Hello World"), Literal(10)),
+      "あ, 𠀋, Xello World")
+    // scalastyle:on nonascii
+  }
+
+  test("mask_hash") {
+    checkEvaluation(MaskHash(Literal("abcd-EFGH-8765-4321")), "60c713f5ec6912229d2060df1c322776")
+    checkEvaluation(MaskHash(Literal("")), "d41d8cd98f00b204e9800998ecf8427e")
+    checkEvaluation(MaskHash(Literal(null, StringType)), null)
+    // scalastyle:off nonascii
+    checkEvaluation(MaskHash(Literal("\u2200x9U")), "f1243ef123d516b1f32a3a75309e5711")
+    // scalastyle:on nonascii
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5ab9cb3..443ba2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3499,6 +3499,125 @@ object functions {
    */
   def map_entries(e: Column): Column = withExpr { MapEntries(e.expr) }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Mask functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  /**
+   * Returns a string which is the masked representation of the input.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask(e: Column): Column = withExpr { new Mask(e.expr) }
+
+  /**
+   * Returns a string which is the masked representation of the input, using `upper`, `lower` and
+   * `digit` as replacement characters.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask(e: Column, upper: String, lower: String, digit: String): Column = withExpr {
+    Mask(e.expr, upper, lower, digit)
+  }
+
+  /**
+   * Returns a string with the first `n` characters masked.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_first_n(e: Column, n: Int): Column = withExpr { new MaskFirstN(e.expr, Literal(n)) }
+
+  /**
+   * Returns a string with the first `n` characters masked, using `upper`, `lower` and `digit` as
+   * replacement characters.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_first_n(
+      e: Column,
+      n: Int,
+      upper: String,
+      lower: String,
+      digit: String): Column = withExpr {
+    MaskFirstN(e.expr, n, upper, lower, digit)
+  }
+
+  /**
+   * Returns a string with the last `n` characters masked.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_last_n(e: Column, n: Int): Column = withExpr { new MaskLastN(e.expr, Literal(n)) }
+
+  /**
+   * Returns a string with the last `n` characters masked, using `upper`, `lower` and `digit` as
+   * replacement characters.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_last_n(
+      e: Column,
+      n: Int,
+      upper: String,
+      lower: String,
+      digit: String): Column = withExpr {
+    MaskLastN(e.expr, n, upper, lower, digit)
+  }
+
+  /**
+   * Returns a string with all but the first `n` characters masked.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_show_first_n(e: Column, n: Int): Column = withExpr {
+    new MaskShowFirstN(e.expr, Literal(n))
+  }
+
+  /**
+   * Returns a string with all but the first `n` characters masked, using `upper`, `lower` and
+   * `digit` as replacement characters.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_show_first_n(
+      e: Column,
+      n: Int,
+      upper: String,
+      lower: String,
+      digit: String): Column = withExpr {
+    MaskShowFirstN(e.expr, n, upper, lower, digit)
+  }
+
+  /**
+   * Returns a string with all but the last `n` characters masked.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_show_last_n(e: Column, n: Int): Column = withExpr {
+    new MaskShowLastN(e.expr, Literal(n))
+  }
+
+  /**
+   * Returns a string with all but the last `n` characters masked, using `upper`, `lower` and
+   * `digit` as replacement characters.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_show_last_n(
+      e: Column,
+      n: Int,
+      upper: String,
+      lower: String,
+      digit: String): Column = withExpr {
+    MaskShowLastN(e.expr, n, upper, lower, digit)
+  }
+
+  /**
+   * Returns a hashed value based on the input column.
+   * @group mask_funcs
+   * @since 2.4.0
+   */
+  def mask_hash(e: Column): Column = withExpr { MaskHash(e.expr) }
+
   // scalastyle:off line.size.limit
   // scalastyle:off parameter.number
 

http://git-wip-us.apache.org/repos/asf/spark/blob/1b36f148/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 79e743d..cc8bad4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -276,6 +276,113 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("mask functions") {
+    val df = Seq("TestString-123", "", null).toDF("a")
+    checkAnswer(df.select(mask($"a")), Seq(Row("XxxxXxxxxx-nnn"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_first_n($"a", 4)), Seq(Row("XxxxString-123"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_last_n($"a", 4)), Seq(Row("TestString-nnn"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_show_first_n($"a", 4)),
+      Seq(Row("TestXxxxxx-nnn"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_show_last_n($"a", 4)),
+      Seq(Row("XxxxXxxxxx-123"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_hash($"a")),
+      Seq(Row("dd78d68ad1b23bde126812482dd70ac6"),
+        Row("d41d8cd98f00b204e9800998ecf8427e"),
+        Row(null)))
+
+    checkAnswer(df.select(mask($"a", "U", "l", "#")),
+      Seq(Row("UlllUlllll-###"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_first_n($"a", 4, "U", "l", "#")),
+      Seq(Row("UlllString-123"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_last_n($"a", 4, "U", "l", "#")),
+      Seq(Row("TestString-###"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_show_first_n($"a", 4, "U", "l", "#")),
+      Seq(Row("TestUlllll-###"), Row(""), Row(null)))
+    checkAnswer(df.select(mask_show_last_n($"a", 4, "U", "l", "#")),
+      Seq(Row("UlllUlllll-123"), Row(""), Row(null)))
+
+    checkAnswer(
+      df.selectExpr("mask(a)", "mask(a, 'U')", "mask(a, 'U', 'l')", "mask(a, 'U', 'l', '#')"),
+      Seq(Row("XxxxXxxxxx-nnn", "UxxxUxxxxx-nnn", "UlllUlllll-nnn", "UlllUlllll-###"),
+        Row("", "", "", ""),
+        Row(null, null, null, null)))
+    checkAnswer(sql("select mask(null)"), Row(null))
+    checkAnswer(sql("select mask('AAaa11', null, null, null)"), Row("XXxxnn"))
+    intercept[AnalysisException] {
+      checkAnswer(df.selectExpr("mask(a, a)"), Seq(Row("XxxxXxxxxx-nnn"), Row(""), Row(null)))
+    }
+
+    checkAnswer(
+      df.selectExpr(
+        "mask_first_n(a)",
+        "mask_first_n(a, 6)",
+        "mask_first_n(a, 6, 'U')",
+        "mask_first_n(a, 6, 'U', 'l')",
+        "mask_first_n(a, 6, 'U', 'l', '#')"),
+      Seq(Row("XxxxString-123", "XxxxXxring-123", "UxxxUxring-123", "UlllUlring-123",
+        "UlllUlring-123"),
+        Row("", "", "", "", ""),
+        Row(null, null, null, null, null)))
+    checkAnswer(sql("select mask_first_n(null)"), Row(null))
+    checkAnswer(sql("select mask_first_n('A1aA1a', null, null, null, null)"), Row("XnxX1a"))
+    intercept[AnalysisException] {
+      checkAnswer(spark.range(1).selectExpr("mask_first_n('A1aA1a', id)"), Row("XnxX1a"))
+    }
+
+    checkAnswer(
+      df.selectExpr(
+        "mask_last_n(a)",
+        "mask_last_n(a, 6)",
+        "mask_last_n(a, 6, 'U')",
+        "mask_last_n(a, 6, 'U', 'l')",
+        "mask_last_n(a, 6, 'U', 'l', '#')"),
+      Seq(Row("TestString-nnn", "TestStrixx-nnn", "TestStrixx-nnn", "TestStrill-nnn",
+        "TestStrill-###"),
+        Row("", "", "", "", ""),
+        Row(null, null, null, null, null)))
+    checkAnswer(sql("select mask_last_n(null)"), Row(null))
+    checkAnswer(sql("select mask_last_n('A1aA1a', null, null, null, null)"), Row("A1xXnx"))
+    intercept[AnalysisException] {
+      checkAnswer(spark.range(1).selectExpr("mask_last_n('A1aA1a', id)"), Row("A1xXnx"))
+    }
+
+    checkAnswer(
+      df.selectExpr(
+        "mask_show_first_n(a)",
+        "mask_show_first_n(a, 6)",
+        "mask_show_first_n(a, 6, 'U')",
+        "mask_show_first_n(a, 6, 'U', 'l')",
+        "mask_show_first_n(a, 6, 'U', 'l', '#')"),
+      Seq(Row("TestXxxxxx-nnn", "TestStxxxx-nnn", "TestStxxxx-nnn", "TestStllll-nnn",
+        "TestStllll-###"),
+        Row("", "", "", "", ""),
+        Row(null, null, null, null, null)))
+    checkAnswer(sql("select mask_show_first_n(null)"), Row(null))
+    checkAnswer(sql("select mask_show_first_n('A1aA1a', null, null, null, null)"), Row("A1aAnx"))
+    intercept[AnalysisException] {
+      checkAnswer(spark.range(1).selectExpr("mask_show_first_n('A1aA1a', id)"), Row("A1aAnx"))
+    }
+
+    checkAnswer(
+      df.selectExpr(
+        "mask_show_last_n(a)",
+        "mask_show_last_n(a, 6)",
+        "mask_show_last_n(a, 6, 'U')",
+        "mask_show_last_n(a, 6, 'U', 'l')",
+        "mask_show_last_n(a, 6, 'U', 'l', '#')"),
+      Seq(Row("XxxxXxxxxx-123", "XxxxXxxxng-123", "UxxxUxxxng-123", "UlllUlllng-123",
+        "UlllUlllng-123"),
+        Row("", "", "", "", ""),
+        Row(null, null, null, null, null)))
+    checkAnswer(sql("select mask_show_last_n(null)"), Row(null))
+    checkAnswer(sql("select mask_show_last_n('A1aA1a', null, null, null, null)"), Row("XnaA1a"))
+    intercept[AnalysisException] {
+      checkAnswer(spark.range(1).selectExpr("mask_show_last_n('A1aA1a', id)"), Row("XnaA1a"))
+    }
+
+    checkAnswer(sql("select mask_hash(null)"), Row(null))
+  }
+
   test("sort_array/array_sort functions") {
     val df = Seq(
       (Array[Int](2, 1, 3), Array("b", "c", "a")),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org