You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by va...@apache.org on 2018/05/18 23:25:38 UTC

spark git commit: [SPARK-23850][SQL][BRANCH-2.2] Add separate config for SQL options redaction.

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 8c223b651 -> 6a55d8b03


[SPARK-23850][SQL][BRANCH-2.2] Add separate config for SQL options redaction.

The old code was relying on a core configuration and extended its
default value to include things that redact desired things in the
app's environment. Instead, add a SQL-specific option for which
options to redact, and apply both the core and SQL-specific rules
when redacting the options in the save command.

This is a little sub-optimal since it adds another config, but it
retains the current default behavior.

While there I also fixed a typo and a couple of minor config API
usage issues in the related redaction option that SQL already had.

Tested with existing unit tests, plus checking the env page on
a shell UI.

(cherry picked from commit ed7ba7db8fa344ff182b72d23ae458e711f63432)

Author: Marcelo Vanzin <va...@cloudera.com>

Closes #21365 from vanzin/SPARK-23850-2.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a55d8b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a55d8b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a55d8b0

Branch: refs/heads/branch-2.2
Commit: 6a55d8b03053e616dcacb79cd2c29a06d219dc32
Parents: 8c223b6
Author: Marcelo Vanzin <va...@cloudera.com>
Authored: Fri May 18 16:25:30 2018 -0700
Committer: Marcelo Vanzin <va...@cloudera.com>
Committed: Fri May 18 16:25:30 2018 -0700

----------------------------------------------------------------------
 .../apache/spark/internal/config/package.scala  |  2 +-
 .../scala/org/apache/spark/util/Utils.scala     | 11 ++++++++++
 .../org/apache/spark/sql/internal/SQLConf.scala | 21 ++++++++++++++++++++
 .../datasources/SaveIntoDataSourceCommand.scala |  8 ++++----
 .../SaveIntoDataSourceCommandSuite.scala        |  3 ---
 5 files changed, 37 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6a55d8b0/core/src/main/scala/org/apache/spark/internal/config/package.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index db4c9f9..f65a9d7 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -247,7 +247,7 @@ package object config {
         "a property key or value, the value is redacted from the environment UI and various logs " +
         "like YARN and event logs.")
       .regexConf
-      .createWithDefault("(?i)secret|password|url|user|username".r)
+      .createWithDefault("(?i)secret|password".r)
 
   private[spark] val STRING_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.string.regex")

http://git-wip-us.apache.org/repos/asf/spark/blob/6a55d8b0/core/src/main/scala/org/apache/spark/util/Utils.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 6bcaf10..626b656 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2625,6 +2625,17 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Redact the sensitive values in the given map. If a map key matches the redaction pattern then
+   * its value is replaced with a dummy text.
+   */
+  def redact(regex: Option[Regex], kvs: Seq[(String, String)]): Seq[(String, String)] = {
+    regex match {
+      case None => kvs
+      case Some(r) => redact(r, kvs)
+    }
+  }
+
   private def redact(redactionPattern: Regex, kvs: Seq[(String, String)]): Seq[(String, String)] = {
     // If the sensitive information regex matches with either the key or the value, redact the value
     // While the original intent was to only redact the value if the key matched with the regex,

http://git-wip-us.apache.org/repos/asf/spark/blob/6a55d8b0/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4c29f8e..ebabd1a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -29,6 +29,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
+import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -819,6 +820,15 @@ object SQLConf {
       .intConf
       .createWithDefault(UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD.toInt)
 
+  val SQL_OPTIONS_REDACTION_PATTERN =
+    buildConf("spark.sql.redaction.options.regex")
+      .doc("Regex to decide which keys in a Spark SQL command's options map contain sensitive " +
+        "information. The values of options whose names that match this regex will be redacted " +
+        "in the explain output. This redaction is applied on top of the global redaction " +
+        s"configuration defined by ${SECRET_REDACTION_PATTERN.key}.")
+    .regexConf
+    .createWithDefault("(?i)url".r)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1182,6 +1192,17 @@ class SQLConf extends Serializable with Logging {
   }
 
   /**
+   * Redacts the given option map according to the description of SQL_OPTIONS_REDACTION_PATTERN.
+   */
+  def redactOptions(options: Map[String, String]): Map[String, String] = {
+    val regexes = Seq(
+      getConf(SQL_OPTIONS_REDACTION_PATTERN),
+      SECRET_REDACTION_PATTERN.readFrom(reader))
+
+    regexes.foldLeft(options.toSeq) { case (opts, r) => Utils.redact(Some(r), opts) }.toMap
+  }
+
+  /**
    * Return whether a given key is set in this [[SQLConf]].
    */
   def contains(key: String): Boolean = {

http://git-wip-us.apache.org/repos/asf/spark/blob/6a55d8b0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
index 53868d4..b92684c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommand.scala
@@ -17,12 +17,10 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.SparkEnv
 import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.command.RunnableCommand
-import org.apache.spark.util.Utils
 
 /**
  * Saves the results of `query` in to a data source.
@@ -53,7 +51,9 @@ case class SaveIntoDataSourceCommand(
   }
 
   override def simpleString: String = {
-    val redacted = Utils.redact(SparkEnv.get.conf, options.toSeq).toMap
-    s"SaveIntoDataSourceCommand ${provider}, ${partitionColumns}, ${redacted}, ${mode}"
+    val redacted = SparkSession.getActiveSession
+      .map(_.sessionState.conf.redactOptions(options))
+      .getOrElse(Map())
+    s"SaveIntoDataSourceCommand ${provider}, ${redacted}, ${mode}"
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/6a55d8b0/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
index 6b9ddb1..cf340d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala
@@ -23,9 +23,6 @@ import org.apache.spark.sql.test.SharedSQLContext
 
 class SaveIntoDataSourceCommandSuite extends SharedSQLContext {
 
-  override protected def sparkConf: SparkConf = super.sparkConf
-    .set("spark.redaction.regex", "(?i)password|url")
-
   test("simpleString is redacted") {
     val URL = "connection.url"
     val PASS = "123"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org