You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/05/18 23:55:47 UTC

[spark] branch branch-2.4 updated: [SPARK-25694][SQL] Add a config for `URL.setURLStreamHandlerFactory`

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new 19cb475  [SPARK-25694][SQL] Add a config for `URL.setURLStreamHandlerFactory`
19cb475 is described below

commit 19cb475682923a7dc1f2e425452e8865520e2b4d
Author: Zhou Jiang <zh...@apple.com>
AuthorDate: Mon Nov 18 05:44:00 2019 +0000

    [SPARK-25694][SQL] Add a config for `URL.setURLStreamHandlerFactory`
    
    Add a property `spark.fsUrlStreamHandlerFactory.enabled` to allow users turn off the default registration of `org.apache.hadoop.fs.FsUrlStreamHandlerFactory`
    
    This [SPARK-25694](https://issues.apache.org/jira/browse/SPARK-25694) is a long-standing issue. Originally, [[SPARK-12868][SQL] Allow adding jars from hdfs](https://github.com/apache/spark/pull/17342 ) added this for better Hive support. However, this have a side-effect when the users use Apache Spark without `-Phive`. This causes exceptions when the users tries to use another custom factories or 3rd party library (trying to set this). This configuration will unblock those non-hive users.
    
    Yes. This provides a new user-configurable property.
    By default, the behavior is unchanged.
    
    Manual testing.
    
    **BEFORE**
    ```
    $ build/sbt package
    $ bin/spark-shell
    scala> sql("show tables").show
    +--------+---------+-----------+
    |database|tableName|isTemporary|
    +--------+---------+-----------+
    +--------+---------+-----------+
    
    scala> java.net.URL.setURLStreamHandlerFactory(new org.apache.hadoop.fs.FsUrlStreamHandlerFactory())
    java.lang.Error: factory already defined
      at java.net.URL.setURLStreamHandlerFactory(URL.java:1134)
      ... 47 elided
    ```
    
    **AFTER**
    ```
    $ build/sbt package
    $ bin/spark-shell --conf spark.sql.defaultUrlStreamHandlerFactory.enabled=false
    scala> sql("show tables").show
    +--------+---------+-----------+
    |database|tableName|isTemporary|
    +--------+---------+-----------+
    +--------+---------+-----------+
    
    scala> java.net.URL.setURLStreamHandlerFactory(new org.apache.hadoop.fs.FsUrlStreamHandlerFactory())
    ```
    
    Closes #26530 from jiangzho/master.
    
    Lead-authored-by: Zhou Jiang <zh...@apple.com>
    Co-authored-by: Dongjoon Hyun <dh...@apple.com>
    Co-authored-by: zhou-jiang <zh...@apple.com>
    Signed-off-by: DB Tsai <d_...@apple.com>
    (cherry picked from commit ee3bd6d76887ccc4961fd520c5d03f7edd3742ac)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../apache/spark/sql/internal/SharedState.scala    | 26 +++++++++++++++----
 .../apache/spark/sql/internal/config/package.scala | 29 ++++++++++++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index 4d2be13..f94c9e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -33,6 +33,8 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.CacheManager
 import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab}
 import org.apache.spark.sql.internal.StaticSQLConf._
+import org.apache.spark.sql.internal.config.DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED
+import org.apache.spark.sql.streaming.StreamingQuery
 import org.apache.spark.status.ElementTrackingStore
 import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
@@ -42,6 +44,8 @@ import org.apache.spark.util.{MutableURLClassLoader, Utils}
  */
 private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
 
+  SharedState.setFsUrlStreamHandlerFactory(sparkContext.conf)
+
   // Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on
   // the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf.
   val warehousePath: String = {
@@ -156,11 +160,23 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
 }
 
 object SharedState extends Logging {
-  try {
-    URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory())
-  } catch {
-    case e: Error =>
-      logWarning("URL.setURLStreamHandlerFactory failed to set FsUrlStreamHandlerFactory")
+  @volatile private var fsUrlStreamHandlerFactoryInitialized = false
+
+  private def setFsUrlStreamHandlerFactory(conf: SparkConf): Unit = {
+    if (!fsUrlStreamHandlerFactoryInitialized &&
+        conf.get(DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED)) {
+      synchronized {
+        if (!fsUrlStreamHandlerFactoryInitialized) {
+          try {
+            URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory())
+            fsUrlStreamHandlerFactoryInitialized = true
+          } catch {
+            case NonFatal(_) =>
+              logWarning("URL.setURLStreamHandlerFactory failed to set FsUrlStreamHandlerFactory")
+          }
+        }
+      }
+    }
   }
 
   private val HIVE_EXTERNAL_CATALOG_CLASS_NAME = "org.apache.spark.sql.hive.HiveExternalCatalog"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/config/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/config/package.scala
new file mode 100644
index 0000000..e26c4aa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/config/package.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.internal
+
+import org.apache.spark.internal.config.ConfigBuilder
+
+package object config {
+
+  private[spark] val DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED =
+    ConfigBuilder("spark.sql.defaultUrlStreamHandlerFactory.enabled")
+      .doc("When true, set FsUrlStreamHandlerFactory to support ADD JAR against HDFS locations")
+      .booleanConf
+      .createWithDefault(true)
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org