You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by pw...@apache.org on 2014/07/16 23:10:21 UTC
git commit: SPARK-1097: Do not introduce deadlock while fixing
concurrency bug
Repository: spark
Updated Branches:
refs/heads/master 7c8d12322 -> 8867cd0bc
SPARK-1097: Do not introduce deadlock while fixing concurrency bug
We recently added this lock on 'conf' in order to prevent concurrent creation. However, it turns out that this can introduce a deadlock because Hadoop also synchronizes on the Configuration objects when creating new Configurations (and they do so via a static REGISTRY which contains all created Configurations).
This fix forces all Spark initialization of Configuration objects to occur serially by using a static lock that we control, and thus also prevents introducing the deadlock.
Author: Aaron Davidson <aa...@databricks.com>
Closes #1409 from aarondav/1054 and squashes the following commits:
7d1b769 [Aaron Davidson] SPARK-1097: Do not introduce deadlock while fixing concurrency bug
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8867cd0b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8867cd0b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8867cd0b
Branch: refs/heads/master
Commit: 8867cd0bc2961fefed84901b8b14e9676ae6ab18
Parents: 7c8d123
Author: Aaron Davidson <aa...@databricks.com>
Authored: Wed Jul 16 14:10:17 2014 -0700
Committer: Patrick Wendell <pw...@gmail.com>
Committed: Wed Jul 16 14:10:17 2014 -0700
----------------------------------------------------------------------
core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/8867cd0b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index 0410285..e521612 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -140,8 +140,8 @@ class HadoopRDD[K, V](
// Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
// local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
// The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
- // synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456)
- conf.synchronized {
+ // Synchronize to prevent ConcurrentModificationException (Spark-1097, Hadoop-10456).
+ HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
val newJobConf = new JobConf(conf)
initLocalJobConfFuncOpt.map(f => f(newJobConf))
HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
@@ -246,6 +246,9 @@ class HadoopRDD[K, V](
}
private[spark] object HadoopRDD {
+ /** Constructing Configuration objects is not threadsafe, use this lock to serialize. */
+ val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
/**
* The three methods below are helpers for accessing the local map, a property of the SparkEnv of
* the local process.