You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by va...@apache.org on 2016/12/07 00:23:38 UTC
[01/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Repository: spark
Updated Branches:
refs/heads/master a8ced76f1 -> 81e5619ca
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
deleted file mode 100644
index a58784f..0000000
--- a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.network.yarn
-
-import java.io.{DataOutputStream, File, FileOutputStream, IOException}
-import java.nio.ByteBuffer
-import java.nio.file.Files
-import java.nio.file.attribute.PosixFilePermission._
-import java.util.EnumSet
-
-import scala.annotation.tailrec
-import scala.concurrent.duration._
-import scala.language.postfixOps
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.service.ServiceStateException
-import org.apache.hadoop.yarn.api.records.ApplicationId
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.server.api.{ApplicationInitializationContext, ApplicationTerminationContext}
-import org.scalatest.{BeforeAndAfterEach, Matchers}
-import org.scalatest.concurrent.Eventually._
-
-import org.apache.spark.SecurityManager
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.network.shuffle.ShuffleTestAccessor
-import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
-import org.apache.spark.util.Utils
-
-class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
- private[yarn] var yarnConfig: YarnConfiguration = null
- private[yarn] val SORT_MANAGER = "org.apache.spark.shuffle.sort.SortShuffleManager"
-
- override def beforeEach(): Unit = {
- super.beforeEach()
- yarnConfig = new YarnConfiguration()
- yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
- yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
- classOf[YarnShuffleService].getCanonicalName)
- yarnConfig.setInt("spark.shuffle.service.port", 0)
- yarnConfig.setBoolean(YarnShuffleService.STOP_ON_FAILURE_KEY, true)
- val localDir = Utils.createTempDir()
- yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, localDir.getAbsolutePath)
- }
-
- var s1: YarnShuffleService = null
- var s2: YarnShuffleService = null
- var s3: YarnShuffleService = null
-
- override def afterEach(): Unit = {
- try {
- if (s1 != null) {
- s1.stop()
- s1 = null
- }
- if (s2 != null) {
- s2.stop()
- s2 = null
- }
- if (s3 != null) {
- s3.stop()
- s3 = null
- }
- } finally {
- super.afterEach()
- }
- }
-
- test("executor state kept across NM restart") {
- s1 = new YarnShuffleService
- // set auth to true to test the secrets recovery
- yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
- s1.init(yarnConfig)
- val app1Id = ApplicationId.newInstance(0, 1)
- val app1Data = makeAppInfo("user", app1Id)
- s1.initializeApplication(app1Data)
- val app2Id = ApplicationId.newInstance(0, 2)
- val app2Data = makeAppInfo("user", app2Id)
- s1.initializeApplication(app2Data)
-
- val execStateFile = s1.registeredExecutorFile
- execStateFile should not be (null)
- val secretsFile = s1.secretsFile
- secretsFile should not be (null)
- val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
- val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
-
- val blockHandler = s1.blockHandler
- val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
- ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
-
- blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
- blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
- ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", blockResolver) should
- be (Some(shuffleInfo1))
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", blockResolver) should
- be (Some(shuffleInfo2))
-
- if (!execStateFile.exists()) {
- @tailrec def findExistingParent(file: File): File = {
- if (file == null) file
- else if (file.exists()) file
- else findExistingParent(file.getParentFile())
- }
- val existingParent = findExistingParent(execStateFile)
- assert(false, s"$execStateFile does not exist -- closest existing parent is $existingParent")
- }
- assert(execStateFile.exists(), s"$execStateFile did not exist")
-
- // now we pretend the shuffle service goes down, and comes back up
- s1.stop()
- s2 = new YarnShuffleService
- s2.init(yarnConfig)
- s2.secretsFile should be (secretsFile)
- s2.registeredExecutorFile should be (execStateFile)
-
- val handler2 = s2.blockHandler
- val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
-
- // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
- // during the restart
- s2.initializeApplication(app1Data)
- s2.stopApplication(new ApplicationTerminationContext(app2Id))
- ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver2) should be (Some(shuffleInfo1))
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (None)
-
- // Act like the NM restarts one more time
- s2.stop()
- s3 = new YarnShuffleService
- s3.init(yarnConfig)
- s3.registeredExecutorFile should be (execStateFile)
- s3.secretsFile should be (secretsFile)
-
- val handler3 = s3.blockHandler
- val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
-
- // app1 is still running
- s3.initializeApplication(app1Data)
- ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver3) should be (Some(shuffleInfo1))
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (None)
- s3.stop()
- }
-
- test("removed applications should not be in registered executor file") {
- s1 = new YarnShuffleService
- yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, false)
- s1.init(yarnConfig)
- val secretsFile = s1.secretsFile
- secretsFile should be (null)
- val app1Id = ApplicationId.newInstance(0, 1)
- val app1Data = makeAppInfo("user", app1Id)
- s1.initializeApplication(app1Data)
- val app2Id = ApplicationId.newInstance(0, 2)
- val app2Data = makeAppInfo("user", app2Id)
- s1.initializeApplication(app2Data)
-
- val execStateFile = s1.registeredExecutorFile
- execStateFile should not be (null)
- val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
- val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
-
- val blockHandler = s1.blockHandler
- val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
- ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
-
- blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
- blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
-
- val db = ShuffleTestAccessor.shuffleServiceLevelDB(blockResolver)
- ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
-
- s1.stopApplication(new ApplicationTerminationContext(app1Id))
- ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
- s1.stopApplication(new ApplicationTerminationContext(app2Id))
- ShuffleTestAccessor.reloadRegisteredExecutors(db) shouldBe empty
- }
-
- test("shuffle service should be robust to corrupt registered executor file") {
- s1 = new YarnShuffleService
- s1.init(yarnConfig)
- val app1Id = ApplicationId.newInstance(0, 1)
- val app1Data = makeAppInfo("user", app1Id)
- s1.initializeApplication(app1Data)
-
- val execStateFile = s1.registeredExecutorFile
- val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
-
- val blockHandler = s1.blockHandler
- val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
- ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
-
- blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
-
- // now we pretend the shuffle service goes down, and comes back up. But we'll also
- // make a corrupt registeredExecutor File
- s1.stop()
-
- execStateFile.listFiles().foreach{_.delete()}
-
- val out = new DataOutputStream(new FileOutputStream(execStateFile + "/CURRENT"))
- out.writeInt(42)
- out.close()
-
- s2 = new YarnShuffleService
- s2.init(yarnConfig)
- s2.registeredExecutorFile should be (execStateFile)
-
- val handler2 = s2.blockHandler
- val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
-
- // we re-initialize app1, but since the file was corrupt there is nothing we can do about it ...
- s2.initializeApplication(app1Data)
- // however, when we initialize a totally new app2, everything is still happy
- val app2Id = ApplicationId.newInstance(0, 2)
- val app2Data = makeAppInfo("user", app2Id)
- s2.initializeApplication(app2Data)
- val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
- resolver2.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2))
- s2.stop()
-
- // another stop & restart should be fine though (eg., we recover from previous corruption)
- s3 = new YarnShuffleService
- s3.init(yarnConfig)
- s3.registeredExecutorFile should be (execStateFile)
- val handler3 = s3.blockHandler
- val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
-
- s3.initializeApplication(app2Data)
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (Some(shuffleInfo2))
- s3.stop()
- }
-
- test("get correct recovery path") {
- // Test recovery path is set outside the shuffle service, this is to simulate NM recovery
- // enabled scenario, where recovery path will be set by yarn.
- s1 = new YarnShuffleService
- val recoveryPath = new Path(Utils.createTempDir().toURI)
- s1.setRecoveryPath(recoveryPath)
-
- s1.init(yarnConfig)
- s1._recoveryPath should be (recoveryPath)
- s1.stop()
-
- // Test recovery path is set inside the shuffle service, this will be happened when NM
- // recovery is not enabled or there's no NM recovery (Hadoop 2.5-).
- s2 = new YarnShuffleService
- s2.init(yarnConfig)
- s2._recoveryPath should be
- (new Path(yarnConfig.getTrimmedStrings("yarn.nodemanager.local-dirs")(0)))
- s2.stop()
- }
-
- test("moving recovery file from NM local dir to recovery path") {
- // This is to test when Hadoop is upgrade to 2.5+ and NM recovery is enabled, we should move
- // old recovery file to the new path to keep compatibility
-
- // Simulate s1 is running on old version of Hadoop in which recovery file is in the NM local
- // dir.
- s1 = new YarnShuffleService
- // set auth to true to test the secrets recovery
- yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
- s1.init(yarnConfig)
- val app1Id = ApplicationId.newInstance(0, 1)
- val app1Data = makeAppInfo("user", app1Id)
- s1.initializeApplication(app1Data)
- val app2Id = ApplicationId.newInstance(0, 2)
- val app2Data = makeAppInfo("user", app2Id)
- s1.initializeApplication(app2Data)
-
- assert(s1.secretManager.getSecretKey(app1Id.toString()) != null)
- assert(s1.secretManager.getSecretKey(app2Id.toString()) != null)
-
- val execStateFile = s1.registeredExecutorFile
- execStateFile should not be (null)
- val secretsFile = s1.secretsFile
- secretsFile should not be (null)
- val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
- val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
-
- val blockHandler = s1.blockHandler
- val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
- ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
-
- blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
- blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
- ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", blockResolver) should
- be (Some(shuffleInfo1))
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", blockResolver) should
- be (Some(shuffleInfo2))
-
- assert(execStateFile.exists(), s"$execStateFile did not exist")
-
- s1.stop()
-
- // Simulate s2 is running on Hadoop 2.5+ with NM recovery is enabled.
- assert(execStateFile.exists())
- val recoveryPath = new Path(Utils.createTempDir().toURI)
- s2 = new YarnShuffleService
- s2.setRecoveryPath(recoveryPath)
- s2.init(yarnConfig)
-
- // Ensure that s2 has loaded known apps from the secrets db.
- assert(s2.secretManager.getSecretKey(app1Id.toString()) != null)
- assert(s2.secretManager.getSecretKey(app2Id.toString()) != null)
-
- val execStateFile2 = s2.registeredExecutorFile
- val secretsFile2 = s2.secretsFile
-
- recoveryPath.toString should be (new Path(execStateFile2.getParentFile.toURI).toString)
- recoveryPath.toString should be (new Path(secretsFile2.getParentFile.toURI).toString)
- eventually(timeout(10 seconds), interval(5 millis)) {
- assert(!execStateFile.exists())
- }
- eventually(timeout(10 seconds), interval(5 millis)) {
- assert(!secretsFile.exists())
- }
-
- val handler2 = s2.blockHandler
- val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
-
- // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
- // during the restart
- // Since recovery file is got from old path, so the previous state should be stored.
- s2.initializeApplication(app1Data)
- s2.stopApplication(new ApplicationTerminationContext(app2Id))
- ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver2) should be (Some(shuffleInfo1))
- ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (None)
-
- s2.stop()
- }
-
- test("service throws error if cannot start") {
- // Set up a read-only local dir.
- val roDir = Utils.createTempDir()
- Files.setPosixFilePermissions(roDir.toPath(), EnumSet.of(OWNER_READ, OWNER_EXECUTE))
- yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, roDir.getAbsolutePath())
-
- // Try to start the shuffle service, it should fail.
- val service = new YarnShuffleService()
-
- try {
- val error = intercept[ServiceStateException] {
- service.init(yarnConfig)
- }
- assert(error.getCause().isInstanceOf[IOException])
- } finally {
- service.stop()
- Files.setPosixFilePermissions(roDir.toPath(),
- EnumSet.of(OWNER_READ, OWNER_WRITE, OWNER_EXECUTE))
- }
- }
-
- private def makeAppInfo(user: String, appId: ApplicationId): ApplicationInitializationContext = {
- val secret = ByteBuffer.wrap(new Array[Byte](0))
- new ApplicationInitializationContext(user, appId, secret)
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala b/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
deleted file mode 100644
index db322cd..0000000
--- a/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.network.yarn
-
-import java.io.File
-
-/**
- * just a cheat to get package-visible members in tests
- */
-object YarnTestAccessor {
- def getShuffleServicePort: Int = {
- YarnShuffleService.boundPort
- }
-
- def getShuffleServiceInstance: YarnShuffleService = {
- YarnShuffleService.instance
- }
-
- def getRegisteredExecutorFile(service: YarnShuffleService): File = {
- service.registeredExecutorFile
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
deleted file mode 100644
index 6ea7984..0000000
--- a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-
-/**
- * Test the integration with [[SchedulerExtensionServices]]
- */
-class ExtensionServiceIntegrationSuite extends SparkFunSuite
- with LocalSparkContext with BeforeAndAfter
- with Logging {
-
- val applicationId = new StubApplicationId(0, 1111L)
- val attemptId = new StubApplicationAttemptId(applicationId, 1)
-
- /*
- * Setup phase creates the spark context
- */
- before {
- val sparkConf = new SparkConf()
- sparkConf.set(SCHEDULER_SERVICES, Seq(classOf[SimpleExtensionService].getName()))
- sparkConf.setMaster("local").setAppName("ExtensionServiceIntegrationSuite")
- sc = new SparkContext(sparkConf)
- }
-
- test("Instantiate") {
- val services = new SchedulerExtensionServices()
- assertResult(Nil, "non-nil service list") {
- services.getServices
- }
- services.start(SchedulerExtensionServiceBinding(sc, applicationId))
- services.stop()
- }
-
- test("Contains SimpleExtensionService Service") {
- val services = new SchedulerExtensionServices()
- try {
- services.start(SchedulerExtensionServiceBinding(sc, applicationId))
- val serviceList = services.getServices
- assert(serviceList.nonEmpty, "empty service list")
- val (service :: Nil) = serviceList
- val simpleService = service.asInstanceOf[SimpleExtensionService]
- assert(simpleService.started.get, "service not started")
- services.stop()
- assert(!simpleService.started.get, "service not stopped")
- } finally {
- services.stop()
- }
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
deleted file mode 100644
index 9b8c98c..0000000
--- a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import java.util.concurrent.atomic.AtomicBoolean
-
-private[spark] class SimpleExtensionService extends SchedulerExtensionService {
-
- /** started flag; set in the `start()` call, stopped in `stop()`. */
- val started = new AtomicBoolean(false)
-
- override def start(binding: SchedulerExtensionServiceBinding): Unit = {
- started.set(true)
- }
-
- override def stop(): Unit = {
- started.set(false)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
deleted file mode 100644
index 4b57b95..0000000
--- a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
-
-/**
- * A stub application ID; can be set in constructor and/or updated later.
- * @param applicationId application ID
- * @param attempt an attempt counter
- */
-class StubApplicationAttemptId(var applicationId: ApplicationId, var attempt: Int)
- extends ApplicationAttemptId {
-
- override def setApplicationId(appID: ApplicationId): Unit = {
- applicationId = appID
- }
-
- override def getAttemptId: Int = {
- attempt
- }
-
- override def setAttemptId(attemptId: Int): Unit = {
- attempt = attemptId
- }
-
- override def getApplicationId: ApplicationId = {
- applicationId
- }
-
- override def build(): Unit = {
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala b/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
deleted file mode 100644
index bffa0e0..0000000
--- a/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.hadoop.yarn.api.records.ApplicationId
-
-/**
- * Simple Testing Application Id; ID and cluster timestamp are set in constructor
- * and cannot be updated.
- * @param id app id
- * @param clusterTimestamp timestamp
- */
-private[spark] class StubApplicationId(id: Int, clusterTimestamp: Long) extends ApplicationId {
- override def getId: Int = {
- id
- }
-
- override def getClusterTimestamp: Long = {
- clusterTimestamp
- }
-
- override def setId(id: Int): Unit = {}
-
- override def setClusterTimestamp(clusterTimestamp: Long): Unit = {}
-
- override def build(): Unit = {}
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[12/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
new file mode 100644
index 0000000..ec47ab1
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import scala.collection.JavaConverters._
+import scala.language.reflectiveCalls
+
+import org.apache.mesos.Protos.{Resource, Value}
+import org.mockito.Mockito._
+import org.scalatest._
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.internal.config._
+
+class MesosSchedulerUtilsSuite extends SparkFunSuite with Matchers with MockitoSugar {
+
+ // scalastyle:off structural.type
+ // this is the documented way of generating fixtures in scalatest
+ def fixture: Object {val sc: SparkContext; val sparkConf: SparkConf} = new {
+ val sparkConf = new SparkConf
+ val sc = mock[SparkContext]
+ when(sc.conf).thenReturn(sparkConf)
+ }
+
+ private def createTestPortResource(range: (Long, Long), role: Option[String] = None): Resource = {
+ val rangeValue = Value.Range.newBuilder()
+ rangeValue.setBegin(range._1)
+ rangeValue.setEnd(range._2)
+ val builder = Resource.newBuilder()
+ .setName("ports")
+ .setType(Value.Type.RANGES)
+ .setRanges(Value.Ranges.newBuilder().addRange(rangeValue))
+
+ role.foreach { r => builder.setRole(r) }
+ builder.build()
+ }
+
+ private def rangesResourcesToTuple(resources: List[Resource]): List[(Long, Long)] = {
+ resources.flatMap{resource => resource.getRanges.getRangeList
+ .asScala.map(range => (range.getBegin, range.getEnd))}
+ }
+
+ def arePortsEqual(array1: Array[(Long, Long)], array2: Array[(Long, Long)])
+ : Boolean = {
+ array1.sortBy(identity).deep == array2.sortBy(identity).deep
+ }
+
+ def arePortsEqual(array1: Array[Long], array2: Array[Long])
+ : Boolean = {
+ array1.sortBy(identity).deep == array2.sortBy(identity).deep
+ }
+
+ def getRangesFromResources(resources: List[Resource]): List[(Long, Long)] = {
+ resources.flatMap{ resource =>
+ resource.getRanges.getRangeList.asScala.toList.map{
+ range => (range.getBegin, range.getEnd)}}
+ }
+
+ val utils = new MesosSchedulerUtils { }
+ // scalastyle:on structural.type
+
+ test("use at-least minimum overhead") {
+ val f = fixture
+ when(f.sc.executorMemory).thenReturn(512)
+ utils.executorMemory(f.sc) shouldBe 896
+ }
+
+ test("use overhead if it is greater than minimum value") {
+ val f = fixture
+ when(f.sc.executorMemory).thenReturn(4096)
+ utils.executorMemory(f.sc) shouldBe 4505
+ }
+
+ test("use spark.mesos.executor.memoryOverhead (if set)") {
+ val f = fixture
+ when(f.sc.executorMemory).thenReturn(1024)
+ f.sparkConf.set("spark.mesos.executor.memoryOverhead", "512")
+ utils.executorMemory(f.sc) shouldBe 1536
+ }
+
+ test("parse a non-empty constraint string correctly") {
+ val expectedMap = Map(
+ "os" -> Set("centos7"),
+ "zone" -> Set("us-east-1a", "us-east-1b")
+ )
+ utils.parseConstraintString("os:centos7;zone:us-east-1a,us-east-1b") should be (expectedMap)
+ }
+
+ test("parse an empty constraint string correctly") {
+ utils.parseConstraintString("") shouldBe Map()
+ }
+
+ test("throw an exception when the input is malformed") {
+ an[IllegalArgumentException] should be thrownBy
+ utils.parseConstraintString("os;zone:us-east")
+ }
+
+ test("empty values for attributes' constraints matches all values") {
+ val constraintsStr = "os:"
+ val parsedConstraints = utils.parseConstraintString(constraintsStr)
+
+ parsedConstraints shouldBe Map("os" -> Set())
+
+ val zoneSet = Value.Set.newBuilder().addItem("us-east-1a").addItem("us-east-1b").build()
+ val noOsOffer = Map("zone" -> zoneSet)
+ val centosOffer = Map("os" -> Value.Text.newBuilder().setValue("centos").build())
+ val ubuntuOffer = Map("os" -> Value.Text.newBuilder().setValue("ubuntu").build())
+
+ utils.matchesAttributeRequirements(parsedConstraints, noOsOffer) shouldBe false
+ utils.matchesAttributeRequirements(parsedConstraints, centosOffer) shouldBe true
+ utils.matchesAttributeRequirements(parsedConstraints, ubuntuOffer) shouldBe true
+ }
+
+ test("subset match is performed for set attributes") {
+ val supersetConstraint = Map(
+ "os" -> Value.Text.newBuilder().setValue("ubuntu").build(),
+ "zone" -> Value.Set.newBuilder()
+ .addItem("us-east-1a")
+ .addItem("us-east-1b")
+ .addItem("us-east-1c")
+ .build())
+
+ val zoneConstraintStr = "os:;zone:us-east-1a,us-east-1c"
+ val parsedConstraints = utils.parseConstraintString(zoneConstraintStr)
+
+ utils.matchesAttributeRequirements(parsedConstraints, supersetConstraint) shouldBe true
+ }
+
+ test("less than equal match is performed on scalar attributes") {
+ val offerAttribs = Map("gpus" -> Value.Scalar.newBuilder().setValue(3).build())
+
+ val ltConstraint = utils.parseConstraintString("gpus:2")
+ val eqConstraint = utils.parseConstraintString("gpus:3")
+ val gtConstraint = utils.parseConstraintString("gpus:4")
+
+ utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe true
+ utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
+ utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
+ }
+
+ test("contains match is performed for range attributes") {
+ val offerAttribs = Map("ports" -> Value.Range.newBuilder().setBegin(7000).setEnd(8000).build())
+ val ltConstraint = utils.parseConstraintString("ports:6000")
+ val eqConstraint = utils.parseConstraintString("ports:7500")
+ val gtConstraint = utils.parseConstraintString("ports:8002")
+ val multiConstraint = utils.parseConstraintString("ports:5000,7500,8300")
+
+ utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe false
+ utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
+ utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
+ utils.matchesAttributeRequirements(multiConstraint, offerAttribs) shouldBe true
+ }
+
+ test("equality match is performed for text attributes") {
+ val offerAttribs = Map("os" -> Value.Text.newBuilder().setValue("centos7").build())
+
+ val trueConstraint = utils.parseConstraintString("os:centos7")
+ val falseConstraint = utils.parseConstraintString("os:ubuntu")
+
+ utils.matchesAttributeRequirements(trueConstraint, offerAttribs) shouldBe true
+ utils.matchesAttributeRequirements(falseConstraint, offerAttribs) shouldBe false
+ }
+
+ test("Port reservation is done correctly with user specified ports only") {
+ val conf = new SparkConf()
+ conf.set("spark.executor.port", "3000" )
+ conf.set(BLOCK_MANAGER_PORT, 4000)
+ val portResource = createTestPortResource((3000, 5000), Some("my_role"))
+
+ val (resourcesLeft, resourcesToBeUsed) = utils
+ .partitionPortResources(List(3000, 4000), List(portResource))
+ resourcesToBeUsed.length shouldBe 2
+
+ val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}.toArray
+
+ portsToUse.length shouldBe 2
+ arePortsEqual(portsToUse, Array(3000L, 4000L)) shouldBe true
+
+ val portRangesToBeUsed = rangesResourcesToTuple(resourcesToBeUsed)
+
+ val expectedUSed = Array((3000L, 3000L), (4000L, 4000L))
+
+ arePortsEqual(portRangesToBeUsed.toArray, expectedUSed) shouldBe true
+ }
+
+ test("Port reservation is done correctly with some user specified ports (spark.executor.port)") {
+ val conf = new SparkConf()
+ conf.set("spark.executor.port", "3100" )
+ val portResource = createTestPortResource((3000, 5000), Some("my_role"))
+
+ val (resourcesLeft, resourcesToBeUsed) = utils
+ .partitionPortResources(List(3100), List(portResource))
+
+ val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
+
+ portsToUse.length shouldBe 1
+ portsToUse.contains(3100) shouldBe true
+ }
+
+ test("Port reservation is done correctly with all random ports") {
+ val conf = new SparkConf()
+ val portResource = createTestPortResource((3000L, 5000L), Some("my_role"))
+
+ val (resourcesLeft, resourcesToBeUsed) = utils
+ .partitionPortResources(List(), List(portResource))
+ val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
+
+ portsToUse.isEmpty shouldBe true
+ }
+
+ test("Port reservation is done correctly with user specified ports only - multiple ranges") {
+ val conf = new SparkConf()
+ conf.set("spark.executor.port", "2100" )
+ conf.set("spark.blockManager.port", "4000")
+ val portResourceList = List(createTestPortResource((3000, 5000), Some("my_role")),
+ createTestPortResource((2000, 2500), Some("other_role")))
+ val (resourcesLeft, resourcesToBeUsed) = utils
+ .partitionPortResources(List(2100, 4000), portResourceList)
+ val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
+
+ portsToUse.length shouldBe 2
+ val portsRangesLeft = rangesResourcesToTuple(resourcesLeft)
+ val portRangesToBeUsed = rangesResourcesToTuple(resourcesToBeUsed)
+
+ val expectedUsed = Array((2100L, 2100L), (4000L, 4000L))
+
+ arePortsEqual(portsToUse.toArray, Array(2100L, 4000L)) shouldBe true
+ arePortsEqual(portRangesToBeUsed.toArray, expectedUsed) shouldBe true
+ }
+
+ test("Port reservation is done correctly with all random ports - multiple ranges") {
+ val conf = new SparkConf()
+ val portResourceList = List(createTestPortResource((3000, 5000), Some("my_role")),
+ createTestPortResource((2000, 2500), Some("other_role")))
+ val (resourcesLeft, resourcesToBeUsed) = utils
+ .partitionPortResources(List(), portResourceList)
+ val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
+ portsToUse.isEmpty shouldBe true
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
new file mode 100644
index 0000000..5a81bb3
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.nio.ByteBuffer
+
+import org.apache.spark.SparkFunSuite
+
+class MesosTaskLaunchDataSuite extends SparkFunSuite {
+ test("serialize and deserialize data must be same") {
+ val serializedTask = ByteBuffer.allocate(40)
+ (Range(100, 110).map(serializedTask.putInt(_)))
+ serializedTask.rewind
+ val attemptNumber = 100
+ val byteString = MesosTaskLaunchData(serializedTask, attemptNumber).toByteString
+ serializedTask.rewind
+ val mesosTaskLaunchData = MesosTaskLaunchData.fromByteString(byteString)
+ assert(mesosTaskLaunchData.attemptNumber == attemptNumber)
+ assert(mesosTaskLaunchData.serializedTask.equals(serializedTask))
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
new file mode 100644
index 0000000..7ebb294
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.util.Collections
+
+import scala.collection.JavaConverters._
+
+import org.apache.mesos.Protos._
+import org.apache.mesos.Protos.Value.{Range => MesosRange, Ranges, Scalar}
+import org.apache.mesos.SchedulerDriver
+import org.mockito.{ArgumentCaptor, Matchers}
+import org.mockito.Mockito._
+
+object Utils {
+ def createOffer(
+ offerId: String,
+ slaveId: String,
+ mem: Int,
+ cpus: Int,
+ ports: Option[(Long, Long)] = None,
+ gpus: Int = 0): Offer = {
+ val builder = Offer.newBuilder()
+ builder.addResourcesBuilder()
+ .setName("mem")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(mem))
+ builder.addResourcesBuilder()
+ .setName("cpus")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(cpus))
+ ports.foreach { resourcePorts =>
+ builder.addResourcesBuilder()
+ .setName("ports")
+ .setType(Value.Type.RANGES)
+ .setRanges(Ranges.newBuilder().addRange(MesosRange.newBuilder()
+ .setBegin(resourcePorts._1).setEnd(resourcePorts._2).build()))
+ }
+ if (gpus > 0) {
+ builder.addResourcesBuilder()
+ .setName("gpus")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(gpus))
+ }
+ builder.setId(createOfferId(offerId))
+ .setFrameworkId(FrameworkID.newBuilder()
+ .setValue("f1"))
+ .setSlaveId(SlaveID.newBuilder().setValue(slaveId))
+ .setHostname(s"host${slaveId}")
+ .build()
+ }
+
+ def verifyTaskLaunched(driver: SchedulerDriver, offerId: String): List[TaskInfo] = {
+ val captor = ArgumentCaptor.forClass(classOf[java.util.Collection[TaskInfo]])
+ verify(driver, times(1)).launchTasks(
+ Matchers.eq(Collections.singleton(createOfferId(offerId))),
+ captor.capture())
+ captor.getValue.asScala.toList
+ }
+
+ def createOfferId(offerId: String): OfferID = {
+ OfferID.newBuilder().setValue(offerId).build()
+ }
+
+ def createSlaveId(slaveId: String): SlaveID = {
+ SlaveID.newBuilder().setValue(slaveId).build()
+ }
+
+ def createExecutorId(executorId: String): ExecutorID = {
+ ExecutorID.newBuilder().setValue(executorId).build()
+ }
+
+ def createTaskId(taskId: String): TaskID = {
+ TaskID.newBuilder().setValue(taskId).build()
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/pom.xml
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
new file mode 100644
index 0000000..04b51dc
--- /dev/null
+++ b/resource-managers/yarn/pom.xml
@@ -0,0 +1,215 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-parent_2.11</artifactId>
+ <version>2.2.0-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>spark-yarn_2.11</artifactId>
+ <packaging>jar</packaging>
+ <name>Spark Project YARN</name>
+ <properties>
+ <sbt.project.name>yarn</sbt.project.name>
+ <jersey-1.version>1.9</jersey-1.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_${scala.binary.version}</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-network-yarn_${scala.binary.version}</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_${scala.binary.version}</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-tags_${scala.binary.version}</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-yarn-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-yarn-common</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-yarn-server-web-proxy</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-yarn-client</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ </dependency>
+
+ <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-server</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-plus</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-http</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-servlet</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-servlets</artifactId>
+ </dependency>
+ <!-- End of shaded deps. -->
+
+ <!--
+ SPARK-10059: Explicitly add JSP dependencies for tests since the MiniYARN cluster needs them.
+ -->
+ <dependency>
+ <groupId>org.eclipse.jetty.orbit</groupId>
+ <artifactId>javax.servlet.jsp</artifactId>
+ <version>2.2.0.v201112011158</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty.orbit</groupId>
+ <artifactId>javax.servlet.jsp.jstl</artifactId>
+ <version>1.2.0.v201105211821</version>
+ <scope>test</scope>
+ </dependency>
+
+ <!--
+ See SPARK-3710. hadoop-yarn-server-tests in Hadoop 2.2 fails to pull some needed
+ dependencies, so they need to be added manually for the tests to work.
+ -->
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-yarn-server-tests</artifactId>
+ <classifier>tests</classifier>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty</artifactId>
+ <version>6.1.26</version>
+ <exclusions>
+ <exclusion>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ </exclusions>
+ <scope>test</scope>
+ </dependency>
+
+ <!--
+ Jersey 1 dependencies only required for YARN integration testing. Creating a YARN cluster
+ in the JVM requires starting a Jersey 1-based web application.
+ -->
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-core</artifactId>
+ <scope>test</scope>
+ <version>${jersey-1.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-json</artifactId>
+ <scope>test</scope>
+ <version>${jersey-1.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ <scope>test</scope>
+ <version>${jersey-1.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey.contribs</groupId>
+ <artifactId>jersey-guice</artifactId>
+ <scope>test</scope>
+ <version>${jersey-1.version}</version>
+ </dependency>
+
+ <!--
+ Testing Hive reflection needs hive on the test classpath only.
+ It doesn't need the spark hive modules, so the -Phive flag is not checked.
+ -->
+ <dependency>
+ <groupId>${hive.group}</groupId>
+ <artifactId>hive-exec</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${hive.group}</groupId>
+ <artifactId>hive-metastore</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.thrift</groupId>
+ <artifactId>libthrift</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.thrift</groupId>
+ <artifactId>libfb303</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+ <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider b/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
new file mode 100644
index 0000000..22ead56
--- /dev/null
+++ b/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
@@ -0,0 +1,3 @@
+org.apache.spark.deploy.yarn.security.HDFSCredentialProvider
+org.apache.spark.deploy.yarn.security.HBaseCredentialProvider
+org.apache.spark.deploy.yarn.security.HiveCredentialProvider
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
new file mode 100644
index 0000000..6e8a1eb
--- /dev/null
+++ b/resource-managers/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -0,0 +1 @@
+org.apache.spark.scheduler.cluster.YarnClusterManager
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
new file mode 100644
index 0000000..0378ef4
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -0,0 +1,791 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, IOException}
+import java.lang.reflect.InvocationTargetException
+import java.net.{Socket, URI, URL}
+import java.util.concurrent.{TimeoutException, TimeUnit}
+
+import scala.collection.mutable.HashMap
+import scala.concurrent.Promise
+import scala.concurrent.duration.Duration
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.yarn.api._
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
+
+import org.apache.spark._
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.history.HistoryServer
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.deploy.yarn.security.{AMCredentialRenewer, ConfigurableCredentialManager}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.rpc._
+import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.util._
+
+/**
+ * Common application master functionality for Spark on Yarn.
+ */
+private[spark] class ApplicationMaster(
+ args: ApplicationMasterArguments,
+ client: YarnRMClient)
+ extends Logging {
+
+ // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
+ // optimal as more containers are available. Might need to handle this better.
+
+ private val sparkConf = new SparkConf()
+ private val yarnConf: YarnConfiguration = SparkHadoopUtil.get.newConfiguration(sparkConf)
+ .asInstanceOf[YarnConfiguration]
+ private val isClusterMode = args.userClass != null
+
+ // Default to twice the number of executors (twice the maximum number of executors if dynamic
+ // allocation is enabled), with a minimum of 3.
+
+ private val maxNumExecutorFailures = {
+ val effectiveNumExecutors =
+ if (Utils.isDynamicAllocationEnabled(sparkConf)) {
+ sparkConf.get(DYN_ALLOCATION_MAX_EXECUTORS)
+ } else {
+ sparkConf.get(EXECUTOR_INSTANCES).getOrElse(0)
+ }
+ // By default, effectiveNumExecutors is Int.MaxValue if dynamic allocation is enabled. We need
+ // avoid the integer overflow here.
+ val defaultMaxNumExecutorFailures = math.max(3,
+ if (effectiveNumExecutors > Int.MaxValue / 2) Int.MaxValue else (2 * effectiveNumExecutors))
+
+ sparkConf.get(MAX_EXECUTOR_FAILURES).getOrElse(defaultMaxNumExecutorFailures)
+ }
+
+ @volatile private var exitCode = 0
+ @volatile private var unregistered = false
+ @volatile private var finished = false
+ @volatile private var finalStatus = getDefaultFinalStatus
+ @volatile private var finalMsg: String = ""
+ @volatile private var userClassThread: Thread = _
+
+ @volatile private var reporterThread: Thread = _
+ @volatile private var allocator: YarnAllocator = _
+
+ // Lock for controlling the allocator (heartbeat) thread.
+ private val allocatorLock = new Object()
+
+ // Steady state heartbeat interval. We want to be reasonably responsive without causing too many
+ // requests to RM.
+ private val heartbeatInterval = {
+ // Ensure that progress is sent before YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS elapses.
+ val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
+ math.max(0, math.min(expiryInterval / 2, sparkConf.get(RM_HEARTBEAT_INTERVAL)))
+ }
+
+ // Initial wait interval before allocator poll, to allow for quicker ramp up when executors are
+ // being requested.
+ private val initialAllocationInterval = math.min(heartbeatInterval,
+ sparkConf.get(INITIAL_HEARTBEAT_INTERVAL))
+
+ // Next wait interval before allocator poll.
+ private var nextAllocationInterval = initialAllocationInterval
+
+ private var rpcEnv: RpcEnv = null
+ private var amEndpoint: RpcEndpointRef = _
+
+ // In cluster mode, used to tell the AM when the user's SparkContext has been initialized.
+ private val sparkContextPromise = Promise[SparkContext]()
+
+ private var credentialRenewer: AMCredentialRenewer = _
+
+ // Load the list of localized files set by the client. This is used when launching executors,
+ // and is loaded here so that these configs don't pollute the Web UI's environment page in
+ // cluster mode.
+ private val localResources = {
+ logInfo("Preparing Local resources")
+ val resources = HashMap[String, LocalResource]()
+
+ def setupDistributedCache(
+ file: String,
+ rtype: LocalResourceType,
+ timestamp: String,
+ size: String,
+ vis: String): Unit = {
+ val uri = new URI(file)
+ val amJarRsrc = Records.newRecord(classOf[LocalResource])
+ amJarRsrc.setType(rtype)
+ amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis))
+ amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri))
+ amJarRsrc.setTimestamp(timestamp.toLong)
+ amJarRsrc.setSize(size.toLong)
+
+ val fileName = Option(uri.getFragment()).getOrElse(new Path(uri).getName())
+ resources(fileName) = amJarRsrc
+ }
+
+ val distFiles = sparkConf.get(CACHED_FILES)
+ val fileSizes = sparkConf.get(CACHED_FILES_SIZES)
+ val timeStamps = sparkConf.get(CACHED_FILES_TIMESTAMPS)
+ val visibilities = sparkConf.get(CACHED_FILES_VISIBILITIES)
+ val resTypes = sparkConf.get(CACHED_FILES_TYPES)
+
+ for (i <- 0 to distFiles.size - 1) {
+ val resType = LocalResourceType.valueOf(resTypes(i))
+ setupDistributedCache(distFiles(i), resType, timeStamps(i).toString, fileSizes(i).toString,
+ visibilities(i))
+ }
+
+ // Distribute the conf archive to executors.
+ sparkConf.get(CACHED_CONF_ARCHIVE).foreach { path =>
+ val uri = new URI(path)
+ val fs = FileSystem.get(uri, yarnConf)
+ val status = fs.getFileStatus(new Path(uri))
+ // SPARK-16080: Make sure to use the correct name for the destination when distributing the
+ // conf archive to executors.
+ val destUri = new URI(uri.getScheme(), uri.getRawSchemeSpecificPart(),
+ Client.LOCALIZED_CONF_DIR)
+ setupDistributedCache(destUri.toString(), LocalResourceType.ARCHIVE,
+ status.getModificationTime().toString, status.getLen.toString,
+ LocalResourceVisibility.PRIVATE.name())
+ }
+
+ // Clean up the configuration so it doesn't show up in the Web UI (since it's really noisy).
+ CACHE_CONFIGS.foreach { e =>
+ sparkConf.remove(e)
+ sys.props.remove(e.key)
+ }
+
+ resources.toMap
+ }
+
+ def getAttemptId(): ApplicationAttemptId = {
+ client.getAttemptId()
+ }
+
+ final def run(): Int = {
+ try {
+ val appAttemptId = client.getAttemptId()
+
+ var attemptID: Option[String] = None
+
+ if (isClusterMode) {
+ // Set the web ui port to be ephemeral for yarn so we don't conflict with
+ // other spark processes running on the same box
+ System.setProperty("spark.ui.port", "0")
+
+ // Set the master and deploy mode property to match the requested mode.
+ System.setProperty("spark.master", "yarn")
+ System.setProperty("spark.submit.deployMode", "cluster")
+
+ // Set this internal configuration if it is running on cluster mode, this
+ // configuration will be checked in SparkContext to avoid misuse of yarn cluster mode.
+ System.setProperty("spark.yarn.app.id", appAttemptId.getApplicationId().toString())
+
+ attemptID = Option(appAttemptId.getAttemptId.toString)
+ }
+
+ new CallerContext(
+ "APPMASTER", sparkConf.get(APP_CALLER_CONTEXT),
+ Option(appAttemptId.getApplicationId.toString), attemptID).setCurrentContext()
+
+ logInfo("ApplicationAttemptId: " + appAttemptId)
+
+ val fs = FileSystem.get(yarnConf)
+
+ // This shutdown hook should run *after* the SparkContext is shut down.
+ val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1
+ ShutdownHookManager.addShutdownHook(priority) { () =>
+ val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
+ val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts
+
+ if (!finished) {
+ // The default state of ApplicationMaster is failed if it is invoked by shut down hook.
+ // This behavior is different compared to 1.x version.
+ // If user application is exited ahead of time by calling System.exit(N), here mark
+ // this application as failed with EXIT_EARLY. For a good shutdown, user shouldn't call
+ // System.exit(0) to terminate the application.
+ finish(finalStatus,
+ ApplicationMaster.EXIT_EARLY,
+ "Shutdown hook called before final status was reported.")
+ }
+
+ if (!unregistered) {
+ // we only want to unregister if we don't want the RM to retry
+ if (finalStatus == FinalApplicationStatus.SUCCEEDED || isLastAttempt) {
+ unregister(finalStatus, finalMsg)
+ cleanupStagingDir(fs)
+ }
+ }
+ }
+
+ // Call this to force generation of secret so it gets populated into the
+ // Hadoop UGI. This has to happen before the startUserApplication which does a
+ // doAs in order for the credentials to be passed on to the executor containers.
+ val securityMgr = new SecurityManager(sparkConf)
+
+ // If the credentials file config is present, we must periodically renew tokens. So create
+ // a new AMDelegationTokenRenewer
+ if (sparkConf.contains(CREDENTIALS_FILE_PATH.key)) {
+ // If a principal and keytab have been set, use that to create new credentials for executors
+ // periodically
+ credentialRenewer =
+ new ConfigurableCredentialManager(sparkConf, yarnConf).credentialRenewer()
+ credentialRenewer.scheduleLoginFromKeytab()
+ }
+
+ if (isClusterMode) {
+ runDriver(securityMgr)
+ } else {
+ runExecutorLauncher(securityMgr)
+ }
+ } catch {
+ case e: Exception =>
+ // catch everything else if not specifically handled
+ logError("Uncaught exception: ", e)
+ finish(FinalApplicationStatus.FAILED,
+ ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
+ "Uncaught exception: " + e)
+ }
+ exitCode
+ }
+
+ /**
+ * Set the default final application status for client mode to UNDEFINED to handle
+ * if YARN HA restarts the application so that it properly retries. Set the final
+ * status to SUCCEEDED in cluster mode to handle if the user calls System.exit
+ * from the application code.
+ */
+ final def getDefaultFinalStatus(): FinalApplicationStatus = {
+ if (isClusterMode) {
+ FinalApplicationStatus.FAILED
+ } else {
+ FinalApplicationStatus.UNDEFINED
+ }
+ }
+
+ /**
+ * unregister is used to completely unregister the application from the ResourceManager.
+ * This means the ResourceManager will not retry the application attempt on your behalf if
+ * a failure occurred.
+ */
+ final def unregister(status: FinalApplicationStatus, diagnostics: String = null): Unit = {
+ synchronized {
+ if (!unregistered) {
+ logInfo(s"Unregistering ApplicationMaster with $status" +
+ Option(diagnostics).map(msg => s" (diag message: $msg)").getOrElse(""))
+ unregistered = true
+ client.unregister(status, Option(diagnostics).getOrElse(""))
+ }
+ }
+ }
+
+ final def finish(status: FinalApplicationStatus, code: Int, msg: String = null): Unit = {
+ synchronized {
+ if (!finished) {
+ val inShutdown = ShutdownHookManager.inShutdown()
+ logInfo(s"Final app status: $status, exitCode: $code" +
+ Option(msg).map(msg => s", (reason: $msg)").getOrElse(""))
+ exitCode = code
+ finalStatus = status
+ finalMsg = msg
+ finished = true
+ if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) {
+ logDebug("shutting down reporter thread")
+ reporterThread.interrupt()
+ }
+ if (!inShutdown && Thread.currentThread() != userClassThread && userClassThread != null) {
+ logDebug("shutting down user thread")
+ userClassThread.interrupt()
+ }
+ if (!inShutdown && credentialRenewer != null) {
+ credentialRenewer.stop()
+ credentialRenewer = null
+ }
+ }
+ }
+ }
+
+ private def sparkContextInitialized(sc: SparkContext) = {
+ sparkContextPromise.success(sc)
+ }
+
+ private def registerAM(
+ _sparkConf: SparkConf,
+ _rpcEnv: RpcEnv,
+ driverRef: RpcEndpointRef,
+ uiAddress: String,
+ securityMgr: SecurityManager) = {
+ val appId = client.getAttemptId().getApplicationId().toString()
+ val attemptId = client.getAttemptId().getAttemptId().toString()
+ val historyAddress =
+ _sparkConf.get(HISTORY_SERVER_ADDRESS)
+ .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
+ .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
+ .getOrElse("")
+
+ val driverUrl = RpcEndpointAddress(
+ _sparkConf.get("spark.driver.host"),
+ _sparkConf.get("spark.driver.port").toInt,
+ CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
+
+ // Before we initialize the allocator, let's log the information about how executors will
+ // be run up front, to avoid printing this out for every single executor being launched.
+ // Use placeholders for information that changes such as executor IDs.
+ logInfo {
+ val executorMemory = sparkConf.get(EXECUTOR_MEMORY).toInt
+ val executorCores = sparkConf.get(EXECUTOR_CORES)
+ val dummyRunner = new ExecutorRunnable(None, yarnConf, sparkConf, driverUrl, "<executorId>",
+ "<hostname>", executorMemory, executorCores, appId, securityMgr, localResources)
+ dummyRunner.launchContextDebugInfo()
+ }
+
+ allocator = client.register(driverUrl,
+ driverRef,
+ yarnConf,
+ _sparkConf,
+ uiAddress,
+ historyAddress,
+ securityMgr,
+ localResources)
+
+ allocator.allocateResources()
+ reporterThread = launchReporterThread()
+ }
+
+ /**
+ * Create an [[RpcEndpoint]] that communicates with the driver.
+ *
+ * In cluster mode, the AM and the driver belong to same process
+ * so the AMEndpoint need not monitor lifecycle of the driver.
+ *
+ * @return A reference to the driver's RPC endpoint.
+ */
+ private def runAMEndpoint(
+ host: String,
+ port: String,
+ isClusterMode: Boolean): RpcEndpointRef = {
+ val driverEndpoint = rpcEnv.setupEndpointRef(
+ RpcAddress(host, port.toInt),
+ YarnSchedulerBackend.ENDPOINT_NAME)
+ amEndpoint =
+ rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverEndpoint, isClusterMode))
+ driverEndpoint
+ }
+
+ private def runDriver(securityMgr: SecurityManager): Unit = {
+ addAmIpFilter()
+ userClassThread = startUserApplication()
+
+ // This a bit hacky, but we need to wait until the spark.driver.port property has
+ // been set by the Thread executing the user class.
+ logInfo("Waiting for spark context initialization...")
+ val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
+ try {
+ val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
+ Duration(totalWaitTime, TimeUnit.MILLISECONDS))
+ if (sc != null) {
+ rpcEnv = sc.env.rpcEnv
+ val driverRef = runAMEndpoint(
+ sc.getConf.get("spark.driver.host"),
+ sc.getConf.get("spark.driver.port"),
+ isClusterMode = true)
+ registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl).getOrElse(""),
+ securityMgr)
+ } else {
+ // Sanity check; should never happen in normal operation, since sc should only be null
+ // if the user app did not create a SparkContext.
+ if (!finished) {
+ throw new IllegalStateException("SparkContext is null but app is still running!")
+ }
+ }
+ userClassThread.join()
+ } catch {
+ case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
+ logError(
+ s"SparkContext did not initialize after waiting for $totalWaitTime ms. " +
+ "Please check earlier log output for errors. Failing the application.")
+ finish(FinalApplicationStatus.FAILED,
+ ApplicationMaster.EXIT_SC_NOT_INITED,
+ "Timed out waiting for SparkContext.")
+ }
+ }
+
+ private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
+ val port = sparkConf.get(AM_PORT)
+ rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr,
+ clientMode = true)
+ val driverRef = waitForSparkDriver()
+ addAmIpFilter()
+ registerAM(sparkConf, rpcEnv, driverRef, sparkConf.get("spark.driver.appUIAddress", ""),
+ securityMgr)
+
+ // In client mode the actor will stop the reporter thread.
+ reporterThread.join()
+ }
+
+ private def launchReporterThread(): Thread = {
+ // The number of failures in a row until Reporter thread give up
+ val reporterMaxFailures = sparkConf.get(MAX_REPORTER_THREAD_FAILURES)
+
+ val t = new Thread {
+ override def run() {
+ var failureCount = 0
+ while (!finished) {
+ try {
+ if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
+ finish(FinalApplicationStatus.FAILED,
+ ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
+ s"Max number of executor failures ($maxNumExecutorFailures) reached")
+ } else {
+ logDebug("Sending progress")
+ allocator.allocateResources()
+ }
+ failureCount = 0
+ } catch {
+ case i: InterruptedException =>
+ case e: Throwable =>
+ failureCount += 1
+ // this exception was introduced in hadoop 2.4 and this code would not compile
+ // with earlier versions if we refer it directly.
+ if ("org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException" ==
+ e.getClass().getName()) {
+ logError("Exception from Reporter thread.", e)
+ finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_REPORTER_FAILURE,
+ e.getMessage)
+ } else if (!NonFatal(e) || failureCount >= reporterMaxFailures) {
+ finish(FinalApplicationStatus.FAILED,
+ ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " +
+ s"$failureCount time(s) from Reporter thread.")
+ } else {
+ logWarning(s"Reporter thread fails $failureCount time(s) in a row.", e)
+ }
+ }
+ try {
+ val numPendingAllocate = allocator.getPendingAllocate.size
+ var sleepStart = 0L
+ var sleepInterval = 200L // ms
+ allocatorLock.synchronized {
+ sleepInterval =
+ if (numPendingAllocate > 0 || allocator.getNumPendingLossReasonRequests > 0) {
+ val currentAllocationInterval =
+ math.min(heartbeatInterval, nextAllocationInterval)
+ nextAllocationInterval = currentAllocationInterval * 2 // avoid overflow
+ currentAllocationInterval
+ } else {
+ nextAllocationInterval = initialAllocationInterval
+ heartbeatInterval
+ }
+ sleepStart = System.currentTimeMillis()
+ allocatorLock.wait(sleepInterval)
+ }
+ val sleepDuration = System.currentTimeMillis() - sleepStart
+ if (sleepDuration < sleepInterval) {
+ // log when sleep is interrupted
+ logDebug(s"Number of pending allocations is $numPendingAllocate. " +
+ s"Slept for $sleepDuration/$sleepInterval ms.")
+ // if sleep was less than the minimum interval, sleep for the rest of it
+ val toSleep = math.max(0, initialAllocationInterval - sleepDuration)
+ if (toSleep > 0) {
+ logDebug(s"Going back to sleep for $toSleep ms")
+ // use Thread.sleep instead of allocatorLock.wait. there is no need to be woken up
+ // by the methods that signal allocatorLock because this is just finishing the min
+ // sleep interval, which should happen even if this is signalled again.
+ Thread.sleep(toSleep)
+ }
+ } else {
+ logDebug(s"Number of pending allocations is $numPendingAllocate. " +
+ s"Slept for $sleepDuration/$sleepInterval.")
+ }
+ } catch {
+ case e: InterruptedException =>
+ }
+ }
+ }
+ }
+ // setting to daemon status, though this is usually not a good idea.
+ t.setDaemon(true)
+ t.setName("Reporter")
+ t.start()
+ logInfo(s"Started progress reporter thread with (heartbeat : $heartbeatInterval, " +
+ s"initial allocation : $initialAllocationInterval) intervals")
+ t
+ }
+
+ /**
+ * Clean up the staging directory.
+ */
+ private def cleanupStagingDir(fs: FileSystem) {
+ var stagingDirPath: Path = null
+ try {
+ val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
+ if (!preserveFiles) {
+ stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR"))
+ if (stagingDirPath == null) {
+ logError("Staging directory is null")
+ return
+ }
+ logInfo("Deleting staging directory " + stagingDirPath)
+ fs.delete(stagingDirPath, true)
+ }
+ } catch {
+ case ioe: IOException =>
+ logError("Failed to cleanup staging dir " + stagingDirPath, ioe)
+ }
+ }
+
+ private def waitForSparkDriver(): RpcEndpointRef = {
+ logInfo("Waiting for Spark driver to be reachable.")
+ var driverUp = false
+ val hostport = args.userArgs(0)
+ val (driverHost, driverPort) = Utils.parseHostPort(hostport)
+
+ // Spark driver should already be up since it launched us, but we don't want to
+ // wait forever, so wait 100 seconds max to match the cluster mode setting.
+ val totalWaitTimeMs = sparkConf.get(AM_MAX_WAIT_TIME)
+ val deadline = System.currentTimeMillis + totalWaitTimeMs
+
+ while (!driverUp && !finished && System.currentTimeMillis < deadline) {
+ try {
+ val socket = new Socket(driverHost, driverPort)
+ socket.close()
+ logInfo("Driver now available: %s:%s".format(driverHost, driverPort))
+ driverUp = true
+ } catch {
+ case e: Exception =>
+ logError("Failed to connect to driver at %s:%s, retrying ...".
+ format(driverHost, driverPort))
+ Thread.sleep(100L)
+ }
+ }
+
+ if (!driverUp) {
+ throw new SparkException("Failed to connect to driver!")
+ }
+
+ sparkConf.set("spark.driver.host", driverHost)
+ sparkConf.set("spark.driver.port", driverPort.toString)
+
+ runAMEndpoint(driverHost, driverPort.toString, isClusterMode = false)
+ }
+
+ /** Add the Yarn IP filter that is required for properly securing the UI. */
+ private def addAmIpFilter() = {
+ val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
+ val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
+ val params = client.getAmIpFilterParams(yarnConf, proxyBase)
+ if (isClusterMode) {
+ System.setProperty("spark.ui.filters", amFilter)
+ params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
+ } else {
+ amEndpoint.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
+ }
+ }
+
+ /**
+ * Start the user class, which contains the spark driver, in a separate Thread.
+ * If the main routine exits cleanly or exits with System.exit(N) for any N
+ * we assume it was successful, for all other cases we assume failure.
+ *
+ * Returns the user thread that was started.
+ */
+ private def startUserApplication(): Thread = {
+ logInfo("Starting the user application in a separate Thread")
+
+ val classpath = Client.getUserClasspath(sparkConf)
+ val urls = classpath.map { entry =>
+ new URL("file:" + new File(entry.getPath()).getAbsolutePath())
+ }
+ val userClassLoader =
+ if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
+ new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+ } else {
+ new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
+ }
+
+ var userArgs = args.userArgs
+ if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
+ // When running pyspark, the app is run using PythonRunner. The second argument is the list
+ // of files to add to PYTHONPATH, which Client.scala already handles, so it's empty.
+ userArgs = Seq(args.primaryPyFile, "") ++ userArgs
+ }
+ if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
+ // TODO(davies): add R dependencies here
+ }
+ val mainMethod = userClassLoader.loadClass(args.userClass)
+ .getMethod("main", classOf[Array[String]])
+
+ val userThread = new Thread {
+ override def run() {
+ try {
+ mainMethod.invoke(null, userArgs.toArray)
+ finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
+ logDebug("Done running users class")
+ } catch {
+ case e: InvocationTargetException =>
+ e.getCause match {
+ case _: InterruptedException =>
+ // Reporter thread can interrupt to stop user class
+ case SparkUserAppException(exitCode) =>
+ val msg = s"User application exited with status $exitCode"
+ logError(msg)
+ finish(FinalApplicationStatus.FAILED, exitCode, msg)
+ case cause: Throwable =>
+ logError("User class threw exception: " + cause, cause)
+ finish(FinalApplicationStatus.FAILED,
+ ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
+ "User class threw exception: " + cause)
+ }
+ sparkContextPromise.tryFailure(e.getCause())
+ } finally {
+ // Notify the thread waiting for the SparkContext, in case the application did not
+ // instantiate one. This will do nothing when the user code instantiates a SparkContext
+ // (with the correct master), or when the user code throws an exception (due to the
+ // tryFailure above).
+ sparkContextPromise.trySuccess(null)
+ }
+ }
+ }
+ userThread.setContextClassLoader(userClassLoader)
+ userThread.setName("Driver")
+ userThread.start()
+ userThread
+ }
+
+ private def resetAllocatorInterval(): Unit = allocatorLock.synchronized {
+ nextAllocationInterval = initialAllocationInterval
+ allocatorLock.notifyAll()
+ }
+
+ /**
+ * An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
+ */
+ private class AMEndpoint(
+ override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
+ extends RpcEndpoint with Logging {
+
+ override def onStart(): Unit = {
+ driver.send(RegisterClusterManager(self))
+ }
+
+ override def receive: PartialFunction[Any, Unit] = {
+ case x: AddWebUIFilter =>
+ logInfo(s"Add WebUI Filter. $x")
+ driver.send(x)
+ }
+
+ override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+ case RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount) =>
+ Option(allocator) match {
+ case Some(a) =>
+ if (a.requestTotalExecutorsWithPreferredLocalities(requestedTotal,
+ localityAwareTasks, hostToLocalTaskCount)) {
+ resetAllocatorInterval()
+ }
+ context.reply(true)
+
+ case None =>
+ logWarning("Container allocator is not ready to request executors yet.")
+ context.reply(false)
+ }
+
+ case KillExecutors(executorIds) =>
+ logInfo(s"Driver requested to kill executor(s) ${executorIds.mkString(", ")}.")
+ Option(allocator) match {
+ case Some(a) => executorIds.foreach(a.killExecutor)
+ case None => logWarning("Container allocator is not ready to kill executors yet.")
+ }
+ context.reply(true)
+
+ case GetExecutorLossReason(eid) =>
+ Option(allocator) match {
+ case Some(a) =>
+ a.enqueueGetLossReasonRequest(eid, context)
+ resetAllocatorInterval()
+ case None =>
+ logWarning("Container allocator is not ready to find executor loss reasons yet.")
+ }
+ }
+
+ override def onDisconnected(remoteAddress: RpcAddress): Unit = {
+ // In cluster mode, do not rely on the disassociated event to exit
+ // This avoids potentially reporting incorrect exit codes if the driver fails
+ if (!isClusterMode) {
+ logInfo(s"Driver terminated or disconnected! Shutting down. $remoteAddress")
+ finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
+ }
+ }
+ }
+
+}
+
+object ApplicationMaster extends Logging {
+
+ // exit codes for different causes, no reason behind the values
+ private val EXIT_SUCCESS = 0
+ private val EXIT_UNCAUGHT_EXCEPTION = 10
+ private val EXIT_MAX_EXECUTOR_FAILURES = 11
+ private val EXIT_REPORTER_FAILURE = 12
+ private val EXIT_SC_NOT_INITED = 13
+ private val EXIT_SECURITY = 14
+ private val EXIT_EXCEPTION_USER_CLASS = 15
+ private val EXIT_EARLY = 16
+
+ private var master: ApplicationMaster = _
+
+ def main(args: Array[String]): Unit = {
+ SignalUtils.registerLogger(log)
+ val amArgs = new ApplicationMasterArguments(args)
+
+ // Load the properties file with the Spark configuration and set entries as system properties,
+ // so that user code run inside the AM also has access to them.
+ // Note: we must do this before SparkHadoopUtil instantiated
+ if (amArgs.propertiesFile != null) {
+ Utils.getPropertiesFromFile(amArgs.propertiesFile).foreach { case (k, v) =>
+ sys.props(k) = v
+ }
+ }
+ SparkHadoopUtil.get.runAsSparkUser { () =>
+ master = new ApplicationMaster(amArgs, new YarnRMClient)
+ System.exit(master.run())
+ }
+ }
+
+ private[spark] def sparkContextInitialized(sc: SparkContext): Unit = {
+ master.sparkContextInitialized(sc)
+ }
+
+ private[spark] def getAttemptId(): ApplicationAttemptId = {
+ master.getAttemptId
+ }
+
+}
+
+/**
+ * This object does not provide any special functionality. It exists so that it's easy to tell
+ * apart the client-mode AM from the cluster-mode AM when using tools such as ps or jps.
+ */
+object ExecutorLauncher {
+
+ def main(args: Array[String]): Unit = {
+ ApplicationMaster.main(args)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
new file mode 100644
index 0000000..5cdec87
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.util.{IntParam, MemoryParam}
+
+class ApplicationMasterArguments(val args: Array[String]) {
+ var userJar: String = null
+ var userClass: String = null
+ var primaryPyFile: String = null
+ var primaryRFile: String = null
+ var userArgs: Seq[String] = Nil
+ var propertiesFile: String = null
+
+ parseArgs(args.toList)
+
+ private def parseArgs(inputArgs: List[String]): Unit = {
+ val userArgsBuffer = new ArrayBuffer[String]()
+
+ var args = inputArgs
+
+ while (!args.isEmpty) {
+ // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0,
+ // the properties with executor in their names are preferred.
+ args match {
+ case ("--jar") :: value :: tail =>
+ userJar = value
+ args = tail
+
+ case ("--class") :: value :: tail =>
+ userClass = value
+ args = tail
+
+ case ("--primary-py-file") :: value :: tail =>
+ primaryPyFile = value
+ args = tail
+
+ case ("--primary-r-file") :: value :: tail =>
+ primaryRFile = value
+ args = tail
+
+ case ("--arg") :: value :: tail =>
+ userArgsBuffer += value
+ args = tail
+
+ case ("--properties-file") :: value :: tail =>
+ propertiesFile = value
+ args = tail
+
+ case _ =>
+ printUsageAndExit(1, args)
+ }
+ }
+
+ if (primaryPyFile != null && primaryRFile != null) {
+ // scalastyle:off println
+ System.err.println("Cannot have primary-py-file and primary-r-file at the same time")
+ // scalastyle:on println
+ System.exit(-1)
+ }
+
+ userArgs = userArgsBuffer.toList
+ }
+
+ def printUsageAndExit(exitCode: Int, unknownParam: Any = null) {
+ // scalastyle:off println
+ if (unknownParam != null) {
+ System.err.println("Unknown/unsupported param " + unknownParam)
+ }
+ System.err.println("""
+ |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options]
+ |Options:
+ | --jar JAR_PATH Path to your application's JAR file
+ | --class CLASS_NAME Name of your application's main class
+ | --primary-py-file A main Python file
+ | --primary-r-file A main R file
+ | --arg ARG Argument to be passed to your application's main class.
+ | Multiple invocations are possible, each will be passed in order.
+ | --properties-file FILE Path to a custom Spark properties file.
+ """.stripMargin)
+ // scalastyle:on println
+ System.exit(exitCode)
+ }
+}
+
+object ApplicationMasterArguments {
+ val DEFAULT_NUMBER_EXECUTORS = 2
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[08/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
new file mode 100644
index 0000000..9c3b18e
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, FileOutputStream, OutputStreamWriter}
+import java.nio.charset.StandardCharsets
+import java.util.Properties
+import java.util.concurrent.TimeUnit
+
+import scala.collection.JavaConverters._
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import com.google.common.io.Files
+import org.apache.commons.lang3.SerializationUtils
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.server.MiniYARNCluster
+import org.scalatest.{BeforeAndAfterAll, Matchers}
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark._
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.launcher._
+import org.apache.spark.util.Utils
+
+abstract class BaseYarnClusterSuite
+ extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
+
+ // log4j configuration for the YARN containers, so that their output is collected
+ // by YARN instead of trying to overwrite unit-tests.log.
+ protected val LOG4J_CONF = """
+ |log4j.rootCategory=DEBUG, console
+ |log4j.appender.console=org.apache.log4j.ConsoleAppender
+ |log4j.appender.console.target=System.err
+ |log4j.appender.console.layout=org.apache.log4j.PatternLayout
+ |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+ |log4j.logger.org.apache.hadoop=WARN
+ |log4j.logger.org.eclipse.jetty=WARN
+ |log4j.logger.org.mortbay=WARN
+ |log4j.logger.org.spark_project.jetty=WARN
+ """.stripMargin
+
+ private var yarnCluster: MiniYARNCluster = _
+ protected var tempDir: File = _
+ private var fakeSparkJar: File = _
+ protected var hadoopConfDir: File = _
+ private var logConfDir: File = _
+
+ var oldSystemProperties: Properties = null
+
+ def newYarnConfig(): YarnConfiguration
+
+ override def beforeAll() {
+ super.beforeAll()
+ oldSystemProperties = SerializationUtils.clone(System.getProperties)
+
+ tempDir = Utils.createTempDir()
+ logConfDir = new File(tempDir, "log4j")
+ logConfDir.mkdir()
+ System.setProperty("SPARK_YARN_MODE", "true")
+
+ val logConfFile = new File(logConfDir, "log4j.properties")
+ Files.write(LOG4J_CONF, logConfFile, StandardCharsets.UTF_8)
+
+ // Disable the disk utilization check to avoid the test hanging when people's disks are
+ // getting full.
+ val yarnConf = newYarnConfig()
+ yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
+ "100.0")
+
+ yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
+ yarnCluster.init(yarnConf)
+ yarnCluster.start()
+
+ // There's a race in MiniYARNCluster in which start() may return before the RM has updated
+ // its address in the configuration. You can see this in the logs by noticing that when
+ // MiniYARNCluster prints the address, it still has port "0" assigned, although later the
+ // test works sometimes:
+ //
+ // INFO MiniYARNCluster: MiniYARN ResourceManager address: blah:0
+ //
+ // That log message prints the contents of the RM_ADDRESS config variable. If you check it
+ // later on, it looks something like this:
+ //
+ // INFO YarnClusterSuite: RM address in configuration is blah:42631
+ //
+ // This hack loops for a bit waiting for the port to change, and fails the test if it hasn't
+ // done so in a timely manner (defined to be 10 seconds).
+ val config = yarnCluster.getConfig()
+ val deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(10)
+ while (config.get(YarnConfiguration.RM_ADDRESS).split(":")(1) == "0") {
+ if (System.currentTimeMillis() > deadline) {
+ throw new IllegalStateException("Timed out waiting for RM to come up.")
+ }
+ logDebug("RM address still not set in configuration, waiting...")
+ TimeUnit.MILLISECONDS.sleep(100)
+ }
+
+ logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
+
+ fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
+ hadoopConfDir = new File(tempDir, Client.LOCALIZED_CONF_DIR)
+ assert(hadoopConfDir.mkdir())
+ File.createTempFile("token", ".txt", hadoopConfDir)
+ }
+
+ override def afterAll() {
+ try {
+ yarnCluster.stop()
+ } finally {
+ System.setProperties(oldSystemProperties)
+ super.afterAll()
+ }
+ }
+
+ protected def runSpark(
+ clientMode: Boolean,
+ klass: String,
+ appArgs: Seq[String] = Nil,
+ sparkArgs: Seq[(String, String)] = Nil,
+ extraClassPath: Seq[String] = Nil,
+ extraJars: Seq[String] = Nil,
+ extraConf: Map[String, String] = Map(),
+ extraEnv: Map[String, String] = Map()): SparkAppHandle.State = {
+ val deployMode = if (clientMode) "client" else "cluster"
+ val propsFile = createConfFile(extraClassPath = extraClassPath, extraConf = extraConf)
+ val env = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv
+
+ val launcher = new SparkLauncher(env.asJava)
+ if (klass.endsWith(".py")) {
+ launcher.setAppResource(klass)
+ } else {
+ launcher.setMainClass(klass)
+ launcher.setAppResource(fakeSparkJar.getAbsolutePath())
+ }
+ launcher.setSparkHome(sys.props("spark.test.home"))
+ .setMaster("yarn")
+ .setDeployMode(deployMode)
+ .setConf("spark.executor.instances", "1")
+ .setPropertiesFile(propsFile)
+ .addAppArgs(appArgs.toArray: _*)
+
+ sparkArgs.foreach { case (name, value) =>
+ if (value != null) {
+ launcher.addSparkArg(name, value)
+ } else {
+ launcher.addSparkArg(name)
+ }
+ }
+ extraJars.foreach(launcher.addJar)
+
+ val handle = launcher.startApplication()
+ try {
+ eventually(timeout(2 minutes), interval(1 second)) {
+ assert(handle.getState().isFinal())
+ }
+ } finally {
+ handle.kill()
+ }
+
+ handle.getState()
+ }
+
+ /**
+ * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
+ * any sort of error when the job process finishes successfully, but the job itself fails. So
+ * the tests enforce that something is written to a file after everything is ok to indicate
+ * that the job succeeded.
+ */
+ protected def checkResult(finalState: SparkAppHandle.State, result: File): Unit = {
+ checkResult(finalState, result, "success")
+ }
+
+ protected def checkResult(
+ finalState: SparkAppHandle.State,
+ result: File,
+ expected: String): Unit = {
+ finalState should be (SparkAppHandle.State.FINISHED)
+ val resultString = Files.toString(result, StandardCharsets.UTF_8)
+ resultString should be (expected)
+ }
+
+ protected def mainClassName(klass: Class[_]): String = {
+ klass.getName().stripSuffix("$")
+ }
+
+ protected def createConfFile(
+ extraClassPath: Seq[String] = Nil,
+ extraConf: Map[String, String] = Map()): String = {
+ val props = new Properties()
+ props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath())
+
+ val testClasspath = new TestClasspathBuilder()
+ .buildClassPath(
+ logConfDir.getAbsolutePath() +
+ File.pathSeparator +
+ extraClassPath.mkString(File.pathSeparator))
+ .asScala
+ .mkString(File.pathSeparator)
+
+ props.put("spark.driver.extraClassPath", testClasspath)
+ props.put("spark.executor.extraClassPath", testClasspath)
+
+ // SPARK-4267: make sure java options are propagated correctly.
+ props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
+ props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
+
+ yarnCluster.getConfig().asScala.foreach { e =>
+ props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
+ }
+ sys.props.foreach { case (k, v) =>
+ if (k.startsWith("spark.")) {
+ props.setProperty(k, v)
+ }
+ }
+ extraConf.foreach { case (k, v) => props.setProperty(k, v) }
+
+ val propsFile = File.createTempFile("spark", ".properties", tempDir)
+ val writer = new OutputStreamWriter(new FileOutputStream(propsFile), StandardCharsets.UTF_8)
+ props.store(writer, "Spark properties.")
+ writer.close()
+ propsFile.getAbsolutePath()
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
new file mode 100644
index 0000000..b696e08
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.net.URI
+
+import scala.collection.mutable.HashMap
+import scala.collection.mutable.Map
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.FileStatus
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.yarn.api.records.LocalResource
+import org.apache.hadoop.yarn.api.records.LocalResourceType
+import org.apache.hadoop.yarn.api.records.LocalResourceVisibility
+import org.apache.hadoop.yarn.util.ConverterUtils
+import org.mockito.Mockito.when
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
+
+class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {
+
+ class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
+ override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
+ LocalResourceVisibility = {
+ LocalResourceVisibility.PRIVATE
+ }
+ }
+
+ test("test getFileStatus empty") {
+ val distMgr = new ClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val uri = new URI("/tmp/testing")
+ when(fs.getFileStatus(new Path(uri))).thenReturn(new FileStatus())
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+ val stat = distMgr.getFileStatus(fs, uri, statCache)
+ assert(stat.getPath() === null)
+ }
+
+ test("test getFileStatus cached") {
+ val distMgr = new ClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val uri = new URI("/tmp/testing")
+ val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
+ null, new Path("/tmp/testing"))
+ when(fs.getFileStatus(new Path(uri))).thenReturn(new FileStatus())
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus](uri -> realFileStatus)
+ val stat = distMgr.getFileStatus(fs, uri, statCache)
+ assert(stat.getPath().toString() === "/tmp/testing")
+ }
+
+ test("test addResource") {
+ val distMgr = new MockClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val conf = new Configuration()
+ val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
+ val localResources = HashMap[String, LocalResource]()
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+ when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
+
+ distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, "link",
+ statCache, false)
+ val resource = localResources("link")
+ assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
+ assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
+ assert(resource.getTimestamp() === 0)
+ assert(resource.getSize() === 0)
+ assert(resource.getType() === LocalResourceType.FILE)
+
+ val sparkConf = new SparkConf(false)
+ distMgr.updateConfiguration(sparkConf)
+ assert(sparkConf.get(CACHED_FILES) === Seq("file:/foo.invalid.com:8080/tmp/testing#link"))
+ assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Seq(0L))
+ assert(sparkConf.get(CACHED_FILES_SIZES) === Seq(0L))
+ assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Seq(LocalResourceVisibility.PRIVATE.name()))
+ assert(sparkConf.get(CACHED_FILES_TYPES) === Seq(LocalResourceType.FILE.name()))
+
+ // add another one and verify both there and order correct
+ val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
+ null, new Path("/tmp/testing2"))
+ val destPath2 = new Path("file:///foo.invalid.com:8080/tmp/testing2")
+ when(fs.getFileStatus(destPath2)).thenReturn(realFileStatus)
+ distMgr.addResource(fs, conf, destPath2, localResources, LocalResourceType.FILE, "link2",
+ statCache, false)
+ val resource2 = localResources("link2")
+ assert(resource2.getVisibility() === LocalResourceVisibility.PRIVATE)
+ assert(ConverterUtils.getPathFromYarnURL(resource2.getResource()) === destPath2)
+ assert(resource2.getTimestamp() === 10)
+ assert(resource2.getSize() === 20)
+ assert(resource2.getType() === LocalResourceType.FILE)
+
+ val sparkConf2 = new SparkConf(false)
+ distMgr.updateConfiguration(sparkConf2)
+
+ val files = sparkConf2.get(CACHED_FILES)
+ val sizes = sparkConf2.get(CACHED_FILES_SIZES)
+ val timestamps = sparkConf2.get(CACHED_FILES_TIMESTAMPS)
+ val visibilities = sparkConf2.get(CACHED_FILES_VISIBILITIES)
+
+ assert(files(0) === "file:/foo.invalid.com:8080/tmp/testing#link")
+ assert(timestamps(0) === 0)
+ assert(sizes(0) === 0)
+ assert(visibilities(0) === LocalResourceVisibility.PRIVATE.name())
+
+ assert(files(1) === "file:/foo.invalid.com:8080/tmp/testing2#link2")
+ assert(timestamps(1) === 10)
+ assert(sizes(1) === 20)
+ assert(visibilities(1) === LocalResourceVisibility.PRIVATE.name())
+ }
+
+ test("test addResource link null") {
+ val distMgr = new MockClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val conf = new Configuration()
+ val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
+ val localResources = HashMap[String, LocalResource]()
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+ when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
+
+ intercept[Exception] {
+ distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, null,
+ statCache, false)
+ }
+ assert(localResources.get("link") === None)
+ assert(localResources.size === 0)
+ }
+
+ test("test addResource appmaster only") {
+ val distMgr = new MockClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val conf = new Configuration()
+ val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
+ val localResources = HashMap[String, LocalResource]()
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+ val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
+ null, new Path("/tmp/testing"))
+ when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
+
+ distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
+ statCache, true)
+ val resource = localResources("link")
+ assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
+ assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
+ assert(resource.getTimestamp() === 10)
+ assert(resource.getSize() === 20)
+ assert(resource.getType() === LocalResourceType.ARCHIVE)
+
+ val sparkConf = new SparkConf(false)
+ distMgr.updateConfiguration(sparkConf)
+ assert(sparkConf.get(CACHED_FILES) === Nil)
+ assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Nil)
+ assert(sparkConf.get(CACHED_FILES_SIZES) === Nil)
+ assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Nil)
+ assert(sparkConf.get(CACHED_FILES_TYPES) === Nil)
+ }
+
+ test("test addResource archive") {
+ val distMgr = new MockClientDistributedCacheManager()
+ val fs = mock[FileSystem]
+ val conf = new Configuration()
+ val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
+ val localResources = HashMap[String, LocalResource]()
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+ val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
+ null, new Path("/tmp/testing"))
+ when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
+
+ distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
+ statCache, false)
+ val resource = localResources("link")
+ assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
+ assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
+ assert(resource.getTimestamp() === 10)
+ assert(resource.getSize() === 20)
+ assert(resource.getType() === LocalResourceType.ARCHIVE)
+
+ val sparkConf = new SparkConf(false)
+ distMgr.updateConfiguration(sparkConf)
+ assert(sparkConf.get(CACHED_FILES) === Seq("file:/foo.invalid.com:8080/tmp/testing#link"))
+ assert(sparkConf.get(CACHED_FILES_SIZES) === Seq(20L))
+ assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Seq(10L))
+ assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Seq(LocalResourceVisibility.PRIVATE.name()))
+ assert(sparkConf.get(CACHED_FILES_TYPES) === Seq(LocalResourceType.ARCHIVE.name()))
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
new file mode 100644
index 0000000..7deaf0a
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -0,0 +1,462 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, FileInputStream, FileOutputStream}
+import java.net.URI
+import java.util.Properties
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap => MutableHashMap}
+import scala.reflect.ClassTag
+import scala.util.Try
+
+import org.apache.commons.lang3.SerializationUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.MRJobConfig
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.YarnClientApplication
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.util.Records
+import org.mockito.Matchers.{eq => meq, _}
+import org.mockito.Mockito._
+import org.scalatest.{BeforeAndAfterAll, Matchers}
+
+import org.apache.spark.{SparkConf, SparkFunSuite, TestUtils}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv, Utils}
+
+class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
+ with ResetSystemProperties {
+
+ import Client._
+
+ var oldSystemProperties: Properties = null
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ oldSystemProperties = SerializationUtils.clone(System.getProperties)
+ System.setProperty("SPARK_YARN_MODE", "true")
+ }
+
+ override def afterAll(): Unit = {
+ try {
+ System.setProperties(oldSystemProperties)
+ oldSystemProperties = null
+ } finally {
+ super.afterAll()
+ }
+ }
+
+ test("default Yarn application classpath") {
+ getDefaultYarnApplicationClasspath should be(Some(Fixtures.knownDefYarnAppCP))
+ }
+
+ test("default MR application classpath") {
+ getDefaultMRApplicationClasspath should be(Some(Fixtures.knownDefMRAppCP))
+ }
+
+ test("resultant classpath for an application that defines a classpath for YARN") {
+ withAppConf(Fixtures.mapYARNAppConf) { conf =>
+ val env = newEnv
+ populateHadoopClasspath(conf, env)
+ classpath(env) should be(
+ flatten(Fixtures.knownYARNAppCP, getDefaultMRApplicationClasspath))
+ }
+ }
+
+ test("resultant classpath for an application that defines a classpath for MR") {
+ withAppConf(Fixtures.mapMRAppConf) { conf =>
+ val env = newEnv
+ populateHadoopClasspath(conf, env)
+ classpath(env) should be(
+ flatten(getDefaultYarnApplicationClasspath, Fixtures.knownMRAppCP))
+ }
+ }
+
+ test("resultant classpath for an application that defines both classpaths, YARN and MR") {
+ withAppConf(Fixtures.mapAppConf) { conf =>
+ val env = newEnv
+ populateHadoopClasspath(conf, env)
+ classpath(env) should be(flatten(Fixtures.knownYARNAppCP, Fixtures.knownMRAppCP))
+ }
+ }
+
+ private val SPARK = "local:/sparkJar"
+ private val USER = "local:/userJar"
+ private val ADDED = "local:/addJar1,local:/addJar2,/addJar3"
+
+ private val PWD =
+ if (classOf[Environment].getMethods().exists(_.getName == "$$")) {
+ "{{PWD}}"
+ } else if (Utils.isWindows) {
+ "%PWD%"
+ } else {
+ Environment.PWD.$()
+ }
+
+ test("Local jar URIs") {
+ val conf = new Configuration()
+ val sparkConf = new SparkConf()
+ .set(SPARK_JARS, Seq(SPARK))
+ .set(USER_CLASS_PATH_FIRST, true)
+ .set("spark.yarn.dist.jars", ADDED)
+ val env = new MutableHashMap[String, String]()
+ val args = new ClientArguments(Array("--jar", USER))
+
+ populateClasspath(args, conf, sparkConf, env)
+
+ val cp = env("CLASSPATH").split(":|;|<CPS>")
+ s"$SPARK,$USER,$ADDED".split(",").foreach({ entry =>
+ val uri = new URI(entry)
+ if (LOCAL_SCHEME.equals(uri.getScheme())) {
+ cp should contain (uri.getPath())
+ } else {
+ cp should not contain (uri.getPath())
+ }
+ })
+ cp should contain(PWD)
+ cp should contain (s"$PWD${Path.SEPARATOR}${LOCALIZED_CONF_DIR}")
+ cp should not contain (APP_JAR)
+ }
+
+ test("Jar path propagation through SparkConf") {
+ val conf = new Configuration()
+ val sparkConf = new SparkConf()
+ .set(SPARK_JARS, Seq(SPARK))
+ .set("spark.yarn.dist.jars", ADDED)
+ val client = createClient(sparkConf, args = Array("--jar", USER))
+ doReturn(new Path("/")).when(client).copyFileToRemote(any(classOf[Path]),
+ any(classOf[Path]), anyShort(), anyBoolean(), any())
+
+ val tempDir = Utils.createTempDir()
+ try {
+ // Because we mocked "copyFileToRemote" above to avoid having to create fake local files,
+ // we need to create a fake config archive in the temp dir to avoid having
+ // prepareLocalResources throw an exception.
+ new FileOutputStream(new File(tempDir, LOCALIZED_CONF_ARCHIVE)).close()
+
+ client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+ sparkConf.get(APP_JAR) should be (Some(USER))
+
+ // The non-local path should be propagated by name only, since it will end up in the app's
+ // staging dir.
+ val expected = ADDED.split(",")
+ .map(p => {
+ val uri = new URI(p)
+ if (LOCAL_SCHEME == uri.getScheme()) {
+ p
+ } else {
+ Option(uri.getFragment()).getOrElse(new File(p).getName())
+ }
+ })
+ .mkString(",")
+
+ sparkConf.get(SECONDARY_JARS) should be (Some(expected.split(",").toSeq))
+ } finally {
+ Utils.deleteRecursively(tempDir)
+ }
+ }
+
+ test("Cluster path translation") {
+ val conf = new Configuration()
+ val sparkConf = new SparkConf()
+ .set(SPARK_JARS, Seq("local:/localPath/spark.jar"))
+ .set(GATEWAY_ROOT_PATH, "/localPath")
+ .set(REPLACEMENT_ROOT_PATH, "/remotePath")
+
+ getClusterPath(sparkConf, "/localPath") should be ("/remotePath")
+ getClusterPath(sparkConf, "/localPath/1:/localPath/2") should be (
+ "/remotePath/1:/remotePath/2")
+
+ val env = new MutableHashMap[String, String]()
+ populateClasspath(null, conf, sparkConf, env, extraClassPath = Some("/localPath/my1.jar"))
+ val cp = classpath(env)
+ cp should contain ("/remotePath/spark.jar")
+ cp should contain ("/remotePath/my1.jar")
+ }
+
+ test("configuration and args propagate through createApplicationSubmissionContext") {
+ val conf = new Configuration()
+ // When parsing tags, duplicates and leading/trailing whitespace should be removed.
+ // Spaces between non-comma strings should be preserved as single tags. Empty strings may or
+ // may not be removed depending on the version of Hadoop being used.
+ val sparkConf = new SparkConf()
+ .set(APPLICATION_TAGS.key, ",tag1, dup,tag2 , ,multi word , dup")
+ .set(MAX_APP_ATTEMPTS, 42)
+ .set("spark.app.name", "foo-test-app")
+ .set(QUEUE_NAME, "staging-queue")
+ val args = new ClientArguments(Array())
+
+ val appContext = Records.newRecord(classOf[ApplicationSubmissionContext])
+ val getNewApplicationResponse = Records.newRecord(classOf[GetNewApplicationResponse])
+ val containerLaunchContext = Records.newRecord(classOf[ContainerLaunchContext])
+
+ val client = new Client(args, conf, sparkConf)
+ client.createApplicationSubmissionContext(
+ new YarnClientApplication(getNewApplicationResponse, appContext),
+ containerLaunchContext)
+
+ appContext.getApplicationName should be ("foo-test-app")
+ appContext.getQueue should be ("staging-queue")
+ appContext.getAMContainerSpec should be (containerLaunchContext)
+ appContext.getApplicationType should be ("SPARK")
+ appContext.getClass.getMethods.filter(_.getName.equals("getApplicationTags")).foreach{ method =>
+ val tags = method.invoke(appContext).asInstanceOf[java.util.Set[String]]
+ tags should contain allOf ("tag1", "dup", "tag2", "multi word")
+ tags.asScala.count(_.nonEmpty) should be (4)
+ }
+ appContext.getMaxAppAttempts should be (42)
+ }
+
+ test("spark.yarn.jars with multiple paths and globs") {
+ val libs = Utils.createTempDir()
+ val single = Utils.createTempDir()
+ val jar1 = TestUtils.createJarWithFiles(Map(), libs)
+ val jar2 = TestUtils.createJarWithFiles(Map(), libs)
+ val jar3 = TestUtils.createJarWithFiles(Map(), single)
+ val jar4 = TestUtils.createJarWithFiles(Map(), single)
+
+ val jarsConf = Seq(
+ s"${libs.getAbsolutePath()}/*",
+ jar3.getPath(),
+ s"local:${jar4.getPath()}",
+ s"local:${single.getAbsolutePath()}/*")
+
+ val sparkConf = new SparkConf().set(SPARK_JARS, jarsConf)
+ val client = createClient(sparkConf)
+
+ val tempDir = Utils.createTempDir()
+ client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+
+ assert(sparkConf.get(SPARK_JARS) ===
+ Some(Seq(s"local:${jar4.getPath()}", s"local:${single.getAbsolutePath()}/*")))
+
+ verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar1.toURI())), anyShort(),
+ anyBoolean(), any())
+ verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar2.toURI())), anyShort(),
+ anyBoolean(), any())
+ verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar3.toURI())), anyShort(),
+ anyBoolean(), any())
+
+ val cp = classpath(client)
+ cp should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
+ cp should not contain (jar3.getPath())
+ cp should contain (jar4.getPath())
+ cp should contain (buildPath(single.getAbsolutePath(), "*"))
+ }
+
+ test("distribute jars archive") {
+ val temp = Utils.createTempDir()
+ val archive = TestUtils.createJarWithFiles(Map(), temp)
+
+ val sparkConf = new SparkConf().set(SPARK_ARCHIVE, archive.getPath())
+ val client = createClient(sparkConf)
+ client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
+
+ verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(archive.toURI())), anyShort(),
+ anyBoolean(), any())
+ classpath(client) should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
+
+ sparkConf.set(SPARK_ARCHIVE, LOCAL_SCHEME + ":" + archive.getPath())
+ intercept[IllegalArgumentException] {
+ client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
+ }
+ }
+
+ test("distribute archive multiple times") {
+ val libs = Utils.createTempDir()
+ // Create jars dir and RELEASE file to avoid IllegalStateException.
+ val jarsDir = new File(libs, "jars")
+ assert(jarsDir.mkdir())
+ new FileOutputStream(new File(libs, "RELEASE")).close()
+
+ val userLib1 = Utils.createTempDir()
+ val testJar = TestUtils.createJarWithFiles(Map(), userLib1)
+
+ // Case 1: FILES_TO_DISTRIBUTE and ARCHIVES_TO_DISTRIBUTE can't have duplicate files
+ val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+ .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+ val client = createClient(sparkConf)
+ val tempDir = Utils.createTempDir()
+ intercept[IllegalArgumentException] {
+ client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+ }
+
+ // Case 2: FILES_TO_DISTRIBUTE can't have duplicate files.
+ val sparkConfFiles = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+ val clientFiles = createClient(sparkConfFiles)
+ val tempDirForFiles = Utils.createTempDir()
+ intercept[IllegalArgumentException] {
+ clientFiles.prepareLocalResources(new Path(tempDirForFiles.getAbsolutePath()), Nil)
+ }
+
+ // Case 3: ARCHIVES_TO_DISTRIBUTE can't have duplicate files.
+ val sparkConfArchives = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
+
+ val clientArchives = createClient(sparkConfArchives)
+ val tempDirForArchives = Utils.createTempDir()
+ intercept[IllegalArgumentException] {
+ clientArchives.prepareLocalResources(new Path(tempDirForArchives.getAbsolutePath()), Nil)
+ }
+
+ // Case 4: FILES_TO_DISTRIBUTE can have unique file.
+ val sparkConfFilesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+ val clientFilesUniq = createClient(sparkConfFilesUniq)
+ val tempDirForFilesUniq = Utils.createTempDir()
+ clientFilesUniq.prepareLocalResources(new Path(tempDirForFilesUniq.getAbsolutePath()), Nil)
+
+ // Case 5: ARCHIVES_TO_DISTRIBUTE can have unique file.
+ val sparkConfArchivesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
+
+ val clientArchivesUniq = createClient(sparkConfArchivesUniq)
+ val tempDirArchivesUniq = Utils.createTempDir()
+ clientArchivesUniq.prepareLocalResources(new Path(tempDirArchivesUniq.getAbsolutePath()), Nil)
+
+ }
+
+ test("distribute local spark jars") {
+ val temp = Utils.createTempDir()
+ val jarsDir = new File(temp, "jars")
+ assert(jarsDir.mkdir())
+ val jar = TestUtils.createJarWithFiles(Map(), jarsDir)
+ new FileOutputStream(new File(temp, "RELEASE")).close()
+
+ val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> temp.getAbsolutePath()))
+ val client = createClient(sparkConf)
+ client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
+ classpath(client) should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
+ }
+
+ test("ignore same name jars") {
+ val libs = Utils.createTempDir()
+ val jarsDir = new File(libs, "jars")
+ assert(jarsDir.mkdir())
+ new FileOutputStream(new File(libs, "RELEASE")).close()
+ val userLib1 = Utils.createTempDir()
+ val userLib2 = Utils.createTempDir()
+
+ val jar1 = TestUtils.createJarWithFiles(Map(), jarsDir)
+ val jar2 = TestUtils.createJarWithFiles(Map(), userLib1)
+ // Copy jar2 to jar3 with same name
+ val jar3 = {
+ val target = new File(userLib2, new File(jar2.toURI).getName)
+ val input = new FileInputStream(jar2.getPath)
+ val output = new FileOutputStream(target)
+ Utils.copyStream(input, output, closeStreams = true)
+ target.toURI.toURL
+ }
+
+ val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
+ .set(JARS_TO_DISTRIBUTE, Seq(jar2.getPath, jar3.getPath))
+
+ val client = createClient(sparkConf)
+ val tempDir = Utils.createTempDir()
+ client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
+
+ // Only jar2 will be added to SECONDARY_JARS, jar3 which has the same name with jar2 will be
+ // ignored.
+ sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new File(jar2.toURI).getName)))
+ }
+
+ object Fixtures {
+
+ val knownDefYarnAppCP: Seq[String] =
+ getFieldValue[Array[String], Seq[String]](classOf[YarnConfiguration],
+ "DEFAULT_YARN_APPLICATION_CLASSPATH",
+ Seq[String]())(a => a.toSeq)
+
+
+ val knownDefMRAppCP: Seq[String] =
+ getFieldValue2[String, Array[String], Seq[String]](
+ classOf[MRJobConfig],
+ "DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH",
+ Seq[String]())(a => a.split(","))(a => a.toSeq)
+
+ val knownYARNAppCP = Some(Seq("/known/yarn/path"))
+
+ val knownMRAppCP = Some(Seq("/known/mr/path"))
+
+ val mapMRAppConf =
+ Map("mapreduce.application.classpath" -> knownMRAppCP.map(_.mkString(":")).get)
+
+ val mapYARNAppConf =
+ Map(YarnConfiguration.YARN_APPLICATION_CLASSPATH -> knownYARNAppCP.map(_.mkString(":")).get)
+
+ val mapAppConf = mapYARNAppConf ++ mapMRAppConf
+ }
+
+ def withAppConf(m: Map[String, String] = Map())(testCode: (Configuration) => Any) {
+ val conf = new Configuration
+ m.foreach { case (k, v) => conf.set(k, v, "ClientSpec") }
+ testCode(conf)
+ }
+
+ def newEnv: MutableHashMap[String, String] = MutableHashMap[String, String]()
+
+ def classpath(env: MutableHashMap[String, String]): Array[String] =
+ env(Environment.CLASSPATH.name).split(":|;|<CPS>")
+
+ def flatten(a: Option[Seq[String]], b: Option[Seq[String]]): Array[String] =
+ (a ++ b).flatten.toArray
+
+ def getFieldValue[A, B](clazz: Class[_], field: String, defaults: => B)(mapTo: A => B): B = {
+ Try(clazz.getField(field))
+ .map(_.get(null).asInstanceOf[A])
+ .toOption
+ .map(mapTo)
+ .getOrElse(defaults)
+ }
+
+ def getFieldValue2[A: ClassTag, A1: ClassTag, B](
+ clazz: Class[_],
+ field: String,
+ defaults: => B)(mapTo: A => B)(mapTo1: A1 => B): B = {
+ Try(clazz.getField(field)).map(_.get(null)).map {
+ case v: A => mapTo(v)
+ case v1: A1 => mapTo1(v1)
+ case _ => defaults
+ }.toOption.getOrElse(defaults)
+ }
+
+ private def createClient(
+ sparkConf: SparkConf,
+ conf: Configuration = new Configuration(),
+ args: Array[String] = Array()): Client = {
+ val clientArgs = new ClientArguments(args)
+ spy(new Client(clientArgs, conf, sparkConf))
+ }
+
+ private def classpath(client: Client): Array[String] = {
+ val env = new MutableHashMap[String, String]()
+ populateClasspath(null, client.hadoopConf, client.sparkConf, env)
+ classpath(env)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
new file mode 100644
index 0000000..afb4b69
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+
+import org.apache.spark.SparkFunSuite
+
+class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
+
+ private val yarnAllocatorSuite = new YarnAllocatorSuite
+ import yarnAllocatorSuite._
+
+ def createContainerRequest(nodes: Array[String]): ContainerRequest =
+ new ContainerRequest(containerResource, nodes, null, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
+
+ override def beforeEach() {
+ yarnAllocatorSuite.beforeEach()
+ }
+
+ override def afterEach() {
+ yarnAllocatorSuite.afterEach()
+ }
+
+ test("allocate locality preferred containers with enough resource and no matched existed " +
+ "containers") {
+ // 1. All the locations of current containers cannot satisfy the new requirements
+ // 2. Current requested container number can fully satisfy the pending tasks.
+
+ val handler = createAllocator(2)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10),
+ handler.allocatedHostToContainersMap, Seq.empty)
+
+ assert(localities.map(_.nodes) === Array(
+ Array("host3", "host4", "host5"),
+ Array("host3", "host4", "host5"),
+ Array("host3", "host4")))
+ }
+
+ test("allocate locality preferred containers with enough resource and partially matched " +
+ "containers") {
+ // 1. Parts of current containers' locations can satisfy the new requirements
+ // 2. Current requested container number can fully satisfy the pending tasks.
+
+ val handler = createAllocator(3)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(
+ createContainer("host1"),
+ createContainer("host1"),
+ createContainer("host2")
+ ))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+ handler.allocatedHostToContainersMap, Seq.empty)
+
+ assert(localities.map(_.nodes) ===
+ Array(null, Array("host2", "host3"), Array("host2", "host3")))
+ }
+
+ test("allocate locality preferred containers with limited resource and partially matched " +
+ "containers") {
+ // 1. Parts of current containers' locations can satisfy the new requirements
+ // 2. Current requested container number cannot fully satisfy the pending tasks.
+
+ val handler = createAllocator(3)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(
+ createContainer("host1"),
+ createContainer("host1"),
+ createContainer("host2")
+ ))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+ handler.allocatedHostToContainersMap, Seq.empty)
+
+ assert(localities.map(_.nodes) === Array(Array("host2", "host3")))
+ }
+
+ test("allocate locality preferred containers with fully matched containers") {
+ // Current containers' locations can fully satisfy the new requirements
+
+ val handler = createAllocator(5)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(
+ createContainer("host1"),
+ createContainer("host1"),
+ createContainer("host2"),
+ createContainer("host2"),
+ createContainer("host3")
+ ))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+ handler.allocatedHostToContainersMap, Seq.empty)
+
+ assert(localities.map(_.nodes) === Array(null, null, null))
+ }
+
+ test("allocate containers with no locality preference") {
+ // Request new container without locality preference
+
+ val handler = createAllocator(2)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 1, 0, Map.empty, handler.allocatedHostToContainersMap, Seq.empty)
+
+ assert(localities.map(_.nodes) === Array(null))
+ }
+
+ test("allocate locality preferred containers by considering the localities of pending requests") {
+ val handler = createAllocator(3)
+ handler.updateResourceRequests()
+ handler.handleAllocatedContainers(Array(
+ createContainer("host1"),
+ createContainer("host1"),
+ createContainer("host2")
+ ))
+
+ val pendingAllocationRequests = Seq(
+ createContainerRequest(Array("host2", "host3")),
+ createContainerRequest(Array("host1", "host4")))
+
+ val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
+ 1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
+ handler.allocatedHostToContainersMap, pendingAllocationRequests)
+
+ assert(localities.map(_.nodes) === Array(Array("host3")))
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
new file mode 100644
index 0000000..994dc75
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -0,0 +1,344 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.util.{Arrays, List => JList}
+
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic
+import org.apache.hadoop.net.DNSToSwitchMapping
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.mockito.Mockito._
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.deploy.yarn.YarnAllocator._
+import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.SplitInfo
+import org.apache.spark.util.ManualClock
+
+class MockResolver extends DNSToSwitchMapping {
+
+ override def resolve(names: JList[String]): JList[String] = {
+ if (names.size > 0 && names.get(0) == "host3") Arrays.asList("/rack2")
+ else Arrays.asList("/rack1")
+ }
+
+ override def reloadCachedMappings() {}
+
+ def reloadCachedMappings(names: JList[String]) {}
+}
+
+class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
+ val conf = new YarnConfiguration()
+ conf.setClass(
+ CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
+ classOf[MockResolver], classOf[DNSToSwitchMapping])
+
+ val sparkConf = new SparkConf()
+ sparkConf.set("spark.driver.host", "localhost")
+ sparkConf.set("spark.driver.port", "4040")
+ sparkConf.set(SPARK_JARS, Seq("notarealjar.jar"))
+ sparkConf.set("spark.yarn.launchContainers", "false")
+
+ val appAttemptId = ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 0), 0)
+
+ // Resource returned by YARN. YARN can give larger containers than requested, so give 6 cores
+ // instead of the 5 requested and 3 GB instead of the 2 requested.
+ val containerResource = Resource.newInstance(3072, 6)
+
+ var rmClient: AMRMClient[ContainerRequest] = _
+
+ var containerNum = 0
+
+ override def beforeEach() {
+ super.beforeEach()
+ rmClient = AMRMClient.createAMRMClient()
+ rmClient.init(conf)
+ rmClient.start()
+ }
+
+ override def afterEach() {
+ try {
+ rmClient.stop()
+ } finally {
+ super.afterEach()
+ }
+ }
+
+ class MockSplitInfo(host: String) extends SplitInfo(null, host, null, 1, null) {
+ override def hashCode(): Int = 0
+ override def equals(other: Any): Boolean = false
+ }
+
+ def createAllocator(maxExecutors: Int = 5): YarnAllocator = {
+ val args = Array(
+ "--jar", "somejar.jar",
+ "--class", "SomeClass")
+ val sparkConfClone = sparkConf.clone()
+ sparkConfClone
+ .set("spark.executor.instances", maxExecutors.toString)
+ .set("spark.executor.cores", "5")
+ .set("spark.executor.memory", "2048")
+ new YarnAllocator(
+ "not used",
+ mock(classOf[RpcEndpointRef]),
+ conf,
+ sparkConfClone,
+ rmClient,
+ appAttemptId,
+ new SecurityManager(sparkConf),
+ Map())
+ }
+
+ def createContainer(host: String): Container = {
+ // When YARN 2.6+ is required, avoid deprecation by using version with long second arg
+ val containerId = ContainerId.newInstance(appAttemptId, containerNum)
+ containerNum += 1
+ val nodeId = NodeId.newInstance(host, 1000)
+ Container.newInstance(containerId, nodeId, "", containerResource, RM_REQUEST_PRIORITY, null)
+ }
+
+ test("single container allocated") {
+ // request a single container and receive it
+ val handler = createAllocator(1)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (1)
+
+ val container = createContainer("host1")
+ handler.handleAllocatedContainers(Array(container))
+
+ handler.getNumExecutorsRunning should be (1)
+ handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+
+ val size = rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size
+ size should be (0)
+ }
+
+ test("container should not be created if requested number if met") {
+ // request a single container and receive it
+ val handler = createAllocator(1)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (1)
+
+ val container = createContainer("host1")
+ handler.handleAllocatedContainers(Array(container))
+
+ handler.getNumExecutorsRunning should be (1)
+ handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+
+ val container2 = createContainer("host2")
+ handler.handleAllocatedContainers(Array(container2))
+ handler.getNumExecutorsRunning should be (1)
+ }
+
+ test("some containers allocated") {
+ // request a few containers and receive some of them
+ val handler = createAllocator(4)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ val container1 = createContainer("host1")
+ val container2 = createContainer("host1")
+ val container3 = createContainer("host2")
+ handler.handleAllocatedContainers(Array(container1, container2, container3))
+
+ handler.getNumExecutorsRunning should be (3)
+ handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
+ handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host1")
+ handler.allocatedContainerToHostMap.get(container3.getId).get should be ("host2")
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container2.getId)
+ handler.allocatedHostToContainersMap.get("host2").get should contain (container3.getId)
+ }
+
+ test("receive more containers than requested") {
+ val handler = createAllocator(2)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (2)
+
+ val container1 = createContainer("host1")
+ val container2 = createContainer("host2")
+ val container3 = createContainer("host4")
+ handler.handleAllocatedContainers(Array(container1, container2, container3))
+
+ handler.getNumExecutorsRunning should be (2)
+ handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
+ handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host2")
+ handler.allocatedContainerToHostMap.contains(container3.getId) should be (false)
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
+ handler.allocatedHostToContainersMap.get("host2").get should contain (container2.getId)
+ handler.allocatedHostToContainersMap.contains("host4") should be (false)
+ }
+
+ test("decrease total requested executors") {
+ val handler = createAllocator(4)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
+ handler.updateResourceRequests()
+ handler.getPendingAllocate.size should be (3)
+
+ val container = createContainer("host1")
+ handler.handleAllocatedContainers(Array(container))
+
+ handler.getNumExecutorsRunning should be (1)
+ handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
+ handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
+
+ handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty)
+ handler.updateResourceRequests()
+ handler.getPendingAllocate.size should be (1)
+ }
+
+ test("decrease total requested executors to less than currently running") {
+ val handler = createAllocator(4)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
+ handler.updateResourceRequests()
+ handler.getPendingAllocate.size should be (3)
+
+ val container1 = createContainer("host1")
+ val container2 = createContainer("host2")
+ handler.handleAllocatedContainers(Array(container1, container2))
+
+ handler.getNumExecutorsRunning should be (2)
+
+ handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
+ handler.updateResourceRequests()
+ handler.getPendingAllocate.size should be (0)
+ handler.getNumExecutorsRunning should be (2)
+ }
+
+ test("kill executors") {
+ val handler = createAllocator(4)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ val container1 = createContainer("host1")
+ val container2 = createContainer("host2")
+ handler.handleAllocatedContainers(Array(container1, container2))
+
+ handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
+ handler.executorIdToContainer.keys.foreach { id => handler.killExecutor(id ) }
+
+ val statuses = Seq(container1, container2).map { c =>
+ ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Finished", 0)
+ }
+ handler.updateResourceRequests()
+ handler.processCompletedContainers(statuses.toSeq)
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (1)
+ }
+
+ test("lost executor removed from backend") {
+ val handler = createAllocator(4)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ val container1 = createContainer("host1")
+ val container2 = createContainer("host2")
+ handler.handleAllocatedContainers(Array(container1, container2))
+
+ handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map())
+
+ val statuses = Seq(container1, container2).map { c =>
+ ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Failed", -1)
+ }
+ handler.updateResourceRequests()
+ handler.processCompletedContainers(statuses.toSeq)
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (2)
+ handler.getNumExecutorsFailed should be (2)
+ handler.getNumUnexpectedContainerRelease should be (2)
+ }
+
+ test("memory exceeded diagnostic regexes") {
+ val diagnostics =
+ "Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +
+ "beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
+ "5.8 GB of 4.2 GB virtual memory used. Killing container."
+ val vmemMsg = memLimitExceededLogMessage(diagnostics, VMEM_EXCEEDED_PATTERN)
+ val pmemMsg = memLimitExceededLogMessage(diagnostics, PMEM_EXCEEDED_PATTERN)
+ assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
+ assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
+ }
+
+ test("window based failure executor counting") {
+ sparkConf.set("spark.yarn.executor.failuresValidityInterval", "100s")
+ val handler = createAllocator(4)
+ val clock = new ManualClock(0L)
+ handler.setClock(clock)
+
+ handler.updateResourceRequests()
+ handler.getNumExecutorsRunning should be (0)
+ handler.getPendingAllocate.size should be (4)
+
+ val containers = Seq(
+ createContainer("host1"),
+ createContainer("host2"),
+ createContainer("host3"),
+ createContainer("host4")
+ )
+ handler.handleAllocatedContainers(containers)
+
+ val failedStatuses = containers.map { c =>
+ ContainerStatus.newInstance(c.getId, ContainerState.COMPLETE, "Failed", -1)
+ }
+
+ handler.getNumExecutorsFailed should be (0)
+
+ clock.advance(100 * 1000L)
+ handler.processCompletedContainers(failedStatuses.slice(0, 1))
+ handler.getNumExecutorsFailed should be (1)
+
+ clock.advance(101 * 1000L)
+ handler.getNumExecutorsFailed should be (0)
+
+ handler.processCompletedContainers(failedStatuses.slice(1, 3))
+ handler.getNumExecutorsFailed should be (2)
+
+ clock.advance(50 * 1000L)
+ handler.processCompletedContainers(failedStatuses.slice(3, 4))
+ handler.getNumExecutorsFailed should be (3)
+
+ clock.advance(51 * 1000L)
+ handler.getNumExecutorsFailed should be (1)
+
+ clock.advance(50 * 1000L)
+ handler.getNumExecutorsFailed should be (0)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
new file mode 100644
index 0000000..99fb58a
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -0,0 +1,493 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.File
+import java.net.URL
+import java.nio.charset.StandardCharsets
+import java.util.{HashMap => JHashMap}
+
+import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import com.google.common.io.{ByteStreams, Files}
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark._
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.launcher._
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
+ SparkListenerExecutorAdded}
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.tags.ExtendedYarnTest
+import org.apache.spark.util.Utils
+
+/**
+ * Integration tests for YARN; these tests use a mini Yarn cluster to run Spark-on-YARN
+ * applications, and require the Spark assembly to be built before they can be successfully
+ * run.
+ */
+@ExtendedYarnTest
+class YarnClusterSuite extends BaseYarnClusterSuite {
+
+ override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
+
+ private val TEST_PYFILE = """
+ |import mod1, mod2
+ |import sys
+ |from operator import add
+ |
+ |from pyspark import SparkConf , SparkContext
+ |if __name__ == "__main__":
+ | if len(sys.argv) != 2:
+ | print >> sys.stderr, "Usage: test.py [result file]"
+ | exit(-1)
+ | sc = SparkContext(conf=SparkConf())
+ | status = open(sys.argv[1],'w')
+ | result = "failure"
+ | rdd = sc.parallelize(range(10)).map(lambda x: x * mod1.func() * mod2.func())
+ | cnt = rdd.count()
+ | if cnt == 10:
+ | result = "success"
+ | status.write(result)
+ | status.close()
+ | sc.stop()
+ """.stripMargin
+
+ private val TEST_PYMODULE = """
+ |def func():
+ | return 42
+ """.stripMargin
+
+ test("run Spark in yarn-client mode") {
+ testBasicYarnApp(true)
+ }
+
+ test("run Spark in yarn-cluster mode") {
+ testBasicYarnApp(false)
+ }
+
+ test("run Spark in yarn-client mode with different configurations") {
+ testBasicYarnApp(true,
+ Map(
+ "spark.driver.memory" -> "512m",
+ "spark.executor.cores" -> "1",
+ "spark.executor.memory" -> "512m",
+ "spark.executor.instances" -> "2"
+ ))
+ }
+
+ test("run Spark in yarn-cluster mode with different configurations") {
+ testBasicYarnApp(false,
+ Map(
+ "spark.driver.memory" -> "512m",
+ "spark.driver.cores" -> "1",
+ "spark.executor.cores" -> "1",
+ "spark.executor.memory" -> "512m",
+ "spark.executor.instances" -> "2"
+ ))
+ }
+
+ test("run Spark in yarn-cluster mode with using SparkHadoopUtil.conf") {
+ testYarnAppUseSparkHadoopUtilConf()
+ }
+
+ test("run Spark in yarn-client mode with additional jar") {
+ testWithAddJar(true)
+ }
+
+ test("run Spark in yarn-cluster mode with additional jar") {
+ testWithAddJar(false)
+ }
+
+ test("run Spark in yarn-cluster mode unsuccessfully") {
+ // Don't provide arguments so the driver will fail.
+ val finalState = runSpark(false, mainClassName(YarnClusterDriver.getClass))
+ finalState should be (SparkAppHandle.State.FAILED)
+ }
+
+ test("run Spark in yarn-cluster mode failure after sc initialized") {
+ val finalState = runSpark(false, mainClassName(YarnClusterDriverWithFailure.getClass))
+ finalState should be (SparkAppHandle.State.FAILED)
+ }
+
+ test("run Python application in yarn-client mode") {
+ testPySpark(true)
+ }
+
+ test("run Python application in yarn-cluster mode") {
+ testPySpark(false)
+ }
+
+ test("run Python application in yarn-cluster mode using " +
+ " spark.yarn.appMasterEnv to override local envvar") {
+ testPySpark(
+ clientMode = false,
+ extraConf = Map(
+ "spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON"
+ -> sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python"),
+ "spark.yarn.appMasterEnv.PYSPARK_PYTHON"
+ -> sys.env.getOrElse("PYSPARK_PYTHON", "python")),
+ extraEnv = Map(
+ "PYSPARK_DRIVER_PYTHON" -> "not python",
+ "PYSPARK_PYTHON" -> "not python"))
+ }
+
+ test("user class path first in client mode") {
+ testUseClassPathFirst(true)
+ }
+
+ test("user class path first in cluster mode") {
+ testUseClassPathFirst(false)
+ }
+
+ test("monitor app using launcher library") {
+ val env = new JHashMap[String, String]()
+ env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath())
+
+ val propsFile = createConfFile()
+ val handle = new SparkLauncher(env)
+ .setSparkHome(sys.props("spark.test.home"))
+ .setConf("spark.ui.enabled", "false")
+ .setPropertiesFile(propsFile)
+ .setMaster("yarn")
+ .setDeployMode("client")
+ .setAppResource(SparkLauncher.NO_RESOURCE)
+ .setMainClass(mainClassName(YarnLauncherTestApp.getClass))
+ .startApplication()
+
+ try {
+ eventually(timeout(30 seconds), interval(100 millis)) {
+ handle.getState() should be (SparkAppHandle.State.RUNNING)
+ }
+
+ handle.getAppId() should not be (null)
+ handle.getAppId() should startWith ("application_")
+ handle.stop()
+
+ eventually(timeout(30 seconds), interval(100 millis)) {
+ handle.getState() should be (SparkAppHandle.State.KILLED)
+ }
+ } finally {
+ handle.kill()
+ }
+ }
+
+ test("timeout to get SparkContext in cluster mode triggers failure") {
+ val timeout = 2000
+ val finalState = runSpark(false, mainClassName(SparkContextTimeoutApp.getClass),
+ appArgs = Seq((timeout * 4).toString),
+ extraConf = Map(AM_MAX_WAIT_TIME.key -> timeout.toString))
+ finalState should be (SparkAppHandle.State.FAILED)
+ }
+
+ private def testBasicYarnApp(clientMode: Boolean, conf: Map[String, String] = Map()): Unit = {
+ val result = File.createTempFile("result", null, tempDir)
+ val finalState = runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
+ appArgs = Seq(result.getAbsolutePath()),
+ extraConf = conf)
+ checkResult(finalState, result)
+ }
+
+ private def testYarnAppUseSparkHadoopUtilConf(): Unit = {
+ val result = File.createTempFile("result", null, tempDir)
+ val finalState = runSpark(false,
+ mainClassName(YarnClusterDriverUseSparkHadoopUtilConf.getClass),
+ appArgs = Seq("key=value", result.getAbsolutePath()),
+ extraConf = Map("spark.hadoop.key" -> "value"))
+ checkResult(finalState, result)
+ }
+
+ private def testWithAddJar(clientMode: Boolean): Unit = {
+ val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
+ val driverResult = File.createTempFile("driver", null, tempDir)
+ val executorResult = File.createTempFile("executor", null, tempDir)
+ val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
+ appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
+ extraClassPath = Seq(originalJar.getPath()),
+ extraJars = Seq("local:" + originalJar.getPath()))
+ checkResult(finalState, driverResult, "ORIGINAL")
+ checkResult(finalState, executorResult, "ORIGINAL")
+ }
+
+ private def testPySpark(
+ clientMode: Boolean,
+ extraConf: Map[String, String] = Map(),
+ extraEnv: Map[String, String] = Map()): Unit = {
+ val primaryPyFile = new File(tempDir, "test.py")
+ Files.write(TEST_PYFILE, primaryPyFile, StandardCharsets.UTF_8)
+
+ // When running tests, let's not assume the user has built the assembly module, which also
+ // creates the pyspark archive. Instead, let's use PYSPARK_ARCHIVES_PATH to point at the
+ // needed locations.
+ val sparkHome = sys.props("spark.test.home")
+ val pythonPath = Seq(
+ s"$sparkHome/python/lib/py4j-0.10.4-src.zip",
+ s"$sparkHome/python")
+ val extraEnvVars = Map(
+ "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
+ "PYTHONPATH" -> pythonPath.mkString(File.pathSeparator)) ++ extraEnv
+
+ val moduleDir =
+ if (clientMode) {
+ // In client-mode, .py files added with --py-files are not visible in the driver.
+ // This is something that the launcher library would have to handle.
+ tempDir
+ } else {
+ val subdir = new File(tempDir, "pyModules")
+ subdir.mkdir()
+ subdir
+ }
+ val pyModule = new File(moduleDir, "mod1.py")
+ Files.write(TEST_PYMODULE, pyModule, StandardCharsets.UTF_8)
+
+ val mod2Archive = TestUtils.createJarWithFiles(Map("mod2.py" -> TEST_PYMODULE), moduleDir)
+ val pyFiles = Seq(pyModule.getAbsolutePath(), mod2Archive.getPath()).mkString(",")
+ val result = File.createTempFile("result", null, tempDir)
+
+ val finalState = runSpark(clientMode, primaryPyFile.getAbsolutePath(),
+ sparkArgs = Seq("--py-files" -> pyFiles),
+ appArgs = Seq(result.getAbsolutePath()),
+ extraEnv = extraEnvVars,
+ extraConf = extraConf)
+ checkResult(finalState, result)
+ }
+
+ private def testUseClassPathFirst(clientMode: Boolean): Unit = {
+ // Create a jar file that contains a different version of "test.resource".
+ val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
+ val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "OVERRIDDEN"), tempDir)
+ val driverResult = File.createTempFile("driver", null, tempDir)
+ val executorResult = File.createTempFile("executor", null, tempDir)
+ val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
+ appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
+ extraClassPath = Seq(originalJar.getPath()),
+ extraJars = Seq("local:" + userJar.getPath()),
+ extraConf = Map(
+ "spark.driver.userClassPathFirst" -> "true",
+ "spark.executor.userClassPathFirst" -> "true"))
+ checkResult(finalState, driverResult, "OVERRIDDEN")
+ checkResult(finalState, executorResult, "OVERRIDDEN")
+ }
+
+}
+
+private[spark] class SaveExecutorInfo extends SparkListener {
+ val addedExecutorInfos = mutable.Map[String, ExecutorInfo]()
+ var driverLogs: Option[collection.Map[String, String]] = None
+
+ override def onExecutorAdded(executor: SparkListenerExecutorAdded) {
+ addedExecutorInfos(executor.executorId) = executor.executorInfo
+ }
+
+ override def onApplicationStart(appStart: SparkListenerApplicationStart): Unit = {
+ driverLogs = appStart.driverLogs
+ }
+}
+
+private object YarnClusterDriverWithFailure extends Logging with Matchers {
+ def main(args: Array[String]): Unit = {
+ val sc = new SparkContext(new SparkConf()
+ .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
+ .setAppName("yarn test with failure"))
+
+ throw new Exception("exception after sc initialized")
+ }
+}
+
+private object YarnClusterDriverUseSparkHadoopUtilConf extends Logging with Matchers {
+ def main(args: Array[String]): Unit = {
+ if (args.length != 2) {
+ // scalastyle:off println
+ System.err.println(
+ s"""
+ |Invalid command line: ${args.mkString(" ")}
+ |
+ |Usage: YarnClusterDriverUseSparkHadoopUtilConf [hadoopConfKey=value] [result file]
+ """.stripMargin)
+ // scalastyle:on println
+ System.exit(1)
+ }
+
+ val sc = new SparkContext(new SparkConf()
+ .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
+ .setAppName("yarn test using SparkHadoopUtil's conf"))
+
+ val kv = args(0).split("=")
+ val status = new File(args(1))
+ var result = "failure"
+ try {
+ SparkHadoopUtil.get.conf.get(kv(0)) should be (kv(1))
+ result = "success"
+ } finally {
+ Files.write(result, status, StandardCharsets.UTF_8)
+ sc.stop()
+ }
+ }
+}
+
+private object YarnClusterDriver extends Logging with Matchers {
+
+ val WAIT_TIMEOUT_MILLIS = 10000
+
+ def main(args: Array[String]): Unit = {
+ if (args.length != 1) {
+ // scalastyle:off println
+ System.err.println(
+ s"""
+ |Invalid command line: ${args.mkString(" ")}
+ |
+ |Usage: YarnClusterDriver [result file]
+ """.stripMargin)
+ // scalastyle:on println
+ System.exit(1)
+ }
+
+ val sc = new SparkContext(new SparkConf()
+ .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
+ .setAppName("yarn \"test app\" 'with quotes' and \\back\\slashes and $dollarSigns"))
+ val conf = sc.getConf
+ val status = new File(args(0))
+ var result = "failure"
+ try {
+ val data = sc.parallelize(1 to 4, 4).collect().toSet
+ sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+ data should be (Set(1, 2, 3, 4))
+ result = "success"
+
+ // Verify that the config archive is correctly placed in the classpath of all containers.
+ val confFile = "/" + Client.SPARK_CONF_FILE
+ assert(getClass().getResource(confFile) != null)
+ val configFromExecutors = sc.parallelize(1 to 4, 4)
+ .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull }
+ .collect()
+ assert(configFromExecutors.find(_ == null) === None)
+ } finally {
+ Files.write(result, status, StandardCharsets.UTF_8)
+ sc.stop()
+ }
+
+ // verify log urls are present
+ val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo]
+ assert(listeners.size === 1)
+ val listener = listeners(0)
+ val executorInfos = listener.addedExecutorInfos.values
+ assert(executorInfos.nonEmpty)
+ executorInfos.foreach { info =>
+ assert(info.logUrlMap.nonEmpty)
+ }
+
+ // If we are running in yarn-cluster mode, verify that driver logs links and present and are
+ // in the expected format.
+ if (conf.get("spark.submit.deployMode") == "cluster") {
+ assert(listener.driverLogs.nonEmpty)
+ val driverLogs = listener.driverLogs.get
+ assert(driverLogs.size === 2)
+ assert(driverLogs.contains("stderr"))
+ assert(driverLogs.contains("stdout"))
+ val urlStr = driverLogs("stderr")
+ // Ensure that this is a valid URL, else this will throw an exception
+ new URL(urlStr)
+ val containerId = YarnSparkHadoopUtil.get.getContainerId
+ val user = Utils.getCurrentUserName()
+ assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=-4096"))
+ }
+ }
+
+}
+
+private object YarnClasspathTest extends Logging {
+ def error(m: String, ex: Throwable = null): Unit = {
+ logError(m, ex)
+ // scalastyle:off println
+ System.out.println(m)
+ if (ex != null) {
+ ex.printStackTrace(System.out)
+ }
+ // scalastyle:on println
+ }
+
+ def main(args: Array[String]): Unit = {
+ if (args.length != 2) {
+ error(
+ s"""
+ |Invalid command line: ${args.mkString(" ")}
+ |
+ |Usage: YarnClasspathTest [driver result file] [executor result file]
+ """.stripMargin)
+ // scalastyle:on println
+ }
+
+ readResource(args(0))
+ val sc = new SparkContext(new SparkConf())
+ try {
+ sc.parallelize(Seq(1)).foreach { x => readResource(args(1)) }
+ } finally {
+ sc.stop()
+ }
+ }
+
+ private def readResource(resultPath: String): Unit = {
+ var result = "failure"
+ try {
+ val ccl = Thread.currentThread().getContextClassLoader()
+ val resource = ccl.getResourceAsStream("test.resource")
+ val bytes = ByteStreams.toByteArray(resource)
+ result = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8)
+ } catch {
+ case t: Throwable =>
+ error(s"loading test.resource to $resultPath", t)
+ } finally {
+ Files.write(result, new File(resultPath), StandardCharsets.UTF_8)
+ }
+ }
+
+}
+
+private object YarnLauncherTestApp {
+
+ def main(args: Array[String]): Unit = {
+ // Do not stop the application; the test will stop it using the launcher lib. Just run a task
+ // that will prevent the process from exiting.
+ val sc = new SparkContext(new SparkConf())
+ sc.parallelize(Seq(1)).foreach { i =>
+ this.synchronized {
+ wait()
+ }
+ }
+ }
+
+}
+
+/**
+ * Used to test code in the AM that detects the SparkContext instance. Expects a single argument
+ * with the duration to sleep for, in ms.
+ */
+private object SparkContextTimeoutApp {
+
+ def main(args: Array[String]): Unit = {
+ val Array(sleepTime) = args
+ Thread.sleep(java.lang.Long.parseLong(sleepTime))
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
new file mode 100644
index 0000000..950ebd9
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -0,0 +1,112 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.deploy.yarn
+
+import java.io.File
+import java.nio.charset.StandardCharsets
+
+import com.google.common.io.Files
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.scalatest.Matchers
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.network.shuffle.ShuffleTestAccessor
+import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
+import org.apache.spark.tags.ExtendedYarnTest
+
+/**
+ * Integration test for the external shuffle service with a yarn mini-cluster
+ */
+@ExtendedYarnTest
+class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
+
+ override def newYarnConfig(): YarnConfiguration = {
+ val yarnConfig = new YarnConfiguration()
+ yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
+ yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
+ classOf[YarnShuffleService].getCanonicalName)
+ yarnConfig.set("spark.shuffle.service.port", "0")
+ yarnConfig
+ }
+
+ test("external shuffle service") {
+ val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
+ val shuffleService = YarnTestAccessor.getShuffleServiceInstance
+
+ val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)
+
+ logInfo("Shuffle service port = " + shuffleServicePort)
+ val result = File.createTempFile("result", null, tempDir)
+ val finalState = runSpark(
+ false,
+ mainClassName(YarnExternalShuffleDriver.getClass),
+ appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
+ extraConf = Map(
+ "spark.shuffle.service.enabled" -> "true",
+ "spark.shuffle.service.port" -> shuffleServicePort.toString
+ )
+ )
+ checkResult(finalState, result)
+ assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
+ }
+}
+
+private object YarnExternalShuffleDriver extends Logging with Matchers {
+
+ val WAIT_TIMEOUT_MILLIS = 10000
+
+ def main(args: Array[String]): Unit = {
+ if (args.length != 2) {
+ // scalastyle:off println
+ System.err.println(
+ s"""
+ |Invalid command line: ${args.mkString(" ")}
+ |
+ |Usage: ExternalShuffleDriver [result file] [registered exec file]
+ """.stripMargin)
+ // scalastyle:on println
+ System.exit(1)
+ }
+
+ val sc = new SparkContext(new SparkConf()
+ .setAppName("External Shuffle Test"))
+ val conf = sc.getConf
+ val status = new File(args(0))
+ val registeredExecFile = new File(args(1))
+ logInfo("shuffle service executor file = " + registeredExecFile)
+ var result = "failure"
+ val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
+ try {
+ val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
+ collect().toSet
+ sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
+ data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
+ result = "success"
+ // only one process can open a leveldb file at a time, so we copy the files
+ FileUtils.copyDirectory(registeredExecFile, execStateCopy)
+ assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
+ } finally {
+ sc.stop()
+ FileUtils.deleteDirectory(execStateCopy)
+ Files.write(result, status, StandardCharsets.UTF_8)
+ }
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[17/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
deleted file mode 100644
index f384290..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ /dev/null
@@ -1,740 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.io.File
-import java.util.{Collections, Date, List => JList}
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.mesos.{Scheduler, SchedulerDriver}
-import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
-import org.apache.mesos.Protos.Environment.Variable
-import org.apache.mesos.Protos.TaskStatus.Reason
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkException, TaskState}
-import org.apache.spark.deploy.mesos.MesosDriverDescription
-import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
-import org.apache.spark.metrics.MetricsSystem
-import org.apache.spark.util.Utils
-
-/**
- * Tracks the current state of a Mesos Task that runs a Spark driver.
- * @param driverDescription Submitted driver description from
- * [[org.apache.spark.deploy.rest.mesos.MesosRestServer]]
- * @param taskId Mesos TaskID generated for the task
- * @param slaveId Slave ID that the task is assigned to
- * @param mesosTaskStatus The last known task status update.
- * @param startDate The date the task was launched
- * @param finishDate The date the task finished
- * @param frameworkId Mesos framework ID the task registers with
- */
-private[spark] class MesosClusterSubmissionState(
- val driverDescription: MesosDriverDescription,
- val taskId: TaskID,
- val slaveId: SlaveID,
- var mesosTaskStatus: Option[TaskStatus],
- var startDate: Date,
- var finishDate: Option[Date],
- val frameworkId: String)
- extends Serializable {
-
- def copy(): MesosClusterSubmissionState = {
- new MesosClusterSubmissionState(
- driverDescription, taskId, slaveId, mesosTaskStatus, startDate, finishDate, frameworkId)
- }
-}
-
-/**
- * Tracks the retry state of a driver, which includes the next time it should be scheduled
- * and necessary information to do exponential backoff.
- * This class is not thread-safe, and we expect the caller to handle synchronizing state.
- *
- * @param lastFailureStatus Last Task status when it failed.
- * @param retries Number of times it has been retried.
- * @param nextRetry Time at which it should be retried next
- * @param waitTime The amount of time driver is scheduled to wait until next retry.
- */
-private[spark] class MesosClusterRetryState(
- val lastFailureStatus: TaskStatus,
- val retries: Int,
- val nextRetry: Date,
- val waitTime: Int) extends Serializable {
- def copy(): MesosClusterRetryState =
- new MesosClusterRetryState(lastFailureStatus, retries, nextRetry, waitTime)
-}
-
-/**
- * The full state of the cluster scheduler, currently being used for displaying
- * information on the UI.
- *
- * @param frameworkId Mesos Framework id for the cluster scheduler.
- * @param masterUrl The Mesos master url
- * @param queuedDrivers All drivers queued to be launched
- * @param launchedDrivers All launched or running drivers
- * @param finishedDrivers All terminated drivers
- * @param pendingRetryDrivers All drivers pending to be retried
- */
-private[spark] class MesosClusterSchedulerState(
- val frameworkId: String,
- val masterUrl: Option[String],
- val queuedDrivers: Iterable[MesosDriverDescription],
- val launchedDrivers: Iterable[MesosClusterSubmissionState],
- val finishedDrivers: Iterable[MesosClusterSubmissionState],
- val pendingRetryDrivers: Iterable[MesosDriverDescription])
-
-/**
- * The full state of a Mesos driver, that is being used to display driver information on the UI.
- */
-private[spark] class MesosDriverState(
- val state: String,
- val description: MesosDriverDescription,
- val submissionState: Option[MesosClusterSubmissionState] = None)
-
-/**
- * A Mesos scheduler that is responsible for launching submitted Spark drivers in cluster mode
- * as Mesos tasks in a Mesos cluster.
- * All drivers are launched asynchronously by the framework, which will eventually be launched
- * by one of the slaves in the cluster. The results of the driver will be stored in slave's task
- * sandbox which is accessible by visiting the Mesos UI.
- * This scheduler supports recovery by persisting all its state and performs task reconciliation
- * on recover, which gets all the latest state for all the drivers from Mesos master.
- */
-private[spark] class MesosClusterScheduler(
- engineFactory: MesosClusterPersistenceEngineFactory,
- conf: SparkConf)
- extends Scheduler with MesosSchedulerUtils {
- var frameworkUrl: String = _
- private val metricsSystem =
- MetricsSystem.createMetricsSystem("mesos_cluster", conf, new SecurityManager(conf))
- private val master = conf.get("spark.master")
- private val appName = conf.get("spark.app.name")
- private val queuedCapacity = conf.getInt("spark.mesos.maxDrivers", 200)
- private val retainedDrivers = conf.getInt("spark.mesos.retainedDrivers", 200)
- private val maxRetryWaitTime = conf.getInt("spark.mesos.cluster.retry.wait.max", 60) // 1 minute
- private val useFetchCache = conf.getBoolean("spark.mesos.fetchCache.enable", false)
- private val schedulerState = engineFactory.createEngine("scheduler")
- private val stateLock = new Object()
- private val finishedDrivers =
- new mutable.ArrayBuffer[MesosClusterSubmissionState](retainedDrivers)
- private var frameworkId: String = null
- // Holds all the launched drivers and current launch state, keyed by driver id.
- private val launchedDrivers = new mutable.HashMap[String, MesosClusterSubmissionState]()
- // Holds a map of driver id to expected slave id that is passed to Mesos for reconciliation.
- // All drivers that are loaded after failover are added here, as we need get the latest
- // state of the tasks from Mesos.
- private val pendingRecover = new mutable.HashMap[String, SlaveID]()
- // Stores all the submitted drivers that hasn't been launched.
- private val queuedDrivers = new ArrayBuffer[MesosDriverDescription]()
- // All supervised drivers that are waiting to retry after termination.
- private val pendingRetryDrivers = new ArrayBuffer[MesosDriverDescription]()
- private val queuedDriversState = engineFactory.createEngine("driverQueue")
- private val launchedDriversState = engineFactory.createEngine("launchedDrivers")
- private val pendingRetryDriversState = engineFactory.createEngine("retryList")
- // Flag to mark if the scheduler is ready to be called, which is until the scheduler
- // is registered with Mesos master.
- @volatile protected var ready = false
- private var masterInfo: Option[MasterInfo] = None
-
- def submitDriver(desc: MesosDriverDescription): CreateSubmissionResponse = {
- val c = new CreateSubmissionResponse
- if (!ready) {
- c.success = false
- c.message = "Scheduler is not ready to take requests"
- return c
- }
-
- stateLock.synchronized {
- if (isQueueFull()) {
- c.success = false
- c.message = "Already reached maximum submission size"
- return c
- }
- c.submissionId = desc.submissionId
- queuedDriversState.persist(desc.submissionId, desc)
- queuedDrivers += desc
- c.success = true
- }
- c
- }
-
- def killDriver(submissionId: String): KillSubmissionResponse = {
- val k = new KillSubmissionResponse
- if (!ready) {
- k.success = false
- k.message = "Scheduler is not ready to take requests"
- return k
- }
- k.submissionId = submissionId
- stateLock.synchronized {
- // We look for the requested driver in the following places:
- // 1. Check if submission is running or launched.
- // 2. Check if it's still queued.
- // 3. Check if it's in the retry list.
- // 4. Check if it has already completed.
- if (launchedDrivers.contains(submissionId)) {
- val task = launchedDrivers(submissionId)
- mesosDriver.killTask(task.taskId)
- k.success = true
- k.message = "Killing running driver"
- } else if (removeFromQueuedDrivers(submissionId)) {
- k.success = true
- k.message = "Removed driver while it's still pending"
- } else if (removeFromPendingRetryDrivers(submissionId)) {
- k.success = true
- k.message = "Removed driver while it's being retried"
- } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) {
- k.success = false
- k.message = "Driver already terminated"
- } else {
- k.success = false
- k.message = "Cannot find driver"
- }
- }
- k
- }
-
- def getDriverStatus(submissionId: String): SubmissionStatusResponse = {
- val s = new SubmissionStatusResponse
- if (!ready) {
- s.success = false
- s.message = "Scheduler is not ready to take requests"
- return s
- }
- s.submissionId = submissionId
- stateLock.synchronized {
- if (queuedDrivers.exists(_.submissionId.equals(submissionId))) {
- s.success = true
- s.driverState = "QUEUED"
- } else if (launchedDrivers.contains(submissionId)) {
- s.success = true
- s.driverState = "RUNNING"
- launchedDrivers(submissionId).mesosTaskStatus.foreach(state => s.message = state.toString)
- } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) {
- s.success = true
- s.driverState = "FINISHED"
- finishedDrivers
- .find(d => d.driverDescription.submissionId.equals(submissionId)).get.mesosTaskStatus
- .foreach(state => s.message = state.toString)
- } else if (pendingRetryDrivers.exists(_.submissionId.equals(submissionId))) {
- val status = pendingRetryDrivers.find(_.submissionId.equals(submissionId))
- .get.retryState.get.lastFailureStatus
- s.success = true
- s.driverState = "RETRYING"
- s.message = status.toString
- } else {
- s.success = false
- s.driverState = "NOT_FOUND"
- }
- }
- s
- }
-
- /**
- * Gets the driver state to be displayed on the Web UI.
- */
- def getDriverState(submissionId: String): Option[MesosDriverState] = {
- stateLock.synchronized {
- queuedDrivers.find(_.submissionId.equals(submissionId))
- .map(d => new MesosDriverState("QUEUED", d))
- .orElse(launchedDrivers.get(submissionId)
- .map(d => new MesosDriverState("RUNNING", d.driverDescription, Some(d))))
- .orElse(finishedDrivers.find(_.driverDescription.submissionId.equals(submissionId))
- .map(d => new MesosDriverState("FINISHED", d.driverDescription, Some(d))))
- .orElse(pendingRetryDrivers.find(_.submissionId.equals(submissionId))
- .map(d => new MesosDriverState("RETRYING", d)))
- }
- }
-
- private def isQueueFull(): Boolean = launchedDrivers.size >= queuedCapacity
-
- /**
- * Recover scheduler state that is persisted.
- * We still need to do task reconciliation to be up to date of the latest task states
- * as it might have changed while the scheduler is failing over.
- */
- private def recoverState(): Unit = {
- stateLock.synchronized {
- launchedDriversState.fetchAll[MesosClusterSubmissionState]().foreach { state =>
- launchedDrivers(state.taskId.getValue) = state
- pendingRecover(state.taskId.getValue) = state.slaveId
- }
- queuedDriversState.fetchAll[MesosDriverDescription]().foreach(d => queuedDrivers += d)
- // There is potential timing issue where a queued driver might have been launched
- // but the scheduler shuts down before the queued driver was able to be removed
- // from the queue. We try to mitigate this issue by walking through all queued drivers
- // and remove if they're already launched.
- queuedDrivers
- .filter(d => launchedDrivers.contains(d.submissionId))
- .foreach(d => removeFromQueuedDrivers(d.submissionId))
- pendingRetryDriversState.fetchAll[MesosDriverDescription]()
- .foreach(s => pendingRetryDrivers += s)
- // TODO: Consider storing finished drivers so we can show them on the UI after
- // failover. For now we clear the history on each recovery.
- finishedDrivers.clear()
- }
- }
-
- /**
- * Starts the cluster scheduler and wait until the scheduler is registered.
- * This also marks the scheduler to be ready for requests.
- */
- def start(): Unit = {
- // TODO: Implement leader election to make sure only one framework running in the cluster.
- val fwId = schedulerState.fetch[String]("frameworkId")
- fwId.foreach { id =>
- frameworkId = id
- }
- recoverState()
- metricsSystem.registerSource(new MesosClusterSchedulerSource(this))
- metricsSystem.start()
- val driver = createSchedulerDriver(
- master,
- MesosClusterScheduler.this,
- Utils.getCurrentUserName(),
- appName,
- conf,
- Some(frameworkUrl),
- Some(true),
- Some(Integer.MAX_VALUE),
- fwId)
-
- startScheduler(driver)
- ready = true
- }
-
- def stop(): Unit = {
- ready = false
- metricsSystem.report()
- metricsSystem.stop()
- mesosDriver.stop(true)
- }
-
- override def registered(
- driver: SchedulerDriver,
- newFrameworkId: FrameworkID,
- masterInfo: MasterInfo): Unit = {
- logInfo("Registered as framework ID " + newFrameworkId.getValue)
- if (newFrameworkId.getValue != frameworkId) {
- frameworkId = newFrameworkId.getValue
- schedulerState.persist("frameworkId", frameworkId)
- }
- markRegistered()
-
- stateLock.synchronized {
- this.masterInfo = Some(masterInfo)
- if (!pendingRecover.isEmpty) {
- // Start task reconciliation if we need to recover.
- val statuses = pendingRecover.collect {
- case (taskId, slaveId) =>
- val newStatus = TaskStatus.newBuilder()
- .setTaskId(TaskID.newBuilder().setValue(taskId).build())
- .setSlaveId(slaveId)
- .setState(MesosTaskState.TASK_STAGING)
- .build()
- launchedDrivers.get(taskId).map(_.mesosTaskStatus.getOrElse(newStatus))
- .getOrElse(newStatus)
- }
- // TODO: Page the status updates to avoid trying to reconcile
- // a large amount of tasks at once.
- driver.reconcileTasks(statuses.toSeq.asJava)
- }
- }
- }
-
- private def getDriverExecutorURI(desc: MesosDriverDescription): Option[String] = {
- desc.conf.getOption("spark.executor.uri")
- .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI"))
- }
-
- private def getDriverFrameworkID(desc: MesosDriverDescription): String = {
- s"${frameworkId}-${desc.submissionId}"
- }
-
- private def adjust[A, B](m: collection.Map[A, B], k: A, default: B)(f: B => B) = {
- m.updated(k, f(m.getOrElse(k, default)))
- }
-
- private def getDriverEnvironment(desc: MesosDriverDescription): Environment = {
- // TODO(mgummelt): Don't do this here. This should be passed as a --conf
- val commandEnv = adjust(desc.command.environment, "SPARK_SUBMIT_OPTS", "")(
- v => s"$v -Dspark.mesos.driver.frameworkId=${getDriverFrameworkID(desc)}"
- )
-
- val env = desc.conf.getAllWithPrefix("spark.mesos.driverEnv.") ++ commandEnv
-
- val envBuilder = Environment.newBuilder()
- env.foreach { case (k, v) =>
- envBuilder.addVariables(Variable.newBuilder().setName(k).setValue(v))
- }
- envBuilder.build()
- }
-
- private def getDriverUris(desc: MesosDriverDescription): List[CommandInfo.URI] = {
- val confUris = List(conf.getOption("spark.mesos.uris"),
- desc.conf.getOption("spark.mesos.uris"),
- desc.conf.getOption("spark.submit.pyFiles")).flatMap(
- _.map(_.split(",").map(_.trim))
- ).flatten
-
- val jarUrl = desc.jarUrl.stripPrefix("file:").stripPrefix("local:")
-
- ((jarUrl :: confUris) ++ getDriverExecutorURI(desc).toList).map(uri =>
- CommandInfo.URI.newBuilder().setValue(uri.trim()).setCache(useFetchCache).build())
- }
-
- private def getDriverCommandValue(desc: MesosDriverDescription): String = {
- val dockerDefined = desc.conf.contains("spark.mesos.executor.docker.image")
- val executorUri = getDriverExecutorURI(desc)
- // Gets the path to run spark-submit, and the path to the Mesos sandbox.
- val (executable, sandboxPath) = if (dockerDefined) {
- // Application jar is automatically downloaded in the mounted sandbox by Mesos,
- // and the path to the mounted volume is stored in $MESOS_SANDBOX env variable.
- ("./bin/spark-submit", "$MESOS_SANDBOX")
- } else if (executorUri.isDefined) {
- val folderBasename = executorUri.get.split('/').last.split('.').head
-
- val entries = conf.getOption("spark.executor.extraLibraryPath")
- .map(path => Seq(path) ++ desc.command.libraryPathEntries)
- .getOrElse(desc.command.libraryPathEntries)
-
- val prefixEnv = if (!entries.isEmpty) Utils.libraryPathEnvPrefix(entries) else ""
-
- val cmdExecutable = s"cd $folderBasename*; $prefixEnv bin/spark-submit"
- // Sandbox path points to the parent folder as we chdir into the folderBasename.
- (cmdExecutable, "..")
- } else {
- val executorSparkHome = desc.conf.getOption("spark.mesos.executor.home")
- .orElse(conf.getOption("spark.home"))
- .orElse(Option(System.getenv("SPARK_HOME")))
- .getOrElse {
- throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
- }
- val cmdExecutable = new File(executorSparkHome, "./bin/spark-submit").getPath
- // Sandbox points to the current directory by default with Mesos.
- (cmdExecutable, ".")
- }
- val cmdOptions = generateCmdOption(desc, sandboxPath).mkString(" ")
- val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString()
- val appArguments = desc.command.arguments.mkString(" ")
-
- s"$executable $cmdOptions $primaryResource $appArguments"
- }
-
- private def buildDriverCommand(desc: MesosDriverDescription): CommandInfo = {
- val builder = CommandInfo.newBuilder()
- builder.setValue(getDriverCommandValue(desc))
- builder.setEnvironment(getDriverEnvironment(desc))
- builder.addAllUris(getDriverUris(desc).asJava)
- builder.build()
- }
-
- private def generateCmdOption(desc: MesosDriverDescription, sandboxPath: String): Seq[String] = {
- var options = Seq(
- "--name", desc.conf.get("spark.app.name"),
- "--master", s"mesos://${conf.get("spark.master")}",
- "--driver-cores", desc.cores.toString,
- "--driver-memory", s"${desc.mem}M")
-
- // Assume empty main class means we're running python
- if (!desc.command.mainClass.equals("")) {
- options ++= Seq("--class", desc.command.mainClass)
- }
-
- desc.conf.getOption("spark.executor.memory").foreach { v =>
- options ++= Seq("--executor-memory", v)
- }
- desc.conf.getOption("spark.cores.max").foreach { v =>
- options ++= Seq("--total-executor-cores", v)
- }
- desc.conf.getOption("spark.submit.pyFiles").foreach { pyFiles =>
- val formattedFiles = pyFiles.split(",")
- .map { path => new File(sandboxPath, path.split("/").last).toString() }
- .mkString(",")
- options ++= Seq("--py-files", formattedFiles)
- }
-
- // --conf
- val replicatedOptionsBlacklist = Set(
- "spark.jars", // Avoids duplicate classes in classpath
- "spark.submit.deployMode", // this would be set to `cluster`, but we need client
- "spark.master" // this contains the address of the dispatcher, not master
- )
- val defaultConf = conf.getAllWithPrefix("spark.mesos.dispatcher.driverDefault.").toMap
- val driverConf = desc.conf.getAll
- .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) }
- .toMap
- (defaultConf ++ driverConf).foreach { case (key, value) =>
- options ++= Seq("--conf", s""""$key=${shellEscape(value)}"""".stripMargin) }
-
- options
- }
-
- /**
- * Escape args for Unix-like shells, unless already quoted by the user.
- * Based on: http://www.gnu.org/software/bash/manual/html_node/Double-Quotes.html
- * and http://www.grymoire.com/Unix/Quote.html
- *
- * @param value argument
- * @return escaped argument
- */
- private[scheduler] def shellEscape(value: String): String = {
- val WrappedInQuotes = """^(".+"|'.+')$""".r
- val ShellSpecialChars = (""".*([ '<>&|\?\*;!#\\(\)"$`]).*""").r
- value match {
- case WrappedInQuotes(c) => value // The user quoted his args, don't touch it!
- case ShellSpecialChars(c) => "\"" + value.replaceAll("""(["`\$\\])""", """\\$1""") + "\""
- case _: String => value // Don't touch harmless strings
- }
- }
-
- private class ResourceOffer(
- val offerId: OfferID,
- val slaveId: SlaveID,
- var resources: JList[Resource]) {
- override def toString(): String = {
- s"Offer id: ${offerId}, resources: ${resources}"
- }
- }
-
- private def createTaskInfo(desc: MesosDriverDescription, offer: ResourceOffer): TaskInfo = {
- val taskId = TaskID.newBuilder().setValue(desc.submissionId).build()
-
- val (remainingResources, cpuResourcesToUse) =
- partitionResources(offer.resources, "cpus", desc.cores)
- val (finalResources, memResourcesToUse) =
- partitionResources(remainingResources.asJava, "mem", desc.mem)
- offer.resources = finalResources.asJava
-
- val appName = desc.conf.get("spark.app.name")
- val taskInfo = TaskInfo.newBuilder()
- .setTaskId(taskId)
- .setName(s"Driver for ${appName}")
- .setSlaveId(offer.slaveId)
- .setCommand(buildDriverCommand(desc))
- .addAllResources(cpuResourcesToUse.asJava)
- .addAllResources(memResourcesToUse.asJava)
- taskInfo.setContainer(MesosSchedulerBackendUtil.containerInfo(desc.conf))
- taskInfo.build
- }
-
- /**
- * This method takes all the possible candidates and attempt to schedule them with Mesos offers.
- * Every time a new task is scheduled, the afterLaunchCallback is called to perform post scheduled
- * logic on each task.
- */
- private def scheduleTasks(
- candidates: Seq[MesosDriverDescription],
- afterLaunchCallback: (String) => Boolean,
- currentOffers: List[ResourceOffer],
- tasks: mutable.HashMap[OfferID, ArrayBuffer[TaskInfo]]): Unit = {
- for (submission <- candidates) {
- val driverCpu = submission.cores
- val driverMem = submission.mem
- logTrace(s"Finding offer to launch driver with cpu: $driverCpu, mem: $driverMem")
- val offerOption = currentOffers.find { o =>
- getResource(o.resources, "cpus") >= driverCpu &&
- getResource(o.resources, "mem") >= driverMem
- }
- if (offerOption.isEmpty) {
- logDebug(s"Unable to find offer to launch driver id: ${submission.submissionId}, " +
- s"cpu: $driverCpu, mem: $driverMem")
- } else {
- val offer = offerOption.get
- val queuedTasks = tasks.getOrElseUpdate(offer.offerId, new ArrayBuffer[TaskInfo])
- val task = createTaskInfo(submission, offer)
- queuedTasks += task
- logTrace(s"Using offer ${offer.offerId.getValue} to launch driver " +
- submission.submissionId)
- val newState = new MesosClusterSubmissionState(submission, task.getTaskId, offer.slaveId,
- None, new Date(), None, getDriverFrameworkID(submission))
- launchedDrivers(submission.submissionId) = newState
- launchedDriversState.persist(submission.submissionId, newState)
- afterLaunchCallback(submission.submissionId)
- }
- }
- }
-
- override def resourceOffers(driver: SchedulerDriver, offers: JList[Offer]): Unit = {
- logTrace(s"Received offers from Mesos: \n${offers.asScala.mkString("\n")}")
- val tasks = new mutable.HashMap[OfferID, ArrayBuffer[TaskInfo]]()
- val currentTime = new Date()
-
- val currentOffers = offers.asScala.map {
- o => new ResourceOffer(o.getId, o.getSlaveId, o.getResourcesList)
- }.toList
-
- stateLock.synchronized {
- // We first schedule all the supervised drivers that are ready to retry.
- // This list will be empty if none of the drivers are marked as supervise.
- val driversToRetry = pendingRetryDrivers.filter { d =>
- d.retryState.get.nextRetry.before(currentTime)
- }
-
- scheduleTasks(
- copyBuffer(driversToRetry),
- removeFromPendingRetryDrivers,
- currentOffers,
- tasks)
-
- // Then we walk through the queued drivers and try to schedule them.
- scheduleTasks(
- copyBuffer(queuedDrivers),
- removeFromQueuedDrivers,
- currentOffers,
- tasks)
- }
- tasks.foreach { case (offerId, taskInfos) =>
- driver.launchTasks(Collections.singleton(offerId), taskInfos.asJava)
- }
-
- for (o <- currentOffers if !tasks.contains(o.offerId)) {
- driver.declineOffer(o.offerId)
- }
- }
-
- private def copyBuffer(
- buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = {
- val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size)
- buffer.copyToBuffer(newBuffer)
- newBuffer
- }
-
- def getSchedulerState(): MesosClusterSchedulerState = {
- stateLock.synchronized {
- new MesosClusterSchedulerState(
- frameworkId,
- masterInfo.map(m => s"http://${m.getIp}:${m.getPort}"),
- copyBuffer(queuedDrivers),
- launchedDrivers.values.map(_.copy()).toList,
- finishedDrivers.map(_.copy()).toList,
- copyBuffer(pendingRetryDrivers))
- }
- }
-
- override def offerRescinded(driver: SchedulerDriver, offerId: OfferID): Unit = {}
- override def disconnected(driver: SchedulerDriver): Unit = {}
- override def reregistered(driver: SchedulerDriver, masterInfo: MasterInfo): Unit = {
- logInfo(s"Framework re-registered with master ${masterInfo.getId}")
- }
- override def slaveLost(driver: SchedulerDriver, slaveId: SlaveID): Unit = {}
- override def error(driver: SchedulerDriver, error: String): Unit = {
- logError("Error received: " + error)
- markErr()
- }
-
- /**
- * Check if the task state is a recoverable state that we can relaunch the task.
- * Task state like TASK_ERROR are not relaunchable state since it wasn't able
- * to be validated by Mesos.
- */
- private def shouldRelaunch(state: MesosTaskState): Boolean = {
- state == MesosTaskState.TASK_FAILED ||
- state == MesosTaskState.TASK_KILLED ||
- state == MesosTaskState.TASK_LOST
- }
-
- override def statusUpdate(driver: SchedulerDriver, status: TaskStatus): Unit = {
- val taskId = status.getTaskId.getValue
- stateLock.synchronized {
- if (launchedDrivers.contains(taskId)) {
- if (status.getReason == Reason.REASON_RECONCILIATION &&
- !pendingRecover.contains(taskId)) {
- // Task has already received update and no longer requires reconciliation.
- return
- }
- val state = launchedDrivers(taskId)
- // Check if the driver is supervise enabled and can be relaunched.
- if (state.driverDescription.supervise && shouldRelaunch(status.getState)) {
- removeFromLaunchedDrivers(taskId)
- state.finishDate = Some(new Date())
- val retryState: Option[MesosClusterRetryState] = state.driverDescription.retryState
- val (retries, waitTimeSec) = retryState
- .map { rs => (rs.retries + 1, Math.min(maxRetryWaitTime, rs.waitTime * 2)) }
- .getOrElse{ (1, 1) }
- val nextRetry = new Date(new Date().getTime + waitTimeSec * 1000L)
-
- val newDriverDescription = state.driverDescription.copy(
- retryState = Some(new MesosClusterRetryState(status, retries, nextRetry, waitTimeSec)))
- pendingRetryDrivers += newDriverDescription
- pendingRetryDriversState.persist(taskId, newDriverDescription)
- } else if (TaskState.isFinished(mesosToTaskState(status.getState))) {
- removeFromLaunchedDrivers(taskId)
- state.finishDate = Some(new Date())
- if (finishedDrivers.size >= retainedDrivers) {
- val toRemove = math.max(retainedDrivers / 10, 1)
- finishedDrivers.trimStart(toRemove)
- }
- finishedDrivers += state
- }
- state.mesosTaskStatus = Option(status)
- } else {
- logError(s"Unable to find driver $taskId in status update")
- }
- }
- }
-
- override def frameworkMessage(
- driver: SchedulerDriver,
- executorId: ExecutorID,
- slaveId: SlaveID,
- message: Array[Byte]): Unit = {}
-
- override def executorLost(
- driver: SchedulerDriver,
- executorId: ExecutorID,
- slaveId: SlaveID,
- status: Int): Unit = {}
-
- private def removeFromQueuedDrivers(id: String): Boolean = {
- val index = queuedDrivers.indexWhere(_.submissionId.equals(id))
- if (index != -1) {
- queuedDrivers.remove(index)
- queuedDriversState.expunge(id)
- true
- } else {
- false
- }
- }
-
- private def removeFromLaunchedDrivers(id: String): Boolean = {
- if (launchedDrivers.remove(id).isDefined) {
- launchedDriversState.expunge(id)
- true
- } else {
- false
- }
- }
-
- private def removeFromPendingRetryDrivers(id: String): Boolean = {
- val index = pendingRetryDrivers.indexWhere(_.submissionId.equals(id))
- if (index != -1) {
- pendingRetryDrivers.remove(index)
- pendingRetryDriversState.expunge(id)
- true
- } else {
- false
- }
- }
-
- def getQueuedDriversSize: Int = queuedDrivers.size
- def getLaunchedDriversSize: Int = launchedDrivers.size
- def getPendingRetryDriversSize: Int = pendingRetryDrivers.size
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
deleted file mode 100644
index 1fe9497..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import com.codahale.metrics.{Gauge, MetricRegistry}
-
-import org.apache.spark.metrics.source.Source
-
-private[mesos] class MesosClusterSchedulerSource(scheduler: MesosClusterScheduler)
- extends Source {
- override def sourceName: String = "mesos_cluster"
- override def metricRegistry: MetricRegistry = new MetricRegistry()
-
- metricRegistry.register(MetricRegistry.name("waitingDrivers"), new Gauge[Int] {
- override def getValue: Int = scheduler.getQueuedDriversSize
- })
-
- metricRegistry.register(MetricRegistry.name("launchedDrivers"), new Gauge[Int] {
- override def getValue: Int = scheduler.getLaunchedDriversSize
- })
-
- metricRegistry.register(MetricRegistry.name("retryDrivers"), new Gauge[Int] {
- override def getValue: Int = scheduler.getPendingRetryDriversSize
- })
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
deleted file mode 100644
index 3258b09..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.io.File
-import java.util.{Collections, List => JList}
-import java.util.concurrent.locks.ReentrantLock
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.concurrent.Future
-
-import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
-
-import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskState}
-import org.apache.spark.network.netty.SparkTransportConf
-import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
-import org.apache.spark.rpc.RpcEndpointAddress
-import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl}
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-import org.apache.spark.util.Utils
-
-/**
- * A SchedulerBackend that runs tasks on Mesos, but uses "coarse-grained" tasks, where it holds
- * onto each Mesos node for the duration of the Spark job instead of relinquishing cores whenever
- * a task is done. It launches Spark tasks within the coarse-grained Mesos tasks using the
- * CoarseGrainedSchedulerBackend mechanism. This class is useful for lower and more predictable
- * latency.
- *
- * Unfortunately this has a bit of duplication from [[MesosFineGrainedSchedulerBackend]],
- * but it seems hard to remove this.
- */
-private[spark] class MesosCoarseGrainedSchedulerBackend(
- scheduler: TaskSchedulerImpl,
- sc: SparkContext,
- master: String,
- securityManager: SecurityManager)
- extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
- with org.apache.mesos.Scheduler
- with MesosSchedulerUtils {
-
- val MAX_SLAVE_FAILURES = 2 // Blacklist a slave after this many failures
-
- // Maximum number of cores to acquire (TODO: we'll need more flexible controls here)
- val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt
-
- val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false)
-
- val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
-
- private[this] val shutdownTimeoutMS =
- conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s")
- .ensuring(_ >= 0, "spark.mesos.coarse.shutdownTimeout must be >= 0")
-
- // Synchronization protected by stateLock
- private[this] var stopCalled: Boolean = false
-
- // If shuffle service is enabled, the Spark driver will register with the shuffle service.
- // This is for cleaning up shuffle files reliably.
- private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
-
- // Cores we have acquired with each Mesos task ID
- val coresByTaskId = new mutable.HashMap[String, Int]
- val gpusByTaskId = new mutable.HashMap[String, Int]
- var totalCoresAcquired = 0
- var totalGpusAcquired = 0
-
- // SlaveID -> Slave
- // This map accumulates entries for the duration of the job. Slaves are never deleted, because
- // we need to maintain e.g. failure state and connection state.
- private val slaves = new mutable.HashMap[String, Slave]
-
- /**
- * The total number of executors we aim to have. Undefined when not using dynamic allocation.
- * Initially set to 0 when using dynamic allocation, the executor allocation manager will send
- * the real initial limit later.
- */
- private var executorLimitOption: Option[Int] = {
- if (Utils.isDynamicAllocationEnabled(conf)) {
- Some(0)
- } else {
- None
- }
- }
-
- /**
- * Return the current executor limit, which may be [[Int.MaxValue]]
- * before properly initialized.
- */
- private[mesos] def executorLimit: Int = executorLimitOption.getOrElse(Int.MaxValue)
-
- // private lock object protecting mutable state above. Using the intrinsic lock
- // may lead to deadlocks since the superclass might also try to lock
- private val stateLock = new ReentrantLock
-
- val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0)
-
- // Offer constraints
- private val slaveOfferConstraints =
- parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
-
- // Reject offers with mismatched constraints in seconds
- private val rejectOfferDurationForUnmetConstraints =
- getRejectOfferDurationForUnmetConstraints(sc)
-
- // Reject offers when we reached the maximum number of cores for this framework
- private val rejectOfferDurationForReachedMaxCores =
- getRejectOfferDurationForReachedMaxCores(sc)
-
- // A client for talking to the external shuffle service
- private val mesosExternalShuffleClient: Option[MesosExternalShuffleClient] = {
- if (shuffleServiceEnabled) {
- Some(getShuffleClient())
- } else {
- None
- }
- }
-
- // This method is factored out for testability
- protected def getShuffleClient(): MesosExternalShuffleClient = {
- new MesosExternalShuffleClient(
- SparkTransportConf.fromSparkConf(conf, "shuffle"),
- securityManager,
- securityManager.isAuthenticationEnabled(),
- securityManager.isSaslEncryptionEnabled())
- }
-
- var nextMesosTaskId = 0
-
- @volatile var appId: String = _
-
- def newMesosTaskId(): String = {
- val id = nextMesosTaskId
- nextMesosTaskId += 1
- id.toString
- }
-
- override def start() {
- super.start()
- val driver = createSchedulerDriver(
- master,
- MesosCoarseGrainedSchedulerBackend.this,
- sc.sparkUser,
- sc.appName,
- sc.conf,
- sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
- None,
- None,
- sc.conf.getOption("spark.mesos.driver.frameworkId")
- )
-
- unsetFrameworkID(sc)
- startScheduler(driver)
- }
-
- def createCommand(offer: Offer, numCores: Int, taskId: String): CommandInfo = {
- val environment = Environment.newBuilder()
- val extraClassPath = conf.getOption("spark.executor.extraClassPath")
- extraClassPath.foreach { cp =>
- environment.addVariables(
- Environment.Variable.newBuilder().setName("SPARK_CLASSPATH").setValue(cp).build())
- }
- val extraJavaOpts = conf.get("spark.executor.extraJavaOptions", "")
-
- // Set the environment variable through a command prefix
- // to append to the existing value of the variable
- val prefixEnv = conf.getOption("spark.executor.extraLibraryPath").map { p =>
- Utils.libraryPathEnvPrefix(Seq(p))
- }.getOrElse("")
-
- environment.addVariables(
- Environment.Variable.newBuilder()
- .setName("SPARK_EXECUTOR_OPTS")
- .setValue(extraJavaOpts)
- .build())
-
- sc.executorEnvs.foreach { case (key, value) =>
- environment.addVariables(Environment.Variable.newBuilder()
- .setName(key)
- .setValue(value)
- .build())
- }
- val command = CommandInfo.newBuilder()
- .setEnvironment(environment)
-
- val uri = conf.getOption("spark.executor.uri")
- .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
-
- if (uri.isEmpty) {
- val executorSparkHome = conf.getOption("spark.mesos.executor.home")
- .orElse(sc.getSparkHome())
- .getOrElse {
- throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
- }
- val runScript = new File(executorSparkHome, "./bin/spark-class").getPath
- command.setValue(
- "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend"
- .format(prefixEnv, runScript) +
- s" --driver-url $driverURL" +
- s" --executor-id $taskId" +
- s" --hostname ${executorHostname(offer)}" +
- s" --cores $numCores" +
- s" --app-id $appId")
- } else {
- // Grab everything to the first '.'. We'll use that and '*' to
- // glob the directory "correctly".
- val basename = uri.get.split('/').last.split('.').head
- command.setValue(
- s"cd $basename*; $prefixEnv " +
- "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend" +
- s" --driver-url $driverURL" +
- s" --executor-id $taskId" +
- s" --hostname ${executorHostname(offer)}" +
- s" --cores $numCores" +
- s" --app-id $appId")
- command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get).setCache(useFetcherCache))
- }
-
- conf.getOption("spark.mesos.uris").foreach(setupUris(_, command, useFetcherCache))
-
- command.build()
- }
-
- protected def driverURL: String = {
- if (conf.contains("spark.testing")) {
- "driverURL"
- } else {
- RpcEndpointAddress(
- conf.get("spark.driver.host"),
- conf.get("spark.driver.port").toInt,
- CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
- }
- }
-
- override def offerRescinded(d: org.apache.mesos.SchedulerDriver, o: OfferID) {}
-
- override def registered(
- d: org.apache.mesos.SchedulerDriver, frameworkId: FrameworkID, masterInfo: MasterInfo) {
- appId = frameworkId.getValue
- mesosExternalShuffleClient.foreach(_.init(appId))
- markRegistered()
- }
-
- override def sufficientResourcesRegistered(): Boolean = {
- totalCoresAcquired >= maxCores * minRegisteredRatio
- }
-
- override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
-
- override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {}
-
- /**
- * Method called by Mesos to offer resources on slaves. We respond by launching an executor,
- * unless we've already launched more than we wanted to.
- */
- override def resourceOffers(d: org.apache.mesos.SchedulerDriver, offers: JList[Offer]) {
- stateLock.synchronized {
- if (stopCalled) {
- logDebug("Ignoring offers during shutdown")
- // Driver should simply return a stopped status on race
- // condition between this.stop() and completing here
- offers.asScala.map(_.getId).foreach(d.declineOffer)
- return
- }
-
- logDebug(s"Received ${offers.size} resource offers.")
-
- val (matchedOffers, unmatchedOffers) = offers.asScala.partition { offer =>
- val offerAttributes = toAttributeMap(offer.getAttributesList)
- matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
- }
-
- declineUnmatchedOffers(d, unmatchedOffers)
- handleMatchedOffers(d, matchedOffers)
- }
- }
-
- private def declineUnmatchedOffers(
- d: org.apache.mesos.SchedulerDriver, offers: mutable.Buffer[Offer]): Unit = {
- offers.foreach { offer =>
- declineOffer(d, offer, Some("unmet constraints"),
- Some(rejectOfferDurationForUnmetConstraints))
- }
- }
-
- private def declineOffer(
- d: org.apache.mesos.SchedulerDriver,
- offer: Offer,
- reason: Option[String] = None,
- refuseSeconds: Option[Long] = None): Unit = {
-
- val id = offer.getId.getValue
- val offerAttributes = toAttributeMap(offer.getAttributesList)
- val mem = getResource(offer.getResourcesList, "mem")
- val cpus = getResource(offer.getResourcesList, "cpus")
- val ports = getRangeResource(offer.getResourcesList, "ports")
-
- logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem" +
- s" cpu: $cpus port: $ports for $refuseSeconds seconds" +
- reason.map(r => s" (reason: $r)").getOrElse(""))
-
- refuseSeconds match {
- case Some(seconds) =>
- val filters = Filters.newBuilder().setRefuseSeconds(seconds).build()
- d.declineOffer(offer.getId, filters)
- case _ => d.declineOffer(offer.getId)
- }
- }
-
- /**
- * Launches executors on accepted offers, and declines unused offers. Executors are launched
- * round-robin on offers.
- *
- * @param d SchedulerDriver
- * @param offers Mesos offers that match attribute constraints
- */
- private def handleMatchedOffers(
- d: org.apache.mesos.SchedulerDriver, offers: mutable.Buffer[Offer]): Unit = {
- val tasks = buildMesosTasks(offers)
- for (offer <- offers) {
- val offerAttributes = toAttributeMap(offer.getAttributesList)
- val offerMem = getResource(offer.getResourcesList, "mem")
- val offerCpus = getResource(offer.getResourcesList, "cpus")
- val offerPorts = getRangeResource(offer.getResourcesList, "ports")
- val id = offer.getId.getValue
-
- if (tasks.contains(offer.getId)) { // accept
- val offerTasks = tasks(offer.getId)
-
- logDebug(s"Accepting offer: $id with attributes: $offerAttributes " +
- s"mem: $offerMem cpu: $offerCpus ports: $offerPorts." +
- s" Launching ${offerTasks.size} Mesos tasks.")
-
- for (task <- offerTasks) {
- val taskId = task.getTaskId
- val mem = getResource(task.getResourcesList, "mem")
- val cpus = getResource(task.getResourcesList, "cpus")
- val ports = getRangeResource(task.getResourcesList, "ports").mkString(",")
-
- logDebug(s"Launching Mesos task: ${taskId.getValue} with mem: $mem cpu: $cpus" +
- s" ports: $ports")
- }
-
- d.launchTasks(
- Collections.singleton(offer.getId),
- offerTasks.asJava)
- } else if (totalCoresAcquired >= maxCores) {
- // Reject an offer for a configurable amount of time to avoid starving other frameworks
- declineOffer(d, offer, Some("reached spark.cores.max"),
- Some(rejectOfferDurationForReachedMaxCores))
- } else {
- declineOffer(d, offer)
- }
- }
- }
-
- /**
- * Returns a map from OfferIDs to the tasks to launch on those offers. In order to maximize
- * per-task memory and IO, tasks are round-robin assigned to offers.
- *
- * @param offers Mesos offers that match attribute constraints
- * @return A map from OfferID to a list of Mesos tasks to launch on that offer
- */
- private def buildMesosTasks(offers: mutable.Buffer[Offer]): Map[OfferID, List[MesosTaskInfo]] = {
- // offerID -> tasks
- val tasks = new mutable.HashMap[OfferID, List[MesosTaskInfo]].withDefaultValue(Nil)
-
- // offerID -> resources
- val remainingResources = mutable.Map(offers.map(offer =>
- (offer.getId.getValue, offer.getResourcesList)): _*)
-
- var launchTasks = true
-
- // TODO(mgummelt): combine offers for a single slave
- //
- // round-robin create executors on the available offers
- while (launchTasks) {
- launchTasks = false
-
- for (offer <- offers) {
- val slaveId = offer.getSlaveId.getValue
- val offerId = offer.getId.getValue
- val resources = remainingResources(offerId)
-
- if (canLaunchTask(slaveId, resources)) {
- // Create a task
- launchTasks = true
- val taskId = newMesosTaskId()
- val offerCPUs = getResource(resources, "cpus").toInt
- val taskGPUs = Math.min(
- Math.max(0, maxGpus - totalGpusAcquired), getResource(resources, "gpus").toInt)
-
- val taskCPUs = executorCores(offerCPUs)
- val taskMemory = executorMemory(sc)
-
- slaves.getOrElseUpdate(slaveId, new Slave(offer.getHostname)).taskIDs.add(taskId)
-
- val (resourcesLeft, resourcesToUse) =
- partitionTaskResources(resources, taskCPUs, taskMemory, taskGPUs)
-
- val taskBuilder = MesosTaskInfo.newBuilder()
- .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build())
- .setSlaveId(offer.getSlaveId)
- .setCommand(createCommand(offer, taskCPUs + extraCoresPerExecutor, taskId))
- .setName("Task " + taskId)
- taskBuilder.addAllResources(resourcesToUse.asJava)
- taskBuilder.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
-
- tasks(offer.getId) ::= taskBuilder.build()
- remainingResources(offerId) = resourcesLeft.asJava
- totalCoresAcquired += taskCPUs
- coresByTaskId(taskId) = taskCPUs
- if (taskGPUs > 0) {
- totalGpusAcquired += taskGPUs
- gpusByTaskId(taskId) = taskGPUs
- }
- }
- }
- }
- tasks.toMap
- }
-
- /** Extracts task needed resources from a list of available resources. */
- private def partitionTaskResources(
- resources: JList[Resource],
- taskCPUs: Int,
- taskMemory: Int,
- taskGPUs: Int)
- : (List[Resource], List[Resource]) = {
-
- // partition cpus & mem
- val (afterCPUResources, cpuResourcesToUse) = partitionResources(resources, "cpus", taskCPUs)
- val (afterMemResources, memResourcesToUse) =
- partitionResources(afterCPUResources.asJava, "mem", taskMemory)
- val (afterGPUResources, gpuResourcesToUse) =
- partitionResources(afterMemResources.asJava, "gpus", taskGPUs)
-
- // If user specifies port numbers in SparkConfig then consecutive tasks will not be launched
- // on the same host. This essentially means one executor per host.
- // TODO: handle network isolator case
- val (nonPortResources, portResourcesToUse) =
- partitionPortResources(nonZeroPortValuesFromConfig(sc.conf), afterGPUResources)
-
- (nonPortResources,
- cpuResourcesToUse ++ memResourcesToUse ++ portResourcesToUse ++ gpuResourcesToUse)
- }
-
- private def canLaunchTask(slaveId: String, resources: JList[Resource]): Boolean = {
- val offerMem = getResource(resources, "mem")
- val offerCPUs = getResource(resources, "cpus").toInt
- val cpus = executorCores(offerCPUs)
- val mem = executorMemory(sc)
- val ports = getRangeResource(resources, "ports")
- val meetsPortRequirements = checkPorts(sc.conf, ports)
-
- cpus > 0 &&
- cpus <= offerCPUs &&
- cpus + totalCoresAcquired <= maxCores &&
- mem <= offerMem &&
- numExecutors() < executorLimit &&
- slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES &&
- meetsPortRequirements
- }
-
- private def executorCores(offerCPUs: Int): Int = {
- sc.conf.getInt("spark.executor.cores",
- math.min(offerCPUs, maxCores - totalCoresAcquired))
- }
-
- override def statusUpdate(d: org.apache.mesos.SchedulerDriver, status: TaskStatus) {
- val taskId = status.getTaskId.getValue
- val slaveId = status.getSlaveId.getValue
- val state = mesosToTaskState(status.getState)
-
- logInfo(s"Mesos task $taskId is now ${status.getState}")
-
- stateLock.synchronized {
- val slave = slaves(slaveId)
-
- // If the shuffle service is enabled, have the driver register with each one of the
- // shuffle services. This allows the shuffle services to clean up state associated with
- // this application when the driver exits. There is currently not a great way to detect
- // this through Mesos, since the shuffle services are set up independently.
- if (state.equals(TaskState.RUNNING) &&
- shuffleServiceEnabled &&
- !slave.shuffleRegistered) {
- assume(mesosExternalShuffleClient.isDefined,
- "External shuffle client was not instantiated even though shuffle service is enabled.")
- // TODO: Remove this and allow the MesosExternalShuffleService to detect
- // framework termination when new Mesos Framework HTTP API is available.
- val externalShufflePort = conf.getInt("spark.shuffle.service.port", 7337)
-
- logDebug(s"Connecting to shuffle service on slave $slaveId, " +
- s"host ${slave.hostname}, port $externalShufflePort for app ${conf.getAppId}")
-
- mesosExternalShuffleClient.get
- .registerDriverWithShuffleService(
- slave.hostname,
- externalShufflePort,
- sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs",
- s"${sc.conf.getTimeAsMs("spark.network.timeout", "120s")}ms"),
- sc.conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s"))
- slave.shuffleRegistered = true
- }
-
- if (TaskState.isFinished(state)) {
- // Remove the cores we have remembered for this task, if it's in the hashmap
- for (cores <- coresByTaskId.get(taskId)) {
- totalCoresAcquired -= cores
- coresByTaskId -= taskId
- }
- // Also remove the gpus we have remembered for this task, if it's in the hashmap
- for (gpus <- gpusByTaskId.get(taskId)) {
- totalGpusAcquired -= gpus
- gpusByTaskId -= taskId
- }
- // If it was a failure, mark the slave as failed for blacklisting purposes
- if (TaskState.isFailed(state)) {
- slave.taskFailures += 1
-
- if (slave.taskFailures >= MAX_SLAVE_FAILURES) {
- logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " +
- "is Spark installed on it?")
- }
- }
- executorTerminated(d, slaveId, taskId, s"Executor finished with state $state")
- // In case we'd rejected everything before but have now lost a node
- d.reviveOffers()
- }
- }
- }
-
- override def error(d: org.apache.mesos.SchedulerDriver, message: String) {
- logError(s"Mesos error: $message")
- scheduler.error(message)
- }
-
- override def stop() {
- // Make sure we're not launching tasks during shutdown
- stateLock.synchronized {
- if (stopCalled) {
- logWarning("Stop called multiple times, ignoring")
- return
- }
- stopCalled = true
- super.stop()
- }
-
- // Wait for executors to report done, or else mesosDriver.stop() will forcefully kill them.
- // See SPARK-12330
- val startTime = System.nanoTime()
-
- // slaveIdsWithExecutors has no memory barrier, so this is eventually consistent
- while (numExecutors() > 0 &&
- System.nanoTime() - startTime < shutdownTimeoutMS * 1000L * 1000L) {
- Thread.sleep(100)
- }
-
- if (numExecutors() > 0) {
- logWarning(s"Timed out waiting for ${numExecutors()} remaining executors "
- + s"to terminate within $shutdownTimeoutMS ms. This may leave temporary files "
- + "on the mesos nodes.")
- }
-
- // Close the mesos external shuffle client if used
- mesosExternalShuffleClient.foreach(_.close())
-
- if (mesosDriver != null) {
- mesosDriver.stop()
- }
- }
-
- override def frameworkMessage(
- d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
-
- /**
- * Called when a slave is lost or a Mesos task finished. Updates local view on
- * what tasks are running. It also notifies the driver that an executor was removed.
- */
- private def executorTerminated(
- d: org.apache.mesos.SchedulerDriver,
- slaveId: String,
- taskId: String,
- reason: String): Unit = {
- stateLock.synchronized {
- // Do not call removeExecutor() after this scheduler backend was stopped because
- // removeExecutor() internally will send a message to the driver endpoint but
- // the driver endpoint is not available now, otherwise an exception will be thrown.
- if (!stopCalled) {
- removeExecutor(taskId, SlaveLost(reason))
- }
- slaves(slaveId).taskIDs.remove(taskId)
- }
- }
-
- override def slaveLost(d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID): Unit = {
- logInfo(s"Mesos slave lost: ${slaveId.getValue}")
- }
-
- override def executorLost(
- d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, status: Int): Unit = {
- logInfo("Mesos executor lost: %s".format(e.getValue))
- }
-
- override def applicationId(): String =
- Option(appId).getOrElse {
- logWarning("Application ID is not initialized yet.")
- super.applicationId
- }
-
- override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future.successful {
- // We don't truly know if we can fulfill the full amount of executors
- // since at coarse grain it depends on the amount of slaves available.
- logInfo("Capping the total amount of executors to " + requestedTotal)
- executorLimitOption = Some(requestedTotal)
- true
- }
-
- override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future.successful {
- if (mesosDriver == null) {
- logWarning("Asked to kill executors before the Mesos driver was started.")
- false
- } else {
- for (executorId <- executorIds) {
- val taskId = TaskID.newBuilder().setValue(executorId).build()
- mesosDriver.killTask(taskId)
- }
- // no need to adjust `executorLimitOption` since the AllocationManager already communicated
- // the desired limit through a call to `doRequestTotalExecutors`.
- // See [[o.a.s.scheduler.cluster.CoarseGrainedSchedulerBackend.killExecutors]]
- true
- }
- }
-
- private def numExecutors(): Int = {
- slaves.values.map(_.taskIDs.size).sum
- }
-
- private def executorHostname(offer: Offer): String = {
- if (sc.conf.getOption("spark.mesos.network.name").isDefined) {
- // The agent's IP is not visible in a CNI container, so we bind to 0.0.0.0
- "0.0.0.0"
- } else {
- offer.getHostname
- }
- }
-}
-
-private class Slave(val hostname: String) {
- val taskIDs = new mutable.HashSet[String]()
- var taskFailures = 0
- var shuffleRegistered = false
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
deleted file mode 100644
index 779ffb5..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.io.File
-import java.util.{ArrayList => JArrayList, Collections, List => JList}
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{HashMap, HashSet}
-
-import org.apache.mesos.Protos.{ExecutorInfo => MesosExecutorInfo, TaskInfo => MesosTaskInfo, _}
-import org.apache.mesos.protobuf.ByteString
-
-import org.apache.spark.{SparkContext, SparkException, TaskState}
-import org.apache.spark.executor.MesosExecutorBackend
-import org.apache.spark.scheduler._
-import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.util.Utils
-
-/**
- * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
- * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks
- * from multiple apps can run on different cores) and in time (a core can switch ownership).
- */
-private[spark] class MesosFineGrainedSchedulerBackend(
- scheduler: TaskSchedulerImpl,
- sc: SparkContext,
- master: String)
- extends SchedulerBackend
- with org.apache.mesos.Scheduler
- with MesosSchedulerUtils {
-
- // Stores the slave ids that has launched a Mesos executor.
- val slaveIdToExecutorInfo = new HashMap[String, MesosExecutorInfo]
- val taskIdToSlaveId = new HashMap[Long, String]
-
- // An ExecutorInfo for our tasks
- var execArgs: Array[Byte] = null
-
- var classLoader: ClassLoader = null
-
- // The listener bus to publish executor added/removed events.
- val listenerBus = sc.listenerBus
-
- private[mesos] val mesosExecutorCores = sc.conf.getDouble("spark.mesos.mesosExecutor.cores", 1)
-
- // Offer constraints
- private[this] val slaveOfferConstraints =
- parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
-
- // reject offers with mismatched constraints in seconds
- private val rejectOfferDurationForUnmetConstraints =
- getRejectOfferDurationForUnmetConstraints(sc)
-
- @volatile var appId: String = _
-
- override def start() {
- classLoader = Thread.currentThread.getContextClassLoader
- val driver = createSchedulerDriver(
- master,
- MesosFineGrainedSchedulerBackend.this,
- sc.sparkUser,
- sc.appName,
- sc.conf,
- sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
- Option.empty,
- Option.empty,
- sc.conf.getOption("spark.mesos.driver.frameworkId")
- )
-
- unsetFrameworkID(sc)
- startScheduler(driver)
- }
-
- /**
- * Creates a MesosExecutorInfo that is used to launch a Mesos executor.
- * @param availableResources Available resources that is offered by Mesos
- * @param execId The executor id to assign to this new executor.
- * @return A tuple of the new mesos executor info and the remaining available resources.
- */
- def createExecutorInfo(
- availableResources: JList[Resource],
- execId: String): (MesosExecutorInfo, JList[Resource]) = {
- val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home")
- .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility
- .getOrElse {
- throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
- }
- val environment = Environment.newBuilder()
- sc.conf.getOption("spark.executor.extraClassPath").foreach { cp =>
- environment.addVariables(
- Environment.Variable.newBuilder().setName("SPARK_CLASSPATH").setValue(cp).build())
- }
- val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions").getOrElse("")
-
- val prefixEnv = sc.conf.getOption("spark.executor.extraLibraryPath").map { p =>
- Utils.libraryPathEnvPrefix(Seq(p))
- }.getOrElse("")
-
- environment.addVariables(
- Environment.Variable.newBuilder()
- .setName("SPARK_EXECUTOR_OPTS")
- .setValue(extraJavaOpts)
- .build())
- sc.executorEnvs.foreach { case (key, value) =>
- environment.addVariables(Environment.Variable.newBuilder()
- .setName(key)
- .setValue(value)
- .build())
- }
- val command = CommandInfo.newBuilder()
- .setEnvironment(environment)
- val uri = sc.conf.getOption("spark.executor.uri")
- .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
-
- val executorBackendName = classOf[MesosExecutorBackend].getName
- if (uri.isEmpty) {
- val executorPath = new File(executorSparkHome, "/bin/spark-class").getPath
- command.setValue(s"$prefixEnv $executorPath $executorBackendName")
- } else {
- // Grab everything to the first '.'. We'll use that and '*' to
- // glob the directory "correctly".
- val basename = uri.get.split('/').last.split('.').head
- command.setValue(s"cd ${basename}*; $prefixEnv ./bin/spark-class $executorBackendName")
- command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get))
- }
- val builder = MesosExecutorInfo.newBuilder()
- val (resourcesAfterCpu, usedCpuResources) =
- partitionResources(availableResources, "cpus", mesosExecutorCores)
- val (resourcesAfterMem, usedMemResources) =
- partitionResources(resourcesAfterCpu.asJava, "mem", executorMemory(sc))
-
- builder.addAllResources(usedCpuResources.asJava)
- builder.addAllResources(usedMemResources.asJava)
-
- sc.conf.getOption("spark.mesos.uris").foreach(setupUris(_, command))
-
- val executorInfo = builder
- .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
- .setCommand(command)
- .setData(ByteString.copyFrom(createExecArg()))
-
- executorInfo.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
- (executorInfo.build(), resourcesAfterMem.asJava)
- }
-
- /**
- * Create and serialize the executor argument to pass to Mesos. Our executor arg is an array
- * containing all the spark.* system properties in the form of (String, String) pairs.
- */
- private def createExecArg(): Array[Byte] = {
- if (execArgs == null) {
- val props = new HashMap[String, String]
- for ((key, value) <- sc.conf.getAll) {
- props(key) = value
- }
- // Serialize the map as an array of (String, String) pairs
- execArgs = Utils.serialize(props.toArray)
- }
- execArgs
- }
-
- override def offerRescinded(d: org.apache.mesos.SchedulerDriver, o: OfferID) {}
-
- override def registered(
- d: org.apache.mesos.SchedulerDriver, frameworkId: FrameworkID, masterInfo: MasterInfo) {
- inClassLoader() {
- appId = frameworkId.getValue
- logInfo("Registered as framework ID " + appId)
- markRegistered()
- }
- }
-
- private def inClassLoader()(fun: => Unit) = {
- val oldClassLoader = Thread.currentThread.getContextClassLoader
- Thread.currentThread.setContextClassLoader(classLoader)
- try {
- fun
- } finally {
- Thread.currentThread.setContextClassLoader(oldClassLoader)
- }
- }
-
- override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
-
- override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {}
-
- private def getTasksSummary(tasks: JArrayList[MesosTaskInfo]): String = {
- val builder = new StringBuilder
- tasks.asScala.foreach { t =>
- builder.append("Task id: ").append(t.getTaskId.getValue).append("\n")
- .append("Slave id: ").append(t.getSlaveId.getValue).append("\n")
- .append("Task resources: ").append(t.getResourcesList).append("\n")
- .append("Executor resources: ").append(t.getExecutor.getResourcesList)
- .append("---------------------------------------------\n")
- }
- builder.toString()
- }
-
- /**
- * Method called by Mesos to offer resources on slaves. We respond by asking our active task sets
- * for tasks in order of priority. We fill each node with tasks in a round-robin manner so that
- * tasks are balanced across the cluster.
- */
- override def resourceOffers(d: org.apache.mesos.SchedulerDriver, offers: JList[Offer]) {
- inClassLoader() {
- // Fail first on offers with unmet constraints
- val (offersMatchingConstraints, offersNotMatchingConstraints) =
- offers.asScala.partition { o =>
- val offerAttributes = toAttributeMap(o.getAttributesList)
- val meetsConstraints =
- matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
-
- // add some debug messaging
- if (!meetsConstraints) {
- val id = o.getId.getValue
- logDebug(s"Declining offer: $id with attributes: $offerAttributes")
- }
-
- meetsConstraints
- }
-
- // These offers do not meet constraints. We don't need to see them again.
- // Decline the offer for a long period of time.
- offersNotMatchingConstraints.foreach { o =>
- d.declineOffer(o.getId, Filters.newBuilder()
- .setRefuseSeconds(rejectOfferDurationForUnmetConstraints).build())
- }
-
- // Of the matching constraints, see which ones give us enough memory and cores
- val (usableOffers, unUsableOffers) = offersMatchingConstraints.partition { o =>
- val mem = getResource(o.getResourcesList, "mem")
- val cpus = getResource(o.getResourcesList, "cpus")
- val slaveId = o.getSlaveId.getValue
- val offerAttributes = toAttributeMap(o.getAttributesList)
-
- // check offers for
- // 1. Memory requirements
- // 2. CPU requirements - need at least 1 for executor, 1 for task
- val meetsMemoryRequirements = mem >= executorMemory(sc)
- val meetsCPURequirements = cpus >= (mesosExecutorCores + scheduler.CPUS_PER_TASK)
- val meetsRequirements =
- (meetsMemoryRequirements && meetsCPURequirements) ||
- (slaveIdToExecutorInfo.contains(slaveId) && cpus >= scheduler.CPUS_PER_TASK)
- val debugstr = if (meetsRequirements) "Accepting" else "Declining"
- logDebug(s"$debugstr offer: ${o.getId.getValue} with attributes: "
- + s"$offerAttributes mem: $mem cpu: $cpus")
-
- meetsRequirements
- }
-
- // Decline offers we ruled out immediately
- unUsableOffers.foreach(o => d.declineOffer(o.getId))
-
- val workerOffers = usableOffers.map { o =>
- val cpus = if (slaveIdToExecutorInfo.contains(o.getSlaveId.getValue)) {
- getResource(o.getResourcesList, "cpus").toInt
- } else {
- // If the Mesos executor has not been started on this slave yet, set aside a few
- // cores for the Mesos executor by offering fewer cores to the Spark executor
- (getResource(o.getResourcesList, "cpus") - mesosExecutorCores).toInt
- }
- new WorkerOffer(
- o.getSlaveId.getValue,
- o.getHostname,
- cpus)
- }.toIndexedSeq
-
- val slaveIdToOffer = usableOffers.map(o => o.getSlaveId.getValue -> o).toMap
- val slaveIdToWorkerOffer = workerOffers.map(o => o.executorId -> o).toMap
- val slaveIdToResources = new HashMap[String, JList[Resource]]()
- usableOffers.foreach { o =>
- slaveIdToResources(o.getSlaveId.getValue) = o.getResourcesList
- }
-
- val mesosTasks = new HashMap[String, JArrayList[MesosTaskInfo]]
-
- val slavesIdsOfAcceptedOffers = HashSet[String]()
-
- // Call into the TaskSchedulerImpl
- val acceptedOffers = scheduler.resourceOffers(workerOffers).filter(!_.isEmpty)
- acceptedOffers
- .foreach { offer =>
- offer.foreach { taskDesc =>
- val slaveId = taskDesc.executorId
- slavesIdsOfAcceptedOffers += slaveId
- taskIdToSlaveId(taskDesc.taskId) = slaveId
- val (mesosTask, remainingResources) = createMesosTask(
- taskDesc,
- slaveIdToResources(slaveId),
- slaveId)
- mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
- .add(mesosTask)
- slaveIdToResources(slaveId) = remainingResources
- }
- }
-
- // Reply to the offers
- val filters = Filters.newBuilder().setRefuseSeconds(1).build() // TODO: lower timeout?
-
- mesosTasks.foreach { case (slaveId, tasks) =>
- slaveIdToWorkerOffer.get(slaveId).foreach(o =>
- listenerBus.post(SparkListenerExecutorAdded(System.currentTimeMillis(), slaveId,
- // TODO: Add support for log urls for Mesos
- new ExecutorInfo(o.host, o.cores, Map.empty)))
- )
- logTrace(s"Launching Mesos tasks on slave '$slaveId', tasks:\n${getTasksSummary(tasks)}")
- d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters)
- }
-
- // Decline offers that weren't used
- // NOTE: This logic assumes that we only get a single offer for each host in a given batch
- for (o <- usableOffers if !slavesIdsOfAcceptedOffers.contains(o.getSlaveId.getValue)) {
- d.declineOffer(o.getId)
- }
- }
- }
-
- /** Turn a Spark TaskDescription into a Mesos task and also resources unused by the task */
- def createMesosTask(
- task: TaskDescription,
- resources: JList[Resource],
- slaveId: String): (MesosTaskInfo, JList[Resource]) = {
- val taskId = TaskID.newBuilder().setValue(task.taskId.toString).build()
- val (executorInfo, remainingResources) = if (slaveIdToExecutorInfo.contains(slaveId)) {
- (slaveIdToExecutorInfo(slaveId), resources)
- } else {
- createExecutorInfo(resources, slaveId)
- }
- slaveIdToExecutorInfo(slaveId) = executorInfo
- val (finalResources, cpuResources) =
- partitionResources(remainingResources, "cpus", scheduler.CPUS_PER_TASK)
- val taskInfo = MesosTaskInfo.newBuilder()
- .setTaskId(taskId)
- .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
- .setExecutor(executorInfo)
- .setName(task.name)
- .addAllResources(cpuResources.asJava)
- .setData(MesosTaskLaunchData(task.serializedTask, task.attemptNumber).toByteString)
- .build()
- (taskInfo, finalResources.asJava)
- }
-
- override def statusUpdate(d: org.apache.mesos.SchedulerDriver, status: TaskStatus) {
- inClassLoader() {
- val tid = status.getTaskId.getValue.toLong
- val state = mesosToTaskState(status.getState)
- synchronized {
- if (TaskState.isFailed(mesosToTaskState(status.getState))
- && taskIdToSlaveId.contains(tid)) {
- // We lost the executor on this slave, so remember that it's gone
- removeExecutor(taskIdToSlaveId(tid), "Lost executor")
- }
- if (TaskState.isFinished(state)) {
- taskIdToSlaveId.remove(tid)
- }
- }
- scheduler.statusUpdate(tid, state, status.getData.asReadOnlyByteBuffer)
- }
- }
-
- override def error(d: org.apache.mesos.SchedulerDriver, message: String) {
- inClassLoader() {
- logError("Mesos error: " + message)
- markErr()
- scheduler.error(message)
- }
- }
-
- override def stop() {
- if (mesosDriver != null) {
- mesosDriver.stop()
- }
- }
-
- override def reviveOffers() {
- mesosDriver.reviveOffers()
- }
-
- override def frameworkMessage(
- d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
-
- /**
- * Remove executor associated with slaveId in a thread safe manner.
- */
- private def removeExecutor(slaveId: String, reason: String) = {
- synchronized {
- listenerBus.post(SparkListenerExecutorRemoved(System.currentTimeMillis(), slaveId, reason))
- slaveIdToExecutorInfo -= slaveId
- }
- }
-
- private def recordSlaveLost(
- d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID, reason: ExecutorLossReason) {
- inClassLoader() {
- logInfo("Mesos slave lost: " + slaveId.getValue)
- removeExecutor(slaveId.getValue, reason.toString)
- scheduler.executorLost(slaveId.getValue, reason)
- }
- }
-
- override def slaveLost(d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID) {
- recordSlaveLost(d, slaveId, SlaveLost())
- }
-
- override def executorLost(
- d: org.apache.mesos.SchedulerDriver, executorId: ExecutorID, slaveId: SlaveID, status: Int) {
- logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
- slaveId.getValue))
- recordSlaveLost(d, slaveId, ExecutorExited(status, exitCausedByApp = true))
- }
-
- override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
- mesosDriver.killTask(
- TaskID.newBuilder()
- .setValue(taskId.toString).build()
- )
- }
-
- // TODO: query Mesos for number of cores
- override def defaultParallelism(): Int = sc.conf.getInt("spark.default.parallelism", 8)
-
- override def applicationId(): String =
- Option(appId).getOrElse {
- logWarning("Application ID is not initialized yet.")
- super.applicationId
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
deleted file mode 100644
index a2adb22..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import org.apache.mesos.Protos.{ContainerInfo, Image, NetworkInfo, Volume}
-import org.apache.mesos.Protos.ContainerInfo.{DockerInfo, MesosInfo}
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.internal.Logging
-
-/**
- * A collection of utility functions which can be used by both the
- * MesosSchedulerBackend and the [[MesosFineGrainedSchedulerBackend]].
- */
-private[mesos] object MesosSchedulerBackendUtil extends Logging {
- /**
- * Parse a comma-delimited list of volume specs, each of which
- * takes the form [host-dir:]container-dir[:rw|:ro].
- */
- def parseVolumesSpec(volumes: String): List[Volume] = {
- volumes.split(",").map(_.split(":")).flatMap { spec =>
- val vol: Volume.Builder = Volume
- .newBuilder()
- .setMode(Volume.Mode.RW)
- spec match {
- case Array(container_path) =>
- Some(vol.setContainerPath(container_path))
- case Array(container_path, "rw") =>
- Some(vol.setContainerPath(container_path))
- case Array(container_path, "ro") =>
- Some(vol.setContainerPath(container_path)
- .setMode(Volume.Mode.RO))
- case Array(host_path, container_path) =>
- Some(vol.setContainerPath(container_path)
- .setHostPath(host_path))
- case Array(host_path, container_path, "rw") =>
- Some(vol.setContainerPath(container_path)
- .setHostPath(host_path))
- case Array(host_path, container_path, "ro") =>
- Some(vol.setContainerPath(container_path)
- .setHostPath(host_path)
- .setMode(Volume.Mode.RO))
- case spec =>
- logWarning(s"Unable to parse volume specs: $volumes. "
- + "Expected form: \"[host-dir:]container-dir[:rw|:ro](, ...)\"")
- None
- }
- }
- .map { _.build() }
- .toList
- }
-
- /**
- * Parse a comma-delimited list of port mapping specs, each of which
- * takes the form host_port:container_port[:udp|:tcp]
- *
- * Note:
- * the docker form is [ip:]host_port:container_port, but the DockerInfo
- * message has no field for 'ip', and instead has a 'protocol' field.
- * Docker itself only appears to support TCP, so this alternative form
- * anticipates the expansion of the docker form to allow for a protocol
- * and leaves open the chance for mesos to begin to accept an 'ip' field
- */
- def parsePortMappingsSpec(portmaps: String): List[DockerInfo.PortMapping] = {
- portmaps.split(",").map(_.split(":")).flatMap { spec: Array[String] =>
- val portmap: DockerInfo.PortMapping.Builder = DockerInfo.PortMapping
- .newBuilder()
- .setProtocol("tcp")
- spec match {
- case Array(host_port, container_port) =>
- Some(portmap.setHostPort(host_port.toInt)
- .setContainerPort(container_port.toInt))
- case Array(host_port, container_port, protocol) =>
- Some(portmap.setHostPort(host_port.toInt)
- .setContainerPort(container_port.toInt)
- .setProtocol(protocol))
- case spec =>
- logWarning(s"Unable to parse port mapping specs: $portmaps. "
- + "Expected form: \"host_port:container_port[:udp|:tcp](, ...)\"")
- None
- }
- }
- .map { _.build() }
- .toList
- }
-
- def containerInfo(conf: SparkConf): ContainerInfo = {
- val containerType = if (conf.contains("spark.mesos.executor.docker.image") &&
- conf.get("spark.mesos.containerizer", "docker") == "docker") {
- ContainerInfo.Type.DOCKER
- } else {
- ContainerInfo.Type.MESOS
- }
-
- val containerInfo = ContainerInfo.newBuilder()
- .setType(containerType)
-
- conf.getOption("spark.mesos.executor.docker.image").map { image =>
- val forcePullImage = conf
- .getOption("spark.mesos.executor.docker.forcePullImage")
- .exists(_.equals("true"))
-
- val portMaps = conf
- .getOption("spark.mesos.executor.docker.portmaps")
- .map(parsePortMappingsSpec)
- .getOrElse(List.empty)
-
- if (containerType == ContainerInfo.Type.DOCKER) {
- containerInfo.setDocker(dockerInfo(image, forcePullImage, portMaps))
- } else {
- containerInfo.setMesos(mesosInfo(image, forcePullImage))
- }
-
- val volumes = conf
- .getOption("spark.mesos.executor.docker.volumes")
- .map(parseVolumesSpec)
-
- volumes.foreach(_.foreach(containerInfo.addVolumes(_)))
- }
-
- conf.getOption("spark.mesos.network.name").map { name =>
- val info = NetworkInfo.newBuilder().setName(name).build()
- containerInfo.addNetworkInfos(info)
- }
-
- containerInfo.build()
- }
-
- private def dockerInfo(
- image: String,
- forcePullImage: Boolean,
- portMaps: List[ContainerInfo.DockerInfo.PortMapping]): DockerInfo = {
- val dockerBuilder = ContainerInfo.DockerInfo.newBuilder()
- .setImage(image)
- .setForcePullImage(forcePullImage)
- portMaps.foreach(dockerBuilder.addPortMappings(_))
-
- dockerBuilder.build
- }
-
- private def mesosInfo(image: String, forcePullImage: Boolean): MesosInfo = {
- val imageProto = Image.newBuilder()
- .setType(Image.Type.DOCKER)
- .setDocker(Image.Docker.newBuilder().setName(image))
- .setCached(!forcePullImage)
- ContainerInfo.MesosInfo.newBuilder()
- .setImage(imageProto)
- .build
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[15/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
deleted file mode 100644
index ec47ab1..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtilsSuite.scala
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import scala.collection.JavaConverters._
-import scala.language.reflectiveCalls
-
-import org.apache.mesos.Protos.{Resource, Value}
-import org.mockito.Mockito._
-import org.scalatest._
-import org.scalatest.mock.MockitoSugar
-
-import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
-import org.apache.spark.internal.config._
-
-class MesosSchedulerUtilsSuite extends SparkFunSuite with Matchers with MockitoSugar {
-
- // scalastyle:off structural.type
- // this is the documented way of generating fixtures in scalatest
- def fixture: Object {val sc: SparkContext; val sparkConf: SparkConf} = new {
- val sparkConf = new SparkConf
- val sc = mock[SparkContext]
- when(sc.conf).thenReturn(sparkConf)
- }
-
- private def createTestPortResource(range: (Long, Long), role: Option[String] = None): Resource = {
- val rangeValue = Value.Range.newBuilder()
- rangeValue.setBegin(range._1)
- rangeValue.setEnd(range._2)
- val builder = Resource.newBuilder()
- .setName("ports")
- .setType(Value.Type.RANGES)
- .setRanges(Value.Ranges.newBuilder().addRange(rangeValue))
-
- role.foreach { r => builder.setRole(r) }
- builder.build()
- }
-
- private def rangesResourcesToTuple(resources: List[Resource]): List[(Long, Long)] = {
- resources.flatMap{resource => resource.getRanges.getRangeList
- .asScala.map(range => (range.getBegin, range.getEnd))}
- }
-
- def arePortsEqual(array1: Array[(Long, Long)], array2: Array[(Long, Long)])
- : Boolean = {
- array1.sortBy(identity).deep == array2.sortBy(identity).deep
- }
-
- def arePortsEqual(array1: Array[Long], array2: Array[Long])
- : Boolean = {
- array1.sortBy(identity).deep == array2.sortBy(identity).deep
- }
-
- def getRangesFromResources(resources: List[Resource]): List[(Long, Long)] = {
- resources.flatMap{ resource =>
- resource.getRanges.getRangeList.asScala.toList.map{
- range => (range.getBegin, range.getEnd)}}
- }
-
- val utils = new MesosSchedulerUtils { }
- // scalastyle:on structural.type
-
- test("use at-least minimum overhead") {
- val f = fixture
- when(f.sc.executorMemory).thenReturn(512)
- utils.executorMemory(f.sc) shouldBe 896
- }
-
- test("use overhead if it is greater than minimum value") {
- val f = fixture
- when(f.sc.executorMemory).thenReturn(4096)
- utils.executorMemory(f.sc) shouldBe 4505
- }
-
- test("use spark.mesos.executor.memoryOverhead (if set)") {
- val f = fixture
- when(f.sc.executorMemory).thenReturn(1024)
- f.sparkConf.set("spark.mesos.executor.memoryOverhead", "512")
- utils.executorMemory(f.sc) shouldBe 1536
- }
-
- test("parse a non-empty constraint string correctly") {
- val expectedMap = Map(
- "os" -> Set("centos7"),
- "zone" -> Set("us-east-1a", "us-east-1b")
- )
- utils.parseConstraintString("os:centos7;zone:us-east-1a,us-east-1b") should be (expectedMap)
- }
-
- test("parse an empty constraint string correctly") {
- utils.parseConstraintString("") shouldBe Map()
- }
-
- test("throw an exception when the input is malformed") {
- an[IllegalArgumentException] should be thrownBy
- utils.parseConstraintString("os;zone:us-east")
- }
-
- test("empty values for attributes' constraints matches all values") {
- val constraintsStr = "os:"
- val parsedConstraints = utils.parseConstraintString(constraintsStr)
-
- parsedConstraints shouldBe Map("os" -> Set())
-
- val zoneSet = Value.Set.newBuilder().addItem("us-east-1a").addItem("us-east-1b").build()
- val noOsOffer = Map("zone" -> zoneSet)
- val centosOffer = Map("os" -> Value.Text.newBuilder().setValue("centos").build())
- val ubuntuOffer = Map("os" -> Value.Text.newBuilder().setValue("ubuntu").build())
-
- utils.matchesAttributeRequirements(parsedConstraints, noOsOffer) shouldBe false
- utils.matchesAttributeRequirements(parsedConstraints, centosOffer) shouldBe true
- utils.matchesAttributeRequirements(parsedConstraints, ubuntuOffer) shouldBe true
- }
-
- test("subset match is performed for set attributes") {
- val supersetConstraint = Map(
- "os" -> Value.Text.newBuilder().setValue("ubuntu").build(),
- "zone" -> Value.Set.newBuilder()
- .addItem("us-east-1a")
- .addItem("us-east-1b")
- .addItem("us-east-1c")
- .build())
-
- val zoneConstraintStr = "os:;zone:us-east-1a,us-east-1c"
- val parsedConstraints = utils.parseConstraintString(zoneConstraintStr)
-
- utils.matchesAttributeRequirements(parsedConstraints, supersetConstraint) shouldBe true
- }
-
- test("less than equal match is performed on scalar attributes") {
- val offerAttribs = Map("gpus" -> Value.Scalar.newBuilder().setValue(3).build())
-
- val ltConstraint = utils.parseConstraintString("gpus:2")
- val eqConstraint = utils.parseConstraintString("gpus:3")
- val gtConstraint = utils.parseConstraintString("gpus:4")
-
- utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe true
- utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
- utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
- }
-
- test("contains match is performed for range attributes") {
- val offerAttribs = Map("ports" -> Value.Range.newBuilder().setBegin(7000).setEnd(8000).build())
- val ltConstraint = utils.parseConstraintString("ports:6000")
- val eqConstraint = utils.parseConstraintString("ports:7500")
- val gtConstraint = utils.parseConstraintString("ports:8002")
- val multiConstraint = utils.parseConstraintString("ports:5000,7500,8300")
-
- utils.matchesAttributeRequirements(ltConstraint, offerAttribs) shouldBe false
- utils.matchesAttributeRequirements(eqConstraint, offerAttribs) shouldBe true
- utils.matchesAttributeRequirements(gtConstraint, offerAttribs) shouldBe false
- utils.matchesAttributeRequirements(multiConstraint, offerAttribs) shouldBe true
- }
-
- test("equality match is performed for text attributes") {
- val offerAttribs = Map("os" -> Value.Text.newBuilder().setValue("centos7").build())
-
- val trueConstraint = utils.parseConstraintString("os:centos7")
- val falseConstraint = utils.parseConstraintString("os:ubuntu")
-
- utils.matchesAttributeRequirements(trueConstraint, offerAttribs) shouldBe true
- utils.matchesAttributeRequirements(falseConstraint, offerAttribs) shouldBe false
- }
-
- test("Port reservation is done correctly with user specified ports only") {
- val conf = new SparkConf()
- conf.set("spark.executor.port", "3000" )
- conf.set(BLOCK_MANAGER_PORT, 4000)
- val portResource = createTestPortResource((3000, 5000), Some("my_role"))
-
- val (resourcesLeft, resourcesToBeUsed) = utils
- .partitionPortResources(List(3000, 4000), List(portResource))
- resourcesToBeUsed.length shouldBe 2
-
- val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}.toArray
-
- portsToUse.length shouldBe 2
- arePortsEqual(portsToUse, Array(3000L, 4000L)) shouldBe true
-
- val portRangesToBeUsed = rangesResourcesToTuple(resourcesToBeUsed)
-
- val expectedUSed = Array((3000L, 3000L), (4000L, 4000L))
-
- arePortsEqual(portRangesToBeUsed.toArray, expectedUSed) shouldBe true
- }
-
- test("Port reservation is done correctly with some user specified ports (spark.executor.port)") {
- val conf = new SparkConf()
- conf.set("spark.executor.port", "3100" )
- val portResource = createTestPortResource((3000, 5000), Some("my_role"))
-
- val (resourcesLeft, resourcesToBeUsed) = utils
- .partitionPortResources(List(3100), List(portResource))
-
- val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
-
- portsToUse.length shouldBe 1
- portsToUse.contains(3100) shouldBe true
- }
-
- test("Port reservation is done correctly with all random ports") {
- val conf = new SparkConf()
- val portResource = createTestPortResource((3000L, 5000L), Some("my_role"))
-
- val (resourcesLeft, resourcesToBeUsed) = utils
- .partitionPortResources(List(), List(portResource))
- val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
-
- portsToUse.isEmpty shouldBe true
- }
-
- test("Port reservation is done correctly with user specified ports only - multiple ranges") {
- val conf = new SparkConf()
- conf.set("spark.executor.port", "2100" )
- conf.set("spark.blockManager.port", "4000")
- val portResourceList = List(createTestPortResource((3000, 5000), Some("my_role")),
- createTestPortResource((2000, 2500), Some("other_role")))
- val (resourcesLeft, resourcesToBeUsed) = utils
- .partitionPortResources(List(2100, 4000), portResourceList)
- val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
-
- portsToUse.length shouldBe 2
- val portsRangesLeft = rangesResourcesToTuple(resourcesLeft)
- val portRangesToBeUsed = rangesResourcesToTuple(resourcesToBeUsed)
-
- val expectedUsed = Array((2100L, 2100L), (4000L, 4000L))
-
- arePortsEqual(portsToUse.toArray, Array(2100L, 4000L)) shouldBe true
- arePortsEqual(portRangesToBeUsed.toArray, expectedUsed) shouldBe true
- }
-
- test("Port reservation is done correctly with all random ports - multiple ranges") {
- val conf = new SparkConf()
- val portResourceList = List(createTestPortResource((3000, 5000), Some("my_role")),
- createTestPortResource((2000, 2500), Some("other_role")))
- val (resourcesLeft, resourcesToBeUsed) = utils
- .partitionPortResources(List(), portResourceList)
- val portsToUse = getRangesFromResources(resourcesToBeUsed).map{r => r._1}
- portsToUse.isEmpty shouldBe true
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
deleted file mode 100644
index 5a81bb3..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchDataSuite.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.nio.ByteBuffer
-
-import org.apache.spark.SparkFunSuite
-
-class MesosTaskLaunchDataSuite extends SparkFunSuite {
- test("serialize and deserialize data must be same") {
- val serializedTask = ByteBuffer.allocate(40)
- (Range(100, 110).map(serializedTask.putInt(_)))
- serializedTask.rewind
- val attemptNumber = 100
- val byteString = MesosTaskLaunchData(serializedTask, attemptNumber).toByteString
- serializedTask.rewind
- val mesosTaskLaunchData = MesosTaskLaunchData.fromByteString(byteString)
- assert(mesosTaskLaunchData.attemptNumber == attemptNumber)
- assert(mesosTaskLaunchData.serializedTask.equals(serializedTask))
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
deleted file mode 100644
index 7ebb294..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/Utils.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.util.Collections
-
-import scala.collection.JavaConverters._
-
-import org.apache.mesos.Protos._
-import org.apache.mesos.Protos.Value.{Range => MesosRange, Ranges, Scalar}
-import org.apache.mesos.SchedulerDriver
-import org.mockito.{ArgumentCaptor, Matchers}
-import org.mockito.Mockito._
-
-object Utils {
- def createOffer(
- offerId: String,
- slaveId: String,
- mem: Int,
- cpus: Int,
- ports: Option[(Long, Long)] = None,
- gpus: Int = 0): Offer = {
- val builder = Offer.newBuilder()
- builder.addResourcesBuilder()
- .setName("mem")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(mem))
- builder.addResourcesBuilder()
- .setName("cpus")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(cpus))
- ports.foreach { resourcePorts =>
- builder.addResourcesBuilder()
- .setName("ports")
- .setType(Value.Type.RANGES)
- .setRanges(Ranges.newBuilder().addRange(MesosRange.newBuilder()
- .setBegin(resourcePorts._1).setEnd(resourcePorts._2).build()))
- }
- if (gpus > 0) {
- builder.addResourcesBuilder()
- .setName("gpus")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(gpus))
- }
- builder.setId(createOfferId(offerId))
- .setFrameworkId(FrameworkID.newBuilder()
- .setValue("f1"))
- .setSlaveId(SlaveID.newBuilder().setValue(slaveId))
- .setHostname(s"host${slaveId}")
- .build()
- }
-
- def verifyTaskLaunched(driver: SchedulerDriver, offerId: String): List[TaskInfo] = {
- val captor = ArgumentCaptor.forClass(classOf[java.util.Collection[TaskInfo]])
- verify(driver, times(1)).launchTasks(
- Matchers.eq(Collections.singleton(createOfferId(offerId))),
- captor.capture())
- captor.getValue.asScala.toList
- }
-
- def createOfferId(offerId: String): OfferID = {
- OfferID.newBuilder().setValue(offerId).build()
- }
-
- def createSlaveId(slaveId: String): SlaveID = {
- SlaveID.newBuilder().setValue(slaveId).build()
- }
-
- def createExecutorId(executorId: String): ExecutorID = {
- ExecutorID.newBuilder().setValue(executorId).build()
- }
-
- def createTaskId(taskId: String): TaskID = {
- TaskID.newBuilder().setValue(taskId).build()
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2f61d33..5e7dc15 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2580,7 +2580,7 @@
<profile>
<id>yarn</id>
<modules>
- <module>yarn</module>
+ <module>resource-managers/yarn</module>
<module>common/network-yarn</module>
</modules>
</profile>
@@ -2588,7 +2588,7 @@
<profile>
<id>mesos</id>
<modules>
- <module>mesos</module>
+ <module>resource-managers/mesos</module>
</modules>
</profile>
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/pom.xml
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
new file mode 100644
index 0000000..c0a8f9a
--- /dev/null
+++ b/resource-managers/mesos/pom.xml
@@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-parent_2.11</artifactId>
+ <version>2.2.0-SNAPSHOT</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>spark-mesos_2.11</artifactId>
+ <packaging>jar</packaging>
+ <name>Spark Project Mesos</name>
+ <properties>
+ <sbt.project.name>mesos</sbt.project.name>
+ <mesos.version>1.0.0</mesos.version>
+ <mesos.classifier>shaded-protobuf</mesos.classifier>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_${scala.binary.version}</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.spark</groupId>
+ <artifactId>spark-core_${scala.binary.version}</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mesos</groupId>
+ <artifactId>mesos</artifactId>
+ <version>${mesos.version}</version>
+ <classifier>${mesos.classifier}</classifier>
+ <exclusions>
+ <exclusion>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-java</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-server</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-plus</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-http</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-servlet</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-servlets</artifactId>
+ </dependency>
+ <!-- End of shaded deps. -->
+
+ </dependencies>
+
+
+ <build>
+ <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+ <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/resource-managers/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
new file mode 100644
index 0000000..12b6d5b
--- /dev/null
+++ b/resource-managers/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -0,0 +1 @@
+org.apache.spark.scheduler.cluster.mesos.MesosClusterManager
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
new file mode 100644
index 0000000..792ade8
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import java.util.concurrent.CountDownLatch
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.mesos.config._
+import org.apache.spark.deploy.mesos.ui.MesosClusterUI
+import org.apache.spark.deploy.rest.mesos.MesosRestServer
+import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.cluster.mesos._
+import org.apache.spark.util.{CommandLineUtils, ShutdownHookManager, Utils}
+
+/*
+ * A dispatcher that is responsible for managing and launching drivers, and is intended to be
+ * used for Mesos cluster mode. The dispatcher is a long-running process started by the user in
+ * the cluster independently of Spark applications.
+ * It contains a [[MesosRestServer]] that listens for requests to submit drivers and a
+ * [[MesosClusterScheduler]] that processes these requests by negotiating with the Mesos master
+ * for resources.
+ *
+ * A typical new driver lifecycle is the following:
+ * - Driver submitted via spark-submit talking to the [[MesosRestServer]]
+ * - [[MesosRestServer]] queues the driver request to [[MesosClusterScheduler]]
+ * - [[MesosClusterScheduler]] gets resource offers and launches the drivers that are in queue
+ *
+ * This dispatcher supports both Mesos fine-grain or coarse-grain mode as the mode is configurable
+ * per driver launched.
+ * This class is needed since Mesos doesn't manage frameworks, so the dispatcher acts as
+ * a daemon to launch drivers as Mesos frameworks upon request. The dispatcher is also started and
+ * stopped by sbin/start-mesos-dispatcher and sbin/stop-mesos-dispatcher respectively.
+ */
+private[mesos] class MesosClusterDispatcher(
+ args: MesosClusterDispatcherArguments,
+ conf: SparkConf)
+ extends Logging {
+
+ private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host)
+ private val recoveryMode = conf.get(RECOVERY_MODE).toUpperCase()
+ logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode)
+
+ private val engineFactory = recoveryMode match {
+ case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory
+ case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf)
+ case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode)
+ }
+
+ private val scheduler = new MesosClusterScheduler(engineFactory, conf)
+
+ private val server = new MesosRestServer(args.host, args.port, conf, scheduler)
+ private val webUi = new MesosClusterUI(
+ new SecurityManager(conf),
+ args.webUiPort,
+ conf,
+ publicAddress,
+ scheduler)
+
+ private val shutdownLatch = new CountDownLatch(1)
+
+ def start(): Unit = {
+ webUi.bind()
+ scheduler.frameworkUrl = conf.get(DISPATCHER_WEBUI_URL).getOrElse(webUi.activeWebUiUrl)
+ scheduler.start()
+ server.start()
+ }
+
+ def awaitShutdown(): Unit = {
+ shutdownLatch.await()
+ }
+
+ def stop(): Unit = {
+ webUi.stop()
+ server.stop()
+ scheduler.stop()
+ shutdownLatch.countDown()
+ }
+}
+
+private[mesos] object MesosClusterDispatcher
+ extends Logging
+ with CommandLineUtils {
+
+ override def main(args: Array[String]) {
+ Utils.initDaemon(log)
+ val conf = new SparkConf
+ val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf)
+ conf.setMaster(dispatcherArgs.masterUrl)
+ conf.setAppName(dispatcherArgs.name)
+ dispatcherArgs.zookeeperUrl.foreach { z =>
+ conf.set(RECOVERY_MODE, "ZOOKEEPER")
+ conf.set(ZOOKEEPER_URL, z)
+ }
+ val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf)
+ dispatcher.start()
+ logDebug("Adding shutdown hook") // force eager creation of logger
+ ShutdownHookManager.addShutdownHook { () =>
+ logInfo("Shutdown hook is shutting down dispatcher")
+ dispatcher.stop()
+ dispatcher.awaitShutdown()
+ }
+ dispatcher.awaitShutdown()
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
new file mode 100644
index 0000000..ef08502
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import scala.annotation.tailrec
+import scala.collection.mutable
+
+import org.apache.spark.util.{IntParam, Utils}
+import org.apache.spark.SparkConf
+
+private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf: SparkConf) {
+ var host: String = Utils.localHostName()
+ var port: Int = 7077
+ var name: String = "Spark Cluster"
+ var webUiPort: Int = 8081
+ var verbose: Boolean = false
+ var masterUrl: String = _
+ var zookeeperUrl: Option[String] = None
+ var propertiesFile: String = _
+ val confProperties: mutable.HashMap[String, String] =
+ new mutable.HashMap[String, String]()
+
+ parse(args.toList)
+
+ // scalastyle:on println
+ propertiesFile = Utils.loadDefaultSparkProperties(conf, propertiesFile)
+ Utils.updateSparkConfigFromProperties(conf, confProperties)
+
+ // scalastyle:off println
+ if (verbose) {
+ MesosClusterDispatcher.printStream.println(s"Using host: $host")
+ MesosClusterDispatcher.printStream.println(s"Using port: $port")
+ MesosClusterDispatcher.printStream.println(s"Using webUiPort: $webUiPort")
+ MesosClusterDispatcher.printStream.println(s"Framework Name: $name")
+
+ Option(propertiesFile).foreach { file =>
+ MesosClusterDispatcher.printStream.println(s"Using properties file: $file")
+ }
+
+ MesosClusterDispatcher.printStream.println(s"Spark Config properties set:")
+ conf.getAll.foreach(println)
+ }
+
+ @tailrec
+ private def parse(args: List[String]): Unit = args match {
+ case ("--host" | "-h") :: value :: tail =>
+ Utils.checkHost(value, "Please use hostname " + value)
+ host = value
+ parse(tail)
+
+ case ("--port" | "-p") :: IntParam(value) :: tail =>
+ port = value
+ parse(tail)
+
+ case ("--webui-port") :: IntParam(value) :: tail =>
+ webUiPort = value
+ parse(tail)
+
+ case ("--zk" | "-z") :: value :: tail =>
+ zookeeperUrl = Some(value)
+ parse(tail)
+
+ case ("--master" | "-m") :: value :: tail =>
+ if (!value.startsWith("mesos://")) {
+ // scalastyle:off println
+ MesosClusterDispatcher.printStream
+ .println("Cluster dispatcher only supports mesos (uri begins with mesos://)")
+ // scalastyle:on println
+ MesosClusterDispatcher.exitFn(1)
+ }
+ masterUrl = value.stripPrefix("mesos://")
+ parse(tail)
+
+ case ("--name") :: value :: tail =>
+ name = value
+ parse(tail)
+
+ case ("--properties-file") :: value :: tail =>
+ propertiesFile = value
+ parse(tail)
+
+ case ("--conf") :: value :: tail =>
+ val pair = MesosClusterDispatcher.
+ parseSparkConfProperty(value)
+ confProperties(pair._1) = pair._2
+ parse(tail)
+
+ case ("--help") :: tail =>
+ printUsageAndExit(0)
+
+ case ("--verbose") :: tail =>
+ verbose = true
+ parse(tail)
+
+ case Nil =>
+ if (Option(masterUrl).isEmpty) {
+ // scalastyle:off println
+ MesosClusterDispatcher.printStream.println("--master is required")
+ // scalastyle:on println
+ printUsageAndExit(1)
+ }
+
+ case value =>
+ // scalastyle:off println
+ MesosClusterDispatcher.printStream.println(s"Unrecognized option: '${value.head}'")
+ // scalastyle:on println
+ printUsageAndExit(1)
+ }
+
+ private def printUsageAndExit(exitCode: Int): Unit = {
+ val outStream = MesosClusterDispatcher.printStream
+
+ // scalastyle:off println
+ outStream.println(
+ "Usage: MesosClusterDispatcher [options]\n" +
+ "\n" +
+ "Options:\n" +
+ " -h HOST, --host HOST Hostname to listen on\n" +
+ " --help Show this help message and exit.\n" +
+ " --verbose, Print additional debug output.\n" +
+ " -p PORT, --port PORT Port to listen on (default: 7077)\n" +
+ " --webui-port WEBUI_PORT WebUI Port to listen on (default: 8081)\n" +
+ " --name NAME Framework name to show in Mesos UI\n" +
+ " -m --master MASTER URI for connecting to Mesos master\n" +
+ " -z --zk ZOOKEEPER Comma delimited URLs for connecting to \n" +
+ " Zookeeper for persistence\n" +
+ " --properties-file FILE Path to a custom Spark properties file.\n" +
+ " Default is conf/spark-defaults.conf \n" +
+ " --conf PROP=VALUE Arbitrary Spark configuration property.\n" +
+ " Takes precedence over defined properties in properties-file.")
+ // scalastyle:on println
+ MesosClusterDispatcher.exitFn(exitCode)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
new file mode 100644
index 0000000..d4c7022
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import java.util.Date
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.Command
+import org.apache.spark.scheduler.cluster.mesos.MesosClusterRetryState
+
+/**
+ * Describes a Spark driver that is submitted from the
+ * [[org.apache.spark.deploy.rest.mesos.MesosRestServer]], to be launched by
+ * [[org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler]].
+ * @param jarUrl URL to the application jar
+ * @param mem Amount of memory for the driver
+ * @param cores Number of cores for the driver
+ * @param supervise Supervise the driver for long running app
+ * @param command The command to launch the driver.
+ * @param schedulerProperties Extra properties to pass the Mesos scheduler
+ */
+private[spark] class MesosDriverDescription(
+ val name: String,
+ val jarUrl: String,
+ val mem: Int,
+ val cores: Double,
+ val supervise: Boolean,
+ val command: Command,
+ schedulerProperties: Map[String, String],
+ val submissionId: String,
+ val submissionDate: Date,
+ val retryState: Option[MesosClusterRetryState] = None)
+ extends Serializable {
+
+ val conf = new SparkConf(false)
+ schedulerProperties.foreach {case (k, v) => conf.set(k, v)}
+
+ def copy(
+ name: String = name,
+ jarUrl: String = jarUrl,
+ mem: Int = mem,
+ cores: Double = cores,
+ supervise: Boolean = supervise,
+ command: Command = command,
+ schedulerProperties: SparkConf = conf,
+ submissionId: String = submissionId,
+ submissionDate: Date = submissionDate,
+ retryState: Option[MesosClusterRetryState] = retryState): MesosDriverDescription = {
+
+ new MesosDriverDescription(name, jarUrl, mem, cores, supervise, command, conf.getAll.toMap,
+ submissionId, submissionDate, retryState)
+ }
+
+ override def toString: String = s"MesosDriverDescription (${command.mainClass})"
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
new file mode 100644
index 0000000..859aa83
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import java.nio.ByteBuffer
+import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.ExternalShuffleService
+import org.apache.spark.deploy.mesos.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
+import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
+import org.apache.spark.network.shuffle.protocol.BlockTransferMessage
+import org.apache.spark.network.shuffle.protocol.mesos.{RegisterDriver, ShuffleServiceHeartbeat}
+import org.apache.spark.network.util.TransportConf
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * An RPC endpoint that receives registration requests from Spark drivers running on Mesos.
+ * It detects driver termination and calls the cleanup callback to [[ExternalShuffleService]].
+ */
+private[mesos] class MesosExternalShuffleBlockHandler(
+ transportConf: TransportConf,
+ cleanerIntervalS: Long)
+ extends ExternalShuffleBlockHandler(transportConf, null) with Logging {
+
+ ThreadUtils.newDaemonSingleThreadScheduledExecutor("shuffle-cleaner-watcher")
+ .scheduleAtFixedRate(new CleanerThread(), 0, cleanerIntervalS, TimeUnit.SECONDS)
+
+ // Stores a map of app id to app state (timeout value and last heartbeat)
+ private val connectedApps = new ConcurrentHashMap[String, AppState]()
+
+ protected override def handleMessage(
+ message: BlockTransferMessage,
+ client: TransportClient,
+ callback: RpcResponseCallback): Unit = {
+ message match {
+ case RegisterDriverParam(appId, appState) =>
+ val address = client.getSocketAddress
+ val timeout = appState.heartbeatTimeout
+ logInfo(s"Received registration request from app $appId (remote address $address, " +
+ s"heartbeat timeout $timeout ms).")
+ if (connectedApps.containsKey(appId)) {
+ logWarning(s"Received a registration request from app $appId, but it was already " +
+ s"registered")
+ }
+ connectedApps.put(appId, appState)
+ callback.onSuccess(ByteBuffer.allocate(0))
+ case Heartbeat(appId) =>
+ val address = client.getSocketAddress
+ Option(connectedApps.get(appId)) match {
+ case Some(existingAppState) =>
+ logTrace(s"Received ShuffleServiceHeartbeat from app '$appId' (remote " +
+ s"address $address).")
+ existingAppState.lastHeartbeat = System.nanoTime()
+ case None =>
+ logWarning(s"Received ShuffleServiceHeartbeat from an unknown app (remote " +
+ s"address $address, appId '$appId').")
+ }
+ case _ => super.handleMessage(message, client, callback)
+ }
+ }
+
+ /** An extractor object for matching [[RegisterDriver]] message. */
+ private object RegisterDriverParam {
+ def unapply(r: RegisterDriver): Option[(String, AppState)] =
+ Some((r.getAppId, new AppState(r.getHeartbeatTimeoutMs, System.nanoTime())))
+ }
+
+ private object Heartbeat {
+ def unapply(h: ShuffleServiceHeartbeat): Option[String] = Some(h.getAppId)
+ }
+
+ private class AppState(val heartbeatTimeout: Long, @volatile var lastHeartbeat: Long)
+
+ private class CleanerThread extends Runnable {
+ override def run(): Unit = {
+ val now = System.nanoTime()
+ connectedApps.asScala.foreach { case (appId, appState) =>
+ if (now - appState.lastHeartbeat > appState.heartbeatTimeout * 1000 * 1000) {
+ logInfo(s"Application $appId timed out. Removing shuffle files.")
+ connectedApps.remove(appId)
+ applicationRemoved(appId, true)
+ }
+ }
+ }
+ }
+}
+
+/**
+ * A wrapper of [[ExternalShuffleService]] that provides an additional endpoint for drivers
+ * to associate with. This allows the shuffle service to detect when a driver is terminated
+ * and can clean up the associated shuffle files.
+ */
+private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager)
+ extends ExternalShuffleService(conf, securityManager) {
+
+ protected override def newShuffleBlockHandler(
+ conf: TransportConf): ExternalShuffleBlockHandler = {
+ val cleanerIntervalS = this.conf.get(SHUFFLE_CLEANER_INTERVAL_S)
+ new MesosExternalShuffleBlockHandler(conf, cleanerIntervalS)
+ }
+}
+
+private[spark] object MesosExternalShuffleService extends Logging {
+
+ def main(args: Array[String]): Unit = {
+ ExternalShuffleService.main(args,
+ (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm))
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
new file mode 100644
index 0000000..19e2533
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.internal.config.ConfigBuilder
+
+package object config {
+
+ /* Common app configuration. */
+
+ private[spark] val SHUFFLE_CLEANER_INTERVAL_S =
+ ConfigBuilder("spark.shuffle.cleaner.interval")
+ .timeConf(TimeUnit.SECONDS)
+ .createWithDefaultString("30s")
+
+ private[spark] val RECOVERY_MODE =
+ ConfigBuilder("spark.deploy.recoveryMode")
+ .stringConf
+ .createWithDefault("NONE")
+
+ private[spark] val DISPATCHER_WEBUI_URL =
+ ConfigBuilder("spark.mesos.dispatcher.webui.url")
+ .doc("Set the Spark Mesos dispatcher webui_url for interacting with the " +
+ "framework. If unset it will point to Spark's internal web UI.")
+ .stringConf
+ .createOptional
+
+ private[spark] val ZOOKEEPER_URL =
+ ConfigBuilder("spark.deploy.zookeeper.url")
+ .doc("When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this " +
+ "configuration is used to set the zookeeper URL to connect to.")
+ .stringConf
+ .createOptional
+
+ private[spark] val HISTORY_SERVER_URL =
+ ConfigBuilder("spark.mesos.dispatcher.historyServer.url")
+ .doc("Set the URL of the history server. The dispatcher will then " +
+ "link each driver to its entry in the history server.")
+ .stringConf
+ .createOptional
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
new file mode 100644
index 0000000..cd98110
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos.ui
+
+import javax.servlet.http.HttpServletRequest
+
+import scala.xml.Node
+
+import org.apache.spark.deploy.Command
+import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.scheduler.cluster.mesos.{MesosClusterRetryState, MesosClusterSubmissionState}
+import org.apache.spark.ui.{UIUtils, WebUIPage}
+
+private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver") {
+
+ override def render(request: HttpServletRequest): Seq[Node] = {
+ val driverId = request.getParameter("id")
+ require(driverId != null && driverId.nonEmpty, "Missing id parameter")
+
+ val state = parent.scheduler.getDriverState(driverId)
+ if (state.isEmpty) {
+ val content =
+ <div>
+ <p>Cannot find driver {driverId}</p>
+ </div>
+ return UIUtils.basicSparkPage(content, s"Details for Job $driverId")
+ }
+ val driverState = state.get
+ val driverHeaders = Seq("Driver property", "Value")
+ val schedulerHeaders = Seq("Scheduler property", "Value")
+ val commandEnvHeaders = Seq("Command environment variable", "Value")
+ val launchedHeaders = Seq("Launched property", "Value")
+ val commandHeaders = Seq("Command property", "Value")
+ val retryHeaders = Seq("Last failed status", "Next retry time", "Retry count")
+ val driverDescription = Iterable.apply(driverState.description)
+ val submissionState = Iterable.apply(driverState.submissionState)
+ val command = Iterable.apply(driverState.description.command)
+ val schedulerProperties = Iterable.apply(driverState.description.conf.getAll.toMap)
+ val commandEnv = Iterable.apply(driverState.description.command.environment)
+ val driverTable =
+ UIUtils.listingTable(driverHeaders, driverRow, driverDescription)
+ val commandTable =
+ UIUtils.listingTable(commandHeaders, commandRow, command)
+ val commandEnvTable =
+ UIUtils.listingTable(commandEnvHeaders, propertiesRow, commandEnv)
+ val schedulerTable =
+ UIUtils.listingTable(schedulerHeaders, propertiesRow, schedulerProperties)
+ val launchedTable =
+ UIUtils.listingTable(launchedHeaders, launchedRow, submissionState)
+ val retryTable =
+ UIUtils.listingTable(
+ retryHeaders, retryRow, Iterable.apply(driverState.description.retryState))
+ val content =
+ <p>Driver state information for driver id {driverId}</p>
+ <a href={UIUtils.prependBaseUri("/")}>Back to Drivers</a>
+ <div class="row-fluid">
+ <div class="span12">
+ <h4>Driver state: {driverState.state}</h4>
+ <h4>Driver properties</h4>
+ {driverTable}
+ <h4>Driver command</h4>
+ {commandTable}
+ <h4>Driver command environment</h4>
+ {commandEnvTable}
+ <h4>Scheduler properties</h4>
+ {schedulerTable}
+ <h4>Launched state</h4>
+ {launchedTable}
+ <h4>Retry state</h4>
+ {retryTable}
+ </div>
+ </div>;
+
+ UIUtils.basicSparkPage(content, s"Details for Job $driverId")
+ }
+
+ private def launchedRow(submissionState: Option[MesosClusterSubmissionState]): Seq[Node] = {
+ submissionState.map { state =>
+ <tr>
+ <td>Mesos Slave ID</td>
+ <td>{state.slaveId.getValue}</td>
+ </tr>
+ <tr>
+ <td>Mesos Task ID</td>
+ <td>{state.taskId.getValue}</td>
+ </tr>
+ <tr>
+ <td>Launch Time</td>
+ <td>{state.startDate}</td>
+ </tr>
+ <tr>
+ <td>Finish Time</td>
+ <td>{state.finishDate.map(_.toString).getOrElse("")}</td>
+ </tr>
+ <tr>
+ <td>Last Task Status</td>
+ <td>{state.mesosTaskStatus.map(_.toString).getOrElse("")}</td>
+ </tr>
+ }.getOrElse(Seq[Node]())
+ }
+
+ private def propertiesRow(properties: collection.Map[String, String]): Seq[Node] = {
+ properties.map { case (k, v) =>
+ <tr>
+ <td>{k}</td><td>{v}</td>
+ </tr>
+ }.toSeq
+ }
+
+ private def commandRow(command: Command): Seq[Node] = {
+ <tr>
+ <td>Main class</td><td>{command.mainClass}</td>
+ </tr>
+ <tr>
+ <td>Arguments</td><td>{command.arguments.mkString(" ")}</td>
+ </tr>
+ <tr>
+ <td>Class path entries</td><td>{command.classPathEntries.mkString(" ")}</td>
+ </tr>
+ <tr>
+ <td>Java options</td><td>{command.javaOpts.mkString((" "))}</td>
+ </tr>
+ <tr>
+ <td>Library path entries</td><td>{command.libraryPathEntries.mkString((" "))}</td>
+ </tr>
+ }
+
+ private def driverRow(driver: MesosDriverDescription): Seq[Node] = {
+ <tr>
+ <td>Name</td><td>{driver.name}</td>
+ </tr>
+ <tr>
+ <td>Id</td><td>{driver.submissionId}</td>
+ </tr>
+ <tr>
+ <td>Cores</td><td>{driver.cores}</td>
+ </tr>
+ <tr>
+ <td>Memory</td><td>{driver.mem}</td>
+ </tr>
+ <tr>
+ <td>Submitted</td><td>{driver.submissionDate}</td>
+ </tr>
+ <tr>
+ <td>Supervise</td><td>{driver.supervise}</td>
+ </tr>
+ }
+
+ private def retryRow(retryState: Option[MesosClusterRetryState]): Seq[Node] = {
+ retryState.map { state =>
+ <tr>
+ <td>
+ {state.lastFailureStatus}
+ </td>
+ <td>
+ {state.nextRetry}
+ </td>
+ <td>
+ {state.retries}
+ </td>
+ </tr>
+ }.getOrElse(Seq[Node]())
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
new file mode 100644
index 0000000..13ba7d3
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos.ui
+
+import javax.servlet.http.HttpServletRequest
+
+import scala.xml.Node
+
+import org.apache.mesos.Protos.TaskStatus
+
+import org.apache.spark.deploy.mesos.config._
+import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.scheduler.cluster.mesos.MesosClusterSubmissionState
+import org.apache.spark.ui.{UIUtils, WebUIPage}
+
+private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage("") {
+ private val historyServerURL = parent.conf.get(HISTORY_SERVER_URL)
+
+ def render(request: HttpServletRequest): Seq[Node] = {
+ val state = parent.scheduler.getSchedulerState()
+
+ val driverHeader = Seq("Driver ID")
+ val historyHeader = historyServerURL.map(url => Seq("History")).getOrElse(Nil)
+ val submissionHeader = Seq("Submit Date", "Main Class", "Driver Resources")
+
+ val queuedHeaders = driverHeader ++ submissionHeader
+ val driverHeaders = driverHeader ++ historyHeader ++ submissionHeader ++
+ Seq("Start Date", "Mesos Slave ID", "State")
+ val retryHeaders = Seq("Driver ID", "Submit Date", "Description") ++
+ Seq("Last Failed Status", "Next Retry Time", "Attempt Count")
+ val queuedTable = UIUtils.listingTable(queuedHeaders, queuedRow, state.queuedDrivers)
+ val launchedTable = UIUtils.listingTable(driverHeaders, driverRow, state.launchedDrivers)
+ val finishedTable = UIUtils.listingTable(driverHeaders, driverRow, state.finishedDrivers)
+ val retryTable = UIUtils.listingTable(retryHeaders, retryRow, state.pendingRetryDrivers)
+ val content =
+ <p>Mesos Framework ID: {state.frameworkId}</p>
+ <div class="row-fluid">
+ <div class="span12">
+ <h4>Queued Drivers:</h4>
+ {queuedTable}
+ <h4>Launched Drivers:</h4>
+ {launchedTable}
+ <h4>Finished Drivers:</h4>
+ {finishedTable}
+ <h4>Supervise drivers waiting for retry:</h4>
+ {retryTable}
+ </div>
+ </div>;
+ UIUtils.basicSparkPage(content, "Spark Drivers for Mesos cluster")
+ }
+
+ private def queuedRow(submission: MesosDriverDescription): Seq[Node] = {
+ val id = submission.submissionId
+ <tr>
+ <td><a href={s"driver?id=$id"}>{id}</a></td>
+ <td>{submission.submissionDate}</td>
+ <td>{submission.command.mainClass}</td>
+ <td>cpus: {submission.cores}, mem: {submission.mem}</td>
+ </tr>
+ }
+
+ private def driverRow(state: MesosClusterSubmissionState): Seq[Node] = {
+ val id = state.driverDescription.submissionId
+
+ val historyCol = if (historyServerURL.isDefined) {
+ <td>
+ <a href={s"${historyServerURL.get}/history/${state.frameworkId}"}>
+ {state.frameworkId}
+ </a>
+ </td>
+ } else Nil
+
+ <tr>
+ <td><a href={s"driver?id=$id"}>{id}</a></td>
+ {historyCol}
+ <td>{state.driverDescription.submissionDate}</td>
+ <td>{state.driverDescription.command.mainClass}</td>
+ <td>cpus: {state.driverDescription.cores}, mem: {state.driverDescription.mem}</td>
+ <td>{state.startDate}</td>
+ <td>{state.slaveId.getValue}</td>
+ <td>{stateString(state.mesosTaskStatus)}</td>
+ </tr>
+ }
+
+ private def retryRow(submission: MesosDriverDescription): Seq[Node] = {
+ val id = submission.submissionId
+ <tr>
+ <td><a href={s"driver?id=$id"}>{id}</a></td>
+ <td>{submission.submissionDate}</td>
+ <td>{submission.command.mainClass}</td>
+ <td>{submission.retryState.get.lastFailureStatus}</td>
+ <td>{submission.retryState.get.nextRetry}</td>
+ <td>{submission.retryState.get.retries}</td>
+ </tr>
+ }
+
+ private def stateString(status: Option[TaskStatus]): String = {
+ if (status.isEmpty) {
+ return ""
+ }
+ val sb = new StringBuilder
+ val s = status.get
+ sb.append(s"State: ${s.getState}")
+ if (status.get.hasMessage) {
+ sb.append(s", Message: ${s.getMessage}")
+ }
+ if (status.get.hasHealthy) {
+ sb.append(s", Healthy: ${s.getHealthy}")
+ }
+ if (status.get.hasSource) {
+ sb.append(s", Source: ${s.getSource}")
+ }
+ if (status.get.hasReason) {
+ sb.append(s", Reason: ${s.getReason}")
+ }
+ if (status.get.hasTimestamp) {
+ sb.append(s", Time: ${s.getTimestamp}")
+ }
+ sb.toString()
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
new file mode 100644
index 0000000..6049789
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos.ui
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler
+import org.apache.spark.ui.{SparkUI, WebUI}
+import org.apache.spark.ui.JettyUtils._
+
+/**
+ * UI that displays driver results from the [[org.apache.spark.deploy.mesos.MesosClusterDispatcher]]
+ */
+private[spark] class MesosClusterUI(
+ securityManager: SecurityManager,
+ port: Int,
+ val conf: SparkConf,
+ dispatcherPublicAddress: String,
+ val scheduler: MesosClusterScheduler)
+ extends WebUI(securityManager, securityManager.getSSLOptions("mesos"), port, conf) {
+
+ initialize()
+
+ def activeWebUiUrl: String = "http://" + dispatcherPublicAddress + ":" + boundPort
+
+ override def initialize() {
+ attachPage(new MesosClusterPage(this))
+ attachPage(new DriverPage(this))
+ attachHandler(createStaticHandler(MesosClusterUI.STATIC_RESOURCE_DIR, "/static"))
+ }
+}
+
+private object MesosClusterUI {
+ val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
new file mode 100644
index 0000000..ff60b88
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.rest.mesos
+
+import java.io.File
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale}
+import java.util.concurrent.atomic.AtomicLong
+import javax.servlet.http.HttpServletResponse
+
+import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf}
+import org.apache.spark.deploy.Command
+import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.deploy.rest._
+import org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler
+import org.apache.spark.util.Utils
+
+/**
+ * A server that responds to requests submitted by the [[RestSubmissionClient]].
+ * All requests are forwarded to
+ * [[org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler]].
+ * This is intended to be used in Mesos cluster mode only.
+ * For more details about the REST submission please refer to [[RestSubmissionServer]] javadocs.
+ */
+private[spark] class MesosRestServer(
+ host: String,
+ requestedPort: Int,
+ masterConf: SparkConf,
+ scheduler: MesosClusterScheduler)
+ extends RestSubmissionServer(host, requestedPort, masterConf) {
+
+ protected override val submitRequestServlet =
+ new MesosSubmitRequestServlet(scheduler, masterConf)
+ protected override val killRequestServlet =
+ new MesosKillRequestServlet(scheduler, masterConf)
+ protected override val statusRequestServlet =
+ new MesosStatusRequestServlet(scheduler, masterConf)
+}
+
+private[mesos] class MesosSubmitRequestServlet(
+ scheduler: MesosClusterScheduler,
+ conf: SparkConf)
+ extends SubmitRequestServlet {
+
+ private val DEFAULT_SUPERVISE = false
+ private val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // mb
+ private val DEFAULT_CORES = 1.0
+
+ private val nextDriverNumber = new AtomicLong(0)
+ // For application IDs
+ private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
+ private def newDriverId(submitDate: Date): String =
+ f"driver-${createDateFormat.format(submitDate)}-${nextDriverNumber.incrementAndGet()}%04d"
+
+ /**
+ * Build a driver description from the fields specified in the submit request.
+ *
+ * This involves constructing a command that launches a mesos framework for the job.
+ * This does not currently consider fields used by python applications since python
+ * is not supported in mesos cluster mode yet.
+ */
+ private def buildDriverDescription(request: CreateSubmissionRequest): MesosDriverDescription = {
+ // Required fields, including the main class because python is not yet supported
+ val appResource = Option(request.appResource).getOrElse {
+ throw new SubmitRestMissingFieldException("Application jar is missing.")
+ }
+ val mainClass = Option(request.mainClass).getOrElse {
+ throw new SubmitRestMissingFieldException("Main class is missing.")
+ }
+
+ // Optional fields
+ val sparkProperties = request.sparkProperties
+ val driverExtraJavaOptions = sparkProperties.get("spark.driver.extraJavaOptions")
+ val driverExtraClassPath = sparkProperties.get("spark.driver.extraClassPath")
+ val driverExtraLibraryPath = sparkProperties.get("spark.driver.extraLibraryPath")
+ val superviseDriver = sparkProperties.get("spark.driver.supervise")
+ val driverMemory = sparkProperties.get("spark.driver.memory")
+ val driverCores = sparkProperties.get("spark.driver.cores")
+ val appArgs = request.appArgs
+ val environmentVariables = request.environmentVariables
+ val name = request.sparkProperties.getOrElse("spark.app.name", mainClass)
+
+ // Construct driver description
+ val conf = new SparkConf(false).setAll(sparkProperties)
+ val extraClassPath = driverExtraClassPath.toSeq.flatMap(_.split(File.pathSeparator))
+ val extraLibraryPath = driverExtraLibraryPath.toSeq.flatMap(_.split(File.pathSeparator))
+ val extraJavaOpts = driverExtraJavaOptions.map(Utils.splitCommandString).getOrElse(Seq.empty)
+ val sparkJavaOpts = Utils.sparkJavaOpts(conf)
+ val javaOpts = sparkJavaOpts ++ extraJavaOpts
+ val command = new Command(
+ mainClass, appArgs, environmentVariables, extraClassPath, extraLibraryPath, javaOpts)
+ val actualSuperviseDriver = superviseDriver.map(_.toBoolean).getOrElse(DEFAULT_SUPERVISE)
+ val actualDriverMemory = driverMemory.map(Utils.memoryStringToMb).getOrElse(DEFAULT_MEMORY)
+ val actualDriverCores = driverCores.map(_.toDouble).getOrElse(DEFAULT_CORES)
+ val submitDate = new Date()
+ val submissionId = newDriverId(submitDate)
+
+ new MesosDriverDescription(
+ name, appResource, actualDriverMemory, actualDriverCores, actualSuperviseDriver,
+ command, request.sparkProperties, submissionId, submitDate)
+ }
+
+ protected override def handleSubmit(
+ requestMessageJson: String,
+ requestMessage: SubmitRestProtocolMessage,
+ responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
+ requestMessage match {
+ case submitRequest: CreateSubmissionRequest =>
+ val driverDescription = buildDriverDescription(submitRequest)
+ val s = scheduler.submitDriver(driverDescription)
+ s.serverSparkVersion = sparkVersion
+ val unknownFields = findUnknownFields(requestMessageJson, requestMessage)
+ if (unknownFields.nonEmpty) {
+ // If there are fields that the server does not know about, warn the client
+ s.unknownFields = unknownFields
+ }
+ s
+ case unexpected =>
+ responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
+ handleError(s"Received message of unexpected type ${unexpected.messageType}.")
+ }
+ }
+}
+
+private[mesos] class MesosKillRequestServlet(scheduler: MesosClusterScheduler, conf: SparkConf)
+ extends KillRequestServlet {
+ protected override def handleKill(submissionId: String): KillSubmissionResponse = {
+ val k = scheduler.killDriver(submissionId)
+ k.serverSparkVersion = sparkVersion
+ k
+ }
+}
+
+private[mesos] class MesosStatusRequestServlet(scheduler: MesosClusterScheduler, conf: SparkConf)
+ extends StatusRequestServlet {
+ protected override def handleStatus(submissionId: String): SubmissionStatusResponse = {
+ val d = scheduler.getDriverStatus(submissionId)
+ d.serverSparkVersion = sparkVersion
+ d
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
new file mode 100644
index 0000000..ee9149c
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.executor
+
+import java.nio.ByteBuffer
+
+import scala.collection.JavaConverters._
+
+import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver}
+import org.apache.mesos.Protos.{TaskStatus => MesosTaskStatus, _}
+import org.apache.mesos.protobuf.ByteString
+
+import org.apache.spark.{SparkConf, SparkEnv, TaskState}
+import org.apache.spark.TaskState
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.cluster.mesos.{MesosSchedulerUtils, MesosTaskLaunchData}
+import org.apache.spark.util.Utils
+
+private[spark] class MesosExecutorBackend
+ extends MesosExecutor
+ with MesosSchedulerUtils // TODO: fix
+ with ExecutorBackend
+ with Logging {
+
+ var executor: Executor = null
+ var driver: ExecutorDriver = null
+
+ override def statusUpdate(taskId: Long, state: TaskState.TaskState, data: ByteBuffer) {
+ val mesosTaskId = TaskID.newBuilder().setValue(taskId.toString).build()
+ driver.sendStatusUpdate(MesosTaskStatus.newBuilder()
+ .setTaskId(mesosTaskId)
+ .setState(taskStateToMesos(state))
+ .setData(ByteString.copyFrom(data))
+ .build())
+ }
+
+ override def registered(
+ driver: ExecutorDriver,
+ executorInfo: ExecutorInfo,
+ frameworkInfo: FrameworkInfo,
+ slaveInfo: SlaveInfo) {
+
+ // Get num cores for this task from ExecutorInfo, created in MesosSchedulerBackend.
+ val cpusPerTask = executorInfo.getResourcesList.asScala
+ .find(_.getName == "cpus")
+ .map(_.getScalar.getValue.toInt)
+ .getOrElse(0)
+ val executorId = executorInfo.getExecutorId.getValue
+
+ logInfo(s"Registered with Mesos as executor ID $executorId with $cpusPerTask cpus")
+ this.driver = driver
+ // Set a context class loader to be picked up by the serializer. Without this call
+ // the serializer would default to the null class loader, and fail to find Spark classes
+ // See SPARK-10986.
+ Thread.currentThread().setContextClassLoader(this.getClass.getClassLoader)
+
+ val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray) ++
+ Seq[(String, String)](("spark.app.id", frameworkInfo.getId.getValue))
+ val conf = new SparkConf(loadDefaults = true).setAll(properties)
+ val port = conf.getInt("spark.executor.port", 0)
+ val env = SparkEnv.createExecutorEnv(
+ conf, executorId, slaveInfo.getHostname, port, cpusPerTask, None, isLocal = false)
+
+ executor = new Executor(
+ executorId,
+ slaveInfo.getHostname,
+ env)
+ }
+
+ override def launchTask(d: ExecutorDriver, taskInfo: TaskInfo) {
+ val taskId = taskInfo.getTaskId.getValue.toLong
+ val taskData = MesosTaskLaunchData.fromByteString(taskInfo.getData)
+ if (executor == null) {
+ logError("Received launchTask but executor was null")
+ } else {
+ SparkHadoopUtil.get.runAsSparkUser { () =>
+ executor.launchTask(this, taskId = taskId, attemptNumber = taskData.attemptNumber,
+ taskInfo.getName, taskData.serializedTask)
+ }
+ }
+ }
+
+ override def error(d: ExecutorDriver, message: String) {
+ logError("Error from Mesos: " + message)
+ }
+
+ override def killTask(d: ExecutorDriver, t: TaskID) {
+ if (executor == null) {
+ logError("Received KillTask but executor was null")
+ } else {
+ // TODO: Determine the 'interruptOnCancel' property set for the given job.
+ executor.killTask(t.getValue.toLong, interruptThread = false)
+ }
+ }
+
+ override def reregistered(d: ExecutorDriver, p2: SlaveInfo) {}
+
+ override def disconnected(d: ExecutorDriver) {}
+
+ override def frameworkMessage(d: ExecutorDriver, data: Array[Byte]) {}
+
+ override def shutdown(d: ExecutorDriver) {}
+}
+
+/**
+ * Entry point for Mesos executor.
+ */
+private[spark] object MesosExecutorBackend extends Logging {
+ def main(args: Array[String]) {
+ Utils.initDaemon(log)
+ // Create a new Executor and start it running
+ val runner = new MesosExecutorBackend()
+ new MesosExecutorDriver(runner).run()
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
new file mode 100644
index 0000000..ed29b34
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.internal.config._
+import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
+
+/**
+ * Cluster Manager for creation of Yarn scheduler and backend
+ */
+private[spark] class MesosClusterManager extends ExternalClusterManager {
+ private val MESOS_REGEX = """mesos://(.*)""".r
+
+ override def canCreate(masterURL: String): Boolean = {
+ masterURL.startsWith("mesos")
+ }
+
+ override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
+ new TaskSchedulerImpl(sc)
+ }
+
+ override def createSchedulerBackend(sc: SparkContext,
+ masterURL: String,
+ scheduler: TaskScheduler): SchedulerBackend = {
+ require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
+ "I/O encryption is currently not supported in Mesos.")
+
+ val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
+ val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
+ if (coarse) {
+ new MesosCoarseGrainedSchedulerBackend(
+ scheduler.asInstanceOf[TaskSchedulerImpl],
+ sc,
+ mesosUrl,
+ sc.env.securityManager)
+ } else {
+ new MesosFineGrainedSchedulerBackend(
+ scheduler.asInstanceOf[TaskSchedulerImpl],
+ sc,
+ mesosUrl)
+ }
+ }
+
+ override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
+ scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
new file mode 100644
index 0000000..61ab3e8
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import scala.collection.JavaConverters._
+
+import org.apache.curator.framework.CuratorFramework
+import org.apache.zookeeper.CreateMode
+import org.apache.zookeeper.KeeperException.NoNodeException
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkCuratorUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Persistence engine factory that is responsible for creating new persistence engines
+ * to store Mesos cluster mode state.
+ */
+private[spark] abstract class MesosClusterPersistenceEngineFactory(conf: SparkConf) {
+ def createEngine(path: String): MesosClusterPersistenceEngine
+}
+
+/**
+ * Mesos cluster persistence engine is responsible for persisting Mesos cluster mode
+ * specific state, so that on failover all the state can be recovered and the scheduler
+ * can resume managing the drivers.
+ */
+private[spark] trait MesosClusterPersistenceEngine {
+ def persist(name: String, obj: Object): Unit
+ def expunge(name: String): Unit
+ def fetch[T](name: String): Option[T]
+ def fetchAll[T](): Iterable[T]
+}
+
+/**
+ * Zookeeper backed persistence engine factory.
+ * All Zk engines created from this factory shares the same Zookeeper client, so
+ * all of them reuses the same connection pool.
+ */
+private[spark] class ZookeeperMesosClusterPersistenceEngineFactory(conf: SparkConf)
+ extends MesosClusterPersistenceEngineFactory(conf) with Logging {
+
+ lazy val zk = SparkCuratorUtil.newClient(conf)
+
+ def createEngine(path: String): MesosClusterPersistenceEngine = {
+ new ZookeeperMesosClusterPersistenceEngine(path, zk, conf)
+ }
+}
+
+/**
+ * Black hole persistence engine factory that creates black hole
+ * persistence engines, which stores nothing.
+ */
+private[spark] class BlackHoleMesosClusterPersistenceEngineFactory
+ extends MesosClusterPersistenceEngineFactory(null) {
+ def createEngine(path: String): MesosClusterPersistenceEngine = {
+ new BlackHoleMesosClusterPersistenceEngine
+ }
+}
+
+/**
+ * Black hole persistence engine that stores nothing.
+ */
+private[spark] class BlackHoleMesosClusterPersistenceEngine extends MesosClusterPersistenceEngine {
+ override def persist(name: String, obj: Object): Unit = {}
+ override def fetch[T](name: String): Option[T] = None
+ override def expunge(name: String): Unit = {}
+ override def fetchAll[T](): Iterable[T] = Iterable.empty[T]
+}
+
+/**
+ * Zookeeper based Mesos cluster persistence engine, that stores cluster mode state
+ * into Zookeeper. Each engine object is operating under one folder in Zookeeper, but
+ * reuses a shared Zookeeper client.
+ */
+private[spark] class ZookeeperMesosClusterPersistenceEngine(
+ baseDir: String,
+ zk: CuratorFramework,
+ conf: SparkConf)
+ extends MesosClusterPersistenceEngine with Logging {
+ private val WORKING_DIR =
+ conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir
+
+ SparkCuratorUtil.mkdir(zk, WORKING_DIR)
+
+ def path(name: String): String = {
+ WORKING_DIR + "/" + name
+ }
+
+ override def expunge(name: String): Unit = {
+ zk.delete().forPath(path(name))
+ }
+
+ override def persist(name: String, obj: Object): Unit = {
+ val serialized = Utils.serialize(obj)
+ val zkPath = path(name)
+ zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized)
+ }
+
+ override def fetch[T](name: String): Option[T] = {
+ val zkPath = path(name)
+
+ try {
+ val fileData = zk.getData().forPath(zkPath)
+ Some(Utils.deserialize[T](fileData))
+ } catch {
+ case e: NoNodeException => None
+ case e: Exception =>
+ logWarning("Exception while reading persisted file, deleting", e)
+ zk.delete().forPath(zkPath)
+ None
+ }
+ }
+
+ override def fetchAll[T](): Iterable[T] = {
+ zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T])
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[14/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
new file mode 100644
index 0000000..f384290
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -0,0 +1,740 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.io.File
+import java.util.{Collections, Date, List => JList}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.mesos.{Scheduler, SchedulerDriver}
+import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
+import org.apache.mesos.Protos.Environment.Variable
+import org.apache.mesos.Protos.TaskStatus.Reason
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkException, TaskState}
+import org.apache.spark.deploy.mesos.MesosDriverDescription
+import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionResponse, SubmissionStatusResponse}
+import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.util.Utils
+
+/**
+ * Tracks the current state of a Mesos Task that runs a Spark driver.
+ * @param driverDescription Submitted driver description from
+ * [[org.apache.spark.deploy.rest.mesos.MesosRestServer]]
+ * @param taskId Mesos TaskID generated for the task
+ * @param slaveId Slave ID that the task is assigned to
+ * @param mesosTaskStatus The last known task status update.
+ * @param startDate The date the task was launched
+ * @param finishDate The date the task finished
+ * @param frameworkId Mesos framework ID the task registers with
+ */
+private[spark] class MesosClusterSubmissionState(
+ val driverDescription: MesosDriverDescription,
+ val taskId: TaskID,
+ val slaveId: SlaveID,
+ var mesosTaskStatus: Option[TaskStatus],
+ var startDate: Date,
+ var finishDate: Option[Date],
+ val frameworkId: String)
+ extends Serializable {
+
+ def copy(): MesosClusterSubmissionState = {
+ new MesosClusterSubmissionState(
+ driverDescription, taskId, slaveId, mesosTaskStatus, startDate, finishDate, frameworkId)
+ }
+}
+
+/**
+ * Tracks the retry state of a driver, which includes the next time it should be scheduled
+ * and necessary information to do exponential backoff.
+ * This class is not thread-safe, and we expect the caller to handle synchronizing state.
+ *
+ * @param lastFailureStatus Last Task status when it failed.
+ * @param retries Number of times it has been retried.
+ * @param nextRetry Time at which it should be retried next
+ * @param waitTime The amount of time driver is scheduled to wait until next retry.
+ */
+private[spark] class MesosClusterRetryState(
+ val lastFailureStatus: TaskStatus,
+ val retries: Int,
+ val nextRetry: Date,
+ val waitTime: Int) extends Serializable {
+ def copy(): MesosClusterRetryState =
+ new MesosClusterRetryState(lastFailureStatus, retries, nextRetry, waitTime)
+}
+
+/**
+ * The full state of the cluster scheduler, currently being used for displaying
+ * information on the UI.
+ *
+ * @param frameworkId Mesos Framework id for the cluster scheduler.
+ * @param masterUrl The Mesos master url
+ * @param queuedDrivers All drivers queued to be launched
+ * @param launchedDrivers All launched or running drivers
+ * @param finishedDrivers All terminated drivers
+ * @param pendingRetryDrivers All drivers pending to be retried
+ */
+private[spark] class MesosClusterSchedulerState(
+ val frameworkId: String,
+ val masterUrl: Option[String],
+ val queuedDrivers: Iterable[MesosDriverDescription],
+ val launchedDrivers: Iterable[MesosClusterSubmissionState],
+ val finishedDrivers: Iterable[MesosClusterSubmissionState],
+ val pendingRetryDrivers: Iterable[MesosDriverDescription])
+
+/**
+ * The full state of a Mesos driver, that is being used to display driver information on the UI.
+ */
+private[spark] class MesosDriverState(
+ val state: String,
+ val description: MesosDriverDescription,
+ val submissionState: Option[MesosClusterSubmissionState] = None)
+
+/**
+ * A Mesos scheduler that is responsible for launching submitted Spark drivers in cluster mode
+ * as Mesos tasks in a Mesos cluster.
+ * All drivers are launched asynchronously by the framework, which will eventually be launched
+ * by one of the slaves in the cluster. The results of the driver will be stored in slave's task
+ * sandbox which is accessible by visiting the Mesos UI.
+ * This scheduler supports recovery by persisting all its state and performs task reconciliation
+ * on recover, which gets all the latest state for all the drivers from Mesos master.
+ */
+private[spark] class MesosClusterScheduler(
+ engineFactory: MesosClusterPersistenceEngineFactory,
+ conf: SparkConf)
+ extends Scheduler with MesosSchedulerUtils {
+ var frameworkUrl: String = _
+ private val metricsSystem =
+ MetricsSystem.createMetricsSystem("mesos_cluster", conf, new SecurityManager(conf))
+ private val master = conf.get("spark.master")
+ private val appName = conf.get("spark.app.name")
+ private val queuedCapacity = conf.getInt("spark.mesos.maxDrivers", 200)
+ private val retainedDrivers = conf.getInt("spark.mesos.retainedDrivers", 200)
+ private val maxRetryWaitTime = conf.getInt("spark.mesos.cluster.retry.wait.max", 60) // 1 minute
+ private val useFetchCache = conf.getBoolean("spark.mesos.fetchCache.enable", false)
+ private val schedulerState = engineFactory.createEngine("scheduler")
+ private val stateLock = new Object()
+ private val finishedDrivers =
+ new mutable.ArrayBuffer[MesosClusterSubmissionState](retainedDrivers)
+ private var frameworkId: String = null
+ // Holds all the launched drivers and current launch state, keyed by driver id.
+ private val launchedDrivers = new mutable.HashMap[String, MesosClusterSubmissionState]()
+ // Holds a map of driver id to expected slave id that is passed to Mesos for reconciliation.
+ // All drivers that are loaded after failover are added here, as we need get the latest
+ // state of the tasks from Mesos.
+ private val pendingRecover = new mutable.HashMap[String, SlaveID]()
+ // Stores all the submitted drivers that hasn't been launched.
+ private val queuedDrivers = new ArrayBuffer[MesosDriverDescription]()
+ // All supervised drivers that are waiting to retry after termination.
+ private val pendingRetryDrivers = new ArrayBuffer[MesosDriverDescription]()
+ private val queuedDriversState = engineFactory.createEngine("driverQueue")
+ private val launchedDriversState = engineFactory.createEngine("launchedDrivers")
+ private val pendingRetryDriversState = engineFactory.createEngine("retryList")
+ // Flag to mark if the scheduler is ready to be called, which is until the scheduler
+ // is registered with Mesos master.
+ @volatile protected var ready = false
+ private var masterInfo: Option[MasterInfo] = None
+
+ def submitDriver(desc: MesosDriverDescription): CreateSubmissionResponse = {
+ val c = new CreateSubmissionResponse
+ if (!ready) {
+ c.success = false
+ c.message = "Scheduler is not ready to take requests"
+ return c
+ }
+
+ stateLock.synchronized {
+ if (isQueueFull()) {
+ c.success = false
+ c.message = "Already reached maximum submission size"
+ return c
+ }
+ c.submissionId = desc.submissionId
+ queuedDriversState.persist(desc.submissionId, desc)
+ queuedDrivers += desc
+ c.success = true
+ }
+ c
+ }
+
+ def killDriver(submissionId: String): KillSubmissionResponse = {
+ val k = new KillSubmissionResponse
+ if (!ready) {
+ k.success = false
+ k.message = "Scheduler is not ready to take requests"
+ return k
+ }
+ k.submissionId = submissionId
+ stateLock.synchronized {
+ // We look for the requested driver in the following places:
+ // 1. Check if submission is running or launched.
+ // 2. Check if it's still queued.
+ // 3. Check if it's in the retry list.
+ // 4. Check if it has already completed.
+ if (launchedDrivers.contains(submissionId)) {
+ val task = launchedDrivers(submissionId)
+ mesosDriver.killTask(task.taskId)
+ k.success = true
+ k.message = "Killing running driver"
+ } else if (removeFromQueuedDrivers(submissionId)) {
+ k.success = true
+ k.message = "Removed driver while it's still pending"
+ } else if (removeFromPendingRetryDrivers(submissionId)) {
+ k.success = true
+ k.message = "Removed driver while it's being retried"
+ } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) {
+ k.success = false
+ k.message = "Driver already terminated"
+ } else {
+ k.success = false
+ k.message = "Cannot find driver"
+ }
+ }
+ k
+ }
+
+ def getDriverStatus(submissionId: String): SubmissionStatusResponse = {
+ val s = new SubmissionStatusResponse
+ if (!ready) {
+ s.success = false
+ s.message = "Scheduler is not ready to take requests"
+ return s
+ }
+ s.submissionId = submissionId
+ stateLock.synchronized {
+ if (queuedDrivers.exists(_.submissionId.equals(submissionId))) {
+ s.success = true
+ s.driverState = "QUEUED"
+ } else if (launchedDrivers.contains(submissionId)) {
+ s.success = true
+ s.driverState = "RUNNING"
+ launchedDrivers(submissionId).mesosTaskStatus.foreach(state => s.message = state.toString)
+ } else if (finishedDrivers.exists(_.driverDescription.submissionId.equals(submissionId))) {
+ s.success = true
+ s.driverState = "FINISHED"
+ finishedDrivers
+ .find(d => d.driverDescription.submissionId.equals(submissionId)).get.mesosTaskStatus
+ .foreach(state => s.message = state.toString)
+ } else if (pendingRetryDrivers.exists(_.submissionId.equals(submissionId))) {
+ val status = pendingRetryDrivers.find(_.submissionId.equals(submissionId))
+ .get.retryState.get.lastFailureStatus
+ s.success = true
+ s.driverState = "RETRYING"
+ s.message = status.toString
+ } else {
+ s.success = false
+ s.driverState = "NOT_FOUND"
+ }
+ }
+ s
+ }
+
+ /**
+ * Gets the driver state to be displayed on the Web UI.
+ */
+ def getDriverState(submissionId: String): Option[MesosDriverState] = {
+ stateLock.synchronized {
+ queuedDrivers.find(_.submissionId.equals(submissionId))
+ .map(d => new MesosDriverState("QUEUED", d))
+ .orElse(launchedDrivers.get(submissionId)
+ .map(d => new MesosDriverState("RUNNING", d.driverDescription, Some(d))))
+ .orElse(finishedDrivers.find(_.driverDescription.submissionId.equals(submissionId))
+ .map(d => new MesosDriverState("FINISHED", d.driverDescription, Some(d))))
+ .orElse(pendingRetryDrivers.find(_.submissionId.equals(submissionId))
+ .map(d => new MesosDriverState("RETRYING", d)))
+ }
+ }
+
+ private def isQueueFull(): Boolean = launchedDrivers.size >= queuedCapacity
+
+ /**
+ * Recover scheduler state that is persisted.
+ * We still need to do task reconciliation to be up to date of the latest task states
+ * as it might have changed while the scheduler is failing over.
+ */
+ private def recoverState(): Unit = {
+ stateLock.synchronized {
+ launchedDriversState.fetchAll[MesosClusterSubmissionState]().foreach { state =>
+ launchedDrivers(state.taskId.getValue) = state
+ pendingRecover(state.taskId.getValue) = state.slaveId
+ }
+ queuedDriversState.fetchAll[MesosDriverDescription]().foreach(d => queuedDrivers += d)
+ // There is potential timing issue where a queued driver might have been launched
+ // but the scheduler shuts down before the queued driver was able to be removed
+ // from the queue. We try to mitigate this issue by walking through all queued drivers
+ // and remove if they're already launched.
+ queuedDrivers
+ .filter(d => launchedDrivers.contains(d.submissionId))
+ .foreach(d => removeFromQueuedDrivers(d.submissionId))
+ pendingRetryDriversState.fetchAll[MesosDriverDescription]()
+ .foreach(s => pendingRetryDrivers += s)
+ // TODO: Consider storing finished drivers so we can show them on the UI after
+ // failover. For now we clear the history on each recovery.
+ finishedDrivers.clear()
+ }
+ }
+
+ /**
+ * Starts the cluster scheduler and wait until the scheduler is registered.
+ * This also marks the scheduler to be ready for requests.
+ */
+ def start(): Unit = {
+ // TODO: Implement leader election to make sure only one framework running in the cluster.
+ val fwId = schedulerState.fetch[String]("frameworkId")
+ fwId.foreach { id =>
+ frameworkId = id
+ }
+ recoverState()
+ metricsSystem.registerSource(new MesosClusterSchedulerSource(this))
+ metricsSystem.start()
+ val driver = createSchedulerDriver(
+ master,
+ MesosClusterScheduler.this,
+ Utils.getCurrentUserName(),
+ appName,
+ conf,
+ Some(frameworkUrl),
+ Some(true),
+ Some(Integer.MAX_VALUE),
+ fwId)
+
+ startScheduler(driver)
+ ready = true
+ }
+
+ def stop(): Unit = {
+ ready = false
+ metricsSystem.report()
+ metricsSystem.stop()
+ mesosDriver.stop(true)
+ }
+
+ override def registered(
+ driver: SchedulerDriver,
+ newFrameworkId: FrameworkID,
+ masterInfo: MasterInfo): Unit = {
+ logInfo("Registered as framework ID " + newFrameworkId.getValue)
+ if (newFrameworkId.getValue != frameworkId) {
+ frameworkId = newFrameworkId.getValue
+ schedulerState.persist("frameworkId", frameworkId)
+ }
+ markRegistered()
+
+ stateLock.synchronized {
+ this.masterInfo = Some(masterInfo)
+ if (!pendingRecover.isEmpty) {
+ // Start task reconciliation if we need to recover.
+ val statuses = pendingRecover.collect {
+ case (taskId, slaveId) =>
+ val newStatus = TaskStatus.newBuilder()
+ .setTaskId(TaskID.newBuilder().setValue(taskId).build())
+ .setSlaveId(slaveId)
+ .setState(MesosTaskState.TASK_STAGING)
+ .build()
+ launchedDrivers.get(taskId).map(_.mesosTaskStatus.getOrElse(newStatus))
+ .getOrElse(newStatus)
+ }
+ // TODO: Page the status updates to avoid trying to reconcile
+ // a large amount of tasks at once.
+ driver.reconcileTasks(statuses.toSeq.asJava)
+ }
+ }
+ }
+
+ private def getDriverExecutorURI(desc: MesosDriverDescription): Option[String] = {
+ desc.conf.getOption("spark.executor.uri")
+ .orElse(desc.command.environment.get("SPARK_EXECUTOR_URI"))
+ }
+
+ private def getDriverFrameworkID(desc: MesosDriverDescription): String = {
+ s"${frameworkId}-${desc.submissionId}"
+ }
+
+ private def adjust[A, B](m: collection.Map[A, B], k: A, default: B)(f: B => B) = {
+ m.updated(k, f(m.getOrElse(k, default)))
+ }
+
+ private def getDriverEnvironment(desc: MesosDriverDescription): Environment = {
+ // TODO(mgummelt): Don't do this here. This should be passed as a --conf
+ val commandEnv = adjust(desc.command.environment, "SPARK_SUBMIT_OPTS", "")(
+ v => s"$v -Dspark.mesos.driver.frameworkId=${getDriverFrameworkID(desc)}"
+ )
+
+ val env = desc.conf.getAllWithPrefix("spark.mesos.driverEnv.") ++ commandEnv
+
+ val envBuilder = Environment.newBuilder()
+ env.foreach { case (k, v) =>
+ envBuilder.addVariables(Variable.newBuilder().setName(k).setValue(v))
+ }
+ envBuilder.build()
+ }
+
+ private def getDriverUris(desc: MesosDriverDescription): List[CommandInfo.URI] = {
+ val confUris = List(conf.getOption("spark.mesos.uris"),
+ desc.conf.getOption("spark.mesos.uris"),
+ desc.conf.getOption("spark.submit.pyFiles")).flatMap(
+ _.map(_.split(",").map(_.trim))
+ ).flatten
+
+ val jarUrl = desc.jarUrl.stripPrefix("file:").stripPrefix("local:")
+
+ ((jarUrl :: confUris) ++ getDriverExecutorURI(desc).toList).map(uri =>
+ CommandInfo.URI.newBuilder().setValue(uri.trim()).setCache(useFetchCache).build())
+ }
+
+ private def getDriverCommandValue(desc: MesosDriverDescription): String = {
+ val dockerDefined = desc.conf.contains("spark.mesos.executor.docker.image")
+ val executorUri = getDriverExecutorURI(desc)
+ // Gets the path to run spark-submit, and the path to the Mesos sandbox.
+ val (executable, sandboxPath) = if (dockerDefined) {
+ // Application jar is automatically downloaded in the mounted sandbox by Mesos,
+ // and the path to the mounted volume is stored in $MESOS_SANDBOX env variable.
+ ("./bin/spark-submit", "$MESOS_SANDBOX")
+ } else if (executorUri.isDefined) {
+ val folderBasename = executorUri.get.split('/').last.split('.').head
+
+ val entries = conf.getOption("spark.executor.extraLibraryPath")
+ .map(path => Seq(path) ++ desc.command.libraryPathEntries)
+ .getOrElse(desc.command.libraryPathEntries)
+
+ val prefixEnv = if (!entries.isEmpty) Utils.libraryPathEnvPrefix(entries) else ""
+
+ val cmdExecutable = s"cd $folderBasename*; $prefixEnv bin/spark-submit"
+ // Sandbox path points to the parent folder as we chdir into the folderBasename.
+ (cmdExecutable, "..")
+ } else {
+ val executorSparkHome = desc.conf.getOption("spark.mesos.executor.home")
+ .orElse(conf.getOption("spark.home"))
+ .orElse(Option(System.getenv("SPARK_HOME")))
+ .getOrElse {
+ throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
+ }
+ val cmdExecutable = new File(executorSparkHome, "./bin/spark-submit").getPath
+ // Sandbox points to the current directory by default with Mesos.
+ (cmdExecutable, ".")
+ }
+ val cmdOptions = generateCmdOption(desc, sandboxPath).mkString(" ")
+ val primaryResource = new File(sandboxPath, desc.jarUrl.split("/").last).toString()
+ val appArguments = desc.command.arguments.mkString(" ")
+
+ s"$executable $cmdOptions $primaryResource $appArguments"
+ }
+
+ private def buildDriverCommand(desc: MesosDriverDescription): CommandInfo = {
+ val builder = CommandInfo.newBuilder()
+ builder.setValue(getDriverCommandValue(desc))
+ builder.setEnvironment(getDriverEnvironment(desc))
+ builder.addAllUris(getDriverUris(desc).asJava)
+ builder.build()
+ }
+
+ private def generateCmdOption(desc: MesosDriverDescription, sandboxPath: String): Seq[String] = {
+ var options = Seq(
+ "--name", desc.conf.get("spark.app.name"),
+ "--master", s"mesos://${conf.get("spark.master")}",
+ "--driver-cores", desc.cores.toString,
+ "--driver-memory", s"${desc.mem}M")
+
+ // Assume empty main class means we're running python
+ if (!desc.command.mainClass.equals("")) {
+ options ++= Seq("--class", desc.command.mainClass)
+ }
+
+ desc.conf.getOption("spark.executor.memory").foreach { v =>
+ options ++= Seq("--executor-memory", v)
+ }
+ desc.conf.getOption("spark.cores.max").foreach { v =>
+ options ++= Seq("--total-executor-cores", v)
+ }
+ desc.conf.getOption("spark.submit.pyFiles").foreach { pyFiles =>
+ val formattedFiles = pyFiles.split(",")
+ .map { path => new File(sandboxPath, path.split("/").last).toString() }
+ .mkString(",")
+ options ++= Seq("--py-files", formattedFiles)
+ }
+
+ // --conf
+ val replicatedOptionsBlacklist = Set(
+ "spark.jars", // Avoids duplicate classes in classpath
+ "spark.submit.deployMode", // this would be set to `cluster`, but we need client
+ "spark.master" // this contains the address of the dispatcher, not master
+ )
+ val defaultConf = conf.getAllWithPrefix("spark.mesos.dispatcher.driverDefault.").toMap
+ val driverConf = desc.conf.getAll
+ .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) }
+ .toMap
+ (defaultConf ++ driverConf).foreach { case (key, value) =>
+ options ++= Seq("--conf", s""""$key=${shellEscape(value)}"""".stripMargin) }
+
+ options
+ }
+
+ /**
+ * Escape args for Unix-like shells, unless already quoted by the user.
+ * Based on: http://www.gnu.org/software/bash/manual/html_node/Double-Quotes.html
+ * and http://www.grymoire.com/Unix/Quote.html
+ *
+ * @param value argument
+ * @return escaped argument
+ */
+ private[scheduler] def shellEscape(value: String): String = {
+ val WrappedInQuotes = """^(".+"|'.+')$""".r
+ val ShellSpecialChars = (""".*([ '<>&|\?\*;!#\\(\)"$`]).*""").r
+ value match {
+ case WrappedInQuotes(c) => value // The user quoted his args, don't touch it!
+ case ShellSpecialChars(c) => "\"" + value.replaceAll("""(["`\$\\])""", """\\$1""") + "\""
+ case _: String => value // Don't touch harmless strings
+ }
+ }
+
+ private class ResourceOffer(
+ val offerId: OfferID,
+ val slaveId: SlaveID,
+ var resources: JList[Resource]) {
+ override def toString(): String = {
+ s"Offer id: ${offerId}, resources: ${resources}"
+ }
+ }
+
+ private def createTaskInfo(desc: MesosDriverDescription, offer: ResourceOffer): TaskInfo = {
+ val taskId = TaskID.newBuilder().setValue(desc.submissionId).build()
+
+ val (remainingResources, cpuResourcesToUse) =
+ partitionResources(offer.resources, "cpus", desc.cores)
+ val (finalResources, memResourcesToUse) =
+ partitionResources(remainingResources.asJava, "mem", desc.mem)
+ offer.resources = finalResources.asJava
+
+ val appName = desc.conf.get("spark.app.name")
+ val taskInfo = TaskInfo.newBuilder()
+ .setTaskId(taskId)
+ .setName(s"Driver for ${appName}")
+ .setSlaveId(offer.slaveId)
+ .setCommand(buildDriverCommand(desc))
+ .addAllResources(cpuResourcesToUse.asJava)
+ .addAllResources(memResourcesToUse.asJava)
+ taskInfo.setContainer(MesosSchedulerBackendUtil.containerInfo(desc.conf))
+ taskInfo.build
+ }
+
+ /**
+ * This method takes all the possible candidates and attempt to schedule them with Mesos offers.
+ * Every time a new task is scheduled, the afterLaunchCallback is called to perform post scheduled
+ * logic on each task.
+ */
+ private def scheduleTasks(
+ candidates: Seq[MesosDriverDescription],
+ afterLaunchCallback: (String) => Boolean,
+ currentOffers: List[ResourceOffer],
+ tasks: mutable.HashMap[OfferID, ArrayBuffer[TaskInfo]]): Unit = {
+ for (submission <- candidates) {
+ val driverCpu = submission.cores
+ val driverMem = submission.mem
+ logTrace(s"Finding offer to launch driver with cpu: $driverCpu, mem: $driverMem")
+ val offerOption = currentOffers.find { o =>
+ getResource(o.resources, "cpus") >= driverCpu &&
+ getResource(o.resources, "mem") >= driverMem
+ }
+ if (offerOption.isEmpty) {
+ logDebug(s"Unable to find offer to launch driver id: ${submission.submissionId}, " +
+ s"cpu: $driverCpu, mem: $driverMem")
+ } else {
+ val offer = offerOption.get
+ val queuedTasks = tasks.getOrElseUpdate(offer.offerId, new ArrayBuffer[TaskInfo])
+ val task = createTaskInfo(submission, offer)
+ queuedTasks += task
+ logTrace(s"Using offer ${offer.offerId.getValue} to launch driver " +
+ submission.submissionId)
+ val newState = new MesosClusterSubmissionState(submission, task.getTaskId, offer.slaveId,
+ None, new Date(), None, getDriverFrameworkID(submission))
+ launchedDrivers(submission.submissionId) = newState
+ launchedDriversState.persist(submission.submissionId, newState)
+ afterLaunchCallback(submission.submissionId)
+ }
+ }
+ }
+
+ override def resourceOffers(driver: SchedulerDriver, offers: JList[Offer]): Unit = {
+ logTrace(s"Received offers from Mesos: \n${offers.asScala.mkString("\n")}")
+ val tasks = new mutable.HashMap[OfferID, ArrayBuffer[TaskInfo]]()
+ val currentTime = new Date()
+
+ val currentOffers = offers.asScala.map {
+ o => new ResourceOffer(o.getId, o.getSlaveId, o.getResourcesList)
+ }.toList
+
+ stateLock.synchronized {
+ // We first schedule all the supervised drivers that are ready to retry.
+ // This list will be empty if none of the drivers are marked as supervise.
+ val driversToRetry = pendingRetryDrivers.filter { d =>
+ d.retryState.get.nextRetry.before(currentTime)
+ }
+
+ scheduleTasks(
+ copyBuffer(driversToRetry),
+ removeFromPendingRetryDrivers,
+ currentOffers,
+ tasks)
+
+ // Then we walk through the queued drivers and try to schedule them.
+ scheduleTasks(
+ copyBuffer(queuedDrivers),
+ removeFromQueuedDrivers,
+ currentOffers,
+ tasks)
+ }
+ tasks.foreach { case (offerId, taskInfos) =>
+ driver.launchTasks(Collections.singleton(offerId), taskInfos.asJava)
+ }
+
+ for (o <- currentOffers if !tasks.contains(o.offerId)) {
+ driver.declineOffer(o.offerId)
+ }
+ }
+
+ private def copyBuffer(
+ buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = {
+ val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size)
+ buffer.copyToBuffer(newBuffer)
+ newBuffer
+ }
+
+ def getSchedulerState(): MesosClusterSchedulerState = {
+ stateLock.synchronized {
+ new MesosClusterSchedulerState(
+ frameworkId,
+ masterInfo.map(m => s"http://${m.getIp}:${m.getPort}"),
+ copyBuffer(queuedDrivers),
+ launchedDrivers.values.map(_.copy()).toList,
+ finishedDrivers.map(_.copy()).toList,
+ copyBuffer(pendingRetryDrivers))
+ }
+ }
+
+ override def offerRescinded(driver: SchedulerDriver, offerId: OfferID): Unit = {}
+ override def disconnected(driver: SchedulerDriver): Unit = {}
+ override def reregistered(driver: SchedulerDriver, masterInfo: MasterInfo): Unit = {
+ logInfo(s"Framework re-registered with master ${masterInfo.getId}")
+ }
+ override def slaveLost(driver: SchedulerDriver, slaveId: SlaveID): Unit = {}
+ override def error(driver: SchedulerDriver, error: String): Unit = {
+ logError("Error received: " + error)
+ markErr()
+ }
+
+ /**
+ * Check if the task state is a recoverable state that we can relaunch the task.
+ * Task state like TASK_ERROR are not relaunchable state since it wasn't able
+ * to be validated by Mesos.
+ */
+ private def shouldRelaunch(state: MesosTaskState): Boolean = {
+ state == MesosTaskState.TASK_FAILED ||
+ state == MesosTaskState.TASK_KILLED ||
+ state == MesosTaskState.TASK_LOST
+ }
+
+ override def statusUpdate(driver: SchedulerDriver, status: TaskStatus): Unit = {
+ val taskId = status.getTaskId.getValue
+ stateLock.synchronized {
+ if (launchedDrivers.contains(taskId)) {
+ if (status.getReason == Reason.REASON_RECONCILIATION &&
+ !pendingRecover.contains(taskId)) {
+ // Task has already received update and no longer requires reconciliation.
+ return
+ }
+ val state = launchedDrivers(taskId)
+ // Check if the driver is supervise enabled and can be relaunched.
+ if (state.driverDescription.supervise && shouldRelaunch(status.getState)) {
+ removeFromLaunchedDrivers(taskId)
+ state.finishDate = Some(new Date())
+ val retryState: Option[MesosClusterRetryState] = state.driverDescription.retryState
+ val (retries, waitTimeSec) = retryState
+ .map { rs => (rs.retries + 1, Math.min(maxRetryWaitTime, rs.waitTime * 2)) }
+ .getOrElse{ (1, 1) }
+ val nextRetry = new Date(new Date().getTime + waitTimeSec * 1000L)
+
+ val newDriverDescription = state.driverDescription.copy(
+ retryState = Some(new MesosClusterRetryState(status, retries, nextRetry, waitTimeSec)))
+ pendingRetryDrivers += newDriverDescription
+ pendingRetryDriversState.persist(taskId, newDriverDescription)
+ } else if (TaskState.isFinished(mesosToTaskState(status.getState))) {
+ removeFromLaunchedDrivers(taskId)
+ state.finishDate = Some(new Date())
+ if (finishedDrivers.size >= retainedDrivers) {
+ val toRemove = math.max(retainedDrivers / 10, 1)
+ finishedDrivers.trimStart(toRemove)
+ }
+ finishedDrivers += state
+ }
+ state.mesosTaskStatus = Option(status)
+ } else {
+ logError(s"Unable to find driver $taskId in status update")
+ }
+ }
+ }
+
+ override def frameworkMessage(
+ driver: SchedulerDriver,
+ executorId: ExecutorID,
+ slaveId: SlaveID,
+ message: Array[Byte]): Unit = {}
+
+ override def executorLost(
+ driver: SchedulerDriver,
+ executorId: ExecutorID,
+ slaveId: SlaveID,
+ status: Int): Unit = {}
+
+ private def removeFromQueuedDrivers(id: String): Boolean = {
+ val index = queuedDrivers.indexWhere(_.submissionId.equals(id))
+ if (index != -1) {
+ queuedDrivers.remove(index)
+ queuedDriversState.expunge(id)
+ true
+ } else {
+ false
+ }
+ }
+
+ private def removeFromLaunchedDrivers(id: String): Boolean = {
+ if (launchedDrivers.remove(id).isDefined) {
+ launchedDriversState.expunge(id)
+ true
+ } else {
+ false
+ }
+ }
+
+ private def removeFromPendingRetryDrivers(id: String): Boolean = {
+ val index = pendingRetryDrivers.indexWhere(_.submissionId.equals(id))
+ if (index != -1) {
+ pendingRetryDrivers.remove(index)
+ pendingRetryDriversState.expunge(id)
+ true
+ } else {
+ false
+ }
+ }
+
+ def getQueuedDriversSize: Int = queuedDrivers.size
+ def getLaunchedDriversSize: Int = launchedDrivers.size
+ def getPendingRetryDriversSize: Int = pendingRetryDrivers.size
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
new file mode 100644
index 0000000..1fe9497
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSource.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import com.codahale.metrics.{Gauge, MetricRegistry}
+
+import org.apache.spark.metrics.source.Source
+
+private[mesos] class MesosClusterSchedulerSource(scheduler: MesosClusterScheduler)
+ extends Source {
+ override def sourceName: String = "mesos_cluster"
+ override def metricRegistry: MetricRegistry = new MetricRegistry()
+
+ metricRegistry.register(MetricRegistry.name("waitingDrivers"), new Gauge[Int] {
+ override def getValue: Int = scheduler.getQueuedDriversSize
+ })
+
+ metricRegistry.register(MetricRegistry.name("launchedDrivers"), new Gauge[Int] {
+ override def getValue: Int = scheduler.getLaunchedDriversSize
+ })
+
+ metricRegistry.register(MetricRegistry.name("retryDrivers"), new Gauge[Int] {
+ override def getValue: Int = scheduler.getPendingRetryDriversSize
+ })
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
new file mode 100644
index 0000000..3258b09
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -0,0 +1,668 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.io.File
+import java.util.{Collections, List => JList}
+import java.util.concurrent.locks.ReentrantLock
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.Future
+
+import org.apache.mesos.Protos.{TaskInfo => MesosTaskInfo, _}
+
+import org.apache.spark.{SecurityManager, SparkContext, SparkException, TaskState}
+import org.apache.spark.network.netty.SparkTransportConf
+import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
+import org.apache.spark.rpc.RpcEndpointAddress
+import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl}
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.util.Utils
+
+/**
+ * A SchedulerBackend that runs tasks on Mesos, but uses "coarse-grained" tasks, where it holds
+ * onto each Mesos node for the duration of the Spark job instead of relinquishing cores whenever
+ * a task is done. It launches Spark tasks within the coarse-grained Mesos tasks using the
+ * CoarseGrainedSchedulerBackend mechanism. This class is useful for lower and more predictable
+ * latency.
+ *
+ * Unfortunately this has a bit of duplication from [[MesosFineGrainedSchedulerBackend]],
+ * but it seems hard to remove this.
+ */
+private[spark] class MesosCoarseGrainedSchedulerBackend(
+ scheduler: TaskSchedulerImpl,
+ sc: SparkContext,
+ master: String,
+ securityManager: SecurityManager)
+ extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
+ with org.apache.mesos.Scheduler
+ with MesosSchedulerUtils {
+
+ val MAX_SLAVE_FAILURES = 2 // Blacklist a slave after this many failures
+
+ // Maximum number of cores to acquire (TODO: we'll need more flexible controls here)
+ val maxCores = conf.get("spark.cores.max", Int.MaxValue.toString).toInt
+
+ val useFetcherCache = conf.getBoolean("spark.mesos.fetcherCache.enable", false)
+
+ val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
+
+ private[this] val shutdownTimeoutMS =
+ conf.getTimeAsMs("spark.mesos.coarse.shutdownTimeout", "10s")
+ .ensuring(_ >= 0, "spark.mesos.coarse.shutdownTimeout must be >= 0")
+
+ // Synchronization protected by stateLock
+ private[this] var stopCalled: Boolean = false
+
+ // If shuffle service is enabled, the Spark driver will register with the shuffle service.
+ // This is for cleaning up shuffle files reliably.
+ private val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
+
+ // Cores we have acquired with each Mesos task ID
+ val coresByTaskId = new mutable.HashMap[String, Int]
+ val gpusByTaskId = new mutable.HashMap[String, Int]
+ var totalCoresAcquired = 0
+ var totalGpusAcquired = 0
+
+ // SlaveID -> Slave
+ // This map accumulates entries for the duration of the job. Slaves are never deleted, because
+ // we need to maintain e.g. failure state and connection state.
+ private val slaves = new mutable.HashMap[String, Slave]
+
+ /**
+ * The total number of executors we aim to have. Undefined when not using dynamic allocation.
+ * Initially set to 0 when using dynamic allocation, the executor allocation manager will send
+ * the real initial limit later.
+ */
+ private var executorLimitOption: Option[Int] = {
+ if (Utils.isDynamicAllocationEnabled(conf)) {
+ Some(0)
+ } else {
+ None
+ }
+ }
+
+ /**
+ * Return the current executor limit, which may be [[Int.MaxValue]]
+ * before properly initialized.
+ */
+ private[mesos] def executorLimit: Int = executorLimitOption.getOrElse(Int.MaxValue)
+
+ // private lock object protecting mutable state above. Using the intrinsic lock
+ // may lead to deadlocks since the superclass might also try to lock
+ private val stateLock = new ReentrantLock
+
+ val extraCoresPerExecutor = conf.getInt("spark.mesos.extra.cores", 0)
+
+ // Offer constraints
+ private val slaveOfferConstraints =
+ parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
+
+ // Reject offers with mismatched constraints in seconds
+ private val rejectOfferDurationForUnmetConstraints =
+ getRejectOfferDurationForUnmetConstraints(sc)
+
+ // Reject offers when we reached the maximum number of cores for this framework
+ private val rejectOfferDurationForReachedMaxCores =
+ getRejectOfferDurationForReachedMaxCores(sc)
+
+ // A client for talking to the external shuffle service
+ private val mesosExternalShuffleClient: Option[MesosExternalShuffleClient] = {
+ if (shuffleServiceEnabled) {
+ Some(getShuffleClient())
+ } else {
+ None
+ }
+ }
+
+ // This method is factored out for testability
+ protected def getShuffleClient(): MesosExternalShuffleClient = {
+ new MesosExternalShuffleClient(
+ SparkTransportConf.fromSparkConf(conf, "shuffle"),
+ securityManager,
+ securityManager.isAuthenticationEnabled(),
+ securityManager.isSaslEncryptionEnabled())
+ }
+
+ var nextMesosTaskId = 0
+
+ @volatile var appId: String = _
+
+ def newMesosTaskId(): String = {
+ val id = nextMesosTaskId
+ nextMesosTaskId += 1
+ id.toString
+ }
+
+ override def start() {
+ super.start()
+ val driver = createSchedulerDriver(
+ master,
+ MesosCoarseGrainedSchedulerBackend.this,
+ sc.sparkUser,
+ sc.appName,
+ sc.conf,
+ sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
+ None,
+ None,
+ sc.conf.getOption("spark.mesos.driver.frameworkId")
+ )
+
+ unsetFrameworkID(sc)
+ startScheduler(driver)
+ }
+
+ def createCommand(offer: Offer, numCores: Int, taskId: String): CommandInfo = {
+ val environment = Environment.newBuilder()
+ val extraClassPath = conf.getOption("spark.executor.extraClassPath")
+ extraClassPath.foreach { cp =>
+ environment.addVariables(
+ Environment.Variable.newBuilder().setName("SPARK_CLASSPATH").setValue(cp).build())
+ }
+ val extraJavaOpts = conf.get("spark.executor.extraJavaOptions", "")
+
+ // Set the environment variable through a command prefix
+ // to append to the existing value of the variable
+ val prefixEnv = conf.getOption("spark.executor.extraLibraryPath").map { p =>
+ Utils.libraryPathEnvPrefix(Seq(p))
+ }.getOrElse("")
+
+ environment.addVariables(
+ Environment.Variable.newBuilder()
+ .setName("SPARK_EXECUTOR_OPTS")
+ .setValue(extraJavaOpts)
+ .build())
+
+ sc.executorEnvs.foreach { case (key, value) =>
+ environment.addVariables(Environment.Variable.newBuilder()
+ .setName(key)
+ .setValue(value)
+ .build())
+ }
+ val command = CommandInfo.newBuilder()
+ .setEnvironment(environment)
+
+ val uri = conf.getOption("spark.executor.uri")
+ .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
+
+ if (uri.isEmpty) {
+ val executorSparkHome = conf.getOption("spark.mesos.executor.home")
+ .orElse(sc.getSparkHome())
+ .getOrElse {
+ throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
+ }
+ val runScript = new File(executorSparkHome, "./bin/spark-class").getPath
+ command.setValue(
+ "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend"
+ .format(prefixEnv, runScript) +
+ s" --driver-url $driverURL" +
+ s" --executor-id $taskId" +
+ s" --hostname ${executorHostname(offer)}" +
+ s" --cores $numCores" +
+ s" --app-id $appId")
+ } else {
+ // Grab everything to the first '.'. We'll use that and '*' to
+ // glob the directory "correctly".
+ val basename = uri.get.split('/').last.split('.').head
+ command.setValue(
+ s"cd $basename*; $prefixEnv " +
+ "./bin/spark-class org.apache.spark.executor.CoarseGrainedExecutorBackend" +
+ s" --driver-url $driverURL" +
+ s" --executor-id $taskId" +
+ s" --hostname ${executorHostname(offer)}" +
+ s" --cores $numCores" +
+ s" --app-id $appId")
+ command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get).setCache(useFetcherCache))
+ }
+
+ conf.getOption("spark.mesos.uris").foreach(setupUris(_, command, useFetcherCache))
+
+ command.build()
+ }
+
+ protected def driverURL: String = {
+ if (conf.contains("spark.testing")) {
+ "driverURL"
+ } else {
+ RpcEndpointAddress(
+ conf.get("spark.driver.host"),
+ conf.get("spark.driver.port").toInt,
+ CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
+ }
+ }
+
+ override def offerRescinded(d: org.apache.mesos.SchedulerDriver, o: OfferID) {}
+
+ override def registered(
+ d: org.apache.mesos.SchedulerDriver, frameworkId: FrameworkID, masterInfo: MasterInfo) {
+ appId = frameworkId.getValue
+ mesosExternalShuffleClient.foreach(_.init(appId))
+ markRegistered()
+ }
+
+ override def sufficientResourcesRegistered(): Boolean = {
+ totalCoresAcquired >= maxCores * minRegisteredRatio
+ }
+
+ override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
+
+ override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {}
+
+ /**
+ * Method called by Mesos to offer resources on slaves. We respond by launching an executor,
+ * unless we've already launched more than we wanted to.
+ */
+ override def resourceOffers(d: org.apache.mesos.SchedulerDriver, offers: JList[Offer]) {
+ stateLock.synchronized {
+ if (stopCalled) {
+ logDebug("Ignoring offers during shutdown")
+ // Driver should simply return a stopped status on race
+ // condition between this.stop() and completing here
+ offers.asScala.map(_.getId).foreach(d.declineOffer)
+ return
+ }
+
+ logDebug(s"Received ${offers.size} resource offers.")
+
+ val (matchedOffers, unmatchedOffers) = offers.asScala.partition { offer =>
+ val offerAttributes = toAttributeMap(offer.getAttributesList)
+ matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
+ }
+
+ declineUnmatchedOffers(d, unmatchedOffers)
+ handleMatchedOffers(d, matchedOffers)
+ }
+ }
+
+ private def declineUnmatchedOffers(
+ d: org.apache.mesos.SchedulerDriver, offers: mutable.Buffer[Offer]): Unit = {
+ offers.foreach { offer =>
+ declineOffer(d, offer, Some("unmet constraints"),
+ Some(rejectOfferDurationForUnmetConstraints))
+ }
+ }
+
+ private def declineOffer(
+ d: org.apache.mesos.SchedulerDriver,
+ offer: Offer,
+ reason: Option[String] = None,
+ refuseSeconds: Option[Long] = None): Unit = {
+
+ val id = offer.getId.getValue
+ val offerAttributes = toAttributeMap(offer.getAttributesList)
+ val mem = getResource(offer.getResourcesList, "mem")
+ val cpus = getResource(offer.getResourcesList, "cpus")
+ val ports = getRangeResource(offer.getResourcesList, "ports")
+
+ logDebug(s"Declining offer: $id with attributes: $offerAttributes mem: $mem" +
+ s" cpu: $cpus port: $ports for $refuseSeconds seconds" +
+ reason.map(r => s" (reason: $r)").getOrElse(""))
+
+ refuseSeconds match {
+ case Some(seconds) =>
+ val filters = Filters.newBuilder().setRefuseSeconds(seconds).build()
+ d.declineOffer(offer.getId, filters)
+ case _ => d.declineOffer(offer.getId)
+ }
+ }
+
+ /**
+ * Launches executors on accepted offers, and declines unused offers. Executors are launched
+ * round-robin on offers.
+ *
+ * @param d SchedulerDriver
+ * @param offers Mesos offers that match attribute constraints
+ */
+ private def handleMatchedOffers(
+ d: org.apache.mesos.SchedulerDriver, offers: mutable.Buffer[Offer]): Unit = {
+ val tasks = buildMesosTasks(offers)
+ for (offer <- offers) {
+ val offerAttributes = toAttributeMap(offer.getAttributesList)
+ val offerMem = getResource(offer.getResourcesList, "mem")
+ val offerCpus = getResource(offer.getResourcesList, "cpus")
+ val offerPorts = getRangeResource(offer.getResourcesList, "ports")
+ val id = offer.getId.getValue
+
+ if (tasks.contains(offer.getId)) { // accept
+ val offerTasks = tasks(offer.getId)
+
+ logDebug(s"Accepting offer: $id with attributes: $offerAttributes " +
+ s"mem: $offerMem cpu: $offerCpus ports: $offerPorts." +
+ s" Launching ${offerTasks.size} Mesos tasks.")
+
+ for (task <- offerTasks) {
+ val taskId = task.getTaskId
+ val mem = getResource(task.getResourcesList, "mem")
+ val cpus = getResource(task.getResourcesList, "cpus")
+ val ports = getRangeResource(task.getResourcesList, "ports").mkString(",")
+
+ logDebug(s"Launching Mesos task: ${taskId.getValue} with mem: $mem cpu: $cpus" +
+ s" ports: $ports")
+ }
+
+ d.launchTasks(
+ Collections.singleton(offer.getId),
+ offerTasks.asJava)
+ } else if (totalCoresAcquired >= maxCores) {
+ // Reject an offer for a configurable amount of time to avoid starving other frameworks
+ declineOffer(d, offer, Some("reached spark.cores.max"),
+ Some(rejectOfferDurationForReachedMaxCores))
+ } else {
+ declineOffer(d, offer)
+ }
+ }
+ }
+
+ /**
+ * Returns a map from OfferIDs to the tasks to launch on those offers. In order to maximize
+ * per-task memory and IO, tasks are round-robin assigned to offers.
+ *
+ * @param offers Mesos offers that match attribute constraints
+ * @return A map from OfferID to a list of Mesos tasks to launch on that offer
+ */
+ private def buildMesosTasks(offers: mutable.Buffer[Offer]): Map[OfferID, List[MesosTaskInfo]] = {
+ // offerID -> tasks
+ val tasks = new mutable.HashMap[OfferID, List[MesosTaskInfo]].withDefaultValue(Nil)
+
+ // offerID -> resources
+ val remainingResources = mutable.Map(offers.map(offer =>
+ (offer.getId.getValue, offer.getResourcesList)): _*)
+
+ var launchTasks = true
+
+ // TODO(mgummelt): combine offers for a single slave
+ //
+ // round-robin create executors on the available offers
+ while (launchTasks) {
+ launchTasks = false
+
+ for (offer <- offers) {
+ val slaveId = offer.getSlaveId.getValue
+ val offerId = offer.getId.getValue
+ val resources = remainingResources(offerId)
+
+ if (canLaunchTask(slaveId, resources)) {
+ // Create a task
+ launchTasks = true
+ val taskId = newMesosTaskId()
+ val offerCPUs = getResource(resources, "cpus").toInt
+ val taskGPUs = Math.min(
+ Math.max(0, maxGpus - totalGpusAcquired), getResource(resources, "gpus").toInt)
+
+ val taskCPUs = executorCores(offerCPUs)
+ val taskMemory = executorMemory(sc)
+
+ slaves.getOrElseUpdate(slaveId, new Slave(offer.getHostname)).taskIDs.add(taskId)
+
+ val (resourcesLeft, resourcesToUse) =
+ partitionTaskResources(resources, taskCPUs, taskMemory, taskGPUs)
+
+ val taskBuilder = MesosTaskInfo.newBuilder()
+ .setTaskId(TaskID.newBuilder().setValue(taskId.toString).build())
+ .setSlaveId(offer.getSlaveId)
+ .setCommand(createCommand(offer, taskCPUs + extraCoresPerExecutor, taskId))
+ .setName("Task " + taskId)
+ taskBuilder.addAllResources(resourcesToUse.asJava)
+ taskBuilder.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
+
+ tasks(offer.getId) ::= taskBuilder.build()
+ remainingResources(offerId) = resourcesLeft.asJava
+ totalCoresAcquired += taskCPUs
+ coresByTaskId(taskId) = taskCPUs
+ if (taskGPUs > 0) {
+ totalGpusAcquired += taskGPUs
+ gpusByTaskId(taskId) = taskGPUs
+ }
+ }
+ }
+ }
+ tasks.toMap
+ }
+
+ /** Extracts task needed resources from a list of available resources. */
+ private def partitionTaskResources(
+ resources: JList[Resource],
+ taskCPUs: Int,
+ taskMemory: Int,
+ taskGPUs: Int)
+ : (List[Resource], List[Resource]) = {
+
+ // partition cpus & mem
+ val (afterCPUResources, cpuResourcesToUse) = partitionResources(resources, "cpus", taskCPUs)
+ val (afterMemResources, memResourcesToUse) =
+ partitionResources(afterCPUResources.asJava, "mem", taskMemory)
+ val (afterGPUResources, gpuResourcesToUse) =
+ partitionResources(afterMemResources.asJava, "gpus", taskGPUs)
+
+ // If user specifies port numbers in SparkConfig then consecutive tasks will not be launched
+ // on the same host. This essentially means one executor per host.
+ // TODO: handle network isolator case
+ val (nonPortResources, portResourcesToUse) =
+ partitionPortResources(nonZeroPortValuesFromConfig(sc.conf), afterGPUResources)
+
+ (nonPortResources,
+ cpuResourcesToUse ++ memResourcesToUse ++ portResourcesToUse ++ gpuResourcesToUse)
+ }
+
+ private def canLaunchTask(slaveId: String, resources: JList[Resource]): Boolean = {
+ val offerMem = getResource(resources, "mem")
+ val offerCPUs = getResource(resources, "cpus").toInt
+ val cpus = executorCores(offerCPUs)
+ val mem = executorMemory(sc)
+ val ports = getRangeResource(resources, "ports")
+ val meetsPortRequirements = checkPorts(sc.conf, ports)
+
+ cpus > 0 &&
+ cpus <= offerCPUs &&
+ cpus + totalCoresAcquired <= maxCores &&
+ mem <= offerMem &&
+ numExecutors() < executorLimit &&
+ slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES &&
+ meetsPortRequirements
+ }
+
+ private def executorCores(offerCPUs: Int): Int = {
+ sc.conf.getInt("spark.executor.cores",
+ math.min(offerCPUs, maxCores - totalCoresAcquired))
+ }
+
+ override def statusUpdate(d: org.apache.mesos.SchedulerDriver, status: TaskStatus) {
+ val taskId = status.getTaskId.getValue
+ val slaveId = status.getSlaveId.getValue
+ val state = mesosToTaskState(status.getState)
+
+ logInfo(s"Mesos task $taskId is now ${status.getState}")
+
+ stateLock.synchronized {
+ val slave = slaves(slaveId)
+
+ // If the shuffle service is enabled, have the driver register with each one of the
+ // shuffle services. This allows the shuffle services to clean up state associated with
+ // this application when the driver exits. There is currently not a great way to detect
+ // this through Mesos, since the shuffle services are set up independently.
+ if (state.equals(TaskState.RUNNING) &&
+ shuffleServiceEnabled &&
+ !slave.shuffleRegistered) {
+ assume(mesosExternalShuffleClient.isDefined,
+ "External shuffle client was not instantiated even though shuffle service is enabled.")
+ // TODO: Remove this and allow the MesosExternalShuffleService to detect
+ // framework termination when new Mesos Framework HTTP API is available.
+ val externalShufflePort = conf.getInt("spark.shuffle.service.port", 7337)
+
+ logDebug(s"Connecting to shuffle service on slave $slaveId, " +
+ s"host ${slave.hostname}, port $externalShufflePort for app ${conf.getAppId}")
+
+ mesosExternalShuffleClient.get
+ .registerDriverWithShuffleService(
+ slave.hostname,
+ externalShufflePort,
+ sc.conf.getTimeAsMs("spark.storage.blockManagerSlaveTimeoutMs",
+ s"${sc.conf.getTimeAsMs("spark.network.timeout", "120s")}ms"),
+ sc.conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s"))
+ slave.shuffleRegistered = true
+ }
+
+ if (TaskState.isFinished(state)) {
+ // Remove the cores we have remembered for this task, if it's in the hashmap
+ for (cores <- coresByTaskId.get(taskId)) {
+ totalCoresAcquired -= cores
+ coresByTaskId -= taskId
+ }
+ // Also remove the gpus we have remembered for this task, if it's in the hashmap
+ for (gpus <- gpusByTaskId.get(taskId)) {
+ totalGpusAcquired -= gpus
+ gpusByTaskId -= taskId
+ }
+ // If it was a failure, mark the slave as failed for blacklisting purposes
+ if (TaskState.isFailed(state)) {
+ slave.taskFailures += 1
+
+ if (slave.taskFailures >= MAX_SLAVE_FAILURES) {
+ logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " +
+ "is Spark installed on it?")
+ }
+ }
+ executorTerminated(d, slaveId, taskId, s"Executor finished with state $state")
+ // In case we'd rejected everything before but have now lost a node
+ d.reviveOffers()
+ }
+ }
+ }
+
+ override def error(d: org.apache.mesos.SchedulerDriver, message: String) {
+ logError(s"Mesos error: $message")
+ scheduler.error(message)
+ }
+
+ override def stop() {
+ // Make sure we're not launching tasks during shutdown
+ stateLock.synchronized {
+ if (stopCalled) {
+ logWarning("Stop called multiple times, ignoring")
+ return
+ }
+ stopCalled = true
+ super.stop()
+ }
+
+ // Wait for executors to report done, or else mesosDriver.stop() will forcefully kill them.
+ // See SPARK-12330
+ val startTime = System.nanoTime()
+
+ // slaveIdsWithExecutors has no memory barrier, so this is eventually consistent
+ while (numExecutors() > 0 &&
+ System.nanoTime() - startTime < shutdownTimeoutMS * 1000L * 1000L) {
+ Thread.sleep(100)
+ }
+
+ if (numExecutors() > 0) {
+ logWarning(s"Timed out waiting for ${numExecutors()} remaining executors "
+ + s"to terminate within $shutdownTimeoutMS ms. This may leave temporary files "
+ + "on the mesos nodes.")
+ }
+
+ // Close the mesos external shuffle client if used
+ mesosExternalShuffleClient.foreach(_.close())
+
+ if (mesosDriver != null) {
+ mesosDriver.stop()
+ }
+ }
+
+ override def frameworkMessage(
+ d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
+
+ /**
+ * Called when a slave is lost or a Mesos task finished. Updates local view on
+ * what tasks are running. It also notifies the driver that an executor was removed.
+ */
+ private def executorTerminated(
+ d: org.apache.mesos.SchedulerDriver,
+ slaveId: String,
+ taskId: String,
+ reason: String): Unit = {
+ stateLock.synchronized {
+ // Do not call removeExecutor() after this scheduler backend was stopped because
+ // removeExecutor() internally will send a message to the driver endpoint but
+ // the driver endpoint is not available now, otherwise an exception will be thrown.
+ if (!stopCalled) {
+ removeExecutor(taskId, SlaveLost(reason))
+ }
+ slaves(slaveId).taskIDs.remove(taskId)
+ }
+ }
+
+ override def slaveLost(d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID): Unit = {
+ logInfo(s"Mesos slave lost: ${slaveId.getValue}")
+ }
+
+ override def executorLost(
+ d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, status: Int): Unit = {
+ logInfo("Mesos executor lost: %s".format(e.getValue))
+ }
+
+ override def applicationId(): String =
+ Option(appId).getOrElse {
+ logWarning("Application ID is not initialized yet.")
+ super.applicationId
+ }
+
+ override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future.successful {
+ // We don't truly know if we can fulfill the full amount of executors
+ // since at coarse grain it depends on the amount of slaves available.
+ logInfo("Capping the total amount of executors to " + requestedTotal)
+ executorLimitOption = Some(requestedTotal)
+ true
+ }
+
+ override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future.successful {
+ if (mesosDriver == null) {
+ logWarning("Asked to kill executors before the Mesos driver was started.")
+ false
+ } else {
+ for (executorId <- executorIds) {
+ val taskId = TaskID.newBuilder().setValue(executorId).build()
+ mesosDriver.killTask(taskId)
+ }
+ // no need to adjust `executorLimitOption` since the AllocationManager already communicated
+ // the desired limit through a call to `doRequestTotalExecutors`.
+ // See [[o.a.s.scheduler.cluster.CoarseGrainedSchedulerBackend.killExecutors]]
+ true
+ }
+ }
+
+ private def numExecutors(): Int = {
+ slaves.values.map(_.taskIDs.size).sum
+ }
+
+ private def executorHostname(offer: Offer): String = {
+ if (sc.conf.getOption("spark.mesos.network.name").isDefined) {
+ // The agent's IP is not visible in a CNI container, so we bind to 0.0.0.0
+ "0.0.0.0"
+ } else {
+ offer.getHostname
+ }
+ }
+}
+
+private class Slave(val hostname: String) {
+ val taskIDs = new mutable.HashSet[String]()
+ var taskFailures = 0
+ var shuffleRegistered = false
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
new file mode 100644
index 0000000..779ffb5
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackend.scala
@@ -0,0 +1,444 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.io.File
+import java.util.{ArrayList => JArrayList, Collections, List => JList}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap, HashSet}
+
+import org.apache.mesos.Protos.{ExecutorInfo => MesosExecutorInfo, TaskInfo => MesosTaskInfo, _}
+import org.apache.mesos.protobuf.ByteString
+
+import org.apache.spark.{SparkContext, SparkException, TaskState}
+import org.apache.spark.executor.MesosExecutorBackend
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+import org.apache.spark.util.Utils
+
+/**
+ * A SchedulerBackend for running fine-grained tasks on Mesos. Each Spark task is mapped to a
+ * separate Mesos task, allowing multiple applications to share cluster nodes both in space (tasks
+ * from multiple apps can run on different cores) and in time (a core can switch ownership).
+ */
+private[spark] class MesosFineGrainedSchedulerBackend(
+ scheduler: TaskSchedulerImpl,
+ sc: SparkContext,
+ master: String)
+ extends SchedulerBackend
+ with org.apache.mesos.Scheduler
+ with MesosSchedulerUtils {
+
+ // Stores the slave ids that has launched a Mesos executor.
+ val slaveIdToExecutorInfo = new HashMap[String, MesosExecutorInfo]
+ val taskIdToSlaveId = new HashMap[Long, String]
+
+ // An ExecutorInfo for our tasks
+ var execArgs: Array[Byte] = null
+
+ var classLoader: ClassLoader = null
+
+ // The listener bus to publish executor added/removed events.
+ val listenerBus = sc.listenerBus
+
+ private[mesos] val mesosExecutorCores = sc.conf.getDouble("spark.mesos.mesosExecutor.cores", 1)
+
+ // Offer constraints
+ private[this] val slaveOfferConstraints =
+ parseConstraintString(sc.conf.get("spark.mesos.constraints", ""))
+
+ // reject offers with mismatched constraints in seconds
+ private val rejectOfferDurationForUnmetConstraints =
+ getRejectOfferDurationForUnmetConstraints(sc)
+
+ @volatile var appId: String = _
+
+ override def start() {
+ classLoader = Thread.currentThread.getContextClassLoader
+ val driver = createSchedulerDriver(
+ master,
+ MesosFineGrainedSchedulerBackend.this,
+ sc.sparkUser,
+ sc.appName,
+ sc.conf,
+ sc.conf.getOption("spark.mesos.driver.webui.url").orElse(sc.ui.map(_.webUrl)),
+ Option.empty,
+ Option.empty,
+ sc.conf.getOption("spark.mesos.driver.frameworkId")
+ )
+
+ unsetFrameworkID(sc)
+ startScheduler(driver)
+ }
+
+ /**
+ * Creates a MesosExecutorInfo that is used to launch a Mesos executor.
+ * @param availableResources Available resources that is offered by Mesos
+ * @param execId The executor id to assign to this new executor.
+ * @return A tuple of the new mesos executor info and the remaining available resources.
+ */
+ def createExecutorInfo(
+ availableResources: JList[Resource],
+ execId: String): (MesosExecutorInfo, JList[Resource]) = {
+ val executorSparkHome = sc.conf.getOption("spark.mesos.executor.home")
+ .orElse(sc.getSparkHome()) // Fall back to driver Spark home for backward compatibility
+ .getOrElse {
+ throw new SparkException("Executor Spark home `spark.mesos.executor.home` is not set!")
+ }
+ val environment = Environment.newBuilder()
+ sc.conf.getOption("spark.executor.extraClassPath").foreach { cp =>
+ environment.addVariables(
+ Environment.Variable.newBuilder().setName("SPARK_CLASSPATH").setValue(cp).build())
+ }
+ val extraJavaOpts = sc.conf.getOption("spark.executor.extraJavaOptions").getOrElse("")
+
+ val prefixEnv = sc.conf.getOption("spark.executor.extraLibraryPath").map { p =>
+ Utils.libraryPathEnvPrefix(Seq(p))
+ }.getOrElse("")
+
+ environment.addVariables(
+ Environment.Variable.newBuilder()
+ .setName("SPARK_EXECUTOR_OPTS")
+ .setValue(extraJavaOpts)
+ .build())
+ sc.executorEnvs.foreach { case (key, value) =>
+ environment.addVariables(Environment.Variable.newBuilder()
+ .setName(key)
+ .setValue(value)
+ .build())
+ }
+ val command = CommandInfo.newBuilder()
+ .setEnvironment(environment)
+ val uri = sc.conf.getOption("spark.executor.uri")
+ .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
+
+ val executorBackendName = classOf[MesosExecutorBackend].getName
+ if (uri.isEmpty) {
+ val executorPath = new File(executorSparkHome, "/bin/spark-class").getPath
+ command.setValue(s"$prefixEnv $executorPath $executorBackendName")
+ } else {
+ // Grab everything to the first '.'. We'll use that and '*' to
+ // glob the directory "correctly".
+ val basename = uri.get.split('/').last.split('.').head
+ command.setValue(s"cd ${basename}*; $prefixEnv ./bin/spark-class $executorBackendName")
+ command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get))
+ }
+ val builder = MesosExecutorInfo.newBuilder()
+ val (resourcesAfterCpu, usedCpuResources) =
+ partitionResources(availableResources, "cpus", mesosExecutorCores)
+ val (resourcesAfterMem, usedMemResources) =
+ partitionResources(resourcesAfterCpu.asJava, "mem", executorMemory(sc))
+
+ builder.addAllResources(usedCpuResources.asJava)
+ builder.addAllResources(usedMemResources.asJava)
+
+ sc.conf.getOption("spark.mesos.uris").foreach(setupUris(_, command))
+
+ val executorInfo = builder
+ .setExecutorId(ExecutorID.newBuilder().setValue(execId).build())
+ .setCommand(command)
+ .setData(ByteString.copyFrom(createExecArg()))
+
+ executorInfo.setContainer(MesosSchedulerBackendUtil.containerInfo(sc.conf))
+ (executorInfo.build(), resourcesAfterMem.asJava)
+ }
+
+ /**
+ * Create and serialize the executor argument to pass to Mesos. Our executor arg is an array
+ * containing all the spark.* system properties in the form of (String, String) pairs.
+ */
+ private def createExecArg(): Array[Byte] = {
+ if (execArgs == null) {
+ val props = new HashMap[String, String]
+ for ((key, value) <- sc.conf.getAll) {
+ props(key) = value
+ }
+ // Serialize the map as an array of (String, String) pairs
+ execArgs = Utils.serialize(props.toArray)
+ }
+ execArgs
+ }
+
+ override def offerRescinded(d: org.apache.mesos.SchedulerDriver, o: OfferID) {}
+
+ override def registered(
+ d: org.apache.mesos.SchedulerDriver, frameworkId: FrameworkID, masterInfo: MasterInfo) {
+ inClassLoader() {
+ appId = frameworkId.getValue
+ logInfo("Registered as framework ID " + appId)
+ markRegistered()
+ }
+ }
+
+ private def inClassLoader()(fun: => Unit) = {
+ val oldClassLoader = Thread.currentThread.getContextClassLoader
+ Thread.currentThread.setContextClassLoader(classLoader)
+ try {
+ fun
+ } finally {
+ Thread.currentThread.setContextClassLoader(oldClassLoader)
+ }
+ }
+
+ override def disconnected(d: org.apache.mesos.SchedulerDriver) {}
+
+ override def reregistered(d: org.apache.mesos.SchedulerDriver, masterInfo: MasterInfo) {}
+
+ private def getTasksSummary(tasks: JArrayList[MesosTaskInfo]): String = {
+ val builder = new StringBuilder
+ tasks.asScala.foreach { t =>
+ builder.append("Task id: ").append(t.getTaskId.getValue).append("\n")
+ .append("Slave id: ").append(t.getSlaveId.getValue).append("\n")
+ .append("Task resources: ").append(t.getResourcesList).append("\n")
+ .append("Executor resources: ").append(t.getExecutor.getResourcesList)
+ .append("---------------------------------------------\n")
+ }
+ builder.toString()
+ }
+
+ /**
+ * Method called by Mesos to offer resources on slaves. We respond by asking our active task sets
+ * for tasks in order of priority. We fill each node with tasks in a round-robin manner so that
+ * tasks are balanced across the cluster.
+ */
+ override def resourceOffers(d: org.apache.mesos.SchedulerDriver, offers: JList[Offer]) {
+ inClassLoader() {
+ // Fail first on offers with unmet constraints
+ val (offersMatchingConstraints, offersNotMatchingConstraints) =
+ offers.asScala.partition { o =>
+ val offerAttributes = toAttributeMap(o.getAttributesList)
+ val meetsConstraints =
+ matchesAttributeRequirements(slaveOfferConstraints, offerAttributes)
+
+ // add some debug messaging
+ if (!meetsConstraints) {
+ val id = o.getId.getValue
+ logDebug(s"Declining offer: $id with attributes: $offerAttributes")
+ }
+
+ meetsConstraints
+ }
+
+ // These offers do not meet constraints. We don't need to see them again.
+ // Decline the offer for a long period of time.
+ offersNotMatchingConstraints.foreach { o =>
+ d.declineOffer(o.getId, Filters.newBuilder()
+ .setRefuseSeconds(rejectOfferDurationForUnmetConstraints).build())
+ }
+
+ // Of the matching constraints, see which ones give us enough memory and cores
+ val (usableOffers, unUsableOffers) = offersMatchingConstraints.partition { o =>
+ val mem = getResource(o.getResourcesList, "mem")
+ val cpus = getResource(o.getResourcesList, "cpus")
+ val slaveId = o.getSlaveId.getValue
+ val offerAttributes = toAttributeMap(o.getAttributesList)
+
+ // check offers for
+ // 1. Memory requirements
+ // 2. CPU requirements - need at least 1 for executor, 1 for task
+ val meetsMemoryRequirements = mem >= executorMemory(sc)
+ val meetsCPURequirements = cpus >= (mesosExecutorCores + scheduler.CPUS_PER_TASK)
+ val meetsRequirements =
+ (meetsMemoryRequirements && meetsCPURequirements) ||
+ (slaveIdToExecutorInfo.contains(slaveId) && cpus >= scheduler.CPUS_PER_TASK)
+ val debugstr = if (meetsRequirements) "Accepting" else "Declining"
+ logDebug(s"$debugstr offer: ${o.getId.getValue} with attributes: "
+ + s"$offerAttributes mem: $mem cpu: $cpus")
+
+ meetsRequirements
+ }
+
+ // Decline offers we ruled out immediately
+ unUsableOffers.foreach(o => d.declineOffer(o.getId))
+
+ val workerOffers = usableOffers.map { o =>
+ val cpus = if (slaveIdToExecutorInfo.contains(o.getSlaveId.getValue)) {
+ getResource(o.getResourcesList, "cpus").toInt
+ } else {
+ // If the Mesos executor has not been started on this slave yet, set aside a few
+ // cores for the Mesos executor by offering fewer cores to the Spark executor
+ (getResource(o.getResourcesList, "cpus") - mesosExecutorCores).toInt
+ }
+ new WorkerOffer(
+ o.getSlaveId.getValue,
+ o.getHostname,
+ cpus)
+ }.toIndexedSeq
+
+ val slaveIdToOffer = usableOffers.map(o => o.getSlaveId.getValue -> o).toMap
+ val slaveIdToWorkerOffer = workerOffers.map(o => o.executorId -> o).toMap
+ val slaveIdToResources = new HashMap[String, JList[Resource]]()
+ usableOffers.foreach { o =>
+ slaveIdToResources(o.getSlaveId.getValue) = o.getResourcesList
+ }
+
+ val mesosTasks = new HashMap[String, JArrayList[MesosTaskInfo]]
+
+ val slavesIdsOfAcceptedOffers = HashSet[String]()
+
+ // Call into the TaskSchedulerImpl
+ val acceptedOffers = scheduler.resourceOffers(workerOffers).filter(!_.isEmpty)
+ acceptedOffers
+ .foreach { offer =>
+ offer.foreach { taskDesc =>
+ val slaveId = taskDesc.executorId
+ slavesIdsOfAcceptedOffers += slaveId
+ taskIdToSlaveId(taskDesc.taskId) = slaveId
+ val (mesosTask, remainingResources) = createMesosTask(
+ taskDesc,
+ slaveIdToResources(slaveId),
+ slaveId)
+ mesosTasks.getOrElseUpdate(slaveId, new JArrayList[MesosTaskInfo])
+ .add(mesosTask)
+ slaveIdToResources(slaveId) = remainingResources
+ }
+ }
+
+ // Reply to the offers
+ val filters = Filters.newBuilder().setRefuseSeconds(1).build() // TODO: lower timeout?
+
+ mesosTasks.foreach { case (slaveId, tasks) =>
+ slaveIdToWorkerOffer.get(slaveId).foreach(o =>
+ listenerBus.post(SparkListenerExecutorAdded(System.currentTimeMillis(), slaveId,
+ // TODO: Add support for log urls for Mesos
+ new ExecutorInfo(o.host, o.cores, Map.empty)))
+ )
+ logTrace(s"Launching Mesos tasks on slave '$slaveId', tasks:\n${getTasksSummary(tasks)}")
+ d.launchTasks(Collections.singleton(slaveIdToOffer(slaveId).getId), tasks, filters)
+ }
+
+ // Decline offers that weren't used
+ // NOTE: This logic assumes that we only get a single offer for each host in a given batch
+ for (o <- usableOffers if !slavesIdsOfAcceptedOffers.contains(o.getSlaveId.getValue)) {
+ d.declineOffer(o.getId)
+ }
+ }
+ }
+
+ /** Turn a Spark TaskDescription into a Mesos task and also resources unused by the task */
+ def createMesosTask(
+ task: TaskDescription,
+ resources: JList[Resource],
+ slaveId: String): (MesosTaskInfo, JList[Resource]) = {
+ val taskId = TaskID.newBuilder().setValue(task.taskId.toString).build()
+ val (executorInfo, remainingResources) = if (slaveIdToExecutorInfo.contains(slaveId)) {
+ (slaveIdToExecutorInfo(slaveId), resources)
+ } else {
+ createExecutorInfo(resources, slaveId)
+ }
+ slaveIdToExecutorInfo(slaveId) = executorInfo
+ val (finalResources, cpuResources) =
+ partitionResources(remainingResources, "cpus", scheduler.CPUS_PER_TASK)
+ val taskInfo = MesosTaskInfo.newBuilder()
+ .setTaskId(taskId)
+ .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
+ .setExecutor(executorInfo)
+ .setName(task.name)
+ .addAllResources(cpuResources.asJava)
+ .setData(MesosTaskLaunchData(task.serializedTask, task.attemptNumber).toByteString)
+ .build()
+ (taskInfo, finalResources.asJava)
+ }
+
+ override def statusUpdate(d: org.apache.mesos.SchedulerDriver, status: TaskStatus) {
+ inClassLoader() {
+ val tid = status.getTaskId.getValue.toLong
+ val state = mesosToTaskState(status.getState)
+ synchronized {
+ if (TaskState.isFailed(mesosToTaskState(status.getState))
+ && taskIdToSlaveId.contains(tid)) {
+ // We lost the executor on this slave, so remember that it's gone
+ removeExecutor(taskIdToSlaveId(tid), "Lost executor")
+ }
+ if (TaskState.isFinished(state)) {
+ taskIdToSlaveId.remove(tid)
+ }
+ }
+ scheduler.statusUpdate(tid, state, status.getData.asReadOnlyByteBuffer)
+ }
+ }
+
+ override def error(d: org.apache.mesos.SchedulerDriver, message: String) {
+ inClassLoader() {
+ logError("Mesos error: " + message)
+ markErr()
+ scheduler.error(message)
+ }
+ }
+
+ override def stop() {
+ if (mesosDriver != null) {
+ mesosDriver.stop()
+ }
+ }
+
+ override def reviveOffers() {
+ mesosDriver.reviveOffers()
+ }
+
+ override def frameworkMessage(
+ d: org.apache.mesos.SchedulerDriver, e: ExecutorID, s: SlaveID, b: Array[Byte]) {}
+
+ /**
+ * Remove executor associated with slaveId in a thread safe manner.
+ */
+ private def removeExecutor(slaveId: String, reason: String) = {
+ synchronized {
+ listenerBus.post(SparkListenerExecutorRemoved(System.currentTimeMillis(), slaveId, reason))
+ slaveIdToExecutorInfo -= slaveId
+ }
+ }
+
+ private def recordSlaveLost(
+ d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID, reason: ExecutorLossReason) {
+ inClassLoader() {
+ logInfo("Mesos slave lost: " + slaveId.getValue)
+ removeExecutor(slaveId.getValue, reason.toString)
+ scheduler.executorLost(slaveId.getValue, reason)
+ }
+ }
+
+ override def slaveLost(d: org.apache.mesos.SchedulerDriver, slaveId: SlaveID) {
+ recordSlaveLost(d, slaveId, SlaveLost())
+ }
+
+ override def executorLost(
+ d: org.apache.mesos.SchedulerDriver, executorId: ExecutorID, slaveId: SlaveID, status: Int) {
+ logInfo("Executor lost: %s, marking slave %s as lost".format(executorId.getValue,
+ slaveId.getValue))
+ recordSlaveLost(d, slaveId, ExecutorExited(status, exitCausedByApp = true))
+ }
+
+ override def killTask(taskId: Long, executorId: String, interruptThread: Boolean): Unit = {
+ mesosDriver.killTask(
+ TaskID.newBuilder()
+ .setValue(taskId.toString).build()
+ )
+ }
+
+ // TODO: query Mesos for number of cores
+ override def defaultParallelism(): Int = sc.conf.getInt("spark.default.parallelism", 8)
+
+ override def applicationId(): String =
+ Option(appId).getOrElse {
+ logWarning("Application ID is not initialized yet.")
+ super.applicationId
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
new file mode 100644
index 0000000..a2adb22
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import org.apache.mesos.Protos.{ContainerInfo, Image, NetworkInfo, Volume}
+import org.apache.mesos.Protos.ContainerInfo.{DockerInfo, MesosInfo}
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.internal.Logging
+
+/**
+ * A collection of utility functions which can be used by both the
+ * MesosSchedulerBackend and the [[MesosFineGrainedSchedulerBackend]].
+ */
+private[mesos] object MesosSchedulerBackendUtil extends Logging {
+ /**
+ * Parse a comma-delimited list of volume specs, each of which
+ * takes the form [host-dir:]container-dir[:rw|:ro].
+ */
+ def parseVolumesSpec(volumes: String): List[Volume] = {
+ volumes.split(",").map(_.split(":")).flatMap { spec =>
+ val vol: Volume.Builder = Volume
+ .newBuilder()
+ .setMode(Volume.Mode.RW)
+ spec match {
+ case Array(container_path) =>
+ Some(vol.setContainerPath(container_path))
+ case Array(container_path, "rw") =>
+ Some(vol.setContainerPath(container_path))
+ case Array(container_path, "ro") =>
+ Some(vol.setContainerPath(container_path)
+ .setMode(Volume.Mode.RO))
+ case Array(host_path, container_path) =>
+ Some(vol.setContainerPath(container_path)
+ .setHostPath(host_path))
+ case Array(host_path, container_path, "rw") =>
+ Some(vol.setContainerPath(container_path)
+ .setHostPath(host_path))
+ case Array(host_path, container_path, "ro") =>
+ Some(vol.setContainerPath(container_path)
+ .setHostPath(host_path)
+ .setMode(Volume.Mode.RO))
+ case spec =>
+ logWarning(s"Unable to parse volume specs: $volumes. "
+ + "Expected form: \"[host-dir:]container-dir[:rw|:ro](, ...)\"")
+ None
+ }
+ }
+ .map { _.build() }
+ .toList
+ }
+
+ /**
+ * Parse a comma-delimited list of port mapping specs, each of which
+ * takes the form host_port:container_port[:udp|:tcp]
+ *
+ * Note:
+ * the docker form is [ip:]host_port:container_port, but the DockerInfo
+ * message has no field for 'ip', and instead has a 'protocol' field.
+ * Docker itself only appears to support TCP, so this alternative form
+ * anticipates the expansion of the docker form to allow for a protocol
+ * and leaves open the chance for mesos to begin to accept an 'ip' field
+ */
+ def parsePortMappingsSpec(portmaps: String): List[DockerInfo.PortMapping] = {
+ portmaps.split(",").map(_.split(":")).flatMap { spec: Array[String] =>
+ val portmap: DockerInfo.PortMapping.Builder = DockerInfo.PortMapping
+ .newBuilder()
+ .setProtocol("tcp")
+ spec match {
+ case Array(host_port, container_port) =>
+ Some(portmap.setHostPort(host_port.toInt)
+ .setContainerPort(container_port.toInt))
+ case Array(host_port, container_port, protocol) =>
+ Some(portmap.setHostPort(host_port.toInt)
+ .setContainerPort(container_port.toInt)
+ .setProtocol(protocol))
+ case spec =>
+ logWarning(s"Unable to parse port mapping specs: $portmaps. "
+ + "Expected form: \"host_port:container_port[:udp|:tcp](, ...)\"")
+ None
+ }
+ }
+ .map { _.build() }
+ .toList
+ }
+
+ def containerInfo(conf: SparkConf): ContainerInfo = {
+ val containerType = if (conf.contains("spark.mesos.executor.docker.image") &&
+ conf.get("spark.mesos.containerizer", "docker") == "docker") {
+ ContainerInfo.Type.DOCKER
+ } else {
+ ContainerInfo.Type.MESOS
+ }
+
+ val containerInfo = ContainerInfo.newBuilder()
+ .setType(containerType)
+
+ conf.getOption("spark.mesos.executor.docker.image").map { image =>
+ val forcePullImage = conf
+ .getOption("spark.mesos.executor.docker.forcePullImage")
+ .exists(_.equals("true"))
+
+ val portMaps = conf
+ .getOption("spark.mesos.executor.docker.portmaps")
+ .map(parsePortMappingsSpec)
+ .getOrElse(List.empty)
+
+ if (containerType == ContainerInfo.Type.DOCKER) {
+ containerInfo.setDocker(dockerInfo(image, forcePullImage, portMaps))
+ } else {
+ containerInfo.setMesos(mesosInfo(image, forcePullImage))
+ }
+
+ val volumes = conf
+ .getOption("spark.mesos.executor.docker.volumes")
+ .map(parseVolumesSpec)
+
+ volumes.foreach(_.foreach(containerInfo.addVolumes(_)))
+ }
+
+ conf.getOption("spark.mesos.network.name").map { name =>
+ val info = NetworkInfo.newBuilder().setName(name).build()
+ containerInfo.addNetworkInfos(info)
+ }
+
+ containerInfo.build()
+ }
+
+ private def dockerInfo(
+ image: String,
+ forcePullImage: Boolean,
+ portMaps: List[ContainerInfo.DockerInfo.PortMapping]): DockerInfo = {
+ val dockerBuilder = ContainerInfo.DockerInfo.newBuilder()
+ .setImage(image)
+ .setForcePullImage(forcePullImage)
+ portMaps.foreach(dockerBuilder.addPortMappings(_))
+
+ dockerBuilder.build
+ }
+
+ private def mesosInfo(image: String, forcePullImage: Boolean): MesosInfo = {
+ val imageProto = Image.newBuilder()
+ .setType(Image.Type.DOCKER)
+ .setDocker(Image.Docker.newBuilder().setName(image))
+ .setCached(!forcePullImage)
+ ContainerInfo.MesosInfo.newBuilder()
+ .setImage(imageProto)
+ .build
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[10/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
new file mode 100644
index 0000000..8772e26
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.util.RackResolver
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.config._
+
+private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], racks: Array[String])
+
+/**
+ * This strategy is calculating the optimal locality preferences of YARN containers by considering
+ * the node ratio of pending tasks, number of required cores/containers and and locality of current
+ * existing and pending allocated containers. The target of this algorithm is to maximize the number
+ * of tasks that would run locally.
+ *
+ * Consider a situation in which we have 20 tasks that require (host1, host2, host3)
+ * and 10 tasks that require (host1, host2, host4), besides each container has 2 cores
+ * and cpus per task is 1, so the required container number is 15,
+ * and host ratio is (host1: 30, host2: 30, host3: 20, host4: 10).
+ *
+ * 1. If requested container number (18) is more than the required container number (15):
+ *
+ * requests for 5 containers with nodes: (host1, host2, host3, host4)
+ * requests for 5 containers with nodes: (host1, host2, host3)
+ * requests for 5 containers with nodes: (host1, host2)
+ * requests for 3 containers with no locality preferences.
+ *
+ * The placement ratio is 3 : 3 : 2 : 1, and set the additional containers with no locality
+ * preferences.
+ *
+ * 2. If requested container number (10) is less than or equal to the required container number
+ * (15):
+ *
+ * requests for 4 containers with nodes: (host1, host2, host3, host4)
+ * requests for 3 containers with nodes: (host1, host2, host3)
+ * requests for 3 containers with nodes: (host1, host2)
+ *
+ * The placement ratio is 10 : 10 : 7 : 4, close to expected ratio (3 : 3 : 2 : 1)
+ *
+ * 3. If containers exist but none of them can match the requested localities,
+ * follow the method of 1 and 2.
+ *
+ * 4. If containers exist and some of them can match the requested localities.
+ * For example if we have 1 containers on each node (host1: 1, host2: 1: host3: 1, host4: 1),
+ * and the expected containers on each node would be (host1: 5, host2: 5, host3: 4, host4: 2),
+ * so the newly requested containers on each node would be updated to (host1: 4, host2: 4,
+ * host3: 3, host4: 1), 12 containers by total.
+ *
+ * 4.1 If requested container number (18) is more than newly required containers (12). Follow
+ * method 1 with updated ratio 4 : 4 : 3 : 1.
+ *
+ * 4.2 If request container number (10) is more than newly required containers (12). Follow
+ * method 2 with updated ratio 4 : 4 : 3 : 1.
+ *
+ * 5. If containers exist and existing localities can fully cover the requested localities.
+ * For example if we have 5 containers on each node (host1: 5, host2: 5, host3: 5, host4: 5),
+ * which could cover the current requested localities. This algorithm will allocate all the
+ * requested containers with no localities.
+ */
+private[yarn] class LocalityPreferredContainerPlacementStrategy(
+ val sparkConf: SparkConf,
+ val yarnConf: Configuration,
+ val resource: Resource) {
+
+ /**
+ * Calculate each container's node locality and rack locality
+ * @param numContainer number of containers to calculate
+ * @param numLocalityAwareTasks number of locality required tasks
+ * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
+ * numbers running on it, used as hints for container allocation
+ * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
+ * expected locality preference by considering the existing
+ * containers
+ * @param localityMatchedPendingAllocations A sequence of pending container request which
+ * matches the localities of current required tasks.
+ * @return node localities and rack localities, each locality is an array of string,
+ * the length of localities is the same as number of containers
+ */
+ def localityOfRequestedContainers(
+ numContainer: Int,
+ numLocalityAwareTasks: Int,
+ hostToLocalTaskCount: Map[String, Int],
+ allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
+ localityMatchedPendingAllocations: Seq[ContainerRequest]
+ ): Array[ContainerLocalityPreferences] = {
+ val updatedHostToContainerCount = expectedHostToContainerCount(
+ numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap,
+ localityMatchedPendingAllocations)
+ val updatedLocalityAwareContainerNum = updatedHostToContainerCount.values.sum
+
+ // The number of containers to allocate, divided into two groups, one with preferred locality,
+ // and the other without locality preference.
+ val requiredLocalityFreeContainerNum =
+ math.max(0, numContainer - updatedLocalityAwareContainerNum)
+ val requiredLocalityAwareContainerNum = numContainer - requiredLocalityFreeContainerNum
+
+ val containerLocalityPreferences = ArrayBuffer[ContainerLocalityPreferences]()
+ if (requiredLocalityFreeContainerNum > 0) {
+ for (i <- 0 until requiredLocalityFreeContainerNum) {
+ containerLocalityPreferences += ContainerLocalityPreferences(
+ null.asInstanceOf[Array[String]], null.asInstanceOf[Array[String]])
+ }
+ }
+
+ if (requiredLocalityAwareContainerNum > 0) {
+ val largestRatio = updatedHostToContainerCount.values.max
+ // Round the ratio of preferred locality to the number of locality required container
+ // number, which is used for locality preferred host calculating.
+ var preferredLocalityRatio = updatedHostToContainerCount.mapValues { ratio =>
+ val adjustedRatio = ratio.toDouble * requiredLocalityAwareContainerNum / largestRatio
+ adjustedRatio.ceil.toInt
+ }
+
+ for (i <- 0 until requiredLocalityAwareContainerNum) {
+ // Only filter out the ratio which is larger than 0, which means the current host can
+ // still be allocated with new container request.
+ val hosts = preferredLocalityRatio.filter(_._2 > 0).keys.toArray
+ val racks = hosts.map { h =>
+ RackResolver.resolve(yarnConf, h).getNetworkLocation
+ }.toSet
+ containerLocalityPreferences += ContainerLocalityPreferences(hosts, racks.toArray)
+
+ // Minus 1 each time when the host is used. When the current ratio is 0,
+ // which means all the required ratio is satisfied, this host will not be allocated again.
+ preferredLocalityRatio = preferredLocalityRatio.mapValues(_ - 1)
+ }
+ }
+
+ containerLocalityPreferences.toArray
+ }
+
+ /**
+ * Calculate the number of executors need to satisfy the given number of pending tasks.
+ */
+ private def numExecutorsPending(numTasksPending: Int): Int = {
+ val coresPerExecutor = resource.getVirtualCores
+ (numTasksPending * sparkConf.get(CPUS_PER_TASK) + coresPerExecutor - 1) / coresPerExecutor
+ }
+
+ /**
+ * Calculate the expected host to number of containers by considering with allocated containers.
+ * @param localityAwareTasks number of locality aware tasks
+ * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
+ * numbers running on it, used as hints for container allocation
+ * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
+ * expected locality preference by considering the existing
+ * containers
+ * @param localityMatchedPendingAllocations A sequence of pending container request which
+ * matches the localities of current required tasks.
+ * @return a map with hostname as key and required number of containers on this host as value
+ */
+ private def expectedHostToContainerCount(
+ localityAwareTasks: Int,
+ hostToLocalTaskCount: Map[String, Int],
+ allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
+ localityMatchedPendingAllocations: Seq[ContainerRequest]
+ ): Map[String, Int] = {
+ val totalLocalTaskNum = hostToLocalTaskCount.values.sum
+ val pendingHostToContainersMap = pendingHostToContainerCount(localityMatchedPendingAllocations)
+
+ hostToLocalTaskCount.map { case (host, count) =>
+ val expectedCount =
+ count.toDouble * numExecutorsPending(localityAwareTasks) / totalLocalTaskNum
+ // Take the locality of pending containers into consideration
+ val existedCount = allocatedHostToContainersMap.get(host).map(_.size).getOrElse(0) +
+ pendingHostToContainersMap.getOrElse(host, 0.0)
+
+ // If existing container can not fully satisfy the expected number of container,
+ // the required container number is expected count minus existed count. Otherwise the
+ // required container number is 0.
+ (host, math.max(0, (expectedCount - existedCount).ceil.toInt))
+ }
+ }
+
+ /**
+ * According to the locality ratio and number of container requests, calculate the host to
+ * possible number of containers for pending allocated containers.
+ *
+ * If current locality ratio of hosts is: Host1 : Host2 : Host3 = 20 : 20 : 10,
+ * and pending container requests is 3, so the possible number of containers on
+ * Host1 : Host2 : Host3 will be 1.2 : 1.2 : 0.6.
+ * @param localityMatchedPendingAllocations A sequence of pending container request which
+ * matches the localities of current required tasks.
+ * @return a Map with hostname as key and possible number of containers on this host as value
+ */
+ private def pendingHostToContainerCount(
+ localityMatchedPendingAllocations: Seq[ContainerRequest]): Map[String, Double] = {
+ val pendingHostToContainerCount = new HashMap[String, Int]()
+ localityMatchedPendingAllocations.foreach { cr =>
+ cr.getNodes.asScala.foreach { n =>
+ val count = pendingHostToContainerCount.getOrElse(n, 0) + 1
+ pendingHostToContainerCount(n) = count
+ }
+ }
+
+ val possibleTotalContainerNum = pendingHostToContainerCount.values.sum
+ val localityMatchedPendingNum = localityMatchedPendingAllocations.size.toDouble
+ pendingHostToContainerCount.mapValues(_ * localityMatchedPendingNum / possibleTotalContainerNum)
+ .toMap
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
new file mode 100644
index 0000000..0b66d1c
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -0,0 +1,727 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.util.Collections
+import java.util.concurrent._
+import java.util.regex.Pattern
+
+import scala.collection.mutable
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue}
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.util.RackResolver
+import org.apache.log4j.{Level, Logger}
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
+import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveLastAllocatedExecutorId
+import org.apache.spark.util.{Clock, SystemClock, ThreadUtils}
+
+/**
+ * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
+ * what to do with containers when YARN fulfills these requests.
+ *
+ * This class makes use of YARN's AMRMClient APIs. We interact with the AMRMClient in three ways:
+ * * Making our resource needs known, which updates local bookkeeping about containers requested.
+ * * Calling "allocate", which syncs our local container requests with the RM, and returns any
+ * containers that YARN has granted to us. This also functions as a heartbeat.
+ * * Processing the containers granted to us to possibly launch executors inside of them.
+ *
+ * The public methods of this class are thread-safe. All methods that mutate state are
+ * synchronized.
+ */
+private[yarn] class YarnAllocator(
+ driverUrl: String,
+ driverRef: RpcEndpointRef,
+ conf: YarnConfiguration,
+ sparkConf: SparkConf,
+ amClient: AMRMClient[ContainerRequest],
+ appAttemptId: ApplicationAttemptId,
+ securityMgr: SecurityManager,
+ localResources: Map[String, LocalResource])
+ extends Logging {
+
+ import YarnAllocator._
+
+ // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
+ if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
+ Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
+ }
+
+ // Visible for testing.
+ val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]]
+ val allocatedContainerToHostMap = new HashMap[ContainerId, String]
+
+ // Containers that we no longer care about. We've either already told the RM to release them or
+ // will on the next heartbeat. Containers get removed from this map after the RM tells us they've
+ // completed.
+ private val releasedContainers = Collections.newSetFromMap[ContainerId](
+ new ConcurrentHashMap[ContainerId, java.lang.Boolean])
+
+ @volatile private var numExecutorsRunning = 0
+
+ /**
+ * Used to generate a unique ID per executor
+ *
+ * Init `executorIdCounter`. when AM restart, `executorIdCounter` will reset to 0. Then
+ * the id of new executor will start from 1, this will conflict with the executor has
+ * already created before. So, we should initialize the `executorIdCounter` by getting
+ * the max executorId from driver.
+ *
+ * And this situation of executorId conflict is just in yarn client mode, so this is an issue
+ * in yarn client mode. For more details, can check in jira.
+ *
+ * @see SPARK-12864
+ */
+ private var executorIdCounter: Int =
+ driverRef.askWithRetry[Int](RetrieveLastAllocatedExecutorId)
+
+ // Queue to store the timestamp of failed executors
+ private val failedExecutorsTimeStamps = new Queue[Long]()
+
+ private var clock: Clock = new SystemClock
+
+ private val executorFailuresValidityInterval =
+ sparkConf.get(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L)
+
+ @volatile private var targetNumExecutors =
+ YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
+
+ // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
+ // list of requesters that should be responded to once we find out why the given executor
+ // was lost.
+ private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
+
+ // Maintain loss reasons for already released executors, it will be added when executor loss
+ // reason is got from AM-RM call, and be removed after querying this loss reason.
+ private val releasedExecutorLossReasons = new HashMap[String, ExecutorLossReason]
+
+ // Keep track of which container is running which executor to remove the executors later
+ // Visible for testing.
+ private[yarn] val executorIdToContainer = new HashMap[String, Container]
+
+ private var numUnexpectedContainerRelease = 0L
+ private val containerIdToExecutorId = new HashMap[ContainerId, String]
+
+ // Executor memory in MB.
+ protected val executorMemory = sparkConf.get(EXECUTOR_MEMORY).toInt
+ // Additional memory overhead.
+ protected val memoryOverhead: Int = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
+ math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)).toInt
+ // Number of cores per executor.
+ protected val executorCores = sparkConf.get(EXECUTOR_CORES)
+ // Resource capability requested for each executors
+ private[yarn] val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
+
+ private val launcherPool = ThreadUtils.newDaemonCachedThreadPool(
+ "ContainerLauncher", sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS))
+
+ // For testing
+ private val launchContainers = sparkConf.getBoolean("spark.yarn.launchContainers", true)
+
+ private val labelExpression = sparkConf.get(EXECUTOR_NODE_LABEL_EXPRESSION)
+
+ // ContainerRequest constructor that can take a node label expression. We grab it through
+ // reflection because it's only available in later versions of YARN.
+ private val nodeLabelConstructor = labelExpression.flatMap { expr =>
+ try {
+ Some(classOf[ContainerRequest].getConstructor(classOf[Resource],
+ classOf[Array[String]], classOf[Array[String]], classOf[Priority], classOf[Boolean],
+ classOf[String]))
+ } catch {
+ case e: NoSuchMethodException =>
+ logWarning(s"Node label expression $expr will be ignored because YARN version on" +
+ " classpath does not support it.")
+ None
+ }
+ }
+
+ // A map to store preferred hostname and possible task numbers running on it.
+ private var hostToLocalTaskCounts: Map[String, Int] = Map.empty
+
+ // Number of tasks that have locality preferences in active stages
+ private var numLocalityAwareTasks: Int = 0
+
+ // A container placement strategy based on pending tasks' locality preference
+ private[yarn] val containerPlacementStrategy =
+ new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource)
+
+ /**
+ * Use a different clock for YarnAllocator. This is mainly used for testing.
+ */
+ def setClock(newClock: Clock): Unit = {
+ clock = newClock
+ }
+
+ def getNumExecutorsRunning: Int = numExecutorsRunning
+
+ def getNumExecutorsFailed: Int = synchronized {
+ val endTime = clock.getTimeMillis()
+
+ while (executorFailuresValidityInterval > 0
+ && failedExecutorsTimeStamps.nonEmpty
+ && failedExecutorsTimeStamps.head < endTime - executorFailuresValidityInterval) {
+ failedExecutorsTimeStamps.dequeue()
+ }
+
+ failedExecutorsTimeStamps.size
+ }
+
+ /**
+ * A sequence of pending container requests that have not yet been fulfilled.
+ */
+ def getPendingAllocate: Seq[ContainerRequest] = getPendingAtLocation(ANY_HOST)
+
+ /**
+ * A sequence of pending container requests at the given location that have not yet been
+ * fulfilled.
+ */
+ private def getPendingAtLocation(location: String): Seq[ContainerRequest] = {
+ amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala
+ .flatMap(_.asScala)
+ .toSeq
+ }
+
+ /**
+ * Request as many executors from the ResourceManager as needed to reach the desired total. If
+ * the requested total is smaller than the current number of running executors, no executors will
+ * be killed.
+ * @param requestedTotal total number of containers requested
+ * @param localityAwareTasks number of locality aware tasks to be used as container placement hint
+ * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
+ * container placement hint.
+ * @return Whether the new requested total is different than the old value.
+ */
+ def requestTotalExecutorsWithPreferredLocalities(
+ requestedTotal: Int,
+ localityAwareTasks: Int,
+ hostToLocalTaskCount: Map[String, Int]): Boolean = synchronized {
+ this.numLocalityAwareTasks = localityAwareTasks
+ this.hostToLocalTaskCounts = hostToLocalTaskCount
+
+ if (requestedTotal != targetNumExecutors) {
+ logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
+ targetNumExecutors = requestedTotal
+ true
+ } else {
+ false
+ }
+ }
+
+ /**
+ * Request that the ResourceManager release the container running the specified executor.
+ */
+ def killExecutor(executorId: String): Unit = synchronized {
+ if (executorIdToContainer.contains(executorId)) {
+ val container = executorIdToContainer.get(executorId).get
+ internalReleaseContainer(container)
+ numExecutorsRunning -= 1
+ } else {
+ logWarning(s"Attempted to kill unknown executor $executorId!")
+ }
+ }
+
+ /**
+ * Request resources such that, if YARN gives us all we ask for, we'll have a number of containers
+ * equal to maxExecutors.
+ *
+ * Deal with any containers YARN has granted to us by possibly launching executors in them.
+ *
+ * This must be synchronized because variables read in this method are mutated by other methods.
+ */
+ def allocateResources(): Unit = synchronized {
+ updateResourceRequests()
+
+ val progressIndicator = 0.1f
+ // Poll the ResourceManager. This doubles as a heartbeat if there are no pending container
+ // requests.
+ val allocateResponse = amClient.allocate(progressIndicator)
+
+ val allocatedContainers = allocateResponse.getAllocatedContainers()
+
+ if (allocatedContainers.size > 0) {
+ logDebug("Allocated containers: %d. Current executor count: %d. Cluster resources: %s."
+ .format(
+ allocatedContainers.size,
+ numExecutorsRunning,
+ allocateResponse.getAvailableResources))
+
+ handleAllocatedContainers(allocatedContainers.asScala)
+ }
+
+ val completedContainers = allocateResponse.getCompletedContainersStatuses()
+ if (completedContainers.size > 0) {
+ logDebug("Completed %d containers".format(completedContainers.size))
+ processCompletedContainers(completedContainers.asScala)
+ logDebug("Finished processing %d completed containers. Current running executor count: %d."
+ .format(completedContainers.size, numExecutorsRunning))
+ }
+ }
+
+ /**
+ * Update the set of container requests that we will sync with the RM based on the number of
+ * executors we have currently running and our target number of executors.
+ *
+ * Visible for testing.
+ */
+ def updateResourceRequests(): Unit = {
+ val pendingAllocate = getPendingAllocate
+ val numPendingAllocate = pendingAllocate.size
+ val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
+
+ if (missing > 0) {
+ logInfo(s"Will request $missing executor container(s), each with " +
+ s"${resource.getVirtualCores} core(s) and " +
+ s"${resource.getMemory} MB memory (including $memoryOverhead MB of overhead)")
+
+ // Split the pending container request into three groups: locality matched list, locality
+ // unmatched list and non-locality list. Take the locality matched container request into
+ // consideration of container placement, treat as allocated containers.
+ // For locality unmatched and locality free container requests, cancel these container
+ // requests, since required locality preference has been changed, recalculating using
+ // container placement strategy.
+ val (localRequests, staleRequests, anyHostRequests) = splitPendingAllocationsByLocality(
+ hostToLocalTaskCounts, pendingAllocate)
+
+ // cancel "stale" requests for locations that are no longer needed
+ staleRequests.foreach { stale =>
+ amClient.removeContainerRequest(stale)
+ }
+ val cancelledContainers = staleRequests.size
+ if (cancelledContainers > 0) {
+ logInfo(s"Canceled $cancelledContainers container request(s) (locality no longer needed)")
+ }
+
+ // consider the number of new containers and cancelled stale containers available
+ val availableContainers = missing + cancelledContainers
+
+ // to maximize locality, include requests with no locality preference that can be cancelled
+ val potentialContainers = availableContainers + anyHostRequests.size
+
+ val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
+ potentialContainers, numLocalityAwareTasks, hostToLocalTaskCounts,
+ allocatedHostToContainersMap, localRequests)
+
+ val newLocalityRequests = new mutable.ArrayBuffer[ContainerRequest]
+ containerLocalityPreferences.foreach {
+ case ContainerLocalityPreferences(nodes, racks) if nodes != null =>
+ newLocalityRequests += createContainerRequest(resource, nodes, racks)
+ case _ =>
+ }
+
+ if (availableContainers >= newLocalityRequests.size) {
+ // more containers are available than needed for locality, fill in requests for any host
+ for (i <- 0 until (availableContainers - newLocalityRequests.size)) {
+ newLocalityRequests += createContainerRequest(resource, null, null)
+ }
+ } else {
+ val numToCancel = newLocalityRequests.size - availableContainers
+ // cancel some requests without locality preferences to schedule more local containers
+ anyHostRequests.slice(0, numToCancel).foreach { nonLocal =>
+ amClient.removeContainerRequest(nonLocal)
+ }
+ if (numToCancel > 0) {
+ logInfo(s"Canceled $numToCancel unlocalized container requests to resubmit with locality")
+ }
+ }
+
+ newLocalityRequests.foreach { request =>
+ amClient.addContainerRequest(request)
+ }
+
+ if (log.isInfoEnabled()) {
+ val (localized, anyHost) = newLocalityRequests.partition(_.getNodes() != null)
+ if (anyHost.nonEmpty) {
+ logInfo(s"Submitted ${anyHost.size} unlocalized container requests.")
+ }
+ localized.foreach { request =>
+ logInfo(s"Submitted container request for host ${hostStr(request)}.")
+ }
+ }
+ } else if (numPendingAllocate > 0 && missing < 0) {
+ val numToCancel = math.min(numPendingAllocate, -missing)
+ logInfo(s"Canceling requests for $numToCancel executor container(s) to have a new desired " +
+ s"total $targetNumExecutors executors.")
+
+ val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource)
+ if (!matchingRequests.isEmpty) {
+ matchingRequests.iterator().next().asScala
+ .take(numToCancel).foreach(amClient.removeContainerRequest)
+ } else {
+ logWarning("Expected to find pending requests, but found none.")
+ }
+ }
+ }
+
+ private def hostStr(request: ContainerRequest): String = {
+ Option(request.getNodes) match {
+ case Some(nodes) => nodes.asScala.mkString(",")
+ case None => "Any"
+ }
+ }
+
+ /**
+ * Creates a container request, handling the reflection required to use YARN features that were
+ * added in recent versions.
+ */
+ private def createContainerRequest(
+ resource: Resource,
+ nodes: Array[String],
+ racks: Array[String]): ContainerRequest = {
+ nodeLabelConstructor.map { constructor =>
+ constructor.newInstance(resource, nodes, racks, RM_REQUEST_PRIORITY, true: java.lang.Boolean,
+ labelExpression.orNull)
+ }.getOrElse(new ContainerRequest(resource, nodes, racks, RM_REQUEST_PRIORITY))
+ }
+
+ /**
+ * Handle containers granted by the RM by launching executors on them.
+ *
+ * Due to the way the YARN allocation protocol works, certain healthy race conditions can result
+ * in YARN granting containers that we no longer need. In this case, we release them.
+ *
+ * Visible for testing.
+ */
+ def handleAllocatedContainers(allocatedContainers: Seq[Container]): Unit = {
+ val containersToUse = new ArrayBuffer[Container](allocatedContainers.size)
+
+ // Match incoming requests by host
+ val remainingAfterHostMatches = new ArrayBuffer[Container]
+ for (allocatedContainer <- allocatedContainers) {
+ matchContainerToRequest(allocatedContainer, allocatedContainer.getNodeId.getHost,
+ containersToUse, remainingAfterHostMatches)
+ }
+
+ // Match remaining by rack
+ val remainingAfterRackMatches = new ArrayBuffer[Container]
+ for (allocatedContainer <- remainingAfterHostMatches) {
+ val rack = RackResolver.resolve(conf, allocatedContainer.getNodeId.getHost).getNetworkLocation
+ matchContainerToRequest(allocatedContainer, rack, containersToUse,
+ remainingAfterRackMatches)
+ }
+
+ // Assign remaining that are neither node-local nor rack-local
+ val remainingAfterOffRackMatches = new ArrayBuffer[Container]
+ for (allocatedContainer <- remainingAfterRackMatches) {
+ matchContainerToRequest(allocatedContainer, ANY_HOST, containersToUse,
+ remainingAfterOffRackMatches)
+ }
+
+ if (!remainingAfterOffRackMatches.isEmpty) {
+ logDebug(s"Releasing ${remainingAfterOffRackMatches.size} unneeded containers that were " +
+ s"allocated to us")
+ for (container <- remainingAfterOffRackMatches) {
+ internalReleaseContainer(container)
+ }
+ }
+
+ runAllocatedContainers(containersToUse)
+
+ logInfo("Received %d containers from YARN, launching executors on %d of them."
+ .format(allocatedContainers.size, containersToUse.size))
+ }
+
+ /**
+ * Looks for requests for the given location that match the given container allocation. If it
+ * finds one, removes the request so that it won't be submitted again. Places the container into
+ * containersToUse or remaining.
+ *
+ * @param allocatedContainer container that was given to us by YARN
+ * @param location resource name, either a node, rack, or *
+ * @param containersToUse list of containers that will be used
+ * @param remaining list of containers that will not be used
+ */
+ private def matchContainerToRequest(
+ allocatedContainer: Container,
+ location: String,
+ containersToUse: ArrayBuffer[Container],
+ remaining: ArrayBuffer[Container]): Unit = {
+ // SPARK-6050: certain Yarn configurations return a virtual core count that doesn't match the
+ // request; for example, capacity scheduler + DefaultResourceCalculator. So match on requested
+ // memory, but use the asked vcore count for matching, effectively disabling matching on vcore
+ // count.
+ val matchingResource = Resource.newInstance(allocatedContainer.getResource.getMemory,
+ resource.getVirtualCores)
+ val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location,
+ matchingResource)
+
+ // Match the allocation to a request
+ if (!matchingRequests.isEmpty) {
+ val containerRequest = matchingRequests.get(0).iterator.next
+ amClient.removeContainerRequest(containerRequest)
+ containersToUse += allocatedContainer
+ } else {
+ remaining += allocatedContainer
+ }
+ }
+
+ /**
+ * Launches executors in the allocated containers.
+ */
+ private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = {
+ for (container <- containersToUse) {
+ executorIdCounter += 1
+ val executorHostname = container.getNodeId.getHost
+ val containerId = container.getId
+ val executorId = executorIdCounter.toString
+ assert(container.getResource.getMemory >= resource.getMemory)
+ logInfo(s"Launching container $containerId on host $executorHostname")
+
+ def updateInternalState(): Unit = synchronized {
+ numExecutorsRunning += 1
+ executorIdToContainer(executorId) = container
+ containerIdToExecutorId(container.getId) = executorId
+
+ val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
+ new HashSet[ContainerId])
+ containerSet += containerId
+ allocatedContainerToHostMap.put(containerId, executorHostname)
+ }
+
+ if (numExecutorsRunning < targetNumExecutors) {
+ if (launchContainers) {
+ launcherPool.execute(new Runnable {
+ override def run(): Unit = {
+ try {
+ new ExecutorRunnable(
+ Some(container),
+ conf,
+ sparkConf,
+ driverUrl,
+ executorId,
+ executorHostname,
+ executorMemory,
+ executorCores,
+ appAttemptId.getApplicationId.toString,
+ securityMgr,
+ localResources
+ ).run()
+ updateInternalState()
+ } catch {
+ case NonFatal(e) =>
+ logError(s"Failed to launch executor $executorId on container $containerId", e)
+ // Assigned container should be released immediately to avoid unnecessary resource
+ // occupation.
+ amClient.releaseAssignedContainer(containerId)
+ }
+ }
+ })
+ } else {
+ // For test only
+ updateInternalState()
+ }
+ } else {
+ logInfo(("Skip launching executorRunnable as runnning Excecutors count: %d " +
+ "reached target Executors count: %d.").format(numExecutorsRunning, targetNumExecutors))
+ }
+ }
+ }
+
+ // Visible for testing.
+ private[yarn] def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
+ for (completedContainer <- completedContainers) {
+ val containerId = completedContainer.getContainerId
+ val alreadyReleased = releasedContainers.remove(containerId)
+ val hostOpt = allocatedContainerToHostMap.get(containerId)
+ val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
+ val exitReason = if (!alreadyReleased) {
+ // Decrement the number of executors running. The next iteration of
+ // the ApplicationMaster's reporting thread will take care of allocating.
+ numExecutorsRunning -= 1
+ logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
+ containerId,
+ onHostStr,
+ completedContainer.getState,
+ completedContainer.getExitStatus))
+ // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
+ // there are some exit status' we shouldn't necessarily count against us, but for
+ // now I think its ok as none of the containers are expected to exit.
+ val exitStatus = completedContainer.getExitStatus
+ val (exitCausedByApp, containerExitReason) = exitStatus match {
+ case ContainerExitStatus.SUCCESS =>
+ (false, s"Executor for container $containerId exited because of a YARN event (e.g., " +
+ "pre-emption) and not because of an error in the running job.")
+ case ContainerExitStatus.PREEMPTED =>
+ // Preemption is not the fault of the running tasks, since YARN preempts containers
+ // merely to do resource sharing, and tasks that fail due to preempted executors could
+ // just as easily finish on any other executor. See SPARK-8167.
+ (false, s"Container ${containerId}${onHostStr} was preempted.")
+ // Should probably still count memory exceeded exit codes towards task failures
+ case VMEM_EXCEEDED_EXIT_CODE =>
+ (true, memLimitExceededLogMessage(
+ completedContainer.getDiagnostics,
+ VMEM_EXCEEDED_PATTERN))
+ case PMEM_EXCEEDED_EXIT_CODE =>
+ (true, memLimitExceededLogMessage(
+ completedContainer.getDiagnostics,
+ PMEM_EXCEEDED_PATTERN))
+ case _ =>
+ // Enqueue the timestamp of failed executor
+ failedExecutorsTimeStamps.enqueue(clock.getTimeMillis())
+ (true, "Container marked as failed: " + containerId + onHostStr +
+ ". Exit status: " + completedContainer.getExitStatus +
+ ". Diagnostics: " + completedContainer.getDiagnostics)
+
+ }
+ if (exitCausedByApp) {
+ logWarning(containerExitReason)
+ } else {
+ logInfo(containerExitReason)
+ }
+ ExecutorExited(exitStatus, exitCausedByApp, containerExitReason)
+ } else {
+ // If we have already released this container, then it must mean
+ // that the driver has explicitly requested it to be killed
+ ExecutorExited(completedContainer.getExitStatus, exitCausedByApp = false,
+ s"Container $containerId exited from explicit termination request.")
+ }
+
+ for {
+ host <- hostOpt
+ containerSet <- allocatedHostToContainersMap.get(host)
+ } {
+ containerSet.remove(containerId)
+ if (containerSet.isEmpty) {
+ allocatedHostToContainersMap.remove(host)
+ } else {
+ allocatedHostToContainersMap.update(host, containerSet)
+ }
+
+ allocatedContainerToHostMap.remove(containerId)
+ }
+
+ containerIdToExecutorId.remove(containerId).foreach { eid =>
+ executorIdToContainer.remove(eid)
+ pendingLossReasonRequests.remove(eid) match {
+ case Some(pendingRequests) =>
+ // Notify application of executor loss reasons so it can decide whether it should abort
+ pendingRequests.foreach(_.reply(exitReason))
+
+ case None =>
+ // We cannot find executor for pending reasons. This is because completed container
+ // is processed before querying pending result. We should store it for later query.
+ // This is usually happened when explicitly killing a container, the result will be
+ // returned in one AM-RM communication. So query RPC will be later than this completed
+ // container process.
+ releasedExecutorLossReasons.put(eid, exitReason)
+ }
+ if (!alreadyReleased) {
+ // The executor could have gone away (like no route to host, node failure, etc)
+ // Notify backend about the failure of the executor
+ numUnexpectedContainerRelease += 1
+ driverRef.send(RemoveExecutor(eid, exitReason))
+ }
+ }
+ }
+ }
+
+ /**
+ * Register that some RpcCallContext has asked the AM why the executor was lost. Note that
+ * we can only find the loss reason to send back in the next call to allocateResources().
+ */
+ private[yarn] def enqueueGetLossReasonRequest(
+ eid: String,
+ context: RpcCallContext): Unit = synchronized {
+ if (executorIdToContainer.contains(eid)) {
+ pendingLossReasonRequests
+ .getOrElseUpdate(eid, new ArrayBuffer[RpcCallContext]) += context
+ } else if (releasedExecutorLossReasons.contains(eid)) {
+ // Executor is already released explicitly before getting the loss reason, so directly send
+ // the pre-stored lost reason
+ context.reply(releasedExecutorLossReasons.remove(eid).get)
+ } else {
+ logWarning(s"Tried to get the loss reason for non-existent executor $eid")
+ context.sendFailure(
+ new SparkException(s"Fail to find loss reason for non-existent executor $eid"))
+ }
+ }
+
+ private def internalReleaseContainer(container: Container): Unit = {
+ releasedContainers.add(container.getId())
+ amClient.releaseAssignedContainer(container.getId())
+ }
+
+ private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
+
+ private[yarn] def getNumPendingLossReasonRequests: Int = synchronized {
+ pendingLossReasonRequests.size
+ }
+
+ /**
+ * Split the pending container requests into 3 groups based on current localities of pending
+ * tasks.
+ * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
+ * container placement hint.
+ * @param pendingAllocations A sequence of pending allocation container request.
+ * @return A tuple of 3 sequences, first is a sequence of locality matched container
+ * requests, second is a sequence of locality unmatched container requests, and third is a
+ * sequence of locality free container requests.
+ */
+ private def splitPendingAllocationsByLocality(
+ hostToLocalTaskCount: Map[String, Int],
+ pendingAllocations: Seq[ContainerRequest]
+ ): (Seq[ContainerRequest], Seq[ContainerRequest], Seq[ContainerRequest]) = {
+ val localityMatched = ArrayBuffer[ContainerRequest]()
+ val localityUnMatched = ArrayBuffer[ContainerRequest]()
+ val localityFree = ArrayBuffer[ContainerRequest]()
+
+ val preferredHosts = hostToLocalTaskCount.keySet
+ pendingAllocations.foreach { cr =>
+ val nodes = cr.getNodes
+ if (nodes == null) {
+ localityFree += cr
+ } else if (nodes.asScala.toSet.intersect(preferredHosts).nonEmpty) {
+ localityMatched += cr
+ } else {
+ localityUnMatched += cr
+ }
+ }
+
+ (localityMatched.toSeq, localityUnMatched.toSeq, localityFree.toSeq)
+ }
+
+}
+
+private object YarnAllocator {
+ val MEM_REGEX = "[0-9.]+ [KMG]B"
+ val PMEM_EXCEEDED_PATTERN =
+ Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
+ val VMEM_EXCEEDED_PATTERN =
+ Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
+ val VMEM_EXCEEDED_EXIT_CODE = -103
+ val PMEM_EXCEEDED_EXIT_CODE = -104
+
+ def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
+ val matcher = pattern.matcher(diagnostics)
+ val diag = if (matcher.find()) " " + matcher.group() + "." else ""
+ ("Container killed by YARN for exceeding memory limits." + diag
+ + " Consider boosting spark.yarn.executor.memoryOverhead.")
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
new file mode 100644
index 0000000..53df11e
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.util.{List => JList}
+
+import scala.collection.JavaConverters._
+import scala.util.Try
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.AMRMClient
+import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.webapp.util.WebAppUtils
+
+import org.apache.spark.{SecurityManager, SparkConf}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.util.Utils
+
+/**
+ * Handles registering and unregistering the application with the YARN ResourceManager.
+ */
+private[spark] class YarnRMClient extends Logging {
+
+ private var amClient: AMRMClient[ContainerRequest] = _
+ private var uiHistoryAddress: String = _
+ private var registered: Boolean = false
+
+ /**
+ * Registers the application master with the RM.
+ *
+ * @param conf The Yarn configuration.
+ * @param sparkConf The Spark configuration.
+ * @param uiAddress Address of the SparkUI.
+ * @param uiHistoryAddress Address of the application on the History Server.
+ * @param securityMgr The security manager.
+ * @param localResources Map with information about files distributed via YARN's cache.
+ */
+ def register(
+ driverUrl: String,
+ driverRef: RpcEndpointRef,
+ conf: YarnConfiguration,
+ sparkConf: SparkConf,
+ uiAddress: String,
+ uiHistoryAddress: String,
+ securityMgr: SecurityManager,
+ localResources: Map[String, LocalResource]
+ ): YarnAllocator = {
+ amClient = AMRMClient.createAMRMClient()
+ amClient.init(conf)
+ amClient.start()
+ this.uiHistoryAddress = uiHistoryAddress
+
+ logInfo("Registering the ApplicationMaster")
+ synchronized {
+ amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
+ registered = true
+ }
+ new YarnAllocator(driverUrl, driverRef, conf, sparkConf, amClient, getAttemptId(), securityMgr,
+ localResources)
+ }
+
+ /**
+ * Unregister the AM. Guaranteed to only be called once.
+ *
+ * @param status The final status of the AM.
+ * @param diagnostics Diagnostics message to include in the final status.
+ */
+ def unregister(status: FinalApplicationStatus, diagnostics: String = ""): Unit = synchronized {
+ if (registered) {
+ amClient.unregisterApplicationMaster(status, diagnostics, uiHistoryAddress)
+ }
+ }
+
+ /** Returns the attempt ID. */
+ def getAttemptId(): ApplicationAttemptId = {
+ YarnSparkHadoopUtil.get.getContainerId.getApplicationAttemptId()
+ }
+
+ /** Returns the configuration for the AmIpFilter to add to the Spark UI. */
+ def getAmIpFilterParams(conf: YarnConfiguration, proxyBase: String): Map[String, String] = {
+ // Figure out which scheme Yarn is using. Note the method seems to have been added after 2.2,
+ // so not all stable releases have it.
+ val prefix = Try(classOf[WebAppUtils].getMethod("getHttpSchemePrefix", classOf[Configuration])
+ .invoke(null, conf).asInstanceOf[String]).getOrElse("http://")
+
+ // If running a new enough Yarn, use the HA-aware API for retrieving the RM addresses.
+ try {
+ val method = classOf[WebAppUtils].getMethod("getProxyHostsAndPortsForAmFilter",
+ classOf[Configuration])
+ val proxies = method.invoke(null, conf).asInstanceOf[JList[String]]
+ val hosts = proxies.asScala.map { proxy => proxy.split(":")(0) }
+ val uriBases = proxies.asScala.map { proxy => prefix + proxy + proxyBase }
+ Map("PROXY_HOSTS" -> hosts.mkString(","), "PROXY_URI_BASES" -> uriBases.mkString(","))
+ } catch {
+ case e: NoSuchMethodException =>
+ val proxy = WebAppUtils.getProxyHostAndPort(conf)
+ val parts = proxy.split(":")
+ val uriBase = prefix + proxy + proxyBase
+ Map("PROXY_HOST" -> parts(0), "PROXY_URI_BASE" -> uriBase)
+ }
+ }
+
+ /** Returns the maximum number of attempts to register the AM. */
+ def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
+ val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt)
+ val yarnMaxAttempts = yarnConf.getInt(
+ YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
+ val retval: Int = sparkMaxAttempts match {
+ case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts
+ case None => yarnMaxAttempts
+ }
+
+ retval
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
new file mode 100644
index 0000000..cc53b1b
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.File
+import java.nio.charset.StandardCharsets.UTF_8
+import java.util.regex.Matcher
+import java.util.regex.Pattern
+
+import scala.collection.mutable.{HashMap, ListBuffer}
+import scala.util.Try
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.security.Credentials
+import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.yarn.api.ApplicationConstants
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.records.{ApplicationAccessType, ContainerId, Priority}
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.util.ConverterUtils
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.security.{ConfigurableCredentialManager, CredentialUpdater}
+import org.apache.spark.internal.config._
+import org.apache.spark.launcher.YarnCommandBuilderUtils
+import org.apache.spark.util.Utils
+
+/**
+ * Contains util methods to interact with Hadoop from spark.
+ */
+class YarnSparkHadoopUtil extends SparkHadoopUtil {
+
+ private var credentialUpdater: CredentialUpdater = _
+
+ override def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
+ dest.addCredentials(source.getCredentials())
+ }
+
+ // Note that all params which start with SPARK are propagated all the way through, so if in yarn
+ // mode, this MUST be set to true.
+ override def isYarnMode(): Boolean = { true }
+
+ // Return an appropriate (subclass) of Configuration. Creating a config initializes some Hadoop
+ // subsystems. Always create a new config, don't reuse yarnConf.
+ override def newConfiguration(conf: SparkConf): Configuration =
+ new YarnConfiguration(super.newConfiguration(conf))
+
+ // Add any user credentials to the job conf which are necessary for running on a secure Hadoop
+ // cluster
+ override def addCredentials(conf: JobConf) {
+ val jobCreds = conf.getCredentials()
+ jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
+ }
+
+ override def getCurrentUserCredentials(): Credentials = {
+ UserGroupInformation.getCurrentUser().getCredentials()
+ }
+
+ override def addCurrentUserCredentials(creds: Credentials) {
+ UserGroupInformation.getCurrentUser().addCredentials(creds)
+ }
+
+ override def addSecretKeyToUserCredentials(key: String, secret: String) {
+ val creds = new Credentials()
+ creds.addSecretKey(new Text(key), secret.getBytes(UTF_8))
+ addCurrentUserCredentials(creds)
+ }
+
+ override def getSecretKeyFromUserCredentials(key: String): Array[Byte] = {
+ val credentials = getCurrentUserCredentials()
+ if (credentials != null) credentials.getSecretKey(new Text(key)) else null
+ }
+
+ private[spark] override def startCredentialUpdater(sparkConf: SparkConf): Unit = {
+ credentialUpdater =
+ new ConfigurableCredentialManager(sparkConf, newConfiguration(sparkConf)).credentialUpdater()
+ credentialUpdater.start()
+ }
+
+ private[spark] override def stopCredentialUpdater(): Unit = {
+ if (credentialUpdater != null) {
+ credentialUpdater.stop()
+ credentialUpdater = null
+ }
+ }
+
+ private[spark] def getContainerId: ContainerId = {
+ val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
+ ConverterUtils.toContainerId(containerIdString)
+ }
+}
+
+object YarnSparkHadoopUtil {
+ // Additional memory overhead
+ // 10% was arrived at experimentally. In the interest of minimizing memory waste while covering
+ // the common cases. Memory overhead tends to grow with container size.
+
+ val MEMORY_OVERHEAD_FACTOR = 0.10
+ val MEMORY_OVERHEAD_MIN = 384L
+
+ val ANY_HOST = "*"
+
+ val DEFAULT_NUMBER_EXECUTORS = 2
+
+ // All RM requests are issued with same priority : we do not (yet) have any distinction between
+ // request types (like map/reduce in hadoop for example)
+ val RM_REQUEST_PRIORITY = Priority.newInstance(1)
+
+ def get: YarnSparkHadoopUtil = {
+ val yarnMode = java.lang.Boolean.parseBoolean(
+ System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
+ if (!yarnMode) {
+ throw new SparkException("YarnSparkHadoopUtil is not available in non-YARN mode!")
+ }
+ SparkHadoopUtil.get.asInstanceOf[YarnSparkHadoopUtil]
+ }
+ /**
+ * Add a path variable to the given environment map.
+ * If the map already contains this key, append the value to the existing value instead.
+ */
+ def addPathToEnvironment(env: HashMap[String, String], key: String, value: String): Unit = {
+ val newValue = if (env.contains(key)) { env(key) + getClassPathSeparator + value } else value
+ env.put(key, newValue)
+ }
+
+ /**
+ * Set zero or more environment variables specified by the given input string.
+ * The input string is expected to take the form "KEY1=VAL1,KEY2=VAL2,KEY3=VAL3".
+ */
+ def setEnvFromInputString(env: HashMap[String, String], inputString: String): Unit = {
+ if (inputString != null && inputString.length() > 0) {
+ val childEnvs = inputString.split(",")
+ val p = Pattern.compile(environmentVariableRegex)
+ for (cEnv <- childEnvs) {
+ val parts = cEnv.split("=") // split on '='
+ val m = p.matcher(parts(1))
+ val sb = new StringBuffer
+ while (m.find()) {
+ val variable = m.group(1)
+ var replace = ""
+ if (env.get(variable) != None) {
+ replace = env.get(variable).get
+ } else {
+ // if this key is not configured for the child .. get it from the env
+ replace = System.getenv(variable)
+ if (replace == null) {
+ // the env key is note present anywhere .. simply set it
+ replace = ""
+ }
+ }
+ m.appendReplacement(sb, Matcher.quoteReplacement(replace))
+ }
+ m.appendTail(sb)
+ // This treats the environment variable as path variable delimited by `File.pathSeparator`
+ // This is kept for backward compatibility and consistency with Hadoop's behavior
+ addPathToEnvironment(env, parts(0), sb.toString)
+ }
+ }
+ }
+
+ private val environmentVariableRegex: String = {
+ if (Utils.isWindows) {
+ "%([A-Za-z_][A-Za-z0-9_]*?)%"
+ } else {
+ "\\$([A-Za-z_][A-Za-z0-9_]*)"
+ }
+ }
+
+ /**
+ * Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling.
+ * Not killing the task leaves various aspects of the executor and (to some extent) the jvm in
+ * an inconsistent state.
+ * TODO: If the OOM is not recoverable by rescheduling it on different node, then do
+ * 'something' to fail job ... akin to blacklisting trackers in mapred ?
+ *
+ * The handler if an OOM Exception is thrown by the JVM must be configured on Windows
+ * differently: the 'taskkill' command should be used, whereas Unix-based systems use 'kill'.
+ *
+ * As the JVM interprets both %p and %%p as the same, we can use either of them. However,
+ * some tests on Windows computers suggest, that the JVM only accepts '%%p'.
+ *
+ * Furthermore, the behavior of the character '%' on the Windows command line differs from
+ * the behavior of '%' in a .cmd file: it gets interpreted as an incomplete environment
+ * variable. Windows .cmd files escape a '%' by '%%'. Thus, the correct way of writing
+ * '%%p' in an escaped way is '%%%%p'.
+ */
+ private[yarn] def addOutOfMemoryErrorArgument(javaOpts: ListBuffer[String]): Unit = {
+ if (!javaOpts.exists(_.contains("-XX:OnOutOfMemoryError"))) {
+ if (Utils.isWindows) {
+ javaOpts += escapeForShell("-XX:OnOutOfMemoryError=taskkill /F /PID %%%%p")
+ } else {
+ javaOpts += "-XX:OnOutOfMemoryError='kill %p'"
+ }
+ }
+ }
+
+ /**
+ * Escapes a string for inclusion in a command line executed by Yarn. Yarn executes commands
+ * using either
+ *
+ * (Unix-based) `bash -c "command arg1 arg2"` and that means plain quoting doesn't really work.
+ * The argument is enclosed in single quotes and some key characters are escaped.
+ *
+ * (Windows-based) part of a .cmd file in which case windows escaping for each argument must be
+ * applied. Windows is quite lenient, however it is usually Java that causes trouble, needing to
+ * distinguish between arguments starting with '-' and class names. If arguments are surrounded
+ * by ' java takes the following string as is, hence an argument is mistakenly taken as a class
+ * name which happens to start with a '-'. The way to avoid this, is to surround nothing with
+ * a ', but instead with a ".
+ *
+ * @param arg A single argument.
+ * @return Argument quoted for execution via Yarn's generated shell script.
+ */
+ def escapeForShell(arg: String): String = {
+ if (arg != null) {
+ if (Utils.isWindows) {
+ YarnCommandBuilderUtils.quoteForBatchScript(arg)
+ } else {
+ val escaped = new StringBuilder("'")
+ for (i <- 0 to arg.length() - 1) {
+ arg.charAt(i) match {
+ case '$' => escaped.append("\\$")
+ case '"' => escaped.append("\\\"")
+ case '\'' => escaped.append("'\\''")
+ case c => escaped.append(c)
+ }
+ }
+ escaped.append("'").toString()
+ }
+ } else {
+ arg
+ }
+ }
+
+ // YARN/Hadoop acls are specified as user1,user2 group1,group2
+ // Users and groups are separated by a space and hence we need to pass the acls in same format
+ def getApplicationAclsForYarn(securityMgr: SecurityManager)
+ : Map[ApplicationAccessType, String] = {
+ Map[ApplicationAccessType, String] (
+ ApplicationAccessType.VIEW_APP -> (securityMgr.getViewAcls + " " +
+ securityMgr.getViewAclsGroups),
+ ApplicationAccessType.MODIFY_APP -> (securityMgr.getModifyAcls + " " +
+ securityMgr.getModifyAclsGroups)
+ )
+ }
+
+ /**
+ * Expand environment variable using Yarn API.
+ * If environment.$$() is implemented, return the result of it.
+ * Otherwise, return the result of environment.$()
+ * Note: $$() is added in Hadoop 2.4.
+ */
+ private lazy val expandMethod =
+ Try(classOf[Environment].getMethod("$$"))
+ .getOrElse(classOf[Environment].getMethod("$"))
+
+ def expandEnvironment(environment: Environment): String =
+ expandMethod.invoke(environment).asInstanceOf[String]
+
+ /**
+ * Get class path separator using Yarn API.
+ * If ApplicationConstants.CLASS_PATH_SEPARATOR is implemented, return it.
+ * Otherwise, return File.pathSeparator
+ * Note: CLASS_PATH_SEPARATOR is added in Hadoop 2.4.
+ */
+ private lazy val classPathSeparatorField =
+ Try(classOf[ApplicationConstants].getField("CLASS_PATH_SEPARATOR"))
+ .getOrElse(classOf[File].getField("pathSeparator"))
+
+ def getClassPathSeparator(): String = {
+ classPathSeparatorField.get(null).asInstanceOf[String]
+ }
+
+ /**
+ * Getting the initial target number of executors depends on whether dynamic allocation is
+ * enabled.
+ * If not using dynamic allocation it gets the number of executors requested by the user.
+ */
+ def getInitialTargetExecutorNumber(
+ conf: SparkConf,
+ numExecutors: Int = DEFAULT_NUMBER_EXECUTORS): Int = {
+ if (Utils.isDynamicAllocationEnabled(conf)) {
+ val minNumExecutors = conf.get(DYN_ALLOCATION_MIN_EXECUTORS)
+ val initialNumExecutors = Utils.getDynamicAllocationInitialExecutors(conf)
+ val maxNumExecutors = conf.get(DYN_ALLOCATION_MAX_EXECUTORS)
+ require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
+ s"initial executor number $initialNumExecutors must between min executor number " +
+ s"$minNumExecutors and max executor number $maxNumExecutors")
+
+ initialNumExecutors
+ } else {
+ val targetNumExecutors =
+ sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(numExecutors)
+ // System property can override environment variable.
+ conf.get(EXECUTOR_INSTANCES).getOrElse(targetNumExecutors)
+ }
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
new file mode 100644
index 0000000..666cb45
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.network.util.ByteUnit
+
+package object config {
+
+ /* Common app configuration. */
+
+ private[spark] val APPLICATION_TAGS = ConfigBuilder("spark.yarn.tags")
+ .doc("Comma-separated list of strings to pass through as YARN application tags appearing " +
+ "in YARN Application Reports, which can be used for filtering when querying YARN.")
+ .stringConf
+ .toSequence
+ .createOptional
+
+ private[spark] val AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS =
+ ConfigBuilder("spark.yarn.am.attemptFailuresValidityInterval")
+ .doc("Interval after which AM failures will be considered independent and " +
+ "not accumulate towards the attempt count.")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createOptional
+
+ private[spark] val AM_PORT =
+ ConfigBuilder("spark.yarn.am.port")
+ .intConf
+ .createWithDefault(0)
+
+ private[spark] val EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS =
+ ConfigBuilder("spark.yarn.executor.failuresValidityInterval")
+ .doc("Interval after which Executor failures will be considered independent and not " +
+ "accumulate towards the attempt count.")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createOptional
+
+ private[spark] val MAX_APP_ATTEMPTS = ConfigBuilder("spark.yarn.maxAppAttempts")
+ .doc("Maximum number of AM attempts before failing the app.")
+ .intConf
+ .createOptional
+
+ private[spark] val USER_CLASS_PATH_FIRST = ConfigBuilder("spark.yarn.user.classpath.first")
+ .doc("Whether to place user jars in front of Spark's classpath.")
+ .booleanConf
+ .createWithDefault(false)
+
+ private[spark] val GATEWAY_ROOT_PATH = ConfigBuilder("spark.yarn.config.gatewayPath")
+ .doc("Root of configuration paths that is present on gateway nodes, and will be replaced " +
+ "with the corresponding path in cluster machines.")
+ .stringConf
+ .createWithDefault(null)
+
+ private[spark] val REPLACEMENT_ROOT_PATH = ConfigBuilder("spark.yarn.config.replacementPath")
+ .doc(s"Path to use as a replacement for ${GATEWAY_ROOT_PATH.key} when launching processes " +
+ "in the YARN cluster.")
+ .stringConf
+ .createWithDefault(null)
+
+ private[spark] val QUEUE_NAME = ConfigBuilder("spark.yarn.queue")
+ .stringConf
+ .createWithDefault("default")
+
+ private[spark] val HISTORY_SERVER_ADDRESS = ConfigBuilder("spark.yarn.historyServer.address")
+ .stringConf
+ .createOptional
+
+ /* File distribution. */
+
+ private[spark] val SPARK_ARCHIVE = ConfigBuilder("spark.yarn.archive")
+ .doc("Location of archive containing jars files with Spark classes.")
+ .stringConf
+ .createOptional
+
+ private[spark] val SPARK_JARS = ConfigBuilder("spark.yarn.jars")
+ .doc("Location of jars containing Spark classes.")
+ .stringConf
+ .toSequence
+ .createOptional
+
+ private[spark] val ARCHIVES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.archives")
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val FILES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.files")
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val JARS_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.jars")
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val PRESERVE_STAGING_FILES = ConfigBuilder("spark.yarn.preserve.staging.files")
+ .doc("Whether to preserve temporary files created by the job in HDFS.")
+ .booleanConf
+ .createWithDefault(false)
+
+ private[spark] val STAGING_FILE_REPLICATION = ConfigBuilder("spark.yarn.submit.file.replication")
+ .doc("Replication factor for files uploaded by Spark to HDFS.")
+ .intConf
+ .createOptional
+
+ private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir")
+ .doc("Staging directory used while submitting applications.")
+ .stringConf
+ .createOptional
+
+ /* Cluster-mode launcher configuration. */
+
+ private[spark] val WAIT_FOR_APP_COMPLETION = ConfigBuilder("spark.yarn.submit.waitAppCompletion")
+ .doc("In cluster mode, whether to wait for the application to finish before exiting the " +
+ "launcher process.")
+ .booleanConf
+ .createWithDefault(true)
+
+ private[spark] val REPORT_INTERVAL = ConfigBuilder("spark.yarn.report.interval")
+ .doc("Interval between reports of the current app status in cluster mode.")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("1s")
+
+ /* Shared Client-mode AM / Driver configuration. */
+
+ private[spark] val AM_MAX_WAIT_TIME = ConfigBuilder("spark.yarn.am.waitTime")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("100s")
+
+ private[spark] val AM_NODE_LABEL_EXPRESSION = ConfigBuilder("spark.yarn.am.nodeLabelExpression")
+ .doc("Node label expression for the AM.")
+ .stringConf
+ .createOptional
+
+ private[spark] val CONTAINER_LAUNCH_MAX_THREADS =
+ ConfigBuilder("spark.yarn.containerLauncherMaxThreads")
+ .intConf
+ .createWithDefault(25)
+
+ private[spark] val MAX_EXECUTOR_FAILURES = ConfigBuilder("spark.yarn.max.executor.failures")
+ .intConf
+ .createOptional
+
+ private[spark] val MAX_REPORTER_THREAD_FAILURES =
+ ConfigBuilder("spark.yarn.scheduler.reporterThread.maxFailures")
+ .intConf
+ .createWithDefault(5)
+
+ private[spark] val RM_HEARTBEAT_INTERVAL =
+ ConfigBuilder("spark.yarn.scheduler.heartbeat.interval-ms")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("3s")
+
+ private[spark] val INITIAL_HEARTBEAT_INTERVAL =
+ ConfigBuilder("spark.yarn.scheduler.initial-allocation.interval")
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefaultString("200ms")
+
+ private[spark] val SCHEDULER_SERVICES = ConfigBuilder("spark.yarn.services")
+ .doc("A comma-separated list of class names of services to add to the scheduler.")
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ /* Client-mode AM configuration. */
+
+ private[spark] val AM_CORES = ConfigBuilder("spark.yarn.am.cores")
+ .intConf
+ .createWithDefault(1)
+
+ private[spark] val AM_JAVA_OPTIONS = ConfigBuilder("spark.yarn.am.extraJavaOptions")
+ .doc("Extra Java options for the client-mode AM.")
+ .stringConf
+ .createOptional
+
+ private[spark] val AM_LIBRARY_PATH = ConfigBuilder("spark.yarn.am.extraLibraryPath")
+ .doc("Extra native library path for the client-mode AM.")
+ .stringConf
+ .createOptional
+
+ private[spark] val AM_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.am.memoryOverhead")
+ .bytesConf(ByteUnit.MiB)
+ .createOptional
+
+ private[spark] val AM_MEMORY = ConfigBuilder("spark.yarn.am.memory")
+ .bytesConf(ByteUnit.MiB)
+ .createWithDefaultString("512m")
+
+ /* Driver configuration. */
+
+ private[spark] val DRIVER_CORES = ConfigBuilder("spark.driver.cores")
+ .intConf
+ .createWithDefault(1)
+
+ private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.driver.memoryOverhead")
+ .bytesConf(ByteUnit.MiB)
+ .createOptional
+
+ /* Executor configuration. */
+
+ private[spark] val EXECUTOR_CORES = ConfigBuilder("spark.executor.cores")
+ .intConf
+ .createWithDefault(1)
+
+ private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.executor.memoryOverhead")
+ .bytesConf(ByteUnit.MiB)
+ .createOptional
+
+ private[spark] val EXECUTOR_NODE_LABEL_EXPRESSION =
+ ConfigBuilder("spark.yarn.executor.nodeLabelExpression")
+ .doc("Node label expression for executors.")
+ .stringConf
+ .createOptional
+
+ /* Security configuration. */
+
+ private[spark] val CREDENTIAL_FILE_MAX_COUNT =
+ ConfigBuilder("spark.yarn.credentials.file.retention.count")
+ .intConf
+ .createWithDefault(5)
+
+ private[spark] val CREDENTIALS_FILE_MAX_RETENTION =
+ ConfigBuilder("spark.yarn.credentials.file.retention.days")
+ .intConf
+ .createWithDefault(5)
+
+ private[spark] val NAMENODES_TO_ACCESS = ConfigBuilder("spark.yarn.access.namenodes")
+ .doc("Extra NameNode URLs for which to request delegation tokens. The NameNode that hosts " +
+ "fs.defaultFS does not need to be listed here.")
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ /* Rolled log aggregation configuration. */
+
+ private[spark] val ROLLED_LOG_INCLUDE_PATTERN =
+ ConfigBuilder("spark.yarn.rolledLog.includePattern")
+ .doc("Java Regex to filter the log files which match the defined include pattern and those " +
+ "log files will be aggregated in a rolling fashion.")
+ .stringConf
+ .createOptional
+
+ private[spark] val ROLLED_LOG_EXCLUDE_PATTERN =
+ ConfigBuilder("spark.yarn.rolledLog.excludePattern")
+ .doc("Java Regex to filter the log files which match the defined exclude pattern and those " +
+ "log files will not be aggregated in a rolling fashion.")
+ .stringConf
+ .createOptional
+
+ /* Private configs. */
+
+ private[spark] val CREDENTIALS_FILE_PATH = ConfigBuilder("spark.yarn.credentials.file")
+ .internal()
+ .stringConf
+ .createWithDefault(null)
+
+ // Internal config to propagate the location of the user's jar to the driver/executors
+ private[spark] val APP_JAR = ConfigBuilder("spark.yarn.user.jar")
+ .internal()
+ .stringConf
+ .createOptional
+
+ // Internal config to propagate the locations of any extra jars to add to the classpath
+ // of the executors
+ private[spark] val SECONDARY_JARS = ConfigBuilder("spark.yarn.secondary.jars")
+ .internal()
+ .stringConf
+ .toSequence
+ .createOptional
+
+ /* Configuration and cached file propagation. */
+
+ private[spark] val CACHED_FILES = ConfigBuilder("spark.yarn.cache.filenames")
+ .internal()
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val CACHED_FILES_SIZES = ConfigBuilder("spark.yarn.cache.sizes")
+ .internal()
+ .longConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val CACHED_FILES_TIMESTAMPS = ConfigBuilder("spark.yarn.cache.timestamps")
+ .internal()
+ .longConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ private[spark] val CACHED_FILES_VISIBILITIES = ConfigBuilder("spark.yarn.cache.visibilities")
+ .internal()
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ // Either "file" or "archive", for each file.
+ private[spark] val CACHED_FILES_TYPES = ConfigBuilder("spark.yarn.cache.types")
+ .internal()
+ .stringConf
+ .toSequence
+ .createWithDefault(Nil)
+
+ // The location of the conf archive in HDFS.
+ private[spark] val CACHED_CONF_ARCHIVE = ConfigBuilder("spark.yarn.cache.confArchive")
+ .internal()
+ .stringConf
+ .createOptional
+
+ private[spark] val CREDENTIALS_RENEWAL_TIME = ConfigBuilder("spark.yarn.credentials.renewalTime")
+ .internal()
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefault(Long.MaxValue)
+
+ private[spark] val CREDENTIALS_UPDATE_TIME = ConfigBuilder("spark.yarn.credentials.updateTime")
+ .internal()
+ .timeConf(TimeUnit.MILLISECONDS)
+ .createWithDefault(Long.MaxValue)
+
+ // The list of cache-related config entries. This is used by Client and the AM to clean
+ // up the environment so that these settings do not appear on the web UI.
+ private[yarn] val CACHE_CONFIGS = Seq(
+ CACHED_FILES,
+ CACHED_FILES_SIZES,
+ CACHED_FILES_TIMESTAMPS,
+ CACHED_FILES_VISIBILITIES,
+ CACHED_FILES_TYPES,
+ CACHED_CONF_ARCHIVE)
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[03/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
deleted file mode 100644
index c4c07b4..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import java.util.ServiceLoader
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.security.Credentials
-
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-/**
- * A ConfigurableCredentialManager to manage all the registered credential providers and offer
- * APIs for other modules to obtain credentials as well as renewal time. By default
- * [[HDFSCredentialProvider]], [[HiveCredentialProvider]] and [[HBaseCredentialProvider]] will
- * be loaded in if not explicitly disabled, any plugged-in credential provider wants to be
- * managed by ConfigurableCredentialManager needs to implement [[ServiceCredentialProvider]]
- * interface and put into resources/META-INF/services to be loaded by ServiceLoader.
- *
- * Also each credential provider is controlled by
- * spark.yarn.security.credentials.{service}.enabled, it will not be loaded in if set to false.
- */
-private[yarn] final class ConfigurableCredentialManager(
- sparkConf: SparkConf, hadoopConf: Configuration) extends Logging {
- private val deprecatedProviderEnabledConfig = "spark.yarn.security.tokens.%s.enabled"
- private val providerEnabledConfig = "spark.yarn.security.credentials.%s.enabled"
-
- // Maintain all the registered credential providers
- private val credentialProviders = {
- val providers = ServiceLoader.load(classOf[ServiceCredentialProvider],
- Utils.getContextOrSparkClassLoader).asScala
-
- // Filter out credentials in which spark.yarn.security.credentials.{service}.enabled is false.
- providers.filter { p =>
- sparkConf.getOption(providerEnabledConfig.format(p.serviceName))
- .orElse {
- sparkConf.getOption(deprecatedProviderEnabledConfig.format(p.serviceName)).map { c =>
- logWarning(s"${deprecatedProviderEnabledConfig.format(p.serviceName)} is deprecated, " +
- s"using ${providerEnabledConfig.format(p.serviceName)} instead")
- c
- }
- }.map(_.toBoolean).getOrElse(true)
- }.map { p => (p.serviceName, p) }.toMap
- }
-
- /**
- * Get credential provider for the specified service.
- */
- def getServiceCredentialProvider(service: String): Option[ServiceCredentialProvider] = {
- credentialProviders.get(service)
- }
-
- /**
- * Obtain credentials from all the registered providers.
- * @return nearest time of next renewal, Long.MaxValue if all the credentials aren't renewable,
- * otherwise the nearest renewal time of any credentials will be returned.
- */
- def obtainCredentials(hadoopConf: Configuration, creds: Credentials): Long = {
- credentialProviders.values.flatMap { provider =>
- if (provider.credentialsRequired(hadoopConf)) {
- provider.obtainCredentials(hadoopConf, sparkConf, creds)
- } else {
- logDebug(s"Service ${provider.serviceName} does not require a token." +
- s" Check your configuration to see if security is disabled or not.")
- None
- }
- }.foldLeft(Long.MaxValue)(math.min)
- }
-
- /**
- * Create an [[AMCredentialRenewer]] instance, caller should be responsible to stop this
- * instance when it is not used. AM will use it to renew credentials periodically.
- */
- def credentialRenewer(): AMCredentialRenewer = {
- new AMCredentialRenewer(sparkConf, hadoopConf, this)
- }
-
- /**
- * Create an [[CredentialUpdater]] instance, caller should be resposible to stop this intance
- * when it is not used. Executors and driver (client mode) will use it to update credentials.
- * periodically.
- */
- def credentialUpdater(): CredentialUpdater = {
- new CredentialUpdater(sparkConf, hadoopConf, this)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
deleted file mode 100644
index 5df4fbd..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import java.util.concurrent.{Executors, TimeUnit}
-
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ThreadUtils, Utils}
-
-private[spark] class CredentialUpdater(
- sparkConf: SparkConf,
- hadoopConf: Configuration,
- credentialManager: ConfigurableCredentialManager) extends Logging {
-
- @volatile private var lastCredentialsFileSuffix = 0
-
- private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
- private val freshHadoopConf =
- SparkHadoopUtil.get.getConfBypassingFSCache(
- hadoopConf, new Path(credentialsFile).toUri.getScheme)
-
- private val credentialUpdater =
- Executors.newSingleThreadScheduledExecutor(
- ThreadUtils.namedThreadFactory("Credential Refresh Thread"))
-
- // This thread wakes up and picks up new credentials from HDFS, if any.
- private val credentialUpdaterRunnable =
- new Runnable {
- override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
- }
-
- /** Start the credential updater task */
- def start(): Unit = {
- val startTime = sparkConf.get(CREDENTIALS_RENEWAL_TIME)
- val remainingTime = startTime - System.currentTimeMillis()
- if (remainingTime <= 0) {
- credentialUpdater.schedule(credentialUpdaterRunnable, 1, TimeUnit.MINUTES)
- } else {
- logInfo(s"Scheduling credentials refresh from HDFS in $remainingTime millis.")
- credentialUpdater.schedule(credentialUpdaterRunnable, remainingTime, TimeUnit.MILLISECONDS)
- }
- }
-
- private def updateCredentialsIfRequired(): Unit = {
- val timeToNextUpdate = try {
- val credentialsFilePath = new Path(credentialsFile)
- val remoteFs = FileSystem.get(freshHadoopConf)
- SparkHadoopUtil.get.listFilesSorted(
- remoteFs, credentialsFilePath.getParent,
- credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
- .lastOption.map { credentialsStatus =>
- val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
- if (suffix > lastCredentialsFileSuffix) {
- logInfo("Reading new credentials from " + credentialsStatus.getPath)
- val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
- lastCredentialsFileSuffix = suffix
- UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
- logInfo("Credentials updated from credentials file.")
-
- val remainingTime = getTimeOfNextUpdateFromFileName(credentialsStatus.getPath)
- - System.currentTimeMillis()
- if (remainingTime <= 0) TimeUnit.MINUTES.toMillis(1) else remainingTime
- } else {
- // If current credential file is older than expected, sleep 1 hour and check again.
- TimeUnit.HOURS.toMillis(1)
- }
- }.getOrElse {
- // Wait for 1 minute to check again if there's no credential file currently
- TimeUnit.MINUTES.toMillis(1)
- }
- } catch {
- // Since the file may get deleted while we are reading it, catch the Exception and come
- // back in an hour to try again
- case NonFatal(e) =>
- logWarning("Error while trying to update credentials, will try again in 1 hour", e)
- TimeUnit.HOURS.toMillis(1)
- }
-
- credentialUpdater.schedule(
- credentialUpdaterRunnable, timeToNextUpdate, TimeUnit.MILLISECONDS)
- }
-
- private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
- val stream = remoteFs.open(tokenPath)
- try {
- val newCredentials = new Credentials()
- newCredentials.readTokenStorageStream(stream)
- newCredentials
- } finally {
- stream.close()
- }
- }
-
- private def getTimeOfNextUpdateFromFileName(credentialsPath: Path): Long = {
- val name = credentialsPath.getName
- val index = name.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM)
- val slice = name.substring(0, index)
- val last2index = slice.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM)
- name.substring(last2index + 1, index).toLong
- }
-
- def stop(): Unit = {
- credentialUpdater.shutdown()
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
deleted file mode 100644
index 5571df0..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import scala.reflect.runtime.universe
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.security.Credentials
-import org.apache.hadoop.security.token.{Token, TokenIdentifier}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.Logging
-
-private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging {
-
- override def serviceName: String = "hbase"
-
- override def obtainCredentials(
- hadoopConf: Configuration,
- sparkConf: SparkConf,
- creds: Credentials): Option[Long] = {
- try {
- val mirror = universe.runtimeMirror(getClass.getClassLoader)
- val obtainToken = mirror.classLoader.
- loadClass("org.apache.hadoop.hbase.security.token.TokenUtil").
- getMethod("obtainToken", classOf[Configuration])
-
- logDebug("Attempting to fetch HBase security token.")
- val token = obtainToken.invoke(null, hbaseConf(hadoopConf))
- .asInstanceOf[Token[_ <: TokenIdentifier]]
- logInfo(s"Get token from HBase: ${token.toString}")
- creds.addToken(token.getService, token)
- } catch {
- case NonFatal(e) =>
- logDebug(s"Failed to get token from service $serviceName", e)
- }
-
- None
- }
-
- override def credentialsRequired(hadoopConf: Configuration): Boolean = {
- hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos"
- }
-
- private def hbaseConf(conf: Configuration): Configuration = {
- try {
- val mirror = universe.runtimeMirror(getClass.getClassLoader)
- val confCreate = mirror.classLoader.
- loadClass("org.apache.hadoop.hbase.HBaseConfiguration").
- getMethod("create", classOf[Configuration])
- confCreate.invoke(null, conf).asInstanceOf[Configuration]
- } catch {
- case NonFatal(e) =>
- logDebug("Fail to invoke HBaseConfiguration", e)
- conf
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
deleted file mode 100644
index 8d06d73..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import java.io.{ByteArrayInputStream, DataInputStream}
-
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
-import org.apache.hadoop.mapred.Master
-import org.apache.hadoop.security.Credentials
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-
-private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
- // Token renewal interval, this value will be set in the first call,
- // if None means no token renewer specified, so cannot get token renewal interval.
- private var tokenRenewalInterval: Option[Long] = null
-
- override val serviceName: String = "hdfs"
-
- override def obtainCredentials(
- hadoopConf: Configuration,
- sparkConf: SparkConf,
- creds: Credentials): Option[Long] = {
- // NameNode to access, used to get tokens from different FileSystems
- nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
- val dstFs = dst.getFileSystem(hadoopConf)
- logInfo("getting token for namenode: " + dst)
- dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
- }
-
- // Get the token renewal interval if it is not set. It will only be called once.
- if (tokenRenewalInterval == null) {
- tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
- }
-
- // Get the time of next renewal.
- tokenRenewalInterval.map { interval =>
- creds.getAllTokens.asScala
- .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
- .map { t =>
- val identifier = new DelegationTokenIdentifier()
- identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
- identifier.getIssueDate + interval
- }.foldLeft(0L)(math.max)
- }
- }
-
- private def getTokenRenewalInterval(
- hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
- // We cannot use the tokens generated with renewer yarn. Trying to renew
- // those will fail with an access control issue. So create new tokens with the logged in
- // user as renewer.
- sparkConf.get(PRINCIPAL).map { renewer =>
- val creds = new Credentials()
- nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
- val dstFs = dst.getFileSystem(hadoopConf)
- dstFs.addDelegationTokens(renewer, creds)
- }
- val t = creds.getAllTokens.asScala
- .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
- .head
- val newExpiration = t.renew(hadoopConf)
- val identifier = new DelegationTokenIdentifier()
- identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
- val interval = newExpiration - identifier.getIssueDate
- logInfo(s"Renewal Interval is $interval")
- interval
- }
- }
-
- private def getTokenRenewer(conf: Configuration): String = {
- val delegTokenRenewer = Master.getMasterPrincipal(conf)
- logDebug("delegation token renewer is: " + delegTokenRenewer)
- if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
- val errorMessage = "Can't get Master Kerberos principal for use as renewer"
- logError(errorMessage)
- throw new SparkException(errorMessage)
- }
-
- delegTokenRenewer
- }
-
- private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
- sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
- sparkConf.get(STAGING_DIR).map(new Path(_))
- .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
deleted file mode 100644
index 16d8fc3..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import java.lang.reflect.UndeclaredThrowableException
-import java.security.PrivilegedExceptionAction
-
-import scala.reflect.runtime.universe
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-import org.apache.hadoop.security.token.Token
-
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-private[security] class HiveCredentialProvider extends ServiceCredentialProvider with Logging {
-
- override def serviceName: String = "hive"
-
- private def hiveConf(hadoopConf: Configuration): Configuration = {
- try {
- val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
- // the hive configuration class is a subclass of Hadoop Configuration, so can be cast down
- // to a Configuration and used without reflection
- val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
- // using the (Configuration, Class) constructor allows the current configuration to be
- // included in the hive config.
- val ctor = hiveConfClass.getDeclaredConstructor(classOf[Configuration],
- classOf[Object].getClass)
- ctor.newInstance(hadoopConf, hiveConfClass).asInstanceOf[Configuration]
- } catch {
- case NonFatal(e) =>
- logDebug("Fail to create Hive Configuration", e)
- hadoopConf
- }
- }
-
- override def credentialsRequired(hadoopConf: Configuration): Boolean = {
- UserGroupInformation.isSecurityEnabled &&
- hiveConf(hadoopConf).getTrimmed("hive.metastore.uris", "").nonEmpty
- }
-
- override def obtainCredentials(
- hadoopConf: Configuration,
- sparkConf: SparkConf,
- creds: Credentials): Option[Long] = {
- val conf = hiveConf(hadoopConf)
-
- val principalKey = "hive.metastore.kerberos.principal"
- val principal = conf.getTrimmed(principalKey, "")
- require(principal.nonEmpty, s"Hive principal $principalKey undefined")
- val metastoreUri = conf.getTrimmed("hive.metastore.uris", "")
- require(metastoreUri.nonEmpty, "Hive metastore uri undefined")
-
- val currentUser = UserGroupInformation.getCurrentUser()
- logDebug(s"Getting Hive delegation token for ${currentUser.getUserName()} against " +
- s"$principal at $metastoreUri")
-
- val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
- val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
- val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
- val closeCurrent = hiveClass.getMethod("closeCurrent")
-
- try {
- // get all the instance methods before invoking any
- val getDelegationToken = hiveClass.getMethod("getDelegationToken",
- classOf[String], classOf[String])
- val getHive = hiveClass.getMethod("get", hiveConfClass)
-
- doAsRealUser {
- val hive = getHive.invoke(null, conf)
- val tokenStr = getDelegationToken.invoke(hive, currentUser.getUserName(), principal)
- .asInstanceOf[String]
- val hive2Token = new Token[DelegationTokenIdentifier]()
- hive2Token.decodeFromUrlString(tokenStr)
- logInfo(s"Get Token from hive metastore: ${hive2Token.toString}")
- creds.addToken(new Text("hive.server2.delegation.token"), hive2Token)
- }
- } catch {
- case NonFatal(e) =>
- logDebug(s"Fail to get token from service $serviceName", e)
- } finally {
- Utils.tryLogNonFatalError {
- closeCurrent.invoke(null)
- }
- }
-
- None
- }
-
- /**
- * Run some code as the real logged in user (which may differ from the current user, for
- * example, when using proxying).
- */
- private def doAsRealUser[T](fn: => T): T = {
- val currentUser = UserGroupInformation.getCurrentUser()
- val realUser = Option(currentUser.getRealUser()).getOrElse(currentUser)
-
- // For some reason the Scala-generated anonymous class ends up causing an
- // UndeclaredThrowableException, even if you annotate the method with @throws.
- try {
- realUser.doAs(new PrivilegedExceptionAction[T]() {
- override def run(): T = fn
- })
- } catch {
- case e: UndeclaredThrowableException => throw Option(e.getCause()).getOrElse(e)
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
deleted file mode 100644
index 4e3fcce..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-
-import org.apache.spark.SparkConf
-
-/**
- * A credential provider for a service. User must implement this if they need to access a
- * secure service from Spark.
- */
-trait ServiceCredentialProvider {
-
- /**
- * Name of the service to provide credentials. This name should unique, Spark internally will
- * use this name to differentiate credential provider.
- */
- def serviceName: String
-
- /**
- * To decide whether credential is required for this service. By default it based on whether
- * Hadoop security is enabled.
- */
- def credentialsRequired(hadoopConf: Configuration): Boolean = {
- UserGroupInformation.isSecurityEnabled
- }
-
- /**
- * Obtain credentials for this service and get the time of the next renewal.
- * @param hadoopConf Configuration of current Hadoop Compatible system.
- * @param sparkConf Spark configuration.
- * @param creds Credentials to add tokens and security keys to.
- * @return If this Credential is renewable and can be renewed, return the time of the next
- * renewal, otherwise None should be returned.
- */
- def obtainCredentials(
- hadoopConf: Configuration,
- sparkConf: SparkConf,
- creds: Credentials): Option[Long]
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala b/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
deleted file mode 100644
index 6c3556a..0000000
--- a/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.launcher
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.ListBuffer
-import scala.util.Properties
-
-/**
- * Exposes methods from the launcher library that are used by the YARN backend.
- */
-private[spark] object YarnCommandBuilderUtils {
-
- def quoteForBatchScript(arg: String): String = {
- CommandBuilderUtils.quoteForBatchScript(arg)
- }
-
- def findJarsDir(sparkHome: String): String = {
- val scalaVer = Properties.versionNumberString
- .split("\\.")
- .take(2)
- .mkString(".")
- CommandBuilderUtils.findJarsDir(sparkHome, scalaVer, true)
- }
-
- /**
- * Adds the perm gen configuration to the list of java options if needed and not yet added.
- *
- * Note that this method adds the option based on the local JVM version; if the node where
- * the container is running has a different Java version, there's a risk that the option will
- * not be added (e.g. if the AM is running Java 8 but the container's node is set up to use
- * Java 7).
- */
- def addPermGenSizeOpt(args: ListBuffer[String]): Unit = {
- CommandBuilderUtils.addPermGenSizeOpt(args.asJava)
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
deleted file mode 100644
index 4ed2852..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import java.util.concurrent.atomic.AtomicBoolean
-
-import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
-
-import org.apache.spark.SparkContext
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-/**
- * An extension service that can be loaded into a Spark YARN scheduler.
- * A Service that can be started and stopped.
- *
- * 1. For implementations to be loadable by `SchedulerExtensionServices`,
- * they must provide an empty constructor.
- * 2. The `stop()` operation MUST be idempotent, and succeed even if `start()` was
- * never invoked.
- */
-trait SchedulerExtensionService {
-
- /**
- * Start the extension service. This should be a no-op if
- * called more than once.
- * @param binding binding to the spark application and YARN
- */
- def start(binding: SchedulerExtensionServiceBinding): Unit
-
- /**
- * Stop the service
- * The `stop()` operation MUST be idempotent, and succeed even if `start()` was
- * never invoked.
- */
- def stop(): Unit
-}
-
-/**
- * Binding information for a [[SchedulerExtensionService]].
- *
- * The attempt ID will be set if the service is started within a YARN application master;
- * there is then a different attempt ID for every time that AM is restarted.
- * When the service binding is instantiated in client mode, there's no attempt ID, as it lacks
- * this information.
- * @param sparkContext current spark context
- * @param applicationId YARN application ID
- * @param attemptId YARN attemptID. This will always be unset in client mode, and always set in
- * cluster mode.
- */
-case class SchedulerExtensionServiceBinding(
- sparkContext: SparkContext,
- applicationId: ApplicationId,
- attemptId: Option[ApplicationAttemptId] = None)
-
-/**
- * Container for [[SchedulerExtensionService]] instances.
- *
- * Loads Extension Services from the configuration property
- * `"spark.yarn.services"`, instantiates and starts them.
- * When stopped, it stops all child entries.
- *
- * The order in which child extension services are started and stopped
- * is undefined.
- */
-private[spark] class SchedulerExtensionServices extends SchedulerExtensionService
- with Logging {
- private var serviceOption: Option[String] = None
- private var services: List[SchedulerExtensionService] = Nil
- private val started = new AtomicBoolean(false)
- private var binding: SchedulerExtensionServiceBinding = _
-
- /**
- * Binding operation will load the named services and call bind on them too; the
- * entire set of services are then ready for `init()` and `start()` calls.
- *
- * @param binding binding to the spark application and YARN
- */
- def start(binding: SchedulerExtensionServiceBinding): Unit = {
- if (started.getAndSet(true)) {
- logWarning("Ignoring re-entrant start operation")
- return
- }
- require(binding.sparkContext != null, "Null context parameter")
- require(binding.applicationId != null, "Null appId parameter")
- this.binding = binding
- val sparkContext = binding.sparkContext
- val appId = binding.applicationId
- val attemptId = binding.attemptId
- logInfo(s"Starting Yarn extension services with app $appId and attemptId $attemptId")
-
- services = sparkContext.conf.get(SCHEDULER_SERVICES).map { sClass =>
- val instance = Utils.classForName(sClass)
- .newInstance()
- .asInstanceOf[SchedulerExtensionService]
- // bind this service
- instance.start(binding)
- logInfo(s"Service $sClass started")
- instance
- }.toList
- }
-
- /**
- * Get the list of services.
- *
- * @return a list of services; Nil until the service is started
- */
- def getServices: List[SchedulerExtensionService] = services
-
- /**
- * Stop the services; idempotent.
- *
- */
- override def stop(): Unit = {
- if (started.getAndSet(false)) {
- logInfo(s"Stopping $this")
- services.foreach { s =>
- Utils.tryLogNonFatalError(s.stop())
- }
- }
- }
-
- override def toString(): String = s"""SchedulerExtensionServices
- |(serviceOption=$serviceOption,
- | services=$services,
- | started=$started)""".stripMargin
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
deleted file mode 100644
index 60da356..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.hadoop.yarn.api.records.YarnApplicationState
-
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
-import org.apache.spark.internal.Logging
-import org.apache.spark.launcher.SparkAppHandle
-import org.apache.spark.scheduler.TaskSchedulerImpl
-
-private[spark] class YarnClientSchedulerBackend(
- scheduler: TaskSchedulerImpl,
- sc: SparkContext)
- extends YarnSchedulerBackend(scheduler, sc)
- with Logging {
-
- private var client: Client = null
- private var monitorThread: MonitorThread = null
-
- /**
- * Create a Yarn client to submit an application to the ResourceManager.
- * This waits until the application is running.
- */
- override def start() {
- val driverHost = conf.get("spark.driver.host")
- val driverPort = conf.get("spark.driver.port")
- val hostport = driverHost + ":" + driverPort
- sc.ui.foreach { ui => conf.set("spark.driver.appUIAddress", ui.webUrl) }
-
- val argsArrayBuf = new ArrayBuffer[String]()
- argsArrayBuf += ("--arg", hostport)
-
- logDebug("ClientArguments called with: " + argsArrayBuf.mkString(" "))
- val args = new ClientArguments(argsArrayBuf.toArray)
- totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(conf)
- client = new Client(args, conf)
- bindToYarn(client.submitApplication(), None)
-
- // SPARK-8687: Ensure all necessary properties have already been set before
- // we initialize our driver scheduler backend, which serves these properties
- // to the executors
- super.start()
- waitForApplication()
-
- // SPARK-8851: In yarn-client mode, the AM still does the credentials refresh. The driver
- // reads the credentials from HDFS, just like the executors and updates its own credentials
- // cache.
- if (conf.contains("spark.yarn.credentials.file")) {
- YarnSparkHadoopUtil.get.startCredentialUpdater(conf)
- }
- monitorThread = asyncMonitorApplication()
- monitorThread.start()
- }
-
- /**
- * Report the state of the application until it is running.
- * If the application has finished, failed or been killed in the process, throw an exception.
- * This assumes both `client` and `appId` have already been set.
- */
- private def waitForApplication(): Unit = {
- assert(client != null && appId.isDefined, "Application has not been submitted yet!")
- val (state, _) = client.monitorApplication(appId.get, returnOnRunning = true) // blocking
- if (state == YarnApplicationState.FINISHED ||
- state == YarnApplicationState.FAILED ||
- state == YarnApplicationState.KILLED) {
- throw new SparkException("Yarn application has already ended! " +
- "It might have been killed or unable to launch application master.")
- }
- if (state == YarnApplicationState.RUNNING) {
- logInfo(s"Application ${appId.get} has started running.")
- }
- }
-
- /**
- * We create this class for SPARK-9519. Basically when we interrupt the monitor thread it's
- * because the SparkContext is being shut down(sc.stop() called by user code), but if
- * monitorApplication return, it means the Yarn application finished before sc.stop() was called,
- * which means we should call sc.stop() here, and we don't allow the monitor to be interrupted
- * before SparkContext stops successfully.
- */
- private class MonitorThread extends Thread {
- private var allowInterrupt = true
-
- override def run() {
- try {
- val (state, _) = client.monitorApplication(appId.get, logApplicationReport = false)
- logError(s"Yarn application has already exited with state $state!")
- allowInterrupt = false
- sc.stop()
- } catch {
- case e: InterruptedException => logInfo("Interrupting monitor thread")
- }
- }
-
- def stopMonitor(): Unit = {
- if (allowInterrupt) {
- this.interrupt()
- }
- }
- }
-
- /**
- * Monitor the application state in a separate thread.
- * If the application has exited for any reason, stop the SparkContext.
- * This assumes both `client` and `appId` have already been set.
- */
- private def asyncMonitorApplication(): MonitorThread = {
- assert(client != null && appId.isDefined, "Application has not been submitted yet!")
- val t = new MonitorThread
- t.setName("Yarn application state monitor")
- t.setDaemon(true)
- t
- }
-
- /**
- * Stop the scheduler. This assumes `start()` has already been called.
- */
- override def stop() {
- assert(client != null, "Attempted to stop this scheduler before starting it!")
- if (monitorThread != null) {
- monitorThread.stopMonitor()
- }
-
- // Report a final state to the launcher if one is connected. This is needed since in client
- // mode this backend doesn't let the app monitor loop run to completion, so it does not report
- // the final state itself.
- //
- // Note: there's not enough information at this point to provide a better final state,
- // so assume the application was successful.
- client.reportLauncherState(SparkAppHandle.State.FINISHED)
-
- super.stop()
- YarnSparkHadoopUtil.get.stopCredentialUpdater()
- client.stop()
- logInfo("Stopped")
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
deleted file mode 100644
index 64cd1bd..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
-
-/**
- * Cluster Manager for creation of Yarn scheduler and backend
- */
-private[spark] class YarnClusterManager extends ExternalClusterManager {
-
- override def canCreate(masterURL: String): Boolean = {
- masterURL == "yarn"
- }
-
- override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
- sc.deployMode match {
- case "cluster" => new YarnClusterScheduler(sc)
- case "client" => new YarnScheduler(sc)
- case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
- }
- }
-
- override def createSchedulerBackend(sc: SparkContext,
- masterURL: String,
- scheduler: TaskScheduler): SchedulerBackend = {
- sc.deployMode match {
- case "cluster" =>
- new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
- case "client" =>
- new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
- case _ =>
- throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
- }
- }
-
- override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
- scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
deleted file mode 100644
index 96c9151..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.spark._
-import org.apache.spark.deploy.yarn.ApplicationMaster
-
-/**
- * This is a simple extension to ClusterScheduler - to ensure that appropriate initialization of
- * ApplicationMaster, etc is done
- */
-private[spark] class YarnClusterScheduler(sc: SparkContext) extends YarnScheduler(sc) {
-
- logInfo("Created YarnClusterScheduler")
-
- override def postStartHook() {
- ApplicationMaster.sparkContextInitialized(sc)
- super.postStartHook()
- logInfo("YarnClusterScheduler.postStartHook done")
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
deleted file mode 100644
index 4f3d5eb..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-
-import org.apache.spark.SparkContext
-import org.apache.spark.deploy.yarn.{ApplicationMaster, YarnSparkHadoopUtil}
-import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.util.Utils
-
-private[spark] class YarnClusterSchedulerBackend(
- scheduler: TaskSchedulerImpl,
- sc: SparkContext)
- extends YarnSchedulerBackend(scheduler, sc) {
-
- override def start() {
- val attemptId = ApplicationMaster.getAttemptId
- bindToYarn(attemptId.getApplicationId(), Some(attemptId))
- super.start()
- totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf)
- }
-
- override def getDriverLogUrls: Option[Map[String, String]] = {
- var driverLogs: Option[Map[String, String]] = None
- try {
- val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
- val containerId = YarnSparkHadoopUtil.get.getContainerId
-
- val httpAddress = System.getenv(Environment.NM_HOST.name()) +
- ":" + System.getenv(Environment.NM_HTTP_PORT.name())
- // lookup appropriate http scheme for container log urls
- val yarnHttpPolicy = yarnConf.get(
- YarnConfiguration.YARN_HTTP_POLICY_KEY,
- YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
- )
- val user = Utils.getCurrentUserName()
- val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
- val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
- logDebug(s"Base URL for logs: $baseUrl")
- driverLogs = Some(Map(
- "stdout" -> s"$baseUrl/stdout?start=-4096",
- "stderr" -> s"$baseUrl/stderr?start=-4096"))
- } catch {
- case e: Exception =>
- logInfo("Error while building AM log links, so AM" +
- " logs link will not appear in application UI", e)
- }
- driverLogs
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
deleted file mode 100644
index 0293821..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import org.apache.hadoop.yarn.util.RackResolver
-import org.apache.log4j.{Level, Logger}
-
-import org.apache.spark._
-import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.util.Utils
-
-private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {
-
- // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
- if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
- Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
- }
-
- // By default, rack is unknown
- override def getRackForHost(hostPort: String): Option[String] = {
- val host = Utils.parseHostPort(hostPort)._1
- Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
deleted file mode 100644
index 2f9ea19..0000000
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster
-
-import scala.concurrent.{ExecutionContext, Future}
-import scala.util.{Failure, Success}
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
-
-import org.apache.spark.SparkContext
-import org.apache.spark.internal.Logging
-import org.apache.spark.rpc._
-import org.apache.spark.scheduler._
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
-import org.apache.spark.ui.JettyUtils
-import org.apache.spark.util.{RpcUtils, ThreadUtils}
-
-/**
- * Abstract Yarn scheduler backend that contains common logic
- * between the client and cluster Yarn scheduler backends.
- */
-private[spark] abstract class YarnSchedulerBackend(
- scheduler: TaskSchedulerImpl,
- sc: SparkContext)
- extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
-
- override val minRegisteredRatio =
- if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
- 0.8
- } else {
- super.minRegisteredRatio
- }
-
- protected var totalExpectedExecutors = 0
-
- private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv)
-
- private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
- YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)
-
- private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)
-
- /** Application ID. */
- protected var appId: Option[ApplicationId] = None
-
- /** Attempt ID. This is unset for client-mode schedulers */
- private var attemptId: Option[ApplicationAttemptId] = None
-
- /** Scheduler extension services. */
- private val services: SchedulerExtensionServices = new SchedulerExtensionServices()
-
- // Flag to specify whether this schedulerBackend should be reset.
- private var shouldResetOnAmRegister = false
-
- /**
- * Bind to YARN. This *must* be done before calling [[start()]].
- *
- * @param appId YARN application ID
- * @param attemptId Optional YARN attempt ID
- */
- protected def bindToYarn(appId: ApplicationId, attemptId: Option[ApplicationAttemptId]): Unit = {
- this.appId = Some(appId)
- this.attemptId = attemptId
- }
-
- override def start() {
- require(appId.isDefined, "application ID unset")
- val binding = SchedulerExtensionServiceBinding(sc, appId.get, attemptId)
- services.start(binding)
- super.start()
- }
-
- override def stop(): Unit = {
- try {
- // SPARK-12009: To prevent Yarn allocator from requesting backup for the executors which
- // was Stopped by SchedulerBackend.
- requestTotalExecutors(0, 0, Map.empty)
- super.stop()
- } finally {
- services.stop()
- }
- }
-
- /**
- * Get the attempt ID for this run, if the cluster manager supports multiple
- * attempts. Applications run in client mode will not have attempt IDs.
- * This attempt ID only includes attempt counter, like "1", "2".
- *
- * @return The application attempt id, if available.
- */
- override def applicationAttemptId(): Option[String] = {
- attemptId.map(_.getAttemptId.toString)
- }
-
- /**
- * Get an application ID associated with the job.
- * This returns the string value of [[appId]] if set, otherwise
- * the locally-generated ID from the superclass.
- * @return The application ID
- */
- override def applicationId(): String = {
- appId.map(_.toString).getOrElse {
- logWarning("Application ID is not initialized yet.")
- super.applicationId
- }
- }
-
- /**
- * Request executors from the ApplicationMaster by specifying the total number desired.
- * This includes executors already pending or running.
- */
- override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
- yarnSchedulerEndpointRef.ask[Boolean](
- RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
- }
-
- /**
- * Request that the ApplicationMaster kill the specified executors.
- */
- override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
- yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))
- }
-
- override def sufficientResourcesRegistered(): Boolean = {
- totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio
- }
-
- /**
- * Add filters to the SparkUI.
- */
- private def addWebUIFilter(
- filterName: String,
- filterParams: Map[String, String],
- proxyBase: String): Unit = {
- if (proxyBase != null && proxyBase.nonEmpty) {
- System.setProperty("spark.ui.proxyBase", proxyBase)
- }
-
- val hasFilter =
- filterName != null && filterName.nonEmpty &&
- filterParams != null && filterParams.nonEmpty
- if (hasFilter) {
- logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
- conf.set("spark.ui.filters", filterName)
- filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v) }
- scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) }
- }
- }
-
- override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
- new YarnDriverEndpoint(rpcEnv, properties)
- }
-
- /**
- * Reset the state of SchedulerBackend to the initial state. This is happened when AM is failed
- * and re-registered itself to driver after a failure. The stale state in driver should be
- * cleaned.
- */
- override protected def reset(): Unit = {
- super.reset()
- sc.executorAllocationManager.foreach(_.reset())
- }
-
- /**
- * Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
- * This endpoint communicates with the executors and queries the AM for an executor's exit
- * status when the executor is disconnected.
- */
- private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
- extends DriverEndpoint(rpcEnv, sparkProperties) {
-
- /**
- * When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
- * handles it by assuming the Executor was lost for a bad reason and removes the executor
- * immediately.
- *
- * In YARN's case however it is crucial to talk to the application master and ask why the
- * executor had exited. If the executor exited for some reason unrelated to the running tasks
- * (e.g., preemption), according to the application master, then we pass that information down
- * to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
- * not count towards a job failure.
- */
- override def onDisconnected(rpcAddress: RpcAddress): Unit = {
- addressToExecutorId.get(rpcAddress).foreach { executorId =>
- if (disableExecutor(executorId)) {
- yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
- }
- }
- }
- }
-
- /**
- * An [[RpcEndpoint]] that communicates with the ApplicationMaster.
- */
- private class YarnSchedulerEndpoint(override val rpcEnv: RpcEnv)
- extends ThreadSafeRpcEndpoint with Logging {
- private var amEndpoint: Option[RpcEndpointRef] = None
-
- private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
- executorId: String,
- executorRpcAddress: RpcAddress): Unit = {
- val removeExecutorMessage = amEndpoint match {
- case Some(am) =>
- val lossReasonRequest = GetExecutorLossReason(executorId)
- am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
- .map { reason => RemoveExecutor(executorId, reason) }(ThreadUtils.sameThread)
- .recover {
- case NonFatal(e) =>
- logWarning(s"Attempted to get executor loss reason" +
- s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
- s" but got no response. Marking as slave lost.", e)
- RemoveExecutor(executorId, SlaveLost())
- }(ThreadUtils.sameThread)
- case None =>
- logWarning("Attempted to check for an executor loss reason" +
- " before the AM has registered!")
- Future.successful(RemoveExecutor(executorId, SlaveLost("AM is not yet registered.")))
- }
-
- removeExecutorMessage
- .flatMap { message =>
- driverEndpoint.ask[Boolean](message)
- }(ThreadUtils.sameThread)
- .onFailure {
- case NonFatal(e) => logError(
- s"Error requesting driver to remove executor $executorId after disconnection.", e)
- }(ThreadUtils.sameThread)
- }
-
- override def receive: PartialFunction[Any, Unit] = {
- case RegisterClusterManager(am) =>
- logInfo(s"ApplicationMaster registered as $am")
- amEndpoint = Option(am)
- if (!shouldResetOnAmRegister) {
- shouldResetOnAmRegister = true
- } else {
- // AM is already registered before, this potentially means that AM failed and
- // a new one registered after the failure. This will only happen in yarn-client mode.
- reset()
- }
-
- case AddWebUIFilter(filterName, filterParams, proxyBase) =>
- addWebUIFilter(filterName, filterParams, proxyBase)
-
- case r @ RemoveExecutor(executorId, reason) =>
- logWarning(reason.toString)
- driverEndpoint.ask[Boolean](r).onFailure {
- case e =>
- logError("Error requesting driver to remove executor" +
- s" $executorId for reason $reason", e)
- }(ThreadUtils.sameThread)
- }
-
-
- override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
- case r: RequestExecutors =>
- amEndpoint match {
- case Some(am) =>
- am.ask[Boolean](r).andThen {
- case Success(b) => context.reply(b)
- case Failure(NonFatal(e)) =>
- logError(s"Sending $r to AM was unsuccessful", e)
- context.sendFailure(e)
- }(ThreadUtils.sameThread)
- case None =>
- logWarning("Attempted to request executors before the AM has registered!")
- context.reply(false)
- }
-
- case k: KillExecutors =>
- amEndpoint match {
- case Some(am) =>
- am.ask[Boolean](k).andThen {
- case Success(b) => context.reply(b)
- case Failure(NonFatal(e)) =>
- logError(s"Sending $k to AM was unsuccessful", e)
- context.sendFailure(e)
- }(ThreadUtils.sameThread)
- case None =>
- logWarning("Attempted to kill executors before the AM has registered!")
- context.reply(false)
- }
-
- case RetrieveLastAllocatedExecutorId =>
- context.reply(currentExecutorIdCounter)
- }
-
- override def onDisconnected(remoteAddress: RpcAddress): Unit = {
- if (amEndpoint.exists(_.address == remoteAddress)) {
- logWarning(s"ApplicationMaster has disassociated: $remoteAddress")
- amEndpoint = None
- }
- }
- }
-}
-
-private[spark] object YarnSchedulerBackend {
- val ENDPOINT_NAME = "YarnScheduler"
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
----------------------------------------------------------------------
diff --git a/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider b/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
deleted file mode 100644
index d0ef5ef..0000000
--- a/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.spark.deploy.yarn.security.TestCredentialProvider
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/yarn/src/test/resources/log4j.properties b/yarn/src/test/resources/log4j.properties
deleted file mode 100644
index d13454d..0000000
--- a/yarn/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,31 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=DEBUG, file
-log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=true
-log4j.appender.file.file=target/unit-tests.log
-log4j.appender.file.layout=org.apache.log4j.PatternLayout
-log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
-
-# Ignore messages below warning level from a few verbose libraries.
-log4j.logger.com.sun.jersey=WARN
-log4j.logger.org.apache.hadoop=WARN
-log4j.logger.org.eclipse.jetty=WARN
-log4j.logger.org.mortbay=WARN
-log4j.logger.org.spark_project.jetty=WARN
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
deleted file mode 100644
index 9c3b18e..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.{File, FileOutputStream, OutputStreamWriter}
-import java.nio.charset.StandardCharsets
-import java.util.Properties
-import java.util.concurrent.TimeUnit
-
-import scala.collection.JavaConverters._
-import scala.concurrent.duration._
-import scala.language.postfixOps
-
-import com.google.common.io.Files
-import org.apache.commons.lang3.SerializationUtils
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.server.MiniYARNCluster
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-import org.scalatest.concurrent.Eventually._
-
-import org.apache.spark._
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.launcher._
-import org.apache.spark.util.Utils
-
-abstract class BaseYarnClusterSuite
- extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
-
- // log4j configuration for the YARN containers, so that their output is collected
- // by YARN instead of trying to overwrite unit-tests.log.
- protected val LOG4J_CONF = """
- |log4j.rootCategory=DEBUG, console
- |log4j.appender.console=org.apache.log4j.ConsoleAppender
- |log4j.appender.console.target=System.err
- |log4j.appender.console.layout=org.apache.log4j.PatternLayout
- |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
- |log4j.logger.org.apache.hadoop=WARN
- |log4j.logger.org.eclipse.jetty=WARN
- |log4j.logger.org.mortbay=WARN
- |log4j.logger.org.spark_project.jetty=WARN
- """.stripMargin
-
- private var yarnCluster: MiniYARNCluster = _
- protected var tempDir: File = _
- private var fakeSparkJar: File = _
- protected var hadoopConfDir: File = _
- private var logConfDir: File = _
-
- var oldSystemProperties: Properties = null
-
- def newYarnConfig(): YarnConfiguration
-
- override def beforeAll() {
- super.beforeAll()
- oldSystemProperties = SerializationUtils.clone(System.getProperties)
-
- tempDir = Utils.createTempDir()
- logConfDir = new File(tempDir, "log4j")
- logConfDir.mkdir()
- System.setProperty("SPARK_YARN_MODE", "true")
-
- val logConfFile = new File(logConfDir, "log4j.properties")
- Files.write(LOG4J_CONF, logConfFile, StandardCharsets.UTF_8)
-
- // Disable the disk utilization check to avoid the test hanging when people's disks are
- // getting full.
- val yarnConf = newYarnConfig()
- yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
- "100.0")
-
- yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
- yarnCluster.init(yarnConf)
- yarnCluster.start()
-
- // There's a race in MiniYARNCluster in which start() may return before the RM has updated
- // its address in the configuration. You can see this in the logs by noticing that when
- // MiniYARNCluster prints the address, it still has port "0" assigned, although later the
- // test works sometimes:
- //
- // INFO MiniYARNCluster: MiniYARN ResourceManager address: blah:0
- //
- // That log message prints the contents of the RM_ADDRESS config variable. If you check it
- // later on, it looks something like this:
- //
- // INFO YarnClusterSuite: RM address in configuration is blah:42631
- //
- // This hack loops for a bit waiting for the port to change, and fails the test if it hasn't
- // done so in a timely manner (defined to be 10 seconds).
- val config = yarnCluster.getConfig()
- val deadline = System.currentTimeMillis() + TimeUnit.SECONDS.toMillis(10)
- while (config.get(YarnConfiguration.RM_ADDRESS).split(":")(1) == "0") {
- if (System.currentTimeMillis() > deadline) {
- throw new IllegalStateException("Timed out waiting for RM to come up.")
- }
- logDebug("RM address still not set in configuration, waiting...")
- TimeUnit.MILLISECONDS.sleep(100)
- }
-
- logInfo(s"RM address in configuration is ${config.get(YarnConfiguration.RM_ADDRESS)}")
-
- fakeSparkJar = File.createTempFile("sparkJar", null, tempDir)
- hadoopConfDir = new File(tempDir, Client.LOCALIZED_CONF_DIR)
- assert(hadoopConfDir.mkdir())
- File.createTempFile("token", ".txt", hadoopConfDir)
- }
-
- override def afterAll() {
- try {
- yarnCluster.stop()
- } finally {
- System.setProperties(oldSystemProperties)
- super.afterAll()
- }
- }
-
- protected def runSpark(
- clientMode: Boolean,
- klass: String,
- appArgs: Seq[String] = Nil,
- sparkArgs: Seq[(String, String)] = Nil,
- extraClassPath: Seq[String] = Nil,
- extraJars: Seq[String] = Nil,
- extraConf: Map[String, String] = Map(),
- extraEnv: Map[String, String] = Map()): SparkAppHandle.State = {
- val deployMode = if (clientMode) "client" else "cluster"
- val propsFile = createConfFile(extraClassPath = extraClassPath, extraConf = extraConf)
- val env = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv
-
- val launcher = new SparkLauncher(env.asJava)
- if (klass.endsWith(".py")) {
- launcher.setAppResource(klass)
- } else {
- launcher.setMainClass(klass)
- launcher.setAppResource(fakeSparkJar.getAbsolutePath())
- }
- launcher.setSparkHome(sys.props("spark.test.home"))
- .setMaster("yarn")
- .setDeployMode(deployMode)
- .setConf("spark.executor.instances", "1")
- .setPropertiesFile(propsFile)
- .addAppArgs(appArgs.toArray: _*)
-
- sparkArgs.foreach { case (name, value) =>
- if (value != null) {
- launcher.addSparkArg(name, value)
- } else {
- launcher.addSparkArg(name)
- }
- }
- extraJars.foreach(launcher.addJar)
-
- val handle = launcher.startApplication()
- try {
- eventually(timeout(2 minutes), interval(1 second)) {
- assert(handle.getState().isFinal())
- }
- } finally {
- handle.kill()
- }
-
- handle.getState()
- }
-
- /**
- * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
- * any sort of error when the job process finishes successfully, but the job itself fails. So
- * the tests enforce that something is written to a file after everything is ok to indicate
- * that the job succeeded.
- */
- protected def checkResult(finalState: SparkAppHandle.State, result: File): Unit = {
- checkResult(finalState, result, "success")
- }
-
- protected def checkResult(
- finalState: SparkAppHandle.State,
- result: File,
- expected: String): Unit = {
- finalState should be (SparkAppHandle.State.FINISHED)
- val resultString = Files.toString(result, StandardCharsets.UTF_8)
- resultString should be (expected)
- }
-
- protected def mainClassName(klass: Class[_]): String = {
- klass.getName().stripSuffix("$")
- }
-
- protected def createConfFile(
- extraClassPath: Seq[String] = Nil,
- extraConf: Map[String, String] = Map()): String = {
- val props = new Properties()
- props.put(SPARK_JARS.key, "local:" + fakeSparkJar.getAbsolutePath())
-
- val testClasspath = new TestClasspathBuilder()
- .buildClassPath(
- logConfDir.getAbsolutePath() +
- File.pathSeparator +
- extraClassPath.mkString(File.pathSeparator))
- .asScala
- .mkString(File.pathSeparator)
-
- props.put("spark.driver.extraClassPath", testClasspath)
- props.put("spark.executor.extraClassPath", testClasspath)
-
- // SPARK-4267: make sure java options are propagated correctly.
- props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
- props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
-
- yarnCluster.getConfig().asScala.foreach { e =>
- props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
- }
- sys.props.foreach { case (k, v) =>
- if (k.startsWith("spark.")) {
- props.setProperty(k, v)
- }
- }
- extraConf.foreach { case (k, v) => props.setProperty(k, v) }
-
- val propsFile = File.createTempFile("spark", ".properties", tempDir)
- val writer = new OutputStreamWriter(new FileOutputStream(propsFile), StandardCharsets.UTF_8)
- props.store(writer, "Spark properties.")
- writer.close()
- propsFile.getAbsolutePath()
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
deleted file mode 100644
index b696e08..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManagerSuite.scala
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.net.URI
-
-import scala.collection.mutable.HashMap
-import scala.collection.mutable.Map
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.FileStatus
-import org.apache.hadoop.fs.FileSystem
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.yarn.api.records.LocalResource
-import org.apache.hadoop.yarn.api.records.LocalResourceType
-import org.apache.hadoop.yarn.api.records.LocalResourceVisibility
-import org.apache.hadoop.yarn.util.ConverterUtils
-import org.mockito.Mockito.when
-import org.scalatest.mock.MockitoSugar
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.yarn.config._
-
-class ClientDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {
-
- class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
- override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
- LocalResourceVisibility = {
- LocalResourceVisibility.PRIVATE
- }
- }
-
- test("test getFileStatus empty") {
- val distMgr = new ClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val uri = new URI("/tmp/testing")
- when(fs.getFileStatus(new Path(uri))).thenReturn(new FileStatus())
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
- val stat = distMgr.getFileStatus(fs, uri, statCache)
- assert(stat.getPath() === null)
- }
-
- test("test getFileStatus cached") {
- val distMgr = new ClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val uri = new URI("/tmp/testing")
- val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
- null, new Path("/tmp/testing"))
- when(fs.getFileStatus(new Path(uri))).thenReturn(new FileStatus())
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus](uri -> realFileStatus)
- val stat = distMgr.getFileStatus(fs, uri, statCache)
- assert(stat.getPath().toString() === "/tmp/testing")
- }
-
- test("test addResource") {
- val distMgr = new MockClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val conf = new Configuration()
- val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
- val localResources = HashMap[String, LocalResource]()
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
- when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
-
- distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, "link",
- statCache, false)
- val resource = localResources("link")
- assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
- assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
- assert(resource.getTimestamp() === 0)
- assert(resource.getSize() === 0)
- assert(resource.getType() === LocalResourceType.FILE)
-
- val sparkConf = new SparkConf(false)
- distMgr.updateConfiguration(sparkConf)
- assert(sparkConf.get(CACHED_FILES) === Seq("file:/foo.invalid.com:8080/tmp/testing#link"))
- assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Seq(0L))
- assert(sparkConf.get(CACHED_FILES_SIZES) === Seq(0L))
- assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Seq(LocalResourceVisibility.PRIVATE.name()))
- assert(sparkConf.get(CACHED_FILES_TYPES) === Seq(LocalResourceType.FILE.name()))
-
- // add another one and verify both there and order correct
- val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
- null, new Path("/tmp/testing2"))
- val destPath2 = new Path("file:///foo.invalid.com:8080/tmp/testing2")
- when(fs.getFileStatus(destPath2)).thenReturn(realFileStatus)
- distMgr.addResource(fs, conf, destPath2, localResources, LocalResourceType.FILE, "link2",
- statCache, false)
- val resource2 = localResources("link2")
- assert(resource2.getVisibility() === LocalResourceVisibility.PRIVATE)
- assert(ConverterUtils.getPathFromYarnURL(resource2.getResource()) === destPath2)
- assert(resource2.getTimestamp() === 10)
- assert(resource2.getSize() === 20)
- assert(resource2.getType() === LocalResourceType.FILE)
-
- val sparkConf2 = new SparkConf(false)
- distMgr.updateConfiguration(sparkConf2)
-
- val files = sparkConf2.get(CACHED_FILES)
- val sizes = sparkConf2.get(CACHED_FILES_SIZES)
- val timestamps = sparkConf2.get(CACHED_FILES_TIMESTAMPS)
- val visibilities = sparkConf2.get(CACHED_FILES_VISIBILITIES)
-
- assert(files(0) === "file:/foo.invalid.com:8080/tmp/testing#link")
- assert(timestamps(0) === 0)
- assert(sizes(0) === 0)
- assert(visibilities(0) === LocalResourceVisibility.PRIVATE.name())
-
- assert(files(1) === "file:/foo.invalid.com:8080/tmp/testing2#link2")
- assert(timestamps(1) === 10)
- assert(sizes(1) === 20)
- assert(visibilities(1) === LocalResourceVisibility.PRIVATE.name())
- }
-
- test("test addResource link null") {
- val distMgr = new MockClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val conf = new Configuration()
- val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
- val localResources = HashMap[String, LocalResource]()
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
- when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
-
- intercept[Exception] {
- distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.FILE, null,
- statCache, false)
- }
- assert(localResources.get("link") === None)
- assert(localResources.size === 0)
- }
-
- test("test addResource appmaster only") {
- val distMgr = new MockClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val conf = new Configuration()
- val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
- val localResources = HashMap[String, LocalResource]()
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
- val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
- null, new Path("/tmp/testing"))
- when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
-
- distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
- statCache, true)
- val resource = localResources("link")
- assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
- assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
- assert(resource.getTimestamp() === 10)
- assert(resource.getSize() === 20)
- assert(resource.getType() === LocalResourceType.ARCHIVE)
-
- val sparkConf = new SparkConf(false)
- distMgr.updateConfiguration(sparkConf)
- assert(sparkConf.get(CACHED_FILES) === Nil)
- assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Nil)
- assert(sparkConf.get(CACHED_FILES_SIZES) === Nil)
- assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Nil)
- assert(sparkConf.get(CACHED_FILES_TYPES) === Nil)
- }
-
- test("test addResource archive") {
- val distMgr = new MockClientDistributedCacheManager()
- val fs = mock[FileSystem]
- val conf = new Configuration()
- val destPath = new Path("file:///foo.invalid.com:8080/tmp/testing")
- val localResources = HashMap[String, LocalResource]()
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
- val realFileStatus = new FileStatus(20, false, 1, 1024, 10, 30, null, "testOwner",
- null, new Path("/tmp/testing"))
- when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)
-
- distMgr.addResource(fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link",
- statCache, false)
- val resource = localResources("link")
- assert(resource.getVisibility() === LocalResourceVisibility.PRIVATE)
- assert(ConverterUtils.getPathFromYarnURL(resource.getResource()) === destPath)
- assert(resource.getTimestamp() === 10)
- assert(resource.getSize() === 20)
- assert(resource.getType() === LocalResourceType.ARCHIVE)
-
- val sparkConf = new SparkConf(false)
- distMgr.updateConfiguration(sparkConf)
- assert(sparkConf.get(CACHED_FILES) === Seq("file:/foo.invalid.com:8080/tmp/testing#link"))
- assert(sparkConf.get(CACHED_FILES_SIZES) === Seq(20L))
- assert(sparkConf.get(CACHED_FILES_TIMESTAMPS) === Seq(10L))
- assert(sparkConf.get(CACHED_FILES_VISIBILITIES) === Seq(LocalResourceVisibility.PRIVATE.name()))
- assert(sparkConf.get(CACHED_FILES_TYPES) === Seq(LocalResourceType.ARCHIVE.name()))
- }
-
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[09/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
new file mode 100644
index 0000000..7e76f40
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.yarn.security
+
+import java.security.PrivilegedExceptionAction
+import java.util.concurrent.{Executors, TimeUnit}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.security.UserGroupInformation
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * The following methods are primarily meant to make sure long-running apps like Spark
+ * Streaming apps can run without interruption while accessing secured services. The
+ * scheduleLoginFromKeytab method is called on the AM to get the new credentials.
+ * This method wakes up a thread that logs into the KDC
+ * once 75% of the renewal interval of the original credentials used for the container
+ * has elapsed. It then obtains new credentials and writes them to HDFS in a
+ * pre-specified location - the prefix of which is specified in the sparkConf by
+ * spark.yarn.credentials.file (so the file(s) would be named c-timestamp1-1, c-timestamp2-2 etc.
+ * - each update goes to a new file, with a monotonically increasing suffix), also the
+ * timestamp1, timestamp2 here indicates the time of next update for CredentialUpdater.
+ * After this, the credentials are renewed once 75% of the new tokens renewal interval has elapsed.
+ *
+ * On the executor and driver (yarn client mode) side, the updateCredentialsIfRequired method is
+ * called once 80% of the validity of the original credentials has elapsed. At that time the
+ * executor finds the credentials file with the latest timestamp and checks if it has read those
+ * credentials before (by keeping track of the suffix of the last file it read). If a new file has
+ * appeared, it will read the credentials and update the currently running UGI with it. This
+ * process happens again once 80% of the validity of this has expired.
+ */
+private[yarn] class AMCredentialRenewer(
+ sparkConf: SparkConf,
+ hadoopConf: Configuration,
+ credentialManager: ConfigurableCredentialManager) extends Logging {
+
+ private var lastCredentialsFileSuffix = 0
+
+ private val credentialRenewer =
+ Executors.newSingleThreadScheduledExecutor(
+ ThreadUtils.namedThreadFactory("Credential Refresh Thread"))
+
+ private val hadoopUtil = YarnSparkHadoopUtil.get
+
+ private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
+ private val daysToKeepFiles = sparkConf.get(CREDENTIALS_FILE_MAX_RETENTION)
+ private val numFilesToKeep = sparkConf.get(CREDENTIAL_FILE_MAX_COUNT)
+ private val freshHadoopConf =
+ hadoopUtil.getConfBypassingFSCache(hadoopConf, new Path(credentialsFile).toUri.getScheme)
+
+ @volatile private var timeOfNextRenewal = sparkConf.get(CREDENTIALS_RENEWAL_TIME)
+
+ /**
+ * Schedule a login from the keytab and principal set using the --principal and --keytab
+ * arguments to spark-submit. This login happens only when the credentials of the current user
+ * are about to expire. This method reads spark.yarn.principal and spark.yarn.keytab from
+ * SparkConf to do the login. This method is a no-op in non-YARN mode.
+ *
+ */
+ private[spark] def scheduleLoginFromKeytab(): Unit = {
+ val principal = sparkConf.get(PRINCIPAL).get
+ val keytab = sparkConf.get(KEYTAB).get
+
+ /**
+ * Schedule re-login and creation of new credentials. If credentials have already expired, this
+ * method will synchronously create new ones.
+ */
+ def scheduleRenewal(runnable: Runnable): Unit = {
+ // Run now!
+ val remainingTime = timeOfNextRenewal - System.currentTimeMillis()
+ if (remainingTime <= 0) {
+ logInfo("Credentials have expired, creating new ones now.")
+ runnable.run()
+ } else {
+ logInfo(s"Scheduling login from keytab in $remainingTime millis.")
+ credentialRenewer.schedule(runnable, remainingTime, TimeUnit.MILLISECONDS)
+ }
+ }
+
+ // This thread periodically runs on the AM to update the credentials on HDFS.
+ val credentialRenewerRunnable =
+ new Runnable {
+ override def run(): Unit = {
+ try {
+ writeNewCredentialsToHDFS(principal, keytab)
+ cleanupOldFiles()
+ } catch {
+ case e: Exception =>
+ // Log the error and try to write new tokens back in an hour
+ logWarning("Failed to write out new credentials to HDFS, will try again in an " +
+ "hour! If this happens too often tasks will fail.", e)
+ credentialRenewer.schedule(this, 1, TimeUnit.HOURS)
+ return
+ }
+ scheduleRenewal(this)
+ }
+ }
+ // Schedule update of credentials. This handles the case of updating the credentials right now
+ // as well, since the renewal interval will be 0, and the thread will get scheduled
+ // immediately.
+ scheduleRenewal(credentialRenewerRunnable)
+ }
+
+ // Keeps only files that are newer than daysToKeepFiles days, and deletes everything else. At
+ // least numFilesToKeep files are kept for safety
+ private def cleanupOldFiles(): Unit = {
+ import scala.concurrent.duration._
+ try {
+ val remoteFs = FileSystem.get(freshHadoopConf)
+ val credentialsPath = new Path(credentialsFile)
+ val thresholdTime = System.currentTimeMillis() - (daysToKeepFiles.days).toMillis
+ hadoopUtil.listFilesSorted(
+ remoteFs, credentialsPath.getParent,
+ credentialsPath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
+ .dropRight(numFilesToKeep)
+ .takeWhile(_.getModificationTime < thresholdTime)
+ .foreach(x => remoteFs.delete(x.getPath, true))
+ } catch {
+ // Such errors are not fatal, so don't throw. Make sure they are logged though
+ case e: Exception =>
+ logWarning("Error while attempting to cleanup old credentials. If you are seeing many " +
+ "such warnings there may be an issue with your HDFS cluster.", e)
+ }
+ }
+
+ private def writeNewCredentialsToHDFS(principal: String, keytab: String): Unit = {
+ // Keytab is copied by YARN to the working directory of the AM, so full path is
+ // not needed.
+
+ // HACK:
+ // HDFS will not issue new delegation tokens, if the Credentials object
+ // passed in already has tokens for that FS even if the tokens are expired (it really only
+ // checks if there are tokens for the service, and not if they are valid). So the only real
+ // way to get new tokens is to make sure a different Credentials object is used each time to
+ // get new tokens and then the new tokens are copied over the current user's Credentials.
+ // So:
+ // - we login as a different user and get the UGI
+ // - use that UGI to get the tokens (see doAs block below)
+ // - copy the tokens over to the current user's credentials (this will overwrite the tokens
+ // in the current user's Credentials object for this FS).
+ // The login to KDC happens each time new tokens are required, but this is rare enough to not
+ // have to worry about (like once every day or so). This makes this code clearer than having
+ // to login and then relogin every time (the HDFS API may not relogin since we don't use this
+ // UGI directly for HDFS communication.
+ logInfo(s"Attempting to login to KDC using principal: $principal")
+ val keytabLoggedInUGI = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab)
+ logInfo("Successfully logged into KDC.")
+ val tempCreds = keytabLoggedInUGI.getCredentials
+ val credentialsPath = new Path(credentialsFile)
+ val dst = credentialsPath.getParent
+ var nearestNextRenewalTime = Long.MaxValue
+ keytabLoggedInUGI.doAs(new PrivilegedExceptionAction[Void] {
+ // Get a copy of the credentials
+ override def run(): Void = {
+ nearestNextRenewalTime = credentialManager.obtainCredentials(freshHadoopConf, tempCreds)
+ null
+ }
+ })
+
+ val currTime = System.currentTimeMillis()
+ val timeOfNextUpdate = if (nearestNextRenewalTime <= currTime) {
+ // If next renewal time is earlier than current time, we set next renewal time to current
+ // time, this will trigger next renewal immediately. Also set next update time to current
+ // time. There still has a gap between token renewal and update will potentially introduce
+ // issue.
+ logWarning(s"Next credential renewal time ($nearestNextRenewalTime) is earlier than " +
+ s"current time ($currTime), which is unexpected, please check your credential renewal " +
+ "related configurations in the target services.")
+ timeOfNextRenewal = currTime
+ currTime
+ } else {
+ // Next valid renewal time is about 75% of credential renewal time, and update time is
+ // slightly later than valid renewal time (80% of renewal time).
+ timeOfNextRenewal = ((nearestNextRenewalTime - currTime) * 0.75 + currTime).toLong
+ ((nearestNextRenewalTime - currTime) * 0.8 + currTime).toLong
+ }
+
+ // Add the temp credentials back to the original ones.
+ UserGroupInformation.getCurrentUser.addCredentials(tempCreds)
+ val remoteFs = FileSystem.get(freshHadoopConf)
+ // If lastCredentialsFileSuffix is 0, then the AM is either started or restarted. If the AM
+ // was restarted, then the lastCredentialsFileSuffix might be > 0, so find the newest file
+ // and update the lastCredentialsFileSuffix.
+ if (lastCredentialsFileSuffix == 0) {
+ hadoopUtil.listFilesSorted(
+ remoteFs, credentialsPath.getParent,
+ credentialsPath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
+ .lastOption.foreach { status =>
+ lastCredentialsFileSuffix = hadoopUtil.getSuffixForCredentialsPath(status.getPath)
+ }
+ }
+ val nextSuffix = lastCredentialsFileSuffix + 1
+
+ val tokenPathStr =
+ credentialsFile + SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM +
+ timeOfNextUpdate.toLong.toString + SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM +
+ nextSuffix
+ val tokenPath = new Path(tokenPathStr)
+ val tempTokenPath = new Path(tokenPathStr + SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
+
+ logInfo("Writing out delegation tokens to " + tempTokenPath.toString)
+ val credentials = UserGroupInformation.getCurrentUser.getCredentials
+ credentials.writeTokenStorageFile(tempTokenPath, freshHadoopConf)
+ logInfo(s"Delegation Tokens written out successfully. Renaming file to $tokenPathStr")
+ remoteFs.rename(tempTokenPath, tokenPath)
+ logInfo("Delegation token file rename complete.")
+ lastCredentialsFileSuffix = nextSuffix
+ }
+
+ def stop(): Unit = {
+ credentialRenewer.shutdown()
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
new file mode 100644
index 0000000..c4c07b4
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManager.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import java.util.ServiceLoader
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.security.Credentials
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * A ConfigurableCredentialManager to manage all the registered credential providers and offer
+ * APIs for other modules to obtain credentials as well as renewal time. By default
+ * [[HDFSCredentialProvider]], [[HiveCredentialProvider]] and [[HBaseCredentialProvider]] will
+ * be loaded in if not explicitly disabled, any plugged-in credential provider wants to be
+ * managed by ConfigurableCredentialManager needs to implement [[ServiceCredentialProvider]]
+ * interface and put into resources/META-INF/services to be loaded by ServiceLoader.
+ *
+ * Also each credential provider is controlled by
+ * spark.yarn.security.credentials.{service}.enabled, it will not be loaded in if set to false.
+ */
+private[yarn] final class ConfigurableCredentialManager(
+ sparkConf: SparkConf, hadoopConf: Configuration) extends Logging {
+ private val deprecatedProviderEnabledConfig = "spark.yarn.security.tokens.%s.enabled"
+ private val providerEnabledConfig = "spark.yarn.security.credentials.%s.enabled"
+
+ // Maintain all the registered credential providers
+ private val credentialProviders = {
+ val providers = ServiceLoader.load(classOf[ServiceCredentialProvider],
+ Utils.getContextOrSparkClassLoader).asScala
+
+ // Filter out credentials in which spark.yarn.security.credentials.{service}.enabled is false.
+ providers.filter { p =>
+ sparkConf.getOption(providerEnabledConfig.format(p.serviceName))
+ .orElse {
+ sparkConf.getOption(deprecatedProviderEnabledConfig.format(p.serviceName)).map { c =>
+ logWarning(s"${deprecatedProviderEnabledConfig.format(p.serviceName)} is deprecated, " +
+ s"using ${providerEnabledConfig.format(p.serviceName)} instead")
+ c
+ }
+ }.map(_.toBoolean).getOrElse(true)
+ }.map { p => (p.serviceName, p) }.toMap
+ }
+
+ /**
+ * Get credential provider for the specified service.
+ */
+ def getServiceCredentialProvider(service: String): Option[ServiceCredentialProvider] = {
+ credentialProviders.get(service)
+ }
+
+ /**
+ * Obtain credentials from all the registered providers.
+ * @return nearest time of next renewal, Long.MaxValue if all the credentials aren't renewable,
+ * otherwise the nearest renewal time of any credentials will be returned.
+ */
+ def obtainCredentials(hadoopConf: Configuration, creds: Credentials): Long = {
+ credentialProviders.values.flatMap { provider =>
+ if (provider.credentialsRequired(hadoopConf)) {
+ provider.obtainCredentials(hadoopConf, sparkConf, creds)
+ } else {
+ logDebug(s"Service ${provider.serviceName} does not require a token." +
+ s" Check your configuration to see if security is disabled or not.")
+ None
+ }
+ }.foldLeft(Long.MaxValue)(math.min)
+ }
+
+ /**
+ * Create an [[AMCredentialRenewer]] instance, caller should be responsible to stop this
+ * instance when it is not used. AM will use it to renew credentials periodically.
+ */
+ def credentialRenewer(): AMCredentialRenewer = {
+ new AMCredentialRenewer(sparkConf, hadoopConf, this)
+ }
+
+ /**
+ * Create an [[CredentialUpdater]] instance, caller should be resposible to stop this intance
+ * when it is not used. Executors and driver (client mode) will use it to update credentials.
+ * periodically.
+ */
+ def credentialUpdater(): CredentialUpdater = {
+ new CredentialUpdater(sparkConf, hadoopConf, this)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
new file mode 100644
index 0000000..5df4fbd
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/CredentialUpdater.scala
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import java.util.concurrent.{Executors, TimeUnit}
+
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+private[spark] class CredentialUpdater(
+ sparkConf: SparkConf,
+ hadoopConf: Configuration,
+ credentialManager: ConfigurableCredentialManager) extends Logging {
+
+ @volatile private var lastCredentialsFileSuffix = 0
+
+ private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
+ private val freshHadoopConf =
+ SparkHadoopUtil.get.getConfBypassingFSCache(
+ hadoopConf, new Path(credentialsFile).toUri.getScheme)
+
+ private val credentialUpdater =
+ Executors.newSingleThreadScheduledExecutor(
+ ThreadUtils.namedThreadFactory("Credential Refresh Thread"))
+
+ // This thread wakes up and picks up new credentials from HDFS, if any.
+ private val credentialUpdaterRunnable =
+ new Runnable {
+ override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
+ }
+
+ /** Start the credential updater task */
+ def start(): Unit = {
+ val startTime = sparkConf.get(CREDENTIALS_RENEWAL_TIME)
+ val remainingTime = startTime - System.currentTimeMillis()
+ if (remainingTime <= 0) {
+ credentialUpdater.schedule(credentialUpdaterRunnable, 1, TimeUnit.MINUTES)
+ } else {
+ logInfo(s"Scheduling credentials refresh from HDFS in $remainingTime millis.")
+ credentialUpdater.schedule(credentialUpdaterRunnable, remainingTime, TimeUnit.MILLISECONDS)
+ }
+ }
+
+ private def updateCredentialsIfRequired(): Unit = {
+ val timeToNextUpdate = try {
+ val credentialsFilePath = new Path(credentialsFile)
+ val remoteFs = FileSystem.get(freshHadoopConf)
+ SparkHadoopUtil.get.listFilesSorted(
+ remoteFs, credentialsFilePath.getParent,
+ credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
+ .lastOption.map { credentialsStatus =>
+ val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
+ if (suffix > lastCredentialsFileSuffix) {
+ logInfo("Reading new credentials from " + credentialsStatus.getPath)
+ val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
+ lastCredentialsFileSuffix = suffix
+ UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
+ logInfo("Credentials updated from credentials file.")
+
+ val remainingTime = getTimeOfNextUpdateFromFileName(credentialsStatus.getPath)
+ - System.currentTimeMillis()
+ if (remainingTime <= 0) TimeUnit.MINUTES.toMillis(1) else remainingTime
+ } else {
+ // If current credential file is older than expected, sleep 1 hour and check again.
+ TimeUnit.HOURS.toMillis(1)
+ }
+ }.getOrElse {
+ // Wait for 1 minute to check again if there's no credential file currently
+ TimeUnit.MINUTES.toMillis(1)
+ }
+ } catch {
+ // Since the file may get deleted while we are reading it, catch the Exception and come
+ // back in an hour to try again
+ case NonFatal(e) =>
+ logWarning("Error while trying to update credentials, will try again in 1 hour", e)
+ TimeUnit.HOURS.toMillis(1)
+ }
+
+ credentialUpdater.schedule(
+ credentialUpdaterRunnable, timeToNextUpdate, TimeUnit.MILLISECONDS)
+ }
+
+ private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
+ val stream = remoteFs.open(tokenPath)
+ try {
+ val newCredentials = new Credentials()
+ newCredentials.readTokenStorageStream(stream)
+ newCredentials
+ } finally {
+ stream.close()
+ }
+ }
+
+ private def getTimeOfNextUpdateFromFileName(credentialsPath: Path): Long = {
+ val name = credentialsPath.getName
+ val index = name.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM)
+ val slice = name.substring(0, index)
+ val last2index = slice.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM)
+ name.substring(last2index + 1, index).toLong
+ }
+
+ def stop(): Unit = {
+ credentialUpdater.shutdown()
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
new file mode 100644
index 0000000..5571df0
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HBaseCredentialProvider.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import scala.reflect.runtime.universe
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.security.Credentials
+import org.apache.hadoop.security.token.{Token, TokenIdentifier}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+
+private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging {
+
+ override def serviceName: String = "hbase"
+
+ override def obtainCredentials(
+ hadoopConf: Configuration,
+ sparkConf: SparkConf,
+ creds: Credentials): Option[Long] = {
+ try {
+ val mirror = universe.runtimeMirror(getClass.getClassLoader)
+ val obtainToken = mirror.classLoader.
+ loadClass("org.apache.hadoop.hbase.security.token.TokenUtil").
+ getMethod("obtainToken", classOf[Configuration])
+
+ logDebug("Attempting to fetch HBase security token.")
+ val token = obtainToken.invoke(null, hbaseConf(hadoopConf))
+ .asInstanceOf[Token[_ <: TokenIdentifier]]
+ logInfo(s"Get token from HBase: ${token.toString}")
+ creds.addToken(token.getService, token)
+ } catch {
+ case NonFatal(e) =>
+ logDebug(s"Failed to get token from service $serviceName", e)
+ }
+
+ None
+ }
+
+ override def credentialsRequired(hadoopConf: Configuration): Boolean = {
+ hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos"
+ }
+
+ private def hbaseConf(conf: Configuration): Configuration = {
+ try {
+ val mirror = universe.runtimeMirror(getClass.getClassLoader)
+ val confCreate = mirror.classLoader.
+ loadClass("org.apache.hadoop.hbase.HBaseConfiguration").
+ getMethod("create", classOf[Configuration])
+ confCreate.invoke(null, conf).asInstanceOf[Configuration]
+ } catch {
+ case NonFatal(e) =>
+ logDebug("Fail to invoke HBaseConfiguration", e)
+ conf
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
new file mode 100644
index 0000000..8d06d73
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProvider.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import java.io.{ByteArrayInputStream, DataInputStream}
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
+import org.apache.hadoop.mapred.Master
+import org.apache.hadoop.security.Credentials
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+
+private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
+ // Token renewal interval, this value will be set in the first call,
+ // if None means no token renewer specified, so cannot get token renewal interval.
+ private var tokenRenewalInterval: Option[Long] = null
+
+ override val serviceName: String = "hdfs"
+
+ override def obtainCredentials(
+ hadoopConf: Configuration,
+ sparkConf: SparkConf,
+ creds: Credentials): Option[Long] = {
+ // NameNode to access, used to get tokens from different FileSystems
+ nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
+ val dstFs = dst.getFileSystem(hadoopConf)
+ logInfo("getting token for namenode: " + dst)
+ dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
+ }
+
+ // Get the token renewal interval if it is not set. It will only be called once.
+ if (tokenRenewalInterval == null) {
+ tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
+ }
+
+ // Get the time of next renewal.
+ tokenRenewalInterval.map { interval =>
+ creds.getAllTokens.asScala
+ .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
+ .map { t =>
+ val identifier = new DelegationTokenIdentifier()
+ identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
+ identifier.getIssueDate + interval
+ }.foldLeft(0L)(math.max)
+ }
+ }
+
+ private def getTokenRenewalInterval(
+ hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
+ // We cannot use the tokens generated with renewer yarn. Trying to renew
+ // those will fail with an access control issue. So create new tokens with the logged in
+ // user as renewer.
+ sparkConf.get(PRINCIPAL).map { renewer =>
+ val creds = new Credentials()
+ nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
+ val dstFs = dst.getFileSystem(hadoopConf)
+ dstFs.addDelegationTokens(renewer, creds)
+ }
+ val t = creds.getAllTokens.asScala
+ .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
+ .head
+ val newExpiration = t.renew(hadoopConf)
+ val identifier = new DelegationTokenIdentifier()
+ identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
+ val interval = newExpiration - identifier.getIssueDate
+ logInfo(s"Renewal Interval is $interval")
+ interval
+ }
+ }
+
+ private def getTokenRenewer(conf: Configuration): String = {
+ val delegTokenRenewer = Master.getMasterPrincipal(conf)
+ logDebug("delegation token renewer is: " + delegTokenRenewer)
+ if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
+ val errorMessage = "Can't get Master Kerberos principal for use as renewer"
+ logError(errorMessage)
+ throw new SparkException(errorMessage)
+ }
+
+ delegTokenRenewer
+ }
+
+ private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
+ sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
+ sparkConf.get(STAGING_DIR).map(new Path(_))
+ .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
new file mode 100644
index 0000000..16d8fc3
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/HiveCredentialProvider.scala
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import java.lang.reflect.UndeclaredThrowableException
+import java.security.PrivilegedExceptionAction
+
+import scala.reflect.runtime.universe
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import org.apache.hadoop.security.token.Token
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+private[security] class HiveCredentialProvider extends ServiceCredentialProvider with Logging {
+
+ override def serviceName: String = "hive"
+
+ private def hiveConf(hadoopConf: Configuration): Configuration = {
+ try {
+ val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
+ // the hive configuration class is a subclass of Hadoop Configuration, so can be cast down
+ // to a Configuration and used without reflection
+ val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+ // using the (Configuration, Class) constructor allows the current configuration to be
+ // included in the hive config.
+ val ctor = hiveConfClass.getDeclaredConstructor(classOf[Configuration],
+ classOf[Object].getClass)
+ ctor.newInstance(hadoopConf, hiveConfClass).asInstanceOf[Configuration]
+ } catch {
+ case NonFatal(e) =>
+ logDebug("Fail to create Hive Configuration", e)
+ hadoopConf
+ }
+ }
+
+ override def credentialsRequired(hadoopConf: Configuration): Boolean = {
+ UserGroupInformation.isSecurityEnabled &&
+ hiveConf(hadoopConf).getTrimmed("hive.metastore.uris", "").nonEmpty
+ }
+
+ override def obtainCredentials(
+ hadoopConf: Configuration,
+ sparkConf: SparkConf,
+ creds: Credentials): Option[Long] = {
+ val conf = hiveConf(hadoopConf)
+
+ val principalKey = "hive.metastore.kerberos.principal"
+ val principal = conf.getTrimmed(principalKey, "")
+ require(principal.nonEmpty, s"Hive principal $principalKey undefined")
+ val metastoreUri = conf.getTrimmed("hive.metastore.uris", "")
+ require(metastoreUri.nonEmpty, "Hive metastore uri undefined")
+
+ val currentUser = UserGroupInformation.getCurrentUser()
+ logDebug(s"Getting Hive delegation token for ${currentUser.getUserName()} against " +
+ s"$principal at $metastoreUri")
+
+ val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader)
+ val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
+ val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+ val closeCurrent = hiveClass.getMethod("closeCurrent")
+
+ try {
+ // get all the instance methods before invoking any
+ val getDelegationToken = hiveClass.getMethod("getDelegationToken",
+ classOf[String], classOf[String])
+ val getHive = hiveClass.getMethod("get", hiveConfClass)
+
+ doAsRealUser {
+ val hive = getHive.invoke(null, conf)
+ val tokenStr = getDelegationToken.invoke(hive, currentUser.getUserName(), principal)
+ .asInstanceOf[String]
+ val hive2Token = new Token[DelegationTokenIdentifier]()
+ hive2Token.decodeFromUrlString(tokenStr)
+ logInfo(s"Get Token from hive metastore: ${hive2Token.toString}")
+ creds.addToken(new Text("hive.server2.delegation.token"), hive2Token)
+ }
+ } catch {
+ case NonFatal(e) =>
+ logDebug(s"Fail to get token from service $serviceName", e)
+ } finally {
+ Utils.tryLogNonFatalError {
+ closeCurrent.invoke(null)
+ }
+ }
+
+ None
+ }
+
+ /**
+ * Run some code as the real logged in user (which may differ from the current user, for
+ * example, when using proxying).
+ */
+ private def doAsRealUser[T](fn: => T): T = {
+ val currentUser = UserGroupInformation.getCurrentUser()
+ val realUser = Option(currentUser.getRealUser()).getOrElse(currentUser)
+
+ // For some reason the Scala-generated anonymous class ends up causing an
+ // UndeclaredThrowableException, even if you annotate the method with @throws.
+ try {
+ realUser.doAs(new PrivilegedExceptionAction[T]() {
+ override def run(): T = fn
+ })
+ } catch {
+ case e: UndeclaredThrowableException => throw Option(e.getCause()).getOrElse(e)
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
new file mode 100644
index 0000000..4e3fcce
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/ServiceCredentialProvider.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+
+import org.apache.spark.SparkConf
+
+/**
+ * A credential provider for a service. User must implement this if they need to access a
+ * secure service from Spark.
+ */
+trait ServiceCredentialProvider {
+
+ /**
+ * Name of the service to provide credentials. This name should unique, Spark internally will
+ * use this name to differentiate credential provider.
+ */
+ def serviceName: String
+
+ /**
+ * To decide whether credential is required for this service. By default it based on whether
+ * Hadoop security is enabled.
+ */
+ def credentialsRequired(hadoopConf: Configuration): Boolean = {
+ UserGroupInformation.isSecurityEnabled
+ }
+
+ /**
+ * Obtain credentials for this service and get the time of the next renewal.
+ * @param hadoopConf Configuration of current Hadoop Compatible system.
+ * @param sparkConf Spark configuration.
+ * @param creds Credentials to add tokens and security keys to.
+ * @return If this Credential is renewable and can be renewed, return the time of the next
+ * renewal, otherwise None should be returned.
+ */
+ def obtainCredentials(
+ hadoopConf: Configuration,
+ sparkConf: SparkConf,
+ creds: Credentials): Option[Long]
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
new file mode 100644
index 0000000..6c3556a
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
+import scala.util.Properties
+
+/**
+ * Exposes methods from the launcher library that are used by the YARN backend.
+ */
+private[spark] object YarnCommandBuilderUtils {
+
+ def quoteForBatchScript(arg: String): String = {
+ CommandBuilderUtils.quoteForBatchScript(arg)
+ }
+
+ def findJarsDir(sparkHome: String): String = {
+ val scalaVer = Properties.versionNumberString
+ .split("\\.")
+ .take(2)
+ .mkString(".")
+ CommandBuilderUtils.findJarsDir(sparkHome, scalaVer, true)
+ }
+
+ /**
+ * Adds the perm gen configuration to the list of java options if needed and not yet added.
+ *
+ * Note that this method adds the option based on the local JVM version; if the node where
+ * the container is running has a different Java version, there's a risk that the option will
+ * not be added (e.g. if the AM is running Java 8 but the container's node is set up to use
+ * Java 7).
+ */
+ def addPermGenSizeOpt(args: ListBuffer[String]): Unit = {
+ CommandBuilderUtils.addPermGenSizeOpt(args.asJava)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
new file mode 100644
index 0000000..4ed2852
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerExtensionService.scala
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * An extension service that can be loaded into a Spark YARN scheduler.
+ * A Service that can be started and stopped.
+ *
+ * 1. For implementations to be loadable by `SchedulerExtensionServices`,
+ * they must provide an empty constructor.
+ * 2. The `stop()` operation MUST be idempotent, and succeed even if `start()` was
+ * never invoked.
+ */
+trait SchedulerExtensionService {
+
+ /**
+ * Start the extension service. This should be a no-op if
+ * called more than once.
+ * @param binding binding to the spark application and YARN
+ */
+ def start(binding: SchedulerExtensionServiceBinding): Unit
+
+ /**
+ * Stop the service
+ * The `stop()` operation MUST be idempotent, and succeed even if `start()` was
+ * never invoked.
+ */
+ def stop(): Unit
+}
+
+/**
+ * Binding information for a [[SchedulerExtensionService]].
+ *
+ * The attempt ID will be set if the service is started within a YARN application master;
+ * there is then a different attempt ID for every time that AM is restarted.
+ * When the service binding is instantiated in client mode, there's no attempt ID, as it lacks
+ * this information.
+ * @param sparkContext current spark context
+ * @param applicationId YARN application ID
+ * @param attemptId YARN attemptID. This will always be unset in client mode, and always set in
+ * cluster mode.
+ */
+case class SchedulerExtensionServiceBinding(
+ sparkContext: SparkContext,
+ applicationId: ApplicationId,
+ attemptId: Option[ApplicationAttemptId] = None)
+
+/**
+ * Container for [[SchedulerExtensionService]] instances.
+ *
+ * Loads Extension Services from the configuration property
+ * `"spark.yarn.services"`, instantiates and starts them.
+ * When stopped, it stops all child entries.
+ *
+ * The order in which child extension services are started and stopped
+ * is undefined.
+ */
+private[spark] class SchedulerExtensionServices extends SchedulerExtensionService
+ with Logging {
+ private var serviceOption: Option[String] = None
+ private var services: List[SchedulerExtensionService] = Nil
+ private val started = new AtomicBoolean(false)
+ private var binding: SchedulerExtensionServiceBinding = _
+
+ /**
+ * Binding operation will load the named services and call bind on them too; the
+ * entire set of services are then ready for `init()` and `start()` calls.
+ *
+ * @param binding binding to the spark application and YARN
+ */
+ def start(binding: SchedulerExtensionServiceBinding): Unit = {
+ if (started.getAndSet(true)) {
+ logWarning("Ignoring re-entrant start operation")
+ return
+ }
+ require(binding.sparkContext != null, "Null context parameter")
+ require(binding.applicationId != null, "Null appId parameter")
+ this.binding = binding
+ val sparkContext = binding.sparkContext
+ val appId = binding.applicationId
+ val attemptId = binding.attemptId
+ logInfo(s"Starting Yarn extension services with app $appId and attemptId $attemptId")
+
+ services = sparkContext.conf.get(SCHEDULER_SERVICES).map { sClass =>
+ val instance = Utils.classForName(sClass)
+ .newInstance()
+ .asInstanceOf[SchedulerExtensionService]
+ // bind this service
+ instance.start(binding)
+ logInfo(s"Service $sClass started")
+ instance
+ }.toList
+ }
+
+ /**
+ * Get the list of services.
+ *
+ * @return a list of services; Nil until the service is started
+ */
+ def getServices: List[SchedulerExtensionService] = services
+
+ /**
+ * Stop the services; idempotent.
+ *
+ */
+ override def stop(): Unit = {
+ if (started.getAndSet(false)) {
+ logInfo(s"Stopping $this")
+ services.foreach { s =>
+ Utils.tryLogNonFatalError(s.stop())
+ }
+ }
+ }
+
+ override def toString(): String = s"""SchedulerExtensionServices
+ |(serviceOption=$serviceOption,
+ | services=$services,
+ | started=$started)""".stripMargin
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
new file mode 100644
index 0000000..60da356
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.hadoop.yarn.api.records.YarnApplicationState
+
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
+import org.apache.spark.internal.Logging
+import org.apache.spark.launcher.SparkAppHandle
+import org.apache.spark.scheduler.TaskSchedulerImpl
+
+private[spark] class YarnClientSchedulerBackend(
+ scheduler: TaskSchedulerImpl,
+ sc: SparkContext)
+ extends YarnSchedulerBackend(scheduler, sc)
+ with Logging {
+
+ private var client: Client = null
+ private var monitorThread: MonitorThread = null
+
+ /**
+ * Create a Yarn client to submit an application to the ResourceManager.
+ * This waits until the application is running.
+ */
+ override def start() {
+ val driverHost = conf.get("spark.driver.host")
+ val driverPort = conf.get("spark.driver.port")
+ val hostport = driverHost + ":" + driverPort
+ sc.ui.foreach { ui => conf.set("spark.driver.appUIAddress", ui.webUrl) }
+
+ val argsArrayBuf = new ArrayBuffer[String]()
+ argsArrayBuf += ("--arg", hostport)
+
+ logDebug("ClientArguments called with: " + argsArrayBuf.mkString(" "))
+ val args = new ClientArguments(argsArrayBuf.toArray)
+ totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(conf)
+ client = new Client(args, conf)
+ bindToYarn(client.submitApplication(), None)
+
+ // SPARK-8687: Ensure all necessary properties have already been set before
+ // we initialize our driver scheduler backend, which serves these properties
+ // to the executors
+ super.start()
+ waitForApplication()
+
+ // SPARK-8851: In yarn-client mode, the AM still does the credentials refresh. The driver
+ // reads the credentials from HDFS, just like the executors and updates its own credentials
+ // cache.
+ if (conf.contains("spark.yarn.credentials.file")) {
+ YarnSparkHadoopUtil.get.startCredentialUpdater(conf)
+ }
+ monitorThread = asyncMonitorApplication()
+ monitorThread.start()
+ }
+
+ /**
+ * Report the state of the application until it is running.
+ * If the application has finished, failed or been killed in the process, throw an exception.
+ * This assumes both `client` and `appId` have already been set.
+ */
+ private def waitForApplication(): Unit = {
+ assert(client != null && appId.isDefined, "Application has not been submitted yet!")
+ val (state, _) = client.monitorApplication(appId.get, returnOnRunning = true) // blocking
+ if (state == YarnApplicationState.FINISHED ||
+ state == YarnApplicationState.FAILED ||
+ state == YarnApplicationState.KILLED) {
+ throw new SparkException("Yarn application has already ended! " +
+ "It might have been killed or unable to launch application master.")
+ }
+ if (state == YarnApplicationState.RUNNING) {
+ logInfo(s"Application ${appId.get} has started running.")
+ }
+ }
+
+ /**
+ * We create this class for SPARK-9519. Basically when we interrupt the monitor thread it's
+ * because the SparkContext is being shut down(sc.stop() called by user code), but if
+ * monitorApplication return, it means the Yarn application finished before sc.stop() was called,
+ * which means we should call sc.stop() here, and we don't allow the monitor to be interrupted
+ * before SparkContext stops successfully.
+ */
+ private class MonitorThread extends Thread {
+ private var allowInterrupt = true
+
+ override def run() {
+ try {
+ val (state, _) = client.monitorApplication(appId.get, logApplicationReport = false)
+ logError(s"Yarn application has already exited with state $state!")
+ allowInterrupt = false
+ sc.stop()
+ } catch {
+ case e: InterruptedException => logInfo("Interrupting monitor thread")
+ }
+ }
+
+ def stopMonitor(): Unit = {
+ if (allowInterrupt) {
+ this.interrupt()
+ }
+ }
+ }
+
+ /**
+ * Monitor the application state in a separate thread.
+ * If the application has exited for any reason, stop the SparkContext.
+ * This assumes both `client` and `appId` have already been set.
+ */
+ private def asyncMonitorApplication(): MonitorThread = {
+ assert(client != null && appId.isDefined, "Application has not been submitted yet!")
+ val t = new MonitorThread
+ t.setName("Yarn application state monitor")
+ t.setDaemon(true)
+ t
+ }
+
+ /**
+ * Stop the scheduler. This assumes `start()` has already been called.
+ */
+ override def stop() {
+ assert(client != null, "Attempted to stop this scheduler before starting it!")
+ if (monitorThread != null) {
+ monitorThread.stopMonitor()
+ }
+
+ // Report a final state to the launcher if one is connected. This is needed since in client
+ // mode this backend doesn't let the app monitor loop run to completion, so it does not report
+ // the final state itself.
+ //
+ // Note: there's not enough information at this point to provide a better final state,
+ // so assume the application was successful.
+ client.reportLauncherState(SparkAppHandle.State.FINISHED)
+
+ super.stop()
+ YarnSparkHadoopUtil.get.stopCredentialUpdater()
+ client.stop()
+ logInfo("Stopped")
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
new file mode 100644
index 0000000..64cd1bd
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterManager.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
+
+/**
+ * Cluster Manager for creation of Yarn scheduler and backend
+ */
+private[spark] class YarnClusterManager extends ExternalClusterManager {
+
+ override def canCreate(masterURL: String): Boolean = {
+ masterURL == "yarn"
+ }
+
+ override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
+ sc.deployMode match {
+ case "cluster" => new YarnClusterScheduler(sc)
+ case "client" => new YarnScheduler(sc)
+ case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
+ }
+ }
+
+ override def createSchedulerBackend(sc: SparkContext,
+ masterURL: String,
+ scheduler: TaskScheduler): SchedulerBackend = {
+ sc.deployMode match {
+ case "cluster" =>
+ new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
+ case "client" =>
+ new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
+ case _ =>
+ throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
+ }
+ }
+
+ override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
+ scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
new file mode 100644
index 0000000..96c9151
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterScheduler.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.spark._
+import org.apache.spark.deploy.yarn.ApplicationMaster
+
+/**
+ * This is a simple extension to ClusterScheduler - to ensure that appropriate initialization of
+ * ApplicationMaster, etc is done
+ */
+private[spark] class YarnClusterScheduler(sc: SparkContext) extends YarnScheduler(sc) {
+
+ logInfo("Created YarnClusterScheduler")
+
+ override def postStartHook() {
+ ApplicationMaster.sparkContextInitialized(sc)
+ super.postStartHook()
+ logInfo("YarnClusterScheduler.postStartHook done")
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
new file mode 100644
index 0000000..4f3d5eb
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+
+import org.apache.spark.SparkContext
+import org.apache.spark.deploy.yarn.{ApplicationMaster, YarnSparkHadoopUtil}
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.util.Utils
+
+private[spark] class YarnClusterSchedulerBackend(
+ scheduler: TaskSchedulerImpl,
+ sc: SparkContext)
+ extends YarnSchedulerBackend(scheduler, sc) {
+
+ override def start() {
+ val attemptId = ApplicationMaster.getAttemptId
+ bindToYarn(attemptId.getApplicationId(), Some(attemptId))
+ super.start()
+ totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf)
+ }
+
+ override def getDriverLogUrls: Option[Map[String, String]] = {
+ var driverLogs: Option[Map[String, String]] = None
+ try {
+ val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
+ val containerId = YarnSparkHadoopUtil.get.getContainerId
+
+ val httpAddress = System.getenv(Environment.NM_HOST.name()) +
+ ":" + System.getenv(Environment.NM_HTTP_PORT.name())
+ // lookup appropriate http scheme for container log urls
+ val yarnHttpPolicy = yarnConf.get(
+ YarnConfiguration.YARN_HTTP_POLICY_KEY,
+ YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
+ )
+ val user = Utils.getCurrentUserName()
+ val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
+ val baseUrl = s"$httpScheme$httpAddress/node/containerlogs/$containerId/$user"
+ logDebug(s"Base URL for logs: $baseUrl")
+ driverLogs = Some(Map(
+ "stdout" -> s"$baseUrl/stdout?start=-4096",
+ "stderr" -> s"$baseUrl/stderr?start=-4096"))
+ } catch {
+ case e: Exception =>
+ logInfo("Error while building AM log links, so AM" +
+ " logs link will not appear in application UI", e)
+ }
+ driverLogs
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
new file mode 100644
index 0000000..0293821
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnScheduler.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.hadoop.yarn.util.RackResolver
+import org.apache.log4j.{Level, Logger}
+
+import org.apache.spark._
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.util.Utils
+
+private[spark] class YarnScheduler(sc: SparkContext) extends TaskSchedulerImpl(sc) {
+
+ // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
+ if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
+ Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
+ }
+
+ // By default, rack is unknown
+ override def getRackForHost(hostPort: String): Option[String] = {
+ val host = Utils.parseHostPort(hostPort)._1
+ Option(RackResolver.resolve(sc.hadoopConfiguration, host).getNetworkLocation)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
new file mode 100644
index 0000000..2f9ea19
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.util.{Failure, Success}
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
+
+import org.apache.spark.SparkContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.rpc._
+import org.apache.spark.scheduler._
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.ui.JettyUtils
+import org.apache.spark.util.{RpcUtils, ThreadUtils}
+
+/**
+ * Abstract Yarn scheduler backend that contains common logic
+ * between the client and cluster Yarn scheduler backends.
+ */
+private[spark] abstract class YarnSchedulerBackend(
+ scheduler: TaskSchedulerImpl,
+ sc: SparkContext)
+ extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) {
+
+ override val minRegisteredRatio =
+ if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
+ 0.8
+ } else {
+ super.minRegisteredRatio
+ }
+
+ protected var totalExpectedExecutors = 0
+
+ private val yarnSchedulerEndpoint = new YarnSchedulerEndpoint(rpcEnv)
+
+ private val yarnSchedulerEndpointRef = rpcEnv.setupEndpoint(
+ YarnSchedulerBackend.ENDPOINT_NAME, yarnSchedulerEndpoint)
+
+ private implicit val askTimeout = RpcUtils.askRpcTimeout(sc.conf)
+
+ /** Application ID. */
+ protected var appId: Option[ApplicationId] = None
+
+ /** Attempt ID. This is unset for client-mode schedulers */
+ private var attemptId: Option[ApplicationAttemptId] = None
+
+ /** Scheduler extension services. */
+ private val services: SchedulerExtensionServices = new SchedulerExtensionServices()
+
+ // Flag to specify whether this schedulerBackend should be reset.
+ private var shouldResetOnAmRegister = false
+
+ /**
+ * Bind to YARN. This *must* be done before calling [[start()]].
+ *
+ * @param appId YARN application ID
+ * @param attemptId Optional YARN attempt ID
+ */
+ protected def bindToYarn(appId: ApplicationId, attemptId: Option[ApplicationAttemptId]): Unit = {
+ this.appId = Some(appId)
+ this.attemptId = attemptId
+ }
+
+ override def start() {
+ require(appId.isDefined, "application ID unset")
+ val binding = SchedulerExtensionServiceBinding(sc, appId.get, attemptId)
+ services.start(binding)
+ super.start()
+ }
+
+ override def stop(): Unit = {
+ try {
+ // SPARK-12009: To prevent Yarn allocator from requesting backup for the executors which
+ // was Stopped by SchedulerBackend.
+ requestTotalExecutors(0, 0, Map.empty)
+ super.stop()
+ } finally {
+ services.stop()
+ }
+ }
+
+ /**
+ * Get the attempt ID for this run, if the cluster manager supports multiple
+ * attempts. Applications run in client mode will not have attempt IDs.
+ * This attempt ID only includes attempt counter, like "1", "2".
+ *
+ * @return The application attempt id, if available.
+ */
+ override def applicationAttemptId(): Option[String] = {
+ attemptId.map(_.getAttemptId.toString)
+ }
+
+ /**
+ * Get an application ID associated with the job.
+ * This returns the string value of [[appId]] if set, otherwise
+ * the locally-generated ID from the superclass.
+ * @return The application ID
+ */
+ override def applicationId(): String = {
+ appId.map(_.toString).getOrElse {
+ logWarning("Application ID is not initialized yet.")
+ super.applicationId
+ }
+ }
+
+ /**
+ * Request executors from the ApplicationMaster by specifying the total number desired.
+ * This includes executors already pending or running.
+ */
+ override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
+ yarnSchedulerEndpointRef.ask[Boolean](
+ RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount))
+ }
+
+ /**
+ * Request that the ApplicationMaster kill the specified executors.
+ */
+ override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {
+ yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))
+ }
+
+ override def sufficientResourcesRegistered(): Boolean = {
+ totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio
+ }
+
+ /**
+ * Add filters to the SparkUI.
+ */
+ private def addWebUIFilter(
+ filterName: String,
+ filterParams: Map[String, String],
+ proxyBase: String): Unit = {
+ if (proxyBase != null && proxyBase.nonEmpty) {
+ System.setProperty("spark.ui.proxyBase", proxyBase)
+ }
+
+ val hasFilter =
+ filterName != null && filterName.nonEmpty &&
+ filterParams != null && filterParams.nonEmpty
+ if (hasFilter) {
+ logInfo(s"Add WebUI Filter. $filterName, $filterParams, $proxyBase")
+ conf.set("spark.ui.filters", filterName)
+ filterParams.foreach { case (k, v) => conf.set(s"spark.$filterName.param.$k", v) }
+ scheduler.sc.ui.foreach { ui => JettyUtils.addFilters(ui.getHandlers, conf) }
+ }
+ }
+
+ override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+ new YarnDriverEndpoint(rpcEnv, properties)
+ }
+
+ /**
+ * Reset the state of SchedulerBackend to the initial state. This is happened when AM is failed
+ * and re-registered itself to driver after a failure. The stale state in driver should be
+ * cleaned.
+ */
+ override protected def reset(): Unit = {
+ super.reset()
+ sc.executorAllocationManager.foreach(_.reset())
+ }
+
+ /**
+ * Override the DriverEndpoint to add extra logic for the case when an executor is disconnected.
+ * This endpoint communicates with the executors and queries the AM for an executor's exit
+ * status when the executor is disconnected.
+ */
+ private class YarnDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
+ extends DriverEndpoint(rpcEnv, sparkProperties) {
+
+ /**
+ * When onDisconnected is received at the driver endpoint, the superclass DriverEndpoint
+ * handles it by assuming the Executor was lost for a bad reason and removes the executor
+ * immediately.
+ *
+ * In YARN's case however it is crucial to talk to the application master and ask why the
+ * executor had exited. If the executor exited for some reason unrelated to the running tasks
+ * (e.g., preemption), according to the application master, then we pass that information down
+ * to the TaskSetManager to inform the TaskSetManager that tasks on that lost executor should
+ * not count towards a job failure.
+ */
+ override def onDisconnected(rpcAddress: RpcAddress): Unit = {
+ addressToExecutorId.get(rpcAddress).foreach { executorId =>
+ if (disableExecutor(executorId)) {
+ yarnSchedulerEndpoint.handleExecutorDisconnectedFromDriver(executorId, rpcAddress)
+ }
+ }
+ }
+ }
+
+ /**
+ * An [[RpcEndpoint]] that communicates with the ApplicationMaster.
+ */
+ private class YarnSchedulerEndpoint(override val rpcEnv: RpcEnv)
+ extends ThreadSafeRpcEndpoint with Logging {
+ private var amEndpoint: Option[RpcEndpointRef] = None
+
+ private[YarnSchedulerBackend] def handleExecutorDisconnectedFromDriver(
+ executorId: String,
+ executorRpcAddress: RpcAddress): Unit = {
+ val removeExecutorMessage = amEndpoint match {
+ case Some(am) =>
+ val lossReasonRequest = GetExecutorLossReason(executorId)
+ am.ask[ExecutorLossReason](lossReasonRequest, askTimeout)
+ .map { reason => RemoveExecutor(executorId, reason) }(ThreadUtils.sameThread)
+ .recover {
+ case NonFatal(e) =>
+ logWarning(s"Attempted to get executor loss reason" +
+ s" for executor id ${executorId} at RPC address ${executorRpcAddress}," +
+ s" but got no response. Marking as slave lost.", e)
+ RemoveExecutor(executorId, SlaveLost())
+ }(ThreadUtils.sameThread)
+ case None =>
+ logWarning("Attempted to check for an executor loss reason" +
+ " before the AM has registered!")
+ Future.successful(RemoveExecutor(executorId, SlaveLost("AM is not yet registered.")))
+ }
+
+ removeExecutorMessage
+ .flatMap { message =>
+ driverEndpoint.ask[Boolean](message)
+ }(ThreadUtils.sameThread)
+ .onFailure {
+ case NonFatal(e) => logError(
+ s"Error requesting driver to remove executor $executorId after disconnection.", e)
+ }(ThreadUtils.sameThread)
+ }
+
+ override def receive: PartialFunction[Any, Unit] = {
+ case RegisterClusterManager(am) =>
+ logInfo(s"ApplicationMaster registered as $am")
+ amEndpoint = Option(am)
+ if (!shouldResetOnAmRegister) {
+ shouldResetOnAmRegister = true
+ } else {
+ // AM is already registered before, this potentially means that AM failed and
+ // a new one registered after the failure. This will only happen in yarn-client mode.
+ reset()
+ }
+
+ case AddWebUIFilter(filterName, filterParams, proxyBase) =>
+ addWebUIFilter(filterName, filterParams, proxyBase)
+
+ case r @ RemoveExecutor(executorId, reason) =>
+ logWarning(reason.toString)
+ driverEndpoint.ask[Boolean](r).onFailure {
+ case e =>
+ logError("Error requesting driver to remove executor" +
+ s" $executorId for reason $reason", e)
+ }(ThreadUtils.sameThread)
+ }
+
+
+ override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+ case r: RequestExecutors =>
+ amEndpoint match {
+ case Some(am) =>
+ am.ask[Boolean](r).andThen {
+ case Success(b) => context.reply(b)
+ case Failure(NonFatal(e)) =>
+ logError(s"Sending $r to AM was unsuccessful", e)
+ context.sendFailure(e)
+ }(ThreadUtils.sameThread)
+ case None =>
+ logWarning("Attempted to request executors before the AM has registered!")
+ context.reply(false)
+ }
+
+ case k: KillExecutors =>
+ amEndpoint match {
+ case Some(am) =>
+ am.ask[Boolean](k).andThen {
+ case Success(b) => context.reply(b)
+ case Failure(NonFatal(e)) =>
+ logError(s"Sending $k to AM was unsuccessful", e)
+ context.sendFailure(e)
+ }(ThreadUtils.sameThread)
+ case None =>
+ logWarning("Attempted to kill executors before the AM has registered!")
+ context.reply(false)
+ }
+
+ case RetrieveLastAllocatedExecutorId =>
+ context.reply(currentExecutorIdCounter)
+ }
+
+ override def onDisconnected(remoteAddress: RpcAddress): Unit = {
+ if (amEndpoint.exists(_.address == remoteAddress)) {
+ logWarning(s"ApplicationMaster has disassociated: $remoteAddress")
+ amEndpoint = None
+ }
+ }
+ }
+}
+
+private[spark] object YarnSchedulerBackend {
+ val ENDPOINT_NAME = "YarnScheduler"
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider b/resource-managers/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
new file mode 100644
index 0000000..d0ef5ef
--- /dev/null
+++ b/resource-managers/yarn/src/test/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
@@ -0,0 +1 @@
+org.apache.spark.deploy.yarn.security.TestCredentialProvider
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/resources/log4j.properties b/resource-managers/yarn/src/test/resources/log4j.properties
new file mode 100644
index 0000000..d13454d
--- /dev/null
+++ b/resource-managers/yarn/src/test/resources/log4j.properties
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=DEBUG, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark_project.jetty=WARN
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[18/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
[SPARK-18662] Move resource managers to separate directory
## What changes were proposed in this pull request?
* Moves yarn and mesos scheduler backends to resource-managers/ sub-directory (in preparation for https://issues.apache.org/jira/browse/SPARK-18278)
* Corresponding change in top-level pom.xml.
Ref: https://github.com/apache/spark/pull/16061#issuecomment-263649340
## How was this patch tested?
* Manual tests
/cc rxin
Author: Anirudh <ra...@google.com>
Closes #16092 from foxish/fix-scheduler-structure-2.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81e5619c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81e5619c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81e5619c
Branch: refs/heads/master
Commit: 81e5619ca141a1d3a06547d2b682cbe3f135b360
Parents: a8ced76
Author: Anirudh <ra...@google.com>
Authored: Tue Dec 6 16:23:27 2016 -0800
Committer: Marcelo Vanzin <va...@cloudera.com>
Committed: Tue Dec 6 16:23:27 2016 -0800
----------------------------------------------------------------------
dev/sparktestsupport/modules.py | 4 +-
mesos/pom.xml | 109 --
...pache.spark.scheduler.ExternalClusterManager | 1 -
.../deploy/mesos/MesosClusterDispatcher.scala | 119 --
.../mesos/MesosClusterDispatcherArguments.scala | 149 --
.../deploy/mesos/MesosDriverDescription.scala | 70 -
.../mesos/MesosExternalShuffleService.scala | 131 --
.../org/apache/spark/deploy/mesos/config.scala | 59 -
.../spark/deploy/mesos/ui/DriverPage.scala | 179 --
.../deploy/mesos/ui/MesosClusterPage.scala | 136 --
.../spark/deploy/mesos/ui/MesosClusterUI.scala | 49 -
.../deploy/rest/mesos/MesosRestServer.scala | 156 --
.../spark/executor/MesosExecutorBackend.scala | 131 --
.../cluster/mesos/MesosClusterManager.scala | 64 -
.../mesos/MesosClusterPersistenceEngine.scala | 134 --
.../cluster/mesos/MesosClusterScheduler.scala | 740 ---------
.../mesos/MesosClusterSchedulerSource.scala | 40 -
.../MesosCoarseGrainedSchedulerBackend.scala | 668 --------
.../MesosFineGrainedSchedulerBackend.scala | 444 -----
.../mesos/MesosSchedulerBackendUtil.scala | 165 --
.../cluster/mesos/MesosSchedulerUtils.scala | 524 ------
.../cluster/mesos/MesosTaskLaunchData.scala | 51 -
.../MesosClusterDispatcherArgumentsSuite.scala | 63 -
.../mesos/MesosClusterDispatcherSuite.scala | 40 -
.../mesos/MesosClusterManagerSuite.scala | 56 -
.../mesos/MesosClusterSchedulerSuite.scala | 239 ---
...esosCoarseGrainedSchedulerBackendSuite.scala | 601 -------
.../MesosFineGrainedSchedulerBackendSuite.scala | 385 -----
.../mesos/MesosSchedulerUtilsSuite.scala | 256 ---
.../mesos/MesosTaskLaunchDataSuite.scala | 36 -
.../spark/scheduler/cluster/mesos/Utils.scala | 91 --
pom.xml | 4 +-
resource-managers/mesos/pom.xml | 109 ++
...pache.spark.scheduler.ExternalClusterManager | 1 +
.../deploy/mesos/MesosClusterDispatcher.scala | 119 ++
.../mesos/MesosClusterDispatcherArguments.scala | 149 ++
.../deploy/mesos/MesosDriverDescription.scala | 70 +
.../mesos/MesosExternalShuffleService.scala | 131 ++
.../org/apache/spark/deploy/mesos/config.scala | 59 +
.../spark/deploy/mesos/ui/DriverPage.scala | 179 ++
.../deploy/mesos/ui/MesosClusterPage.scala | 136 ++
.../spark/deploy/mesos/ui/MesosClusterUI.scala | 49 +
.../deploy/rest/mesos/MesosRestServer.scala | 156 ++
.../spark/executor/MesosExecutorBackend.scala | 131 ++
.../cluster/mesos/MesosClusterManager.scala | 64 +
.../mesos/MesosClusterPersistenceEngine.scala | 134 ++
.../cluster/mesos/MesosClusterScheduler.scala | 740 +++++++++
.../mesos/MesosClusterSchedulerSource.scala | 40 +
.../MesosCoarseGrainedSchedulerBackend.scala | 668 ++++++++
.../MesosFineGrainedSchedulerBackend.scala | 444 +++++
.../mesos/MesosSchedulerBackendUtil.scala | 165 ++
.../cluster/mesos/MesosSchedulerUtils.scala | 524 ++++++
.../cluster/mesos/MesosTaskLaunchData.scala | 51 +
.../MesosClusterDispatcherArgumentsSuite.scala | 63 +
.../mesos/MesosClusterDispatcherSuite.scala | 40 +
.../mesos/MesosClusterManagerSuite.scala | 56 +
.../mesos/MesosClusterSchedulerSuite.scala | 239 +++
...esosCoarseGrainedSchedulerBackendSuite.scala | 601 +++++++
.../MesosFineGrainedSchedulerBackendSuite.scala | 385 +++++
.../mesos/MesosSchedulerUtilsSuite.scala | 256 +++
.../mesos/MesosTaskLaunchDataSuite.scala | 36 +
.../spark/scheduler/cluster/mesos/Utils.scala | 91 ++
resource-managers/yarn/pom.xml | 215 +++
...ploy.yarn.security.ServiceCredentialProvider | 3 +
...pache.spark.scheduler.ExternalClusterManager | 1 +
.../spark/deploy/yarn/ApplicationMaster.scala | 791 +++++++++
.../yarn/ApplicationMasterArguments.scala | 105 ++
.../org/apache/spark/deploy/yarn/Client.scala | 1541 ++++++++++++++++++
.../spark/deploy/yarn/ClientArguments.scala | 86 +
.../yarn/ClientDistributedCacheManager.scala | 186 +++
.../spark/deploy/yarn/ExecutorRunnable.scala | 266 +++
...ityPreferredContainerPlacementStrategy.scala | 224 +++
.../spark/deploy/yarn/YarnAllocator.scala | 727 +++++++++
.../apache/spark/deploy/yarn/YarnRMClient.scala | 135 ++
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 317 ++++
.../org/apache/spark/deploy/yarn/config.scala | 347 ++++
.../yarn/security/AMCredentialRenewer.scala | 235 +++
.../ConfigurableCredentialManager.scala | 105 ++
.../yarn/security/CredentialUpdater.scala | 130 ++
.../yarn/security/HBaseCredentialProvider.scala | 74 +
.../yarn/security/HDFSCredentialProvider.scala | 110 ++
.../yarn/security/HiveCredentialProvider.scala | 129 ++
.../security/ServiceCredentialProvider.scala | 57 +
.../launcher/YarnCommandBuilderUtils.scala | 53 +
.../cluster/SchedulerExtensionService.scala | 143 ++
.../cluster/YarnClientSchedulerBackend.scala | 157 ++
.../scheduler/cluster/YarnClusterManager.scala | 56 +
.../cluster/YarnClusterScheduler.scala | 37 +
.../cluster/YarnClusterSchedulerBackend.scala | 67 +
.../spark/scheduler/cluster/YarnScheduler.scala | 39 +
.../cluster/YarnSchedulerBackend.scala | 315 ++++
...ploy.yarn.security.ServiceCredentialProvider | 1 +
.../yarn/src/test/resources/log4j.properties | 31 +
.../deploy/yarn/BaseYarnClusterSuite.scala | 241 +++
.../ClientDistributedCacheManagerSuite.scala | 204 +++
.../apache/spark/deploy/yarn/ClientSuite.scala | 462 ++++++
.../yarn/ContainerPlacementStrategySuite.scala | 153 ++
.../spark/deploy/yarn/YarnAllocatorSuite.scala | 344 ++++
.../spark/deploy/yarn/YarnClusterSuite.scala | 493 ++++++
.../yarn/YarnShuffleIntegrationSuite.scala | 112 ++
.../deploy/yarn/YarnSparkHadoopUtilSuite.scala | 213 +++
.../ConfigurableCredentialManagerSuite.scala | 150 ++
.../security/HDFSCredentialProviderSuite.scala | 71 +
.../spark/launcher/TestClasspathBuilder.scala | 36 +
.../network/shuffle/ShuffleTestAccessor.scala | 70 +
.../network/yarn/YarnShuffleServiceSuite.scala | 372 +++++
.../spark/network/yarn/YarnTestAccessor.scala | 37 +
.../ExtensionServiceIntegrationSuite.scala | 72 +
.../cluster/SimpleExtensionService.scala | 34 +
.../cluster/StubApplicationAttemptId.scala | 48 +
.../scheduler/cluster/StubApplicationId.scala | 42 +
yarn/pom.xml | 215 ---
...ploy.yarn.security.ServiceCredentialProvider | 3 -
...pache.spark.scheduler.ExternalClusterManager | 1 -
.../spark/deploy/yarn/ApplicationMaster.scala | 791 ---------
.../yarn/ApplicationMasterArguments.scala | 105 --
.../org/apache/spark/deploy/yarn/Client.scala | 1541 ------------------
.../spark/deploy/yarn/ClientArguments.scala | 86 -
.../yarn/ClientDistributedCacheManager.scala | 186 ---
.../spark/deploy/yarn/ExecutorRunnable.scala | 266 ---
...ityPreferredContainerPlacementStrategy.scala | 224 ---
.../spark/deploy/yarn/YarnAllocator.scala | 727 ---------
.../apache/spark/deploy/yarn/YarnRMClient.scala | 135 --
.../spark/deploy/yarn/YarnSparkHadoopUtil.scala | 317 ----
.../org/apache/spark/deploy/yarn/config.scala | 347 ----
.../yarn/security/AMCredentialRenewer.scala | 235 ---
.../ConfigurableCredentialManager.scala | 105 --
.../yarn/security/CredentialUpdater.scala | 130 --
.../yarn/security/HBaseCredentialProvider.scala | 74 -
.../yarn/security/HDFSCredentialProvider.scala | 110 --
.../yarn/security/HiveCredentialProvider.scala | 129 --
.../security/ServiceCredentialProvider.scala | 57 -
.../launcher/YarnCommandBuilderUtils.scala | 53 -
.../cluster/SchedulerExtensionService.scala | 143 --
.../cluster/YarnClientSchedulerBackend.scala | 157 --
.../scheduler/cluster/YarnClusterManager.scala | 56 -
.../cluster/YarnClusterScheduler.scala | 37 -
.../cluster/YarnClusterSchedulerBackend.scala | 67 -
.../spark/scheduler/cluster/YarnScheduler.scala | 39 -
.../cluster/YarnSchedulerBackend.scala | 315 ----
...ploy.yarn.security.ServiceCredentialProvider | 1 -
yarn/src/test/resources/log4j.properties | 31 -
.../deploy/yarn/BaseYarnClusterSuite.scala | 241 ---
.../ClientDistributedCacheManagerSuite.scala | 204 ---
.../apache/spark/deploy/yarn/ClientSuite.scala | 462 ------
.../yarn/ContainerPlacementStrategySuite.scala | 153 --
.../spark/deploy/yarn/YarnAllocatorSuite.scala | 344 ----
.../spark/deploy/yarn/YarnClusterSuite.scala | 493 ------
.../yarn/YarnShuffleIntegrationSuite.scala | 112 --
.../deploy/yarn/YarnSparkHadoopUtilSuite.scala | 213 ---
.../ConfigurableCredentialManagerSuite.scala | 150 --
.../security/HDFSCredentialProviderSuite.scala | 71 -
.../spark/launcher/TestClasspathBuilder.scala | 36 -
.../network/shuffle/ShuffleTestAccessor.scala | 70 -
.../network/yarn/YarnShuffleServiceSuite.scala | 372 -----
.../spark/network/yarn/YarnTestAccessor.scala | 37 -
.../ExtensionServiceIntegrationSuite.scala | 72 -
.../cluster/SimpleExtensionService.scala | 34 -
.../cluster/StubApplicationAttemptId.scala | 48 -
.../scheduler/cluster/StubApplicationId.scala | 42 -
160 files changed, 15727 insertions(+), 15727 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/dev/sparktestsupport/modules.py
----------------------------------------------------------------------
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b34ab51..1a7cf9a 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -469,7 +469,7 @@ yarn = Module(
name="yarn",
dependencies=[],
source_file_regexes=[
- "yarn/",
+ "resource-managers/yarn/",
"common/network-yarn/",
],
build_profile_flags=["-Pyarn"],
@@ -485,7 +485,7 @@ yarn = Module(
mesos = Module(
name="mesos",
dependencies=[],
- source_file_regexes=["mesos/"],
+ source_file_regexes=["resource-managers/mesos/"],
build_profile_flags=["-Pmesos"],
sbt_test_goals=["mesos/test"]
)
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/pom.xml
----------------------------------------------------------------------
diff --git a/mesos/pom.xml b/mesos/pom.xml
deleted file mode 100644
index 60f2359..0000000
--- a/mesos/pom.xml
+++ /dev/null
@@ -1,109 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one or more
- ~ contributor license agreements. See the NOTICE file distributed with
- ~ this work for additional information regarding copyright ownership.
- ~ The ASF licenses this file to You under the Apache License, Version 2.0
- ~ (the "License"); you may not use this file except in compliance with
- ~ the License. You may obtain a copy of the License at
- ~
- ~ http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing, software
- ~ distributed under the License is distributed on an "AS IS" BASIS,
- ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ~ See the License for the specific language governing permissions and
- ~ limitations under the License.
- -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-parent_2.11</artifactId>
- <version>2.2.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>spark-mesos_2.11</artifactId>
- <packaging>jar</packaging>
- <name>Spark Project Mesos</name>
- <properties>
- <sbt.project.name>mesos</sbt.project.name>
- <mesos.version>1.0.0</mesos.version>
- <mesos.classifier>shaded-protobuf</mesos.classifier>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.mesos</groupId>
- <artifactId>mesos</artifactId>
- <version>${mesos.version}</version>
- <classifier>${mesos.classifier}</classifier>
- <exclusions>
- <exclusion>
- <groupId>com.google.protobuf</groupId>
- <artifactId>protobuf-java</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <scope>test</scope>
- </dependency>
-
- <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-server</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-plus</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-util</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-http</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-servlet</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-servlets</artifactId>
- </dependency>
- <!-- End of shaded deps. -->
-
- </dependencies>
-
-
- <build>
- <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
- <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
- </build>
-
-</project>
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
----------------------------------------------------------------------
diff --git a/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
deleted file mode 100644
index 12b6d5b..0000000
--- a/mesos/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.spark.scheduler.cluster.mesos.MesosClusterManager
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
deleted file mode 100644
index 792ade8..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import java.util.concurrent.CountDownLatch
-
-import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.deploy.mesos.config._
-import org.apache.spark.deploy.mesos.ui.MesosClusterUI
-import org.apache.spark.deploy.rest.mesos.MesosRestServer
-import org.apache.spark.internal.Logging
-import org.apache.spark.scheduler.cluster.mesos._
-import org.apache.spark.util.{CommandLineUtils, ShutdownHookManager, Utils}
-
-/*
- * A dispatcher that is responsible for managing and launching drivers, and is intended to be
- * used for Mesos cluster mode. The dispatcher is a long-running process started by the user in
- * the cluster independently of Spark applications.
- * It contains a [[MesosRestServer]] that listens for requests to submit drivers and a
- * [[MesosClusterScheduler]] that processes these requests by negotiating with the Mesos master
- * for resources.
- *
- * A typical new driver lifecycle is the following:
- * - Driver submitted via spark-submit talking to the [[MesosRestServer]]
- * - [[MesosRestServer]] queues the driver request to [[MesosClusterScheduler]]
- * - [[MesosClusterScheduler]] gets resource offers and launches the drivers that are in queue
- *
- * This dispatcher supports both Mesos fine-grain or coarse-grain mode as the mode is configurable
- * per driver launched.
- * This class is needed since Mesos doesn't manage frameworks, so the dispatcher acts as
- * a daemon to launch drivers as Mesos frameworks upon request. The dispatcher is also started and
- * stopped by sbin/start-mesos-dispatcher and sbin/stop-mesos-dispatcher respectively.
- */
-private[mesos] class MesosClusterDispatcher(
- args: MesosClusterDispatcherArguments,
- conf: SparkConf)
- extends Logging {
-
- private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host)
- private val recoveryMode = conf.get(RECOVERY_MODE).toUpperCase()
- logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode)
-
- private val engineFactory = recoveryMode match {
- case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory
- case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf)
- case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode)
- }
-
- private val scheduler = new MesosClusterScheduler(engineFactory, conf)
-
- private val server = new MesosRestServer(args.host, args.port, conf, scheduler)
- private val webUi = new MesosClusterUI(
- new SecurityManager(conf),
- args.webUiPort,
- conf,
- publicAddress,
- scheduler)
-
- private val shutdownLatch = new CountDownLatch(1)
-
- def start(): Unit = {
- webUi.bind()
- scheduler.frameworkUrl = conf.get(DISPATCHER_WEBUI_URL).getOrElse(webUi.activeWebUiUrl)
- scheduler.start()
- server.start()
- }
-
- def awaitShutdown(): Unit = {
- shutdownLatch.await()
- }
-
- def stop(): Unit = {
- webUi.stop()
- server.stop()
- scheduler.stop()
- shutdownLatch.countDown()
- }
-}
-
-private[mesos] object MesosClusterDispatcher
- extends Logging
- with CommandLineUtils {
-
- override def main(args: Array[String]) {
- Utils.initDaemon(log)
- val conf = new SparkConf
- val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf)
- conf.setMaster(dispatcherArgs.masterUrl)
- conf.setAppName(dispatcherArgs.name)
- dispatcherArgs.zookeeperUrl.foreach { z =>
- conf.set(RECOVERY_MODE, "ZOOKEEPER")
- conf.set(ZOOKEEPER_URL, z)
- }
- val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf)
- dispatcher.start()
- logDebug("Adding shutdown hook") // force eager creation of logger
- ShutdownHookManager.addShutdownHook { () =>
- logInfo("Shutdown hook is shutting down dispatcher")
- dispatcher.stop()
- dispatcher.awaitShutdown()
- }
- dispatcher.awaitShutdown()
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
deleted file mode 100644
index ef08502..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArguments.scala
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import scala.annotation.tailrec
-import scala.collection.mutable
-
-import org.apache.spark.util.{IntParam, Utils}
-import org.apache.spark.SparkConf
-
-private[mesos] class MesosClusterDispatcherArguments(args: Array[String], conf: SparkConf) {
- var host: String = Utils.localHostName()
- var port: Int = 7077
- var name: String = "Spark Cluster"
- var webUiPort: Int = 8081
- var verbose: Boolean = false
- var masterUrl: String = _
- var zookeeperUrl: Option[String] = None
- var propertiesFile: String = _
- val confProperties: mutable.HashMap[String, String] =
- new mutable.HashMap[String, String]()
-
- parse(args.toList)
-
- // scalastyle:on println
- propertiesFile = Utils.loadDefaultSparkProperties(conf, propertiesFile)
- Utils.updateSparkConfigFromProperties(conf, confProperties)
-
- // scalastyle:off println
- if (verbose) {
- MesosClusterDispatcher.printStream.println(s"Using host: $host")
- MesosClusterDispatcher.printStream.println(s"Using port: $port")
- MesosClusterDispatcher.printStream.println(s"Using webUiPort: $webUiPort")
- MesosClusterDispatcher.printStream.println(s"Framework Name: $name")
-
- Option(propertiesFile).foreach { file =>
- MesosClusterDispatcher.printStream.println(s"Using properties file: $file")
- }
-
- MesosClusterDispatcher.printStream.println(s"Spark Config properties set:")
- conf.getAll.foreach(println)
- }
-
- @tailrec
- private def parse(args: List[String]): Unit = args match {
- case ("--host" | "-h") :: value :: tail =>
- Utils.checkHost(value, "Please use hostname " + value)
- host = value
- parse(tail)
-
- case ("--port" | "-p") :: IntParam(value) :: tail =>
- port = value
- parse(tail)
-
- case ("--webui-port") :: IntParam(value) :: tail =>
- webUiPort = value
- parse(tail)
-
- case ("--zk" | "-z") :: value :: tail =>
- zookeeperUrl = Some(value)
- parse(tail)
-
- case ("--master" | "-m") :: value :: tail =>
- if (!value.startsWith("mesos://")) {
- // scalastyle:off println
- MesosClusterDispatcher.printStream
- .println("Cluster dispatcher only supports mesos (uri begins with mesos://)")
- // scalastyle:on println
- MesosClusterDispatcher.exitFn(1)
- }
- masterUrl = value.stripPrefix("mesos://")
- parse(tail)
-
- case ("--name") :: value :: tail =>
- name = value
- parse(tail)
-
- case ("--properties-file") :: value :: tail =>
- propertiesFile = value
- parse(tail)
-
- case ("--conf") :: value :: tail =>
- val pair = MesosClusterDispatcher.
- parseSparkConfProperty(value)
- confProperties(pair._1) = pair._2
- parse(tail)
-
- case ("--help") :: tail =>
- printUsageAndExit(0)
-
- case ("--verbose") :: tail =>
- verbose = true
- parse(tail)
-
- case Nil =>
- if (Option(masterUrl).isEmpty) {
- // scalastyle:off println
- MesosClusterDispatcher.printStream.println("--master is required")
- // scalastyle:on println
- printUsageAndExit(1)
- }
-
- case value =>
- // scalastyle:off println
- MesosClusterDispatcher.printStream.println(s"Unrecognized option: '${value.head}'")
- // scalastyle:on println
- printUsageAndExit(1)
- }
-
- private def printUsageAndExit(exitCode: Int): Unit = {
- val outStream = MesosClusterDispatcher.printStream
-
- // scalastyle:off println
- outStream.println(
- "Usage: MesosClusterDispatcher [options]\n" +
- "\n" +
- "Options:\n" +
- " -h HOST, --host HOST Hostname to listen on\n" +
- " --help Show this help message and exit.\n" +
- " --verbose, Print additional debug output.\n" +
- " -p PORT, --port PORT Port to listen on (default: 7077)\n" +
- " --webui-port WEBUI_PORT WebUI Port to listen on (default: 8081)\n" +
- " --name NAME Framework name to show in Mesos UI\n" +
- " -m --master MASTER URI for connecting to Mesos master\n" +
- " -z --zk ZOOKEEPER Comma delimited URLs for connecting to \n" +
- " Zookeeper for persistence\n" +
- " --properties-file FILE Path to a custom Spark properties file.\n" +
- " Default is conf/spark-defaults.conf \n" +
- " --conf PROP=VALUE Arbitrary Spark configuration property.\n" +
- " Takes precedence over defined properties in properties-file.")
- // scalastyle:on println
- MesosClusterDispatcher.exitFn(exitCode)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
deleted file mode 100644
index d4c7022..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosDriverDescription.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import java.util.Date
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.Command
-import org.apache.spark.scheduler.cluster.mesos.MesosClusterRetryState
-
-/**
- * Describes a Spark driver that is submitted from the
- * [[org.apache.spark.deploy.rest.mesos.MesosRestServer]], to be launched by
- * [[org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler]].
- * @param jarUrl URL to the application jar
- * @param mem Amount of memory for the driver
- * @param cores Number of cores for the driver
- * @param supervise Supervise the driver for long running app
- * @param command The command to launch the driver.
- * @param schedulerProperties Extra properties to pass the Mesos scheduler
- */
-private[spark] class MesosDriverDescription(
- val name: String,
- val jarUrl: String,
- val mem: Int,
- val cores: Double,
- val supervise: Boolean,
- val command: Command,
- schedulerProperties: Map[String, String],
- val submissionId: String,
- val submissionDate: Date,
- val retryState: Option[MesosClusterRetryState] = None)
- extends Serializable {
-
- val conf = new SparkConf(false)
- schedulerProperties.foreach {case (k, v) => conf.set(k, v)}
-
- def copy(
- name: String = name,
- jarUrl: String = jarUrl,
- mem: Int = mem,
- cores: Double = cores,
- supervise: Boolean = supervise,
- command: Command = command,
- schedulerProperties: SparkConf = conf,
- submissionId: String = submissionId,
- submissionDate: Date = submissionDate,
- retryState: Option[MesosClusterRetryState] = retryState): MesosDriverDescription = {
-
- new MesosDriverDescription(name, jarUrl, mem, cores, supervise, command, conf.getAll.toMap,
- submissionId, submissionDate, retryState)
- }
-
- override def toString: String = s"MesosDriverDescription (${command.mainClass})"
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
deleted file mode 100644
index 859aa83..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/MesosExternalShuffleService.scala
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import java.nio.ByteBuffer
-import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
-
-import scala.collection.JavaConverters._
-
-import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.deploy.ExternalShuffleService
-import org.apache.spark.deploy.mesos.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.network.client.{RpcResponseCallback, TransportClient}
-import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler
-import org.apache.spark.network.shuffle.protocol.BlockTransferMessage
-import org.apache.spark.network.shuffle.protocol.mesos.{RegisterDriver, ShuffleServiceHeartbeat}
-import org.apache.spark.network.util.TransportConf
-import org.apache.spark.util.ThreadUtils
-
-/**
- * An RPC endpoint that receives registration requests from Spark drivers running on Mesos.
- * It detects driver termination and calls the cleanup callback to [[ExternalShuffleService]].
- */
-private[mesos] class MesosExternalShuffleBlockHandler(
- transportConf: TransportConf,
- cleanerIntervalS: Long)
- extends ExternalShuffleBlockHandler(transportConf, null) with Logging {
-
- ThreadUtils.newDaemonSingleThreadScheduledExecutor("shuffle-cleaner-watcher")
- .scheduleAtFixedRate(new CleanerThread(), 0, cleanerIntervalS, TimeUnit.SECONDS)
-
- // Stores a map of app id to app state (timeout value and last heartbeat)
- private val connectedApps = new ConcurrentHashMap[String, AppState]()
-
- protected override def handleMessage(
- message: BlockTransferMessage,
- client: TransportClient,
- callback: RpcResponseCallback): Unit = {
- message match {
- case RegisterDriverParam(appId, appState) =>
- val address = client.getSocketAddress
- val timeout = appState.heartbeatTimeout
- logInfo(s"Received registration request from app $appId (remote address $address, " +
- s"heartbeat timeout $timeout ms).")
- if (connectedApps.containsKey(appId)) {
- logWarning(s"Received a registration request from app $appId, but it was already " +
- s"registered")
- }
- connectedApps.put(appId, appState)
- callback.onSuccess(ByteBuffer.allocate(0))
- case Heartbeat(appId) =>
- val address = client.getSocketAddress
- Option(connectedApps.get(appId)) match {
- case Some(existingAppState) =>
- logTrace(s"Received ShuffleServiceHeartbeat from app '$appId' (remote " +
- s"address $address).")
- existingAppState.lastHeartbeat = System.nanoTime()
- case None =>
- logWarning(s"Received ShuffleServiceHeartbeat from an unknown app (remote " +
- s"address $address, appId '$appId').")
- }
- case _ => super.handleMessage(message, client, callback)
- }
- }
-
- /** An extractor object for matching [[RegisterDriver]] message. */
- private object RegisterDriverParam {
- def unapply(r: RegisterDriver): Option[(String, AppState)] =
- Some((r.getAppId, new AppState(r.getHeartbeatTimeoutMs, System.nanoTime())))
- }
-
- private object Heartbeat {
- def unapply(h: ShuffleServiceHeartbeat): Option[String] = Some(h.getAppId)
- }
-
- private class AppState(val heartbeatTimeout: Long, @volatile var lastHeartbeat: Long)
-
- private class CleanerThread extends Runnable {
- override def run(): Unit = {
- val now = System.nanoTime()
- connectedApps.asScala.foreach { case (appId, appState) =>
- if (now - appState.lastHeartbeat > appState.heartbeatTimeout * 1000 * 1000) {
- logInfo(s"Application $appId timed out. Removing shuffle files.")
- connectedApps.remove(appId)
- applicationRemoved(appId, true)
- }
- }
- }
- }
-}
-
-/**
- * A wrapper of [[ExternalShuffleService]] that provides an additional endpoint for drivers
- * to associate with. This allows the shuffle service to detect when a driver is terminated
- * and can clean up the associated shuffle files.
- */
-private[mesos] class MesosExternalShuffleService(conf: SparkConf, securityManager: SecurityManager)
- extends ExternalShuffleService(conf, securityManager) {
-
- protected override def newShuffleBlockHandler(
- conf: TransportConf): ExternalShuffleBlockHandler = {
- val cleanerIntervalS = this.conf.get(SHUFFLE_CLEANER_INTERVAL_S)
- new MesosExternalShuffleBlockHandler(conf, cleanerIntervalS)
- }
-}
-
-private[spark] object MesosExternalShuffleService extends Logging {
-
- def main(args: Array[String]): Unit = {
- ExternalShuffleService.main(args,
- (conf: SparkConf, sm: SecurityManager) => new MesosExternalShuffleService(conf, sm))
- }
-}
-
-
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
deleted file mode 100644
index 19e2533..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import java.util.concurrent.TimeUnit
-
-import org.apache.spark.internal.config.ConfigBuilder
-
-package object config {
-
- /* Common app configuration. */
-
- private[spark] val SHUFFLE_CLEANER_INTERVAL_S =
- ConfigBuilder("spark.shuffle.cleaner.interval")
- .timeConf(TimeUnit.SECONDS)
- .createWithDefaultString("30s")
-
- private[spark] val RECOVERY_MODE =
- ConfigBuilder("spark.deploy.recoveryMode")
- .stringConf
- .createWithDefault("NONE")
-
- private[spark] val DISPATCHER_WEBUI_URL =
- ConfigBuilder("spark.mesos.dispatcher.webui.url")
- .doc("Set the Spark Mesos dispatcher webui_url for interacting with the " +
- "framework. If unset it will point to Spark's internal web UI.")
- .stringConf
- .createOptional
-
- private[spark] val ZOOKEEPER_URL =
- ConfigBuilder("spark.deploy.zookeeper.url")
- .doc("When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this " +
- "configuration is used to set the zookeeper URL to connect to.")
- .stringConf
- .createOptional
-
- private[spark] val HISTORY_SERVER_URL =
- ConfigBuilder("spark.mesos.dispatcher.historyServer.url")
- .doc("Set the URL of the history server. The dispatcher will then " +
- "link each driver to its entry in the history server.")
- .stringConf
- .createOptional
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
deleted file mode 100644
index cd98110..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos.ui
-
-import javax.servlet.http.HttpServletRequest
-
-import scala.xml.Node
-
-import org.apache.spark.deploy.Command
-import org.apache.spark.deploy.mesos.MesosDriverDescription
-import org.apache.spark.scheduler.cluster.mesos.{MesosClusterRetryState, MesosClusterSubmissionState}
-import org.apache.spark.ui.{UIUtils, WebUIPage}
-
-private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver") {
-
- override def render(request: HttpServletRequest): Seq[Node] = {
- val driverId = request.getParameter("id")
- require(driverId != null && driverId.nonEmpty, "Missing id parameter")
-
- val state = parent.scheduler.getDriverState(driverId)
- if (state.isEmpty) {
- val content =
- <div>
- <p>Cannot find driver {driverId}</p>
- </div>
- return UIUtils.basicSparkPage(content, s"Details for Job $driverId")
- }
- val driverState = state.get
- val driverHeaders = Seq("Driver property", "Value")
- val schedulerHeaders = Seq("Scheduler property", "Value")
- val commandEnvHeaders = Seq("Command environment variable", "Value")
- val launchedHeaders = Seq("Launched property", "Value")
- val commandHeaders = Seq("Command property", "Value")
- val retryHeaders = Seq("Last failed status", "Next retry time", "Retry count")
- val driverDescription = Iterable.apply(driverState.description)
- val submissionState = Iterable.apply(driverState.submissionState)
- val command = Iterable.apply(driverState.description.command)
- val schedulerProperties = Iterable.apply(driverState.description.conf.getAll.toMap)
- val commandEnv = Iterable.apply(driverState.description.command.environment)
- val driverTable =
- UIUtils.listingTable(driverHeaders, driverRow, driverDescription)
- val commandTable =
- UIUtils.listingTable(commandHeaders, commandRow, command)
- val commandEnvTable =
- UIUtils.listingTable(commandEnvHeaders, propertiesRow, commandEnv)
- val schedulerTable =
- UIUtils.listingTable(schedulerHeaders, propertiesRow, schedulerProperties)
- val launchedTable =
- UIUtils.listingTable(launchedHeaders, launchedRow, submissionState)
- val retryTable =
- UIUtils.listingTable(
- retryHeaders, retryRow, Iterable.apply(driverState.description.retryState))
- val content =
- <p>Driver state information for driver id {driverId}</p>
- <a href={UIUtils.prependBaseUri("/")}>Back to Drivers</a>
- <div class="row-fluid">
- <div class="span12">
- <h4>Driver state: {driverState.state}</h4>
- <h4>Driver properties</h4>
- {driverTable}
- <h4>Driver command</h4>
- {commandTable}
- <h4>Driver command environment</h4>
- {commandEnvTable}
- <h4>Scheduler properties</h4>
- {schedulerTable}
- <h4>Launched state</h4>
- {launchedTable}
- <h4>Retry state</h4>
- {retryTable}
- </div>
- </div>;
-
- UIUtils.basicSparkPage(content, s"Details for Job $driverId")
- }
-
- private def launchedRow(submissionState: Option[MesosClusterSubmissionState]): Seq[Node] = {
- submissionState.map { state =>
- <tr>
- <td>Mesos Slave ID</td>
- <td>{state.slaveId.getValue}</td>
- </tr>
- <tr>
- <td>Mesos Task ID</td>
- <td>{state.taskId.getValue}</td>
- </tr>
- <tr>
- <td>Launch Time</td>
- <td>{state.startDate}</td>
- </tr>
- <tr>
- <td>Finish Time</td>
- <td>{state.finishDate.map(_.toString).getOrElse("")}</td>
- </tr>
- <tr>
- <td>Last Task Status</td>
- <td>{state.mesosTaskStatus.map(_.toString).getOrElse("")}</td>
- </tr>
- }.getOrElse(Seq[Node]())
- }
-
- private def propertiesRow(properties: collection.Map[String, String]): Seq[Node] = {
- properties.map { case (k, v) =>
- <tr>
- <td>{k}</td><td>{v}</td>
- </tr>
- }.toSeq
- }
-
- private def commandRow(command: Command): Seq[Node] = {
- <tr>
- <td>Main class</td><td>{command.mainClass}</td>
- </tr>
- <tr>
- <td>Arguments</td><td>{command.arguments.mkString(" ")}</td>
- </tr>
- <tr>
- <td>Class path entries</td><td>{command.classPathEntries.mkString(" ")}</td>
- </tr>
- <tr>
- <td>Java options</td><td>{command.javaOpts.mkString((" "))}</td>
- </tr>
- <tr>
- <td>Library path entries</td><td>{command.libraryPathEntries.mkString((" "))}</td>
- </tr>
- }
-
- private def driverRow(driver: MesosDriverDescription): Seq[Node] = {
- <tr>
- <td>Name</td><td>{driver.name}</td>
- </tr>
- <tr>
- <td>Id</td><td>{driver.submissionId}</td>
- </tr>
- <tr>
- <td>Cores</td><td>{driver.cores}</td>
- </tr>
- <tr>
- <td>Memory</td><td>{driver.mem}</td>
- </tr>
- <tr>
- <td>Submitted</td><td>{driver.submissionDate}</td>
- </tr>
- <tr>
- <td>Supervise</td><td>{driver.supervise}</td>
- </tr>
- }
-
- private def retryRow(retryState: Option[MesosClusterRetryState]): Seq[Node] = {
- retryState.map { state =>
- <tr>
- <td>
- {state.lastFailureStatus}
- </td>
- <td>
- {state.nextRetry}
- </td>
- <td>
- {state.retries}
- </td>
- </tr>
- }.getOrElse(Seq[Node]())
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
deleted file mode 100644
index 13ba7d3..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos.ui
-
-import javax.servlet.http.HttpServletRequest
-
-import scala.xml.Node
-
-import org.apache.mesos.Protos.TaskStatus
-
-import org.apache.spark.deploy.mesos.config._
-import org.apache.spark.deploy.mesos.MesosDriverDescription
-import org.apache.spark.scheduler.cluster.mesos.MesosClusterSubmissionState
-import org.apache.spark.ui.{UIUtils, WebUIPage}
-
-private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage("") {
- private val historyServerURL = parent.conf.get(HISTORY_SERVER_URL)
-
- def render(request: HttpServletRequest): Seq[Node] = {
- val state = parent.scheduler.getSchedulerState()
-
- val driverHeader = Seq("Driver ID")
- val historyHeader = historyServerURL.map(url => Seq("History")).getOrElse(Nil)
- val submissionHeader = Seq("Submit Date", "Main Class", "Driver Resources")
-
- val queuedHeaders = driverHeader ++ submissionHeader
- val driverHeaders = driverHeader ++ historyHeader ++ submissionHeader ++
- Seq("Start Date", "Mesos Slave ID", "State")
- val retryHeaders = Seq("Driver ID", "Submit Date", "Description") ++
- Seq("Last Failed Status", "Next Retry Time", "Attempt Count")
- val queuedTable = UIUtils.listingTable(queuedHeaders, queuedRow, state.queuedDrivers)
- val launchedTable = UIUtils.listingTable(driverHeaders, driverRow, state.launchedDrivers)
- val finishedTable = UIUtils.listingTable(driverHeaders, driverRow, state.finishedDrivers)
- val retryTable = UIUtils.listingTable(retryHeaders, retryRow, state.pendingRetryDrivers)
- val content =
- <p>Mesos Framework ID: {state.frameworkId}</p>
- <div class="row-fluid">
- <div class="span12">
- <h4>Queued Drivers:</h4>
- {queuedTable}
- <h4>Launched Drivers:</h4>
- {launchedTable}
- <h4>Finished Drivers:</h4>
- {finishedTable}
- <h4>Supervise drivers waiting for retry:</h4>
- {retryTable}
- </div>
- </div>;
- UIUtils.basicSparkPage(content, "Spark Drivers for Mesos cluster")
- }
-
- private def queuedRow(submission: MesosDriverDescription): Seq[Node] = {
- val id = submission.submissionId
- <tr>
- <td><a href={s"driver?id=$id"}>{id}</a></td>
- <td>{submission.submissionDate}</td>
- <td>{submission.command.mainClass}</td>
- <td>cpus: {submission.cores}, mem: {submission.mem}</td>
- </tr>
- }
-
- private def driverRow(state: MesosClusterSubmissionState): Seq[Node] = {
- val id = state.driverDescription.submissionId
-
- val historyCol = if (historyServerURL.isDefined) {
- <td>
- <a href={s"${historyServerURL.get}/history/${state.frameworkId}"}>
- {state.frameworkId}
- </a>
- </td>
- } else Nil
-
- <tr>
- <td><a href={s"driver?id=$id"}>{id}</a></td>
- {historyCol}
- <td>{state.driverDescription.submissionDate}</td>
- <td>{state.driverDescription.command.mainClass}</td>
- <td>cpus: {state.driverDescription.cores}, mem: {state.driverDescription.mem}</td>
- <td>{state.startDate}</td>
- <td>{state.slaveId.getValue}</td>
- <td>{stateString(state.mesosTaskStatus)}</td>
- </tr>
- }
-
- private def retryRow(submission: MesosDriverDescription): Seq[Node] = {
- val id = submission.submissionId
- <tr>
- <td><a href={s"driver?id=$id"}>{id}</a></td>
- <td>{submission.submissionDate}</td>
- <td>{submission.command.mainClass}</td>
- <td>{submission.retryState.get.lastFailureStatus}</td>
- <td>{submission.retryState.get.nextRetry}</td>
- <td>{submission.retryState.get.retries}</td>
- </tr>
- }
-
- private def stateString(status: Option[TaskStatus]): String = {
- if (status.isEmpty) {
- return ""
- }
- val sb = new StringBuilder
- val s = status.get
- sb.append(s"State: ${s.getState}")
- if (status.get.hasMessage) {
- sb.append(s", Message: ${s.getMessage}")
- }
- if (status.get.hasHealthy) {
- sb.append(s", Healthy: ${s.getHealthy}")
- }
- if (status.get.hasSource) {
- sb.append(s", Source: ${s.getSource}")
- }
- if (status.get.hasReason) {
- sb.append(s", Reason: ${s.getReason}")
- }
- if (status.get.hasTimestamp) {
- sb.append(s", Time: ${s.getTimestamp}")
- }
- sb.toString()
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala b/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
deleted file mode 100644
index 6049789..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos.ui
-
-import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler
-import org.apache.spark.ui.{SparkUI, WebUI}
-import org.apache.spark.ui.JettyUtils._
-
-/**
- * UI that displays driver results from the [[org.apache.spark.deploy.mesos.MesosClusterDispatcher]]
- */
-private[spark] class MesosClusterUI(
- securityManager: SecurityManager,
- port: Int,
- val conf: SparkConf,
- dispatcherPublicAddress: String,
- val scheduler: MesosClusterScheduler)
- extends WebUI(securityManager, securityManager.getSSLOptions("mesos"), port, conf) {
-
- initialize()
-
- def activeWebUiUrl: String = "http://" + dispatcherPublicAddress + ":" + boundPort
-
- override def initialize() {
- attachPage(new MesosClusterPage(this))
- attachPage(new DriverPage(this))
- attachHandler(createStaticHandler(MesosClusterUI.STATIC_RESOURCE_DIR, "/static"))
- }
-}
-
-private object MesosClusterUI {
- val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
deleted file mode 100644
index ff60b88..0000000
--- a/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.rest.mesos
-
-import java.io.File
-import java.text.SimpleDateFormat
-import java.util.{Date, Locale}
-import java.util.concurrent.atomic.AtomicLong
-import javax.servlet.http.HttpServletResponse
-
-import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf}
-import org.apache.spark.deploy.Command
-import org.apache.spark.deploy.mesos.MesosDriverDescription
-import org.apache.spark.deploy.rest._
-import org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler
-import org.apache.spark.util.Utils
-
-/**
- * A server that responds to requests submitted by the [[RestSubmissionClient]].
- * All requests are forwarded to
- * [[org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler]].
- * This is intended to be used in Mesos cluster mode only.
- * For more details about the REST submission please refer to [[RestSubmissionServer]] javadocs.
- */
-private[spark] class MesosRestServer(
- host: String,
- requestedPort: Int,
- masterConf: SparkConf,
- scheduler: MesosClusterScheduler)
- extends RestSubmissionServer(host, requestedPort, masterConf) {
-
- protected override val submitRequestServlet =
- new MesosSubmitRequestServlet(scheduler, masterConf)
- protected override val killRequestServlet =
- new MesosKillRequestServlet(scheduler, masterConf)
- protected override val statusRequestServlet =
- new MesosStatusRequestServlet(scheduler, masterConf)
-}
-
-private[mesos] class MesosSubmitRequestServlet(
- scheduler: MesosClusterScheduler,
- conf: SparkConf)
- extends SubmitRequestServlet {
-
- private val DEFAULT_SUPERVISE = false
- private val DEFAULT_MEMORY = Utils.DEFAULT_DRIVER_MEM_MB // mb
- private val DEFAULT_CORES = 1.0
-
- private val nextDriverNumber = new AtomicLong(0)
- // For application IDs
- private def createDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
- private def newDriverId(submitDate: Date): String =
- f"driver-${createDateFormat.format(submitDate)}-${nextDriverNumber.incrementAndGet()}%04d"
-
- /**
- * Build a driver description from the fields specified in the submit request.
- *
- * This involves constructing a command that launches a mesos framework for the job.
- * This does not currently consider fields used by python applications since python
- * is not supported in mesos cluster mode yet.
- */
- private def buildDriverDescription(request: CreateSubmissionRequest): MesosDriverDescription = {
- // Required fields, including the main class because python is not yet supported
- val appResource = Option(request.appResource).getOrElse {
- throw new SubmitRestMissingFieldException("Application jar is missing.")
- }
- val mainClass = Option(request.mainClass).getOrElse {
- throw new SubmitRestMissingFieldException("Main class is missing.")
- }
-
- // Optional fields
- val sparkProperties = request.sparkProperties
- val driverExtraJavaOptions = sparkProperties.get("spark.driver.extraJavaOptions")
- val driverExtraClassPath = sparkProperties.get("spark.driver.extraClassPath")
- val driverExtraLibraryPath = sparkProperties.get("spark.driver.extraLibraryPath")
- val superviseDriver = sparkProperties.get("spark.driver.supervise")
- val driverMemory = sparkProperties.get("spark.driver.memory")
- val driverCores = sparkProperties.get("spark.driver.cores")
- val appArgs = request.appArgs
- val environmentVariables = request.environmentVariables
- val name = request.sparkProperties.getOrElse("spark.app.name", mainClass)
-
- // Construct driver description
- val conf = new SparkConf(false).setAll(sparkProperties)
- val extraClassPath = driverExtraClassPath.toSeq.flatMap(_.split(File.pathSeparator))
- val extraLibraryPath = driverExtraLibraryPath.toSeq.flatMap(_.split(File.pathSeparator))
- val extraJavaOpts = driverExtraJavaOptions.map(Utils.splitCommandString).getOrElse(Seq.empty)
- val sparkJavaOpts = Utils.sparkJavaOpts(conf)
- val javaOpts = sparkJavaOpts ++ extraJavaOpts
- val command = new Command(
- mainClass, appArgs, environmentVariables, extraClassPath, extraLibraryPath, javaOpts)
- val actualSuperviseDriver = superviseDriver.map(_.toBoolean).getOrElse(DEFAULT_SUPERVISE)
- val actualDriverMemory = driverMemory.map(Utils.memoryStringToMb).getOrElse(DEFAULT_MEMORY)
- val actualDriverCores = driverCores.map(_.toDouble).getOrElse(DEFAULT_CORES)
- val submitDate = new Date()
- val submissionId = newDriverId(submitDate)
-
- new MesosDriverDescription(
- name, appResource, actualDriverMemory, actualDriverCores, actualSuperviseDriver,
- command, request.sparkProperties, submissionId, submitDate)
- }
-
- protected override def handleSubmit(
- requestMessageJson: String,
- requestMessage: SubmitRestProtocolMessage,
- responseServlet: HttpServletResponse): SubmitRestProtocolResponse = {
- requestMessage match {
- case submitRequest: CreateSubmissionRequest =>
- val driverDescription = buildDriverDescription(submitRequest)
- val s = scheduler.submitDriver(driverDescription)
- s.serverSparkVersion = sparkVersion
- val unknownFields = findUnknownFields(requestMessageJson, requestMessage)
- if (unknownFields.nonEmpty) {
- // If there are fields that the server does not know about, warn the client
- s.unknownFields = unknownFields
- }
- s
- case unexpected =>
- responseServlet.setStatus(HttpServletResponse.SC_BAD_REQUEST)
- handleError(s"Received message of unexpected type ${unexpected.messageType}.")
- }
- }
-}
-
-private[mesos] class MesosKillRequestServlet(scheduler: MesosClusterScheduler, conf: SparkConf)
- extends KillRequestServlet {
- protected override def handleKill(submissionId: String): KillSubmissionResponse = {
- val k = scheduler.killDriver(submissionId)
- k.serverSparkVersion = sparkVersion
- k
- }
-}
-
-private[mesos] class MesosStatusRequestServlet(scheduler: MesosClusterScheduler, conf: SparkConf)
- extends StatusRequestServlet {
- protected override def handleStatus(submissionId: String): SubmissionStatusResponse = {
- val d = scheduler.getDriverStatus(submissionId)
- d.serverSparkVersion = sparkVersion
- d
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala b/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
deleted file mode 100644
index ee9149c..0000000
--- a/mesos/src/main/scala/org/apache/spark/executor/MesosExecutorBackend.scala
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.executor
-
-import java.nio.ByteBuffer
-
-import scala.collection.JavaConverters._
-
-import org.apache.mesos.{Executor => MesosExecutor, ExecutorDriver, MesosExecutorDriver}
-import org.apache.mesos.Protos.{TaskStatus => MesosTaskStatus, _}
-import org.apache.mesos.protobuf.ByteString
-
-import org.apache.spark.{SparkConf, SparkEnv, TaskState}
-import org.apache.spark.TaskState
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.Logging
-import org.apache.spark.scheduler.cluster.mesos.{MesosSchedulerUtils, MesosTaskLaunchData}
-import org.apache.spark.util.Utils
-
-private[spark] class MesosExecutorBackend
- extends MesosExecutor
- with MesosSchedulerUtils // TODO: fix
- with ExecutorBackend
- with Logging {
-
- var executor: Executor = null
- var driver: ExecutorDriver = null
-
- override def statusUpdate(taskId: Long, state: TaskState.TaskState, data: ByteBuffer) {
- val mesosTaskId = TaskID.newBuilder().setValue(taskId.toString).build()
- driver.sendStatusUpdate(MesosTaskStatus.newBuilder()
- .setTaskId(mesosTaskId)
- .setState(taskStateToMesos(state))
- .setData(ByteString.copyFrom(data))
- .build())
- }
-
- override def registered(
- driver: ExecutorDriver,
- executorInfo: ExecutorInfo,
- frameworkInfo: FrameworkInfo,
- slaveInfo: SlaveInfo) {
-
- // Get num cores for this task from ExecutorInfo, created in MesosSchedulerBackend.
- val cpusPerTask = executorInfo.getResourcesList.asScala
- .find(_.getName == "cpus")
- .map(_.getScalar.getValue.toInt)
- .getOrElse(0)
- val executorId = executorInfo.getExecutorId.getValue
-
- logInfo(s"Registered with Mesos as executor ID $executorId with $cpusPerTask cpus")
- this.driver = driver
- // Set a context class loader to be picked up by the serializer. Without this call
- // the serializer would default to the null class loader, and fail to find Spark classes
- // See SPARK-10986.
- Thread.currentThread().setContextClassLoader(this.getClass.getClassLoader)
-
- val properties = Utils.deserialize[Array[(String, String)]](executorInfo.getData.toByteArray) ++
- Seq[(String, String)](("spark.app.id", frameworkInfo.getId.getValue))
- val conf = new SparkConf(loadDefaults = true).setAll(properties)
- val port = conf.getInt("spark.executor.port", 0)
- val env = SparkEnv.createExecutorEnv(
- conf, executorId, slaveInfo.getHostname, port, cpusPerTask, None, isLocal = false)
-
- executor = new Executor(
- executorId,
- slaveInfo.getHostname,
- env)
- }
-
- override def launchTask(d: ExecutorDriver, taskInfo: TaskInfo) {
- val taskId = taskInfo.getTaskId.getValue.toLong
- val taskData = MesosTaskLaunchData.fromByteString(taskInfo.getData)
- if (executor == null) {
- logError("Received launchTask but executor was null")
- } else {
- SparkHadoopUtil.get.runAsSparkUser { () =>
- executor.launchTask(this, taskId = taskId, attemptNumber = taskData.attemptNumber,
- taskInfo.getName, taskData.serializedTask)
- }
- }
- }
-
- override def error(d: ExecutorDriver, message: String) {
- logError("Error from Mesos: " + message)
- }
-
- override def killTask(d: ExecutorDriver, t: TaskID) {
- if (executor == null) {
- logError("Received KillTask but executor was null")
- } else {
- // TODO: Determine the 'interruptOnCancel' property set for the given job.
- executor.killTask(t.getValue.toLong, interruptThread = false)
- }
- }
-
- override def reregistered(d: ExecutorDriver, p2: SlaveInfo) {}
-
- override def disconnected(d: ExecutorDriver) {}
-
- override def frameworkMessage(d: ExecutorDriver, data: Array[Byte]) {}
-
- override def shutdown(d: ExecutorDriver) {}
-}
-
-/**
- * Entry point for Mesos executor.
- */
-private[spark] object MesosExecutorBackend extends Logging {
- def main(args: Array[String]) {
- Utils.initDaemon(log)
- // Create a new Executor and start it running
- val runner = new MesosExecutorBackend()
- new MesosExecutorDriver(runner).run()
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
deleted file mode 100644
index ed29b34..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManager.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.internal.config._
-import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
-
-/**
- * Cluster Manager for creation of Yarn scheduler and backend
- */
-private[spark] class MesosClusterManager extends ExternalClusterManager {
- private val MESOS_REGEX = """mesos://(.*)""".r
-
- override def canCreate(masterURL: String): Boolean = {
- masterURL.startsWith("mesos")
- }
-
- override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
- new TaskSchedulerImpl(sc)
- }
-
- override def createSchedulerBackend(sc: SparkContext,
- masterURL: String,
- scheduler: TaskScheduler): SchedulerBackend = {
- require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
- "I/O encryption is currently not supported in Mesos.")
-
- val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
- val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
- if (coarse) {
- new MesosCoarseGrainedSchedulerBackend(
- scheduler.asInstanceOf[TaskSchedulerImpl],
- sc,
- mesosUrl,
- sc.env.securityManager)
- } else {
- new MesosFineGrainedSchedulerBackend(
- scheduler.asInstanceOf[TaskSchedulerImpl],
- sc,
- mesosUrl)
- }
- }
-
- override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
- scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
- }
-}
-
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
deleted file mode 100644
index 61ab3e8..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterPersistenceEngine.scala
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import scala.collection.JavaConverters._
-
-import org.apache.curator.framework.CuratorFramework
-import org.apache.zookeeper.CreateMode
-import org.apache.zookeeper.KeeperException.NoNodeException
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkCuratorUtil
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-/**
- * Persistence engine factory that is responsible for creating new persistence engines
- * to store Mesos cluster mode state.
- */
-private[spark] abstract class MesosClusterPersistenceEngineFactory(conf: SparkConf) {
- def createEngine(path: String): MesosClusterPersistenceEngine
-}
-
-/**
- * Mesos cluster persistence engine is responsible for persisting Mesos cluster mode
- * specific state, so that on failover all the state can be recovered and the scheduler
- * can resume managing the drivers.
- */
-private[spark] trait MesosClusterPersistenceEngine {
- def persist(name: String, obj: Object): Unit
- def expunge(name: String): Unit
- def fetch[T](name: String): Option[T]
- def fetchAll[T](): Iterable[T]
-}
-
-/**
- * Zookeeper backed persistence engine factory.
- * All Zk engines created from this factory shares the same Zookeeper client, so
- * all of them reuses the same connection pool.
- */
-private[spark] class ZookeeperMesosClusterPersistenceEngineFactory(conf: SparkConf)
- extends MesosClusterPersistenceEngineFactory(conf) with Logging {
-
- lazy val zk = SparkCuratorUtil.newClient(conf)
-
- def createEngine(path: String): MesosClusterPersistenceEngine = {
- new ZookeeperMesosClusterPersistenceEngine(path, zk, conf)
- }
-}
-
-/**
- * Black hole persistence engine factory that creates black hole
- * persistence engines, which stores nothing.
- */
-private[spark] class BlackHoleMesosClusterPersistenceEngineFactory
- extends MesosClusterPersistenceEngineFactory(null) {
- def createEngine(path: String): MesosClusterPersistenceEngine = {
- new BlackHoleMesosClusterPersistenceEngine
- }
-}
-
-/**
- * Black hole persistence engine that stores nothing.
- */
-private[spark] class BlackHoleMesosClusterPersistenceEngine extends MesosClusterPersistenceEngine {
- override def persist(name: String, obj: Object): Unit = {}
- override def fetch[T](name: String): Option[T] = None
- override def expunge(name: String): Unit = {}
- override def fetchAll[T](): Iterable[T] = Iterable.empty[T]
-}
-
-/**
- * Zookeeper based Mesos cluster persistence engine, that stores cluster mode state
- * into Zookeeper. Each engine object is operating under one folder in Zookeeper, but
- * reuses a shared Zookeeper client.
- */
-private[spark] class ZookeeperMesosClusterPersistenceEngine(
- baseDir: String,
- zk: CuratorFramework,
- conf: SparkConf)
- extends MesosClusterPersistenceEngine with Logging {
- private val WORKING_DIR =
- conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir
-
- SparkCuratorUtil.mkdir(zk, WORKING_DIR)
-
- def path(name: String): String = {
- WORKING_DIR + "/" + name
- }
-
- override def expunge(name: String): Unit = {
- zk.delete().forPath(path(name))
- }
-
- override def persist(name: String, obj: Object): Unit = {
- val serialized = Utils.serialize(obj)
- val zkPath = path(name)
- zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized)
- }
-
- override def fetch[T](name: String): Option[T] = {
- val zkPath = path(name)
-
- try {
- val fileData = zk.getData().forPath(zkPath)
- Some(Utils.deserialize[T](fileData))
- } catch {
- case e: NoNodeException => None
- case e: Exception =>
- logWarning("Exception while reading persisted file, deleting", e)
- zk.delete().forPath(zkPath)
- None
- }
- }
-
- override def fetchAll[T](): Iterable[T] = {
- zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T])
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[06/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
deleted file mode 100644
index 0378ef4..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.{File, IOException}
-import java.lang.reflect.InvocationTargetException
-import java.net.{Socket, URI, URL}
-import java.util.concurrent.{TimeoutException, TimeUnit}
-
-import scala.collection.mutable.HashMap
-import scala.concurrent.Promise
-import scala.concurrent.duration.Duration
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
-
-import org.apache.spark._
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.history.HistoryServer
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.deploy.yarn.security.{AMCredentialRenewer, ConfigurableCredentialManager}
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.rpc._
-import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, YarnSchedulerBackend}
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
-import org.apache.spark.util._
-
-/**
- * Common application master functionality for Spark on Yarn.
- */
-private[spark] class ApplicationMaster(
- args: ApplicationMasterArguments,
- client: YarnRMClient)
- extends Logging {
-
- // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
- // optimal as more containers are available. Might need to handle this better.
-
- private val sparkConf = new SparkConf()
- private val yarnConf: YarnConfiguration = SparkHadoopUtil.get.newConfiguration(sparkConf)
- .asInstanceOf[YarnConfiguration]
- private val isClusterMode = args.userClass != null
-
- // Default to twice the number of executors (twice the maximum number of executors if dynamic
- // allocation is enabled), with a minimum of 3.
-
- private val maxNumExecutorFailures = {
- val effectiveNumExecutors =
- if (Utils.isDynamicAllocationEnabled(sparkConf)) {
- sparkConf.get(DYN_ALLOCATION_MAX_EXECUTORS)
- } else {
- sparkConf.get(EXECUTOR_INSTANCES).getOrElse(0)
- }
- // By default, effectiveNumExecutors is Int.MaxValue if dynamic allocation is enabled. We need
- // avoid the integer overflow here.
- val defaultMaxNumExecutorFailures = math.max(3,
- if (effectiveNumExecutors > Int.MaxValue / 2) Int.MaxValue else (2 * effectiveNumExecutors))
-
- sparkConf.get(MAX_EXECUTOR_FAILURES).getOrElse(defaultMaxNumExecutorFailures)
- }
-
- @volatile private var exitCode = 0
- @volatile private var unregistered = false
- @volatile private var finished = false
- @volatile private var finalStatus = getDefaultFinalStatus
- @volatile private var finalMsg: String = ""
- @volatile private var userClassThread: Thread = _
-
- @volatile private var reporterThread: Thread = _
- @volatile private var allocator: YarnAllocator = _
-
- // Lock for controlling the allocator (heartbeat) thread.
- private val allocatorLock = new Object()
-
- // Steady state heartbeat interval. We want to be reasonably responsive without causing too many
- // requests to RM.
- private val heartbeatInterval = {
- // Ensure that progress is sent before YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS elapses.
- val expiryInterval = yarnConf.getInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 120000)
- math.max(0, math.min(expiryInterval / 2, sparkConf.get(RM_HEARTBEAT_INTERVAL)))
- }
-
- // Initial wait interval before allocator poll, to allow for quicker ramp up when executors are
- // being requested.
- private val initialAllocationInterval = math.min(heartbeatInterval,
- sparkConf.get(INITIAL_HEARTBEAT_INTERVAL))
-
- // Next wait interval before allocator poll.
- private var nextAllocationInterval = initialAllocationInterval
-
- private var rpcEnv: RpcEnv = null
- private var amEndpoint: RpcEndpointRef = _
-
- // In cluster mode, used to tell the AM when the user's SparkContext has been initialized.
- private val sparkContextPromise = Promise[SparkContext]()
-
- private var credentialRenewer: AMCredentialRenewer = _
-
- // Load the list of localized files set by the client. This is used when launching executors,
- // and is loaded here so that these configs don't pollute the Web UI's environment page in
- // cluster mode.
- private val localResources = {
- logInfo("Preparing Local resources")
- val resources = HashMap[String, LocalResource]()
-
- def setupDistributedCache(
- file: String,
- rtype: LocalResourceType,
- timestamp: String,
- size: String,
- vis: String): Unit = {
- val uri = new URI(file)
- val amJarRsrc = Records.newRecord(classOf[LocalResource])
- amJarRsrc.setType(rtype)
- amJarRsrc.setVisibility(LocalResourceVisibility.valueOf(vis))
- amJarRsrc.setResource(ConverterUtils.getYarnUrlFromURI(uri))
- amJarRsrc.setTimestamp(timestamp.toLong)
- amJarRsrc.setSize(size.toLong)
-
- val fileName = Option(uri.getFragment()).getOrElse(new Path(uri).getName())
- resources(fileName) = amJarRsrc
- }
-
- val distFiles = sparkConf.get(CACHED_FILES)
- val fileSizes = sparkConf.get(CACHED_FILES_SIZES)
- val timeStamps = sparkConf.get(CACHED_FILES_TIMESTAMPS)
- val visibilities = sparkConf.get(CACHED_FILES_VISIBILITIES)
- val resTypes = sparkConf.get(CACHED_FILES_TYPES)
-
- for (i <- 0 to distFiles.size - 1) {
- val resType = LocalResourceType.valueOf(resTypes(i))
- setupDistributedCache(distFiles(i), resType, timeStamps(i).toString, fileSizes(i).toString,
- visibilities(i))
- }
-
- // Distribute the conf archive to executors.
- sparkConf.get(CACHED_CONF_ARCHIVE).foreach { path =>
- val uri = new URI(path)
- val fs = FileSystem.get(uri, yarnConf)
- val status = fs.getFileStatus(new Path(uri))
- // SPARK-16080: Make sure to use the correct name for the destination when distributing the
- // conf archive to executors.
- val destUri = new URI(uri.getScheme(), uri.getRawSchemeSpecificPart(),
- Client.LOCALIZED_CONF_DIR)
- setupDistributedCache(destUri.toString(), LocalResourceType.ARCHIVE,
- status.getModificationTime().toString, status.getLen.toString,
- LocalResourceVisibility.PRIVATE.name())
- }
-
- // Clean up the configuration so it doesn't show up in the Web UI (since it's really noisy).
- CACHE_CONFIGS.foreach { e =>
- sparkConf.remove(e)
- sys.props.remove(e.key)
- }
-
- resources.toMap
- }
-
- def getAttemptId(): ApplicationAttemptId = {
- client.getAttemptId()
- }
-
- final def run(): Int = {
- try {
- val appAttemptId = client.getAttemptId()
-
- var attemptID: Option[String] = None
-
- if (isClusterMode) {
- // Set the web ui port to be ephemeral for yarn so we don't conflict with
- // other spark processes running on the same box
- System.setProperty("spark.ui.port", "0")
-
- // Set the master and deploy mode property to match the requested mode.
- System.setProperty("spark.master", "yarn")
- System.setProperty("spark.submit.deployMode", "cluster")
-
- // Set this internal configuration if it is running on cluster mode, this
- // configuration will be checked in SparkContext to avoid misuse of yarn cluster mode.
- System.setProperty("spark.yarn.app.id", appAttemptId.getApplicationId().toString())
-
- attemptID = Option(appAttemptId.getAttemptId.toString)
- }
-
- new CallerContext(
- "APPMASTER", sparkConf.get(APP_CALLER_CONTEXT),
- Option(appAttemptId.getApplicationId.toString), attemptID).setCurrentContext()
-
- logInfo("ApplicationAttemptId: " + appAttemptId)
-
- val fs = FileSystem.get(yarnConf)
-
- // This shutdown hook should run *after* the SparkContext is shut down.
- val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1
- ShutdownHookManager.addShutdownHook(priority) { () =>
- val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
- val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts
-
- if (!finished) {
- // The default state of ApplicationMaster is failed if it is invoked by shut down hook.
- // This behavior is different compared to 1.x version.
- // If user application is exited ahead of time by calling System.exit(N), here mark
- // this application as failed with EXIT_EARLY. For a good shutdown, user shouldn't call
- // System.exit(0) to terminate the application.
- finish(finalStatus,
- ApplicationMaster.EXIT_EARLY,
- "Shutdown hook called before final status was reported.")
- }
-
- if (!unregistered) {
- // we only want to unregister if we don't want the RM to retry
- if (finalStatus == FinalApplicationStatus.SUCCEEDED || isLastAttempt) {
- unregister(finalStatus, finalMsg)
- cleanupStagingDir(fs)
- }
- }
- }
-
- // Call this to force generation of secret so it gets populated into the
- // Hadoop UGI. This has to happen before the startUserApplication which does a
- // doAs in order for the credentials to be passed on to the executor containers.
- val securityMgr = new SecurityManager(sparkConf)
-
- // If the credentials file config is present, we must periodically renew tokens. So create
- // a new AMDelegationTokenRenewer
- if (sparkConf.contains(CREDENTIALS_FILE_PATH.key)) {
- // If a principal and keytab have been set, use that to create new credentials for executors
- // periodically
- credentialRenewer =
- new ConfigurableCredentialManager(sparkConf, yarnConf).credentialRenewer()
- credentialRenewer.scheduleLoginFromKeytab()
- }
-
- if (isClusterMode) {
- runDriver(securityMgr)
- } else {
- runExecutorLauncher(securityMgr)
- }
- } catch {
- case e: Exception =>
- // catch everything else if not specifically handled
- logError("Uncaught exception: ", e)
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
- "Uncaught exception: " + e)
- }
- exitCode
- }
-
- /**
- * Set the default final application status for client mode to UNDEFINED to handle
- * if YARN HA restarts the application so that it properly retries. Set the final
- * status to SUCCEEDED in cluster mode to handle if the user calls System.exit
- * from the application code.
- */
- final def getDefaultFinalStatus(): FinalApplicationStatus = {
- if (isClusterMode) {
- FinalApplicationStatus.FAILED
- } else {
- FinalApplicationStatus.UNDEFINED
- }
- }
-
- /**
- * unregister is used to completely unregister the application from the ResourceManager.
- * This means the ResourceManager will not retry the application attempt on your behalf if
- * a failure occurred.
- */
- final def unregister(status: FinalApplicationStatus, diagnostics: String = null): Unit = {
- synchronized {
- if (!unregistered) {
- logInfo(s"Unregistering ApplicationMaster with $status" +
- Option(diagnostics).map(msg => s" (diag message: $msg)").getOrElse(""))
- unregistered = true
- client.unregister(status, Option(diagnostics).getOrElse(""))
- }
- }
- }
-
- final def finish(status: FinalApplicationStatus, code: Int, msg: String = null): Unit = {
- synchronized {
- if (!finished) {
- val inShutdown = ShutdownHookManager.inShutdown()
- logInfo(s"Final app status: $status, exitCode: $code" +
- Option(msg).map(msg => s", (reason: $msg)").getOrElse(""))
- exitCode = code
- finalStatus = status
- finalMsg = msg
- finished = true
- if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) {
- logDebug("shutting down reporter thread")
- reporterThread.interrupt()
- }
- if (!inShutdown && Thread.currentThread() != userClassThread && userClassThread != null) {
- logDebug("shutting down user thread")
- userClassThread.interrupt()
- }
- if (!inShutdown && credentialRenewer != null) {
- credentialRenewer.stop()
- credentialRenewer = null
- }
- }
- }
- }
-
- private def sparkContextInitialized(sc: SparkContext) = {
- sparkContextPromise.success(sc)
- }
-
- private def registerAM(
- _sparkConf: SparkConf,
- _rpcEnv: RpcEnv,
- driverRef: RpcEndpointRef,
- uiAddress: String,
- securityMgr: SecurityManager) = {
- val appId = client.getAttemptId().getApplicationId().toString()
- val attemptId = client.getAttemptId().getAttemptId().toString()
- val historyAddress =
- _sparkConf.get(HISTORY_SERVER_ADDRESS)
- .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
- .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
- .getOrElse("")
-
- val driverUrl = RpcEndpointAddress(
- _sparkConf.get("spark.driver.host"),
- _sparkConf.get("spark.driver.port").toInt,
- CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
-
- // Before we initialize the allocator, let's log the information about how executors will
- // be run up front, to avoid printing this out for every single executor being launched.
- // Use placeholders for information that changes such as executor IDs.
- logInfo {
- val executorMemory = sparkConf.get(EXECUTOR_MEMORY).toInt
- val executorCores = sparkConf.get(EXECUTOR_CORES)
- val dummyRunner = new ExecutorRunnable(None, yarnConf, sparkConf, driverUrl, "<executorId>",
- "<hostname>", executorMemory, executorCores, appId, securityMgr, localResources)
- dummyRunner.launchContextDebugInfo()
- }
-
- allocator = client.register(driverUrl,
- driverRef,
- yarnConf,
- _sparkConf,
- uiAddress,
- historyAddress,
- securityMgr,
- localResources)
-
- allocator.allocateResources()
- reporterThread = launchReporterThread()
- }
-
- /**
- * Create an [[RpcEndpoint]] that communicates with the driver.
- *
- * In cluster mode, the AM and the driver belong to same process
- * so the AMEndpoint need not monitor lifecycle of the driver.
- *
- * @return A reference to the driver's RPC endpoint.
- */
- private def runAMEndpoint(
- host: String,
- port: String,
- isClusterMode: Boolean): RpcEndpointRef = {
- val driverEndpoint = rpcEnv.setupEndpointRef(
- RpcAddress(host, port.toInt),
- YarnSchedulerBackend.ENDPOINT_NAME)
- amEndpoint =
- rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverEndpoint, isClusterMode))
- driverEndpoint
- }
-
- private def runDriver(securityMgr: SecurityManager): Unit = {
- addAmIpFilter()
- userClassThread = startUserApplication()
-
- // This a bit hacky, but we need to wait until the spark.driver.port property has
- // been set by the Thread executing the user class.
- logInfo("Waiting for spark context initialization...")
- val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
- try {
- val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
- Duration(totalWaitTime, TimeUnit.MILLISECONDS))
- if (sc != null) {
- rpcEnv = sc.env.rpcEnv
- val driverRef = runAMEndpoint(
- sc.getConf.get("spark.driver.host"),
- sc.getConf.get("spark.driver.port"),
- isClusterMode = true)
- registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl).getOrElse(""),
- securityMgr)
- } else {
- // Sanity check; should never happen in normal operation, since sc should only be null
- // if the user app did not create a SparkContext.
- if (!finished) {
- throw new IllegalStateException("SparkContext is null but app is still running!")
- }
- }
- userClassThread.join()
- } catch {
- case e: SparkException if e.getCause().isInstanceOf[TimeoutException] =>
- logError(
- s"SparkContext did not initialize after waiting for $totalWaitTime ms. " +
- "Please check earlier log output for errors. Failing the application.")
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_SC_NOT_INITED,
- "Timed out waiting for SparkContext.")
- }
- }
-
- private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
- val port = sparkConf.get(AM_PORT)
- rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, port, sparkConf, securityMgr,
- clientMode = true)
- val driverRef = waitForSparkDriver()
- addAmIpFilter()
- registerAM(sparkConf, rpcEnv, driverRef, sparkConf.get("spark.driver.appUIAddress", ""),
- securityMgr)
-
- // In client mode the actor will stop the reporter thread.
- reporterThread.join()
- }
-
- private def launchReporterThread(): Thread = {
- // The number of failures in a row until Reporter thread give up
- val reporterMaxFailures = sparkConf.get(MAX_REPORTER_THREAD_FAILURES)
-
- val t = new Thread {
- override def run() {
- var failureCount = 0
- while (!finished) {
- try {
- if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES,
- s"Max number of executor failures ($maxNumExecutorFailures) reached")
- } else {
- logDebug("Sending progress")
- allocator.allocateResources()
- }
- failureCount = 0
- } catch {
- case i: InterruptedException =>
- case e: Throwable =>
- failureCount += 1
- // this exception was introduced in hadoop 2.4 and this code would not compile
- // with earlier versions if we refer it directly.
- if ("org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException" ==
- e.getClass().getName()) {
- logError("Exception from Reporter thread.", e)
- finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_REPORTER_FAILURE,
- e.getMessage)
- } else if (!NonFatal(e) || failureCount >= reporterMaxFailures) {
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " +
- s"$failureCount time(s) from Reporter thread.")
- } else {
- logWarning(s"Reporter thread fails $failureCount time(s) in a row.", e)
- }
- }
- try {
- val numPendingAllocate = allocator.getPendingAllocate.size
- var sleepStart = 0L
- var sleepInterval = 200L // ms
- allocatorLock.synchronized {
- sleepInterval =
- if (numPendingAllocate > 0 || allocator.getNumPendingLossReasonRequests > 0) {
- val currentAllocationInterval =
- math.min(heartbeatInterval, nextAllocationInterval)
- nextAllocationInterval = currentAllocationInterval * 2 // avoid overflow
- currentAllocationInterval
- } else {
- nextAllocationInterval = initialAllocationInterval
- heartbeatInterval
- }
- sleepStart = System.currentTimeMillis()
- allocatorLock.wait(sleepInterval)
- }
- val sleepDuration = System.currentTimeMillis() - sleepStart
- if (sleepDuration < sleepInterval) {
- // log when sleep is interrupted
- logDebug(s"Number of pending allocations is $numPendingAllocate. " +
- s"Slept for $sleepDuration/$sleepInterval ms.")
- // if sleep was less than the minimum interval, sleep for the rest of it
- val toSleep = math.max(0, initialAllocationInterval - sleepDuration)
- if (toSleep > 0) {
- logDebug(s"Going back to sleep for $toSleep ms")
- // use Thread.sleep instead of allocatorLock.wait. there is no need to be woken up
- // by the methods that signal allocatorLock because this is just finishing the min
- // sleep interval, which should happen even if this is signalled again.
- Thread.sleep(toSleep)
- }
- } else {
- logDebug(s"Number of pending allocations is $numPendingAllocate. " +
- s"Slept for $sleepDuration/$sleepInterval.")
- }
- } catch {
- case e: InterruptedException =>
- }
- }
- }
- }
- // setting to daemon status, though this is usually not a good idea.
- t.setDaemon(true)
- t.setName("Reporter")
- t.start()
- logInfo(s"Started progress reporter thread with (heartbeat : $heartbeatInterval, " +
- s"initial allocation : $initialAllocationInterval) intervals")
- t
- }
-
- /**
- * Clean up the staging directory.
- */
- private def cleanupStagingDir(fs: FileSystem) {
- var stagingDirPath: Path = null
- try {
- val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
- if (!preserveFiles) {
- stagingDirPath = new Path(System.getenv("SPARK_YARN_STAGING_DIR"))
- if (stagingDirPath == null) {
- logError("Staging directory is null")
- return
- }
- logInfo("Deleting staging directory " + stagingDirPath)
- fs.delete(stagingDirPath, true)
- }
- } catch {
- case ioe: IOException =>
- logError("Failed to cleanup staging dir " + stagingDirPath, ioe)
- }
- }
-
- private def waitForSparkDriver(): RpcEndpointRef = {
- logInfo("Waiting for Spark driver to be reachable.")
- var driverUp = false
- val hostport = args.userArgs(0)
- val (driverHost, driverPort) = Utils.parseHostPort(hostport)
-
- // Spark driver should already be up since it launched us, but we don't want to
- // wait forever, so wait 100 seconds max to match the cluster mode setting.
- val totalWaitTimeMs = sparkConf.get(AM_MAX_WAIT_TIME)
- val deadline = System.currentTimeMillis + totalWaitTimeMs
-
- while (!driverUp && !finished && System.currentTimeMillis < deadline) {
- try {
- val socket = new Socket(driverHost, driverPort)
- socket.close()
- logInfo("Driver now available: %s:%s".format(driverHost, driverPort))
- driverUp = true
- } catch {
- case e: Exception =>
- logError("Failed to connect to driver at %s:%s, retrying ...".
- format(driverHost, driverPort))
- Thread.sleep(100L)
- }
- }
-
- if (!driverUp) {
- throw new SparkException("Failed to connect to driver!")
- }
-
- sparkConf.set("spark.driver.host", driverHost)
- sparkConf.set("spark.driver.port", driverPort.toString)
-
- runAMEndpoint(driverHost, driverPort.toString, isClusterMode = false)
- }
-
- /** Add the Yarn IP filter that is required for properly securing the UI. */
- private def addAmIpFilter() = {
- val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
- val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
- val params = client.getAmIpFilterParams(yarnConf, proxyBase)
- if (isClusterMode) {
- System.setProperty("spark.ui.filters", amFilter)
- params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
- } else {
- amEndpoint.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
- }
- }
-
- /**
- * Start the user class, which contains the spark driver, in a separate Thread.
- * If the main routine exits cleanly or exits with System.exit(N) for any N
- * we assume it was successful, for all other cases we assume failure.
- *
- * Returns the user thread that was started.
- */
- private def startUserApplication(): Thread = {
- logInfo("Starting the user application in a separate Thread")
-
- val classpath = Client.getUserClasspath(sparkConf)
- val urls = classpath.map { entry =>
- new URL("file:" + new File(entry.getPath()).getAbsolutePath())
- }
- val userClassLoader =
- if (Client.isUserClassPathFirst(sparkConf, isDriver = true)) {
- new ChildFirstURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
- } else {
- new MutableURLClassLoader(urls, Utils.getContextOrSparkClassLoader)
- }
-
- var userArgs = args.userArgs
- if (args.primaryPyFile != null && args.primaryPyFile.endsWith(".py")) {
- // When running pyspark, the app is run using PythonRunner. The second argument is the list
- // of files to add to PYTHONPATH, which Client.scala already handles, so it's empty.
- userArgs = Seq(args.primaryPyFile, "") ++ userArgs
- }
- if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
- // TODO(davies): add R dependencies here
- }
- val mainMethod = userClassLoader.loadClass(args.userClass)
- .getMethod("main", classOf[Array[String]])
-
- val userThread = new Thread {
- override def run() {
- try {
- mainMethod.invoke(null, userArgs.toArray)
- finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
- logDebug("Done running users class")
- } catch {
- case e: InvocationTargetException =>
- e.getCause match {
- case _: InterruptedException =>
- // Reporter thread can interrupt to stop user class
- case SparkUserAppException(exitCode) =>
- val msg = s"User application exited with status $exitCode"
- logError(msg)
- finish(FinalApplicationStatus.FAILED, exitCode, msg)
- case cause: Throwable =>
- logError("User class threw exception: " + cause, cause)
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_EXCEPTION_USER_CLASS,
- "User class threw exception: " + cause)
- }
- sparkContextPromise.tryFailure(e.getCause())
- } finally {
- // Notify the thread waiting for the SparkContext, in case the application did not
- // instantiate one. This will do nothing when the user code instantiates a SparkContext
- // (with the correct master), or when the user code throws an exception (due to the
- // tryFailure above).
- sparkContextPromise.trySuccess(null)
- }
- }
- }
- userThread.setContextClassLoader(userClassLoader)
- userThread.setName("Driver")
- userThread.start()
- userThread
- }
-
- private def resetAllocatorInterval(): Unit = allocatorLock.synchronized {
- nextAllocationInterval = initialAllocationInterval
- allocatorLock.notifyAll()
- }
-
- /**
- * An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
- */
- private class AMEndpoint(
- override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
- extends RpcEndpoint with Logging {
-
- override def onStart(): Unit = {
- driver.send(RegisterClusterManager(self))
- }
-
- override def receive: PartialFunction[Any, Unit] = {
- case x: AddWebUIFilter =>
- logInfo(s"Add WebUI Filter. $x")
- driver.send(x)
- }
-
- override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
- case RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount) =>
- Option(allocator) match {
- case Some(a) =>
- if (a.requestTotalExecutorsWithPreferredLocalities(requestedTotal,
- localityAwareTasks, hostToLocalTaskCount)) {
- resetAllocatorInterval()
- }
- context.reply(true)
-
- case None =>
- logWarning("Container allocator is not ready to request executors yet.")
- context.reply(false)
- }
-
- case KillExecutors(executorIds) =>
- logInfo(s"Driver requested to kill executor(s) ${executorIds.mkString(", ")}.")
- Option(allocator) match {
- case Some(a) => executorIds.foreach(a.killExecutor)
- case None => logWarning("Container allocator is not ready to kill executors yet.")
- }
- context.reply(true)
-
- case GetExecutorLossReason(eid) =>
- Option(allocator) match {
- case Some(a) =>
- a.enqueueGetLossReasonRequest(eid, context)
- resetAllocatorInterval()
- case None =>
- logWarning("Container allocator is not ready to find executor loss reasons yet.")
- }
- }
-
- override def onDisconnected(remoteAddress: RpcAddress): Unit = {
- // In cluster mode, do not rely on the disassociated event to exit
- // This avoids potentially reporting incorrect exit codes if the driver fails
- if (!isClusterMode) {
- logInfo(s"Driver terminated or disconnected! Shutting down. $remoteAddress")
- finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
- }
- }
- }
-
-}
-
-object ApplicationMaster extends Logging {
-
- // exit codes for different causes, no reason behind the values
- private val EXIT_SUCCESS = 0
- private val EXIT_UNCAUGHT_EXCEPTION = 10
- private val EXIT_MAX_EXECUTOR_FAILURES = 11
- private val EXIT_REPORTER_FAILURE = 12
- private val EXIT_SC_NOT_INITED = 13
- private val EXIT_SECURITY = 14
- private val EXIT_EXCEPTION_USER_CLASS = 15
- private val EXIT_EARLY = 16
-
- private var master: ApplicationMaster = _
-
- def main(args: Array[String]): Unit = {
- SignalUtils.registerLogger(log)
- val amArgs = new ApplicationMasterArguments(args)
-
- // Load the properties file with the Spark configuration and set entries as system properties,
- // so that user code run inside the AM also has access to them.
- // Note: we must do this before SparkHadoopUtil instantiated
- if (amArgs.propertiesFile != null) {
- Utils.getPropertiesFromFile(amArgs.propertiesFile).foreach { case (k, v) =>
- sys.props(k) = v
- }
- }
- SparkHadoopUtil.get.runAsSparkUser { () =>
- master = new ApplicationMaster(amArgs, new YarnRMClient)
- System.exit(master.run())
- }
- }
-
- private[spark] def sparkContextInitialized(sc: SparkContext): Unit = {
- master.sparkContextInitialized(sc)
- }
-
- private[spark] def getAttemptId(): ApplicationAttemptId = {
- master.getAttemptId
- }
-
-}
-
-/**
- * This object does not provide any special functionality. It exists so that it's easy to tell
- * apart the client-mode AM from the cluster-mode AM when using tools such as ps or jps.
- */
-object ExecutorLauncher {
-
- def main(args: Array[String]): Unit = {
- ApplicationMaster.main(args)
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
deleted file mode 100644
index 5cdec87..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.util.{IntParam, MemoryParam}
-
-class ApplicationMasterArguments(val args: Array[String]) {
- var userJar: String = null
- var userClass: String = null
- var primaryPyFile: String = null
- var primaryRFile: String = null
- var userArgs: Seq[String] = Nil
- var propertiesFile: String = null
-
- parseArgs(args.toList)
-
- private def parseArgs(inputArgs: List[String]): Unit = {
- val userArgsBuffer = new ArrayBuffer[String]()
-
- var args = inputArgs
-
- while (!args.isEmpty) {
- // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0,
- // the properties with executor in their names are preferred.
- args match {
- case ("--jar") :: value :: tail =>
- userJar = value
- args = tail
-
- case ("--class") :: value :: tail =>
- userClass = value
- args = tail
-
- case ("--primary-py-file") :: value :: tail =>
- primaryPyFile = value
- args = tail
-
- case ("--primary-r-file") :: value :: tail =>
- primaryRFile = value
- args = tail
-
- case ("--arg") :: value :: tail =>
- userArgsBuffer += value
- args = tail
-
- case ("--properties-file") :: value :: tail =>
- propertiesFile = value
- args = tail
-
- case _ =>
- printUsageAndExit(1, args)
- }
- }
-
- if (primaryPyFile != null && primaryRFile != null) {
- // scalastyle:off println
- System.err.println("Cannot have primary-py-file and primary-r-file at the same time")
- // scalastyle:on println
- System.exit(-1)
- }
-
- userArgs = userArgsBuffer.toList
- }
-
- def printUsageAndExit(exitCode: Int, unknownParam: Any = null) {
- // scalastyle:off println
- if (unknownParam != null) {
- System.err.println("Unknown/unsupported param " + unknownParam)
- }
- System.err.println("""
- |Usage: org.apache.spark.deploy.yarn.ApplicationMaster [options]
- |Options:
- | --jar JAR_PATH Path to your application's JAR file
- | --class CLASS_NAME Name of your application's main class
- | --primary-py-file A main Python file
- | --primary-r-file A main R file
- | --arg ARG Argument to be passed to your application's main class.
- | Multiple invocations are possible, each will be passed in order.
- | --properties-file FILE Path to a custom Spark properties file.
- """.stripMargin)
- // scalastyle:on println
- System.exit(exitCode)
- }
-}
-
-object ApplicationMasterArguments {
- val DEFAULT_NUMBER_EXECUTORS = 2
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[11/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
new file mode 100644
index 0000000..be419ce
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -0,0 +1,1541 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, FileOutputStream, IOException, OutputStreamWriter}
+import java.net.{InetAddress, UnknownHostException, URI}
+import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
+import java.util.{Properties, UUID}
+import java.util.zip.{ZipEntry, ZipOutputStream}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
+import scala.util.{Failure, Success, Try}
+import scala.util.control.NonFatal
+
+import com.google.common.base.Objects
+import com.google.common.io.Files
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs._
+import org.apache.hadoop.fs.permission.FsPermission
+import org.apache.hadoop.io.DataOutputBuffer
+import org.apache.hadoop.mapreduce.MRJobConfig
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
+import org.apache.hadoop.util.StringUtils
+import org.apache.hadoop.yarn.api._
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.protocolrecords._
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.{YarnClient, YarnClientApplication}
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
+import org.apache.hadoop.yarn.util.Records
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkContext, SparkException}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils}
+import org.apache.spark.util.{CallerContext, Utils}
+
+private[spark] class Client(
+ val args: ClientArguments,
+ val hadoopConf: Configuration,
+ val sparkConf: SparkConf)
+ extends Logging {
+
+ import Client._
+ import YarnSparkHadoopUtil._
+
+ def this(clientArgs: ClientArguments, spConf: SparkConf) =
+ this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
+
+ private val yarnClient = YarnClient.createYarnClient
+ private val yarnConf = new YarnConfiguration(hadoopConf)
+
+ private val isClusterMode = sparkConf.get("spark.submit.deployMode", "client") == "cluster"
+
+ // AM related configurations
+ private val amMemory = if (isClusterMode) {
+ sparkConf.get(DRIVER_MEMORY).toInt
+ } else {
+ sparkConf.get(AM_MEMORY).toInt
+ }
+ private val amMemoryOverhead = {
+ val amMemoryOverheadEntry = if (isClusterMode) DRIVER_MEMORY_OVERHEAD else AM_MEMORY_OVERHEAD
+ sparkConf.get(amMemoryOverheadEntry).getOrElse(
+ math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
+ }
+ private val amCores = if (isClusterMode) {
+ sparkConf.get(DRIVER_CORES)
+ } else {
+ sparkConf.get(AM_CORES)
+ }
+
+ // Executor related configurations
+ private val executorMemory = sparkConf.get(EXECUTOR_MEMORY)
+ private val executorMemoryOverhead = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
+ math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
+
+ private val distCacheMgr = new ClientDistributedCacheManager()
+
+ private var loginFromKeytab = false
+ private var principal: String = null
+ private var keytab: String = null
+ private var credentials: Credentials = null
+
+ private val launcherBackend = new LauncherBackend() {
+ override def onStopRequest(): Unit = {
+ if (isClusterMode && appId != null) {
+ yarnClient.killApplication(appId)
+ } else {
+ setState(SparkAppHandle.State.KILLED)
+ stop()
+ }
+ }
+ }
+ private val fireAndForget = isClusterMode && !sparkConf.get(WAIT_FOR_APP_COMPLETION)
+
+ private var appId: ApplicationId = null
+
+ // The app staging dir based on the STAGING_DIR configuration if configured
+ // otherwise based on the users home directory.
+ private val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) }
+ .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
+
+ private val credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+
+ def reportLauncherState(state: SparkAppHandle.State): Unit = {
+ launcherBackend.setState(state)
+ }
+
+ def stop(): Unit = {
+ launcherBackend.close()
+ yarnClient.stop()
+ // Unset YARN mode system env variable, to allow switching between cluster types.
+ System.clearProperty("SPARK_YARN_MODE")
+ }
+
+ /**
+ * Submit an application running our ApplicationMaster to the ResourceManager.
+ *
+ * The stable Yarn API provides a convenience method (YarnClient#createApplication) for
+ * creating applications and setting up the application submission context. This was not
+ * available in the alpha API.
+ */
+ def submitApplication(): ApplicationId = {
+ var appId: ApplicationId = null
+ try {
+ launcherBackend.connect()
+ // Setup the credentials before doing anything else,
+ // so we have don't have issues at any point.
+ setupCredentials()
+ yarnClient.init(yarnConf)
+ yarnClient.start()
+
+ logInfo("Requesting a new application from cluster with %d NodeManagers"
+ .format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))
+
+ // Get a new application from our RM
+ val newApp = yarnClient.createApplication()
+ val newAppResponse = newApp.getNewApplicationResponse()
+ appId = newAppResponse.getApplicationId()
+ reportLauncherState(SparkAppHandle.State.SUBMITTED)
+ launcherBackend.setAppId(appId.toString)
+
+ new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
+ Option(appId.toString)).setCurrentContext()
+
+ // Verify whether the cluster has enough resources for our AM
+ verifyClusterResources(newAppResponse)
+
+ // Set up the appropriate contexts to launch our AM
+ val containerContext = createContainerLaunchContext(newAppResponse)
+ val appContext = createApplicationSubmissionContext(newApp, containerContext)
+
+ // Finally, submit and monitor the application
+ logInfo(s"Submitting application $appId to ResourceManager")
+ yarnClient.submitApplication(appContext)
+ appId
+ } catch {
+ case e: Throwable =>
+ if (appId != null) {
+ cleanupStagingDir(appId)
+ }
+ throw e
+ }
+ }
+
+ /**
+ * Cleanup application staging directory.
+ */
+ private def cleanupStagingDir(appId: ApplicationId): Unit = {
+ val stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
+ try {
+ val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
+ val fs = stagingDirPath.getFileSystem(hadoopConf)
+ if (!preserveFiles && fs.delete(stagingDirPath, true)) {
+ logInfo(s"Deleted staging directory $stagingDirPath")
+ }
+ } catch {
+ case ioe: IOException =>
+ logWarning("Failed to cleanup staging dir " + stagingDirPath, ioe)
+ }
+ }
+
+ /**
+ * Set up the context for submitting our ApplicationMaster.
+ * This uses the YarnClientApplication not available in the Yarn alpha API.
+ */
+ def createApplicationSubmissionContext(
+ newApp: YarnClientApplication,
+ containerContext: ContainerLaunchContext): ApplicationSubmissionContext = {
+ val appContext = newApp.getApplicationSubmissionContext
+ appContext.setApplicationName(sparkConf.get("spark.app.name", "Spark"))
+ appContext.setQueue(sparkConf.get(QUEUE_NAME))
+ appContext.setAMContainerSpec(containerContext)
+ appContext.setApplicationType("SPARK")
+
+ sparkConf.get(APPLICATION_TAGS).foreach { tags =>
+ try {
+ // The setApplicationTags method was only introduced in Hadoop 2.4+, so we need to use
+ // reflection to set it, printing a warning if a tag was specified but the YARN version
+ // doesn't support it.
+ val method = appContext.getClass().getMethod(
+ "setApplicationTags", classOf[java.util.Set[String]])
+ method.invoke(appContext, new java.util.HashSet[String](tags.asJava))
+ } catch {
+ case e: NoSuchMethodException =>
+ logWarning(s"Ignoring ${APPLICATION_TAGS.key} because this version of " +
+ "YARN does not support it")
+ }
+ }
+ sparkConf.get(MAX_APP_ATTEMPTS) match {
+ case Some(v) => appContext.setMaxAppAttempts(v)
+ case None => logDebug(s"${MAX_APP_ATTEMPTS.key} is not set. " +
+ "Cluster's default value will be used.")
+ }
+
+ sparkConf.get(AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).foreach { interval =>
+ try {
+ val method = appContext.getClass().getMethod(
+ "setAttemptFailuresValidityInterval", classOf[Long])
+ method.invoke(appContext, interval: java.lang.Long)
+ } catch {
+ case e: NoSuchMethodException =>
+ logWarning(s"Ignoring ${AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS.key} because " +
+ "the version of YARN does not support it")
+ }
+ }
+
+ val capability = Records.newRecord(classOf[Resource])
+ capability.setMemory(amMemory + amMemoryOverhead)
+ capability.setVirtualCores(amCores)
+
+ sparkConf.get(AM_NODE_LABEL_EXPRESSION) match {
+ case Some(expr) =>
+ try {
+ val amRequest = Records.newRecord(classOf[ResourceRequest])
+ amRequest.setResourceName(ResourceRequest.ANY)
+ amRequest.setPriority(Priority.newInstance(0))
+ amRequest.setCapability(capability)
+ amRequest.setNumContainers(1)
+ val method = amRequest.getClass.getMethod("setNodeLabelExpression", classOf[String])
+ method.invoke(amRequest, expr)
+
+ val setResourceRequestMethod =
+ appContext.getClass.getMethod("setAMContainerResourceRequest", classOf[ResourceRequest])
+ setResourceRequestMethod.invoke(appContext, amRequest)
+ } catch {
+ case e: NoSuchMethodException =>
+ logWarning(s"Ignoring ${AM_NODE_LABEL_EXPRESSION.key} because the version " +
+ "of YARN does not support it")
+ appContext.setResource(capability)
+ }
+ case None =>
+ appContext.setResource(capability)
+ }
+
+ sparkConf.get(ROLLED_LOG_INCLUDE_PATTERN).foreach { includePattern =>
+ try {
+ val logAggregationContext = Records.newRecord(
+ Utils.classForName("org.apache.hadoop.yarn.api.records.LogAggregationContext"))
+ .asInstanceOf[Object]
+
+ val setRolledLogsIncludePatternMethod =
+ logAggregationContext.getClass.getMethod("setRolledLogsIncludePattern", classOf[String])
+ setRolledLogsIncludePatternMethod.invoke(logAggregationContext, includePattern)
+
+ sparkConf.get(ROLLED_LOG_EXCLUDE_PATTERN).foreach { excludePattern =>
+ val setRolledLogsExcludePatternMethod =
+ logAggregationContext.getClass.getMethod("setRolledLogsExcludePattern", classOf[String])
+ setRolledLogsExcludePatternMethod.invoke(logAggregationContext, excludePattern)
+ }
+
+ val setLogAggregationContextMethod =
+ appContext.getClass.getMethod("setLogAggregationContext",
+ Utils.classForName("org.apache.hadoop.yarn.api.records.LogAggregationContext"))
+ setLogAggregationContextMethod.invoke(appContext, logAggregationContext)
+ } catch {
+ case NonFatal(e) =>
+ logWarning(s"Ignoring ${ROLLED_LOG_INCLUDE_PATTERN.key} because the version of YARN " +
+ s"does not support it", e)
+ }
+ }
+
+ appContext
+ }
+
+ /** Set up security tokens for launching our ApplicationMaster container. */
+ private def setupSecurityToken(amContainer: ContainerLaunchContext): Unit = {
+ val dob = new DataOutputBuffer
+ credentials.writeTokenStorageToStream(dob)
+ amContainer.setTokens(ByteBuffer.wrap(dob.getData))
+ }
+
+ /** Get the application report from the ResourceManager for an application we have submitted. */
+ def getApplicationReport(appId: ApplicationId): ApplicationReport =
+ yarnClient.getApplicationReport(appId)
+
+ /**
+ * Return the security token used by this client to communicate with the ApplicationMaster.
+ * If no security is enabled, the token returned by the report is null.
+ */
+ private def getClientToken(report: ApplicationReport): String =
+ Option(report.getClientToAMToken).map(_.toString).getOrElse("")
+
+ /**
+ * Fail fast if we have requested more resources per container than is available in the cluster.
+ */
+ private def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = {
+ val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()
+ logInfo("Verifying our application has not requested more than the maximum " +
+ s"memory capability of the cluster ($maxMem MB per container)")
+ val executorMem = executorMemory + executorMemoryOverhead
+ if (executorMem > maxMem) {
+ throw new IllegalArgumentException(s"Required executor memory ($executorMemory" +
+ s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +
+ "Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or " +
+ "'yarn.nodemanager.resource.memory-mb'.")
+ }
+ val amMem = amMemory + amMemoryOverhead
+ if (amMem > maxMem) {
+ throw new IllegalArgumentException(s"Required AM memory ($amMemory" +
+ s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +
+ "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.")
+ }
+ logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format(
+ amMem,
+ amMemoryOverhead))
+
+ // We could add checks to make sure the entire cluster has enough resources but that involves
+ // getting all the node reports and computing ourselves.
+ }
+
+ /**
+ * Copy the given file to a remote file system (e.g. HDFS) if needed.
+ * The file is only copied if the source and destination file systems are different. This is used
+ * for preparing resources for launching the ApplicationMaster container. Exposed for testing.
+ */
+ private[yarn] def copyFileToRemote(
+ destDir: Path,
+ srcPath: Path,
+ replication: Short,
+ force: Boolean = false,
+ destName: Option[String] = None): Path = {
+ val destFs = destDir.getFileSystem(hadoopConf)
+ val srcFs = srcPath.getFileSystem(hadoopConf)
+ var destPath = srcPath
+ if (force || !compareFs(srcFs, destFs)) {
+ destPath = new Path(destDir, destName.getOrElse(srcPath.getName()))
+ logInfo(s"Uploading resource $srcPath -> $destPath")
+ FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)
+ destFs.setReplication(destPath, replication)
+ destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
+ } else {
+ logInfo(s"Source and destination file systems are the same. Not copying $srcPath")
+ }
+ // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific
+ // version shows the specific version in the distributed cache configuration
+ val qualifiedDestPath = destFs.makeQualified(destPath)
+ val fc = FileContext.getFileContext(qualifiedDestPath.toUri(), hadoopConf)
+ fc.resolvePath(qualifiedDestPath)
+ }
+
+ /**
+ * Upload any resources to the distributed cache if needed. If a resource is intended to be
+ * consumed locally, set up the appropriate config for downstream code to handle it properly.
+ * This is used for setting up a container launch context for our ApplicationMaster.
+ * Exposed for testing.
+ */
+ def prepareLocalResources(
+ destDir: Path,
+ pySparkArchives: Seq[String]): HashMap[String, LocalResource] = {
+ logInfo("Preparing resources for our AM container")
+ // Upload Spark and the application JAR to the remote file system if necessary,
+ // and add them as local resources to the application master.
+ val fs = destDir.getFileSystem(hadoopConf)
+
+ // Merge credentials obtained from registered providers
+ val nearestTimeOfNextRenewal = credentialManager.obtainCredentials(hadoopConf, credentials)
+
+ if (credentials != null) {
+ logDebug(YarnSparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
+ }
+
+ // If we use principal and keytab to login, also credentials can be renewed some time
+ // after current time, we should pass the next renewal and updating time to credential
+ // renewer and updater.
+ if (loginFromKeytab && nearestTimeOfNextRenewal > System.currentTimeMillis() &&
+ nearestTimeOfNextRenewal != Long.MaxValue) {
+
+ // Valid renewal time is 75% of next renewal time, and the valid update time will be
+ // slightly later then renewal time (80% of next renewal time). This is to make sure
+ // credentials are renewed and updated before expired.
+ val currTime = System.currentTimeMillis()
+ val renewalTime = (nearestTimeOfNextRenewal - currTime) * 0.75 + currTime
+ val updateTime = (nearestTimeOfNextRenewal - currTime) * 0.8 + currTime
+
+ sparkConf.set(CREDENTIALS_RENEWAL_TIME, renewalTime.toLong)
+ sparkConf.set(CREDENTIALS_UPDATE_TIME, updateTime.toLong)
+ }
+
+ // Used to keep track of URIs added to the distributed cache. If the same URI is added
+ // multiple times, YARN will fail to launch containers for the app with an internal
+ // error.
+ val distributedUris = new HashSet[String]
+ // Used to keep track of URIs(files) added to the distribute cache have the same name. If
+ // same name but different path files are added multiple time, YARN will fail to launch
+ // containers for the app with an internal error.
+ val distributedNames = new HashSet[String]
+
+ val replication = sparkConf.get(STAGING_FILE_REPLICATION).map(_.toShort)
+ .getOrElse(fs.getDefaultReplication(destDir))
+ val localResources = HashMap[String, LocalResource]()
+ FileSystem.mkdirs(fs, destDir, new FsPermission(STAGING_DIR_PERMISSION))
+
+ val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
+
+ def addDistributedUri(uri: URI): Boolean = {
+ val uriStr = uri.toString()
+ val fileName = new File(uri.getPath).getName
+ if (distributedUris.contains(uriStr)) {
+ logWarning(s"Same path resource $uri added multiple times to distributed cache.")
+ false
+ } else if (distributedNames.contains(fileName)) {
+ logWarning(s"Same name resource $uri added multiple times to distributed cache")
+ false
+ } else {
+ distributedUris += uriStr
+ distributedNames += fileName
+ true
+ }
+ }
+
+ /**
+ * Distribute a file to the cluster.
+ *
+ * If the file's path is a "local:" URI, it's actually not distributed. Other files are copied
+ * to HDFS (if not already there) and added to the application's distributed cache.
+ *
+ * @param path URI of the file to distribute.
+ * @param resType Type of resource being distributed.
+ * @param destName Name of the file in the distributed cache.
+ * @param targetDir Subdirectory where to place the file.
+ * @param appMasterOnly Whether to distribute only to the AM.
+ * @return A 2-tuple. First item is whether the file is a "local:" URI. Second item is the
+ * localized path for non-local paths, or the input `path` for local paths.
+ * The localized path will be null if the URI has already been added to the cache.
+ */
+ def distribute(
+ path: String,
+ resType: LocalResourceType = LocalResourceType.FILE,
+ destName: Option[String] = None,
+ targetDir: Option[String] = None,
+ appMasterOnly: Boolean = false): (Boolean, String) = {
+ val trimmedPath = path.trim()
+ val localURI = Utils.resolveURI(trimmedPath)
+ if (localURI.getScheme != LOCAL_SCHEME) {
+ if (addDistributedUri(localURI)) {
+ val localPath = getQualifiedLocalPath(localURI, hadoopConf)
+ val linkname = targetDir.map(_ + "/").getOrElse("") +
+ destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())
+ val destPath = copyFileToRemote(destDir, localPath, replication)
+ val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
+ distCacheMgr.addResource(
+ destFs, hadoopConf, destPath, localResources, resType, linkname, statCache,
+ appMasterOnly = appMasterOnly)
+ (false, linkname)
+ } else {
+ (false, null)
+ }
+ } else {
+ (true, trimmedPath)
+ }
+ }
+
+ // If we passed in a keytab, make sure we copy the keytab to the staging directory on
+ // HDFS, and setup the relevant environment vars, so the AM can login again.
+ if (loginFromKeytab) {
+ logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
+ " via the YARN Secure Distributed Cache.")
+ val (_, localizedPath) = distribute(keytab,
+ destName = sparkConf.get(KEYTAB),
+ appMasterOnly = true)
+ require(localizedPath != null, "Keytab file already distributed.")
+ }
+
+ /**
+ * Add Spark to the cache. There are two settings that control what files to add to the cache:
+ * - if a Spark archive is defined, use the archive. The archive is expected to contain
+ * jar files at its root directory.
+ * - if a list of jars is provided, filter the non-local ones, resolve globs, and
+ * add the found files to the cache.
+ *
+ * Note that the archive cannot be a "local" URI. If none of the above settings are found,
+ * then upload all files found in $SPARK_HOME/jars.
+ */
+ val sparkArchive = sparkConf.get(SPARK_ARCHIVE)
+ if (sparkArchive.isDefined) {
+ val archive = sparkArchive.get
+ require(!isLocalUri(archive), s"${SPARK_ARCHIVE.key} cannot be a local URI.")
+ distribute(Utils.resolveURI(archive).toString,
+ resType = LocalResourceType.ARCHIVE,
+ destName = Some(LOCALIZED_LIB_DIR))
+ } else {
+ sparkConf.get(SPARK_JARS) match {
+ case Some(jars) =>
+ // Break the list of jars to upload, and resolve globs.
+ val localJars = new ArrayBuffer[String]()
+ jars.foreach { jar =>
+ if (!isLocalUri(jar)) {
+ val path = getQualifiedLocalPath(Utils.resolveURI(jar), hadoopConf)
+ val pathFs = FileSystem.get(path.toUri(), hadoopConf)
+ pathFs.globStatus(path).filter(_.isFile()).foreach { entry =>
+ distribute(entry.getPath().toUri().toString(),
+ targetDir = Some(LOCALIZED_LIB_DIR))
+ }
+ } else {
+ localJars += jar
+ }
+ }
+
+ // Propagate the local URIs to the containers using the configuration.
+ sparkConf.set(SPARK_JARS, localJars)
+
+ case None =>
+ // No configuration, so fall back to uploading local jar files.
+ logWarning(s"Neither ${SPARK_JARS.key} nor ${SPARK_ARCHIVE.key} is set, falling back " +
+ "to uploading libraries under SPARK_HOME.")
+ val jarsDir = new File(YarnCommandBuilderUtils.findJarsDir(
+ sparkConf.getenv("SPARK_HOME")))
+ val jarsArchive = File.createTempFile(LOCALIZED_LIB_DIR, ".zip",
+ new File(Utils.getLocalDir(sparkConf)))
+ val jarsStream = new ZipOutputStream(new FileOutputStream(jarsArchive))
+
+ try {
+ jarsStream.setLevel(0)
+ jarsDir.listFiles().foreach { f =>
+ if (f.isFile && f.getName.toLowerCase().endsWith(".jar") && f.canRead) {
+ jarsStream.putNextEntry(new ZipEntry(f.getName))
+ Files.copy(f, jarsStream)
+ jarsStream.closeEntry()
+ }
+ }
+ } finally {
+ jarsStream.close()
+ }
+
+ distribute(jarsArchive.toURI.getPath,
+ resType = LocalResourceType.ARCHIVE,
+ destName = Some(LOCALIZED_LIB_DIR))
+ }
+ }
+
+ /**
+ * Copy user jar to the distributed cache if their scheme is not "local".
+ * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
+ */
+ Option(args.userJar).filter(_.trim.nonEmpty).foreach { jar =>
+ val (isLocal, localizedPath) = distribute(jar, destName = Some(APP_JAR_NAME))
+ if (isLocal) {
+ require(localizedPath != null, s"Path $jar already distributed")
+ // If the resource is intended for local use only, handle this downstream
+ // by setting the appropriate property
+ sparkConf.set(APP_JAR, localizedPath)
+ }
+ }
+
+ /**
+ * Do the same for any additional resources passed in through ClientArguments.
+ * Each resource category is represented by a 3-tuple of:
+ * (1) comma separated list of resources in this category,
+ * (2) resource type, and
+ * (3) whether to add these resources to the classpath
+ */
+ val cachedSecondaryJarLinks = ListBuffer.empty[String]
+ List(
+ (sparkConf.get(JARS_TO_DISTRIBUTE), LocalResourceType.FILE, true),
+ (sparkConf.get(FILES_TO_DISTRIBUTE), LocalResourceType.FILE, false),
+ (sparkConf.get(ARCHIVES_TO_DISTRIBUTE), LocalResourceType.ARCHIVE, false)
+ ).foreach { case (flist, resType, addToClasspath) =>
+ flist.foreach { file =>
+ val (_, localizedPath) = distribute(file, resType = resType)
+ // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.
+ if (addToClasspath) {
+ if (localizedPath != null) {
+ cachedSecondaryJarLinks += localizedPath
+ }
+ } else {
+ if (localizedPath == null) {
+ throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +
+ " to the distributed cache.")
+ }
+ }
+ }
+ }
+ if (cachedSecondaryJarLinks.nonEmpty) {
+ sparkConf.set(SECONDARY_JARS, cachedSecondaryJarLinks)
+ }
+
+ if (isClusterMode && args.primaryPyFile != null) {
+ distribute(args.primaryPyFile, appMasterOnly = true)
+ }
+
+ pySparkArchives.foreach { f => distribute(f) }
+
+ // The python files list needs to be treated especially. All files that are not an
+ // archive need to be placed in a subdirectory that will be added to PYTHONPATH.
+ sparkConf.get(PY_FILES).foreach { f =>
+ val targetDir = if (f.endsWith(".py")) Some(LOCALIZED_PYTHON_DIR) else None
+ distribute(f, targetDir = targetDir)
+ }
+
+ // Update the configuration with all the distributed files, minus the conf archive. The
+ // conf archive will be handled by the AM differently so that we avoid having to send
+ // this configuration by other means. See SPARK-14602 for one reason of why this is needed.
+ distCacheMgr.updateConfiguration(sparkConf)
+
+ // Upload the conf archive to HDFS manually, and record its location in the configuration.
+ // This will allow the AM to know where the conf archive is in HDFS, so that it can be
+ // distributed to the containers.
+ //
+ // This code forces the archive to be copied, so that unit tests pass (since in that case both
+ // file systems are the same and the archive wouldn't normally be copied). In most (all?)
+ // deployments, the archive would be copied anyway, since it's a temp file in the local file
+ // system.
+ val remoteConfArchivePath = new Path(destDir, LOCALIZED_CONF_ARCHIVE)
+ val remoteFs = FileSystem.get(remoteConfArchivePath.toUri(), hadoopConf)
+ sparkConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString())
+
+ val localConfArchive = new Path(createConfArchive().toURI())
+ copyFileToRemote(destDir, localConfArchive, replication, force = true,
+ destName = Some(LOCALIZED_CONF_ARCHIVE))
+
+ // Manually add the config archive to the cache manager so that the AM is launched with
+ // the proper files set up.
+ distCacheMgr.addResource(
+ remoteFs, hadoopConf, remoteConfArchivePath, localResources, LocalResourceType.ARCHIVE,
+ LOCALIZED_CONF_DIR, statCache, appMasterOnly = false)
+
+ // Clear the cache-related entries from the configuration to avoid them polluting the
+ // UI's environment page. This works for client mode; for cluster mode, this is handled
+ // by the AM.
+ CACHE_CONFIGS.foreach(sparkConf.remove)
+
+ localResources
+ }
+
+ /**
+ * Create an archive with the config files for distribution.
+ *
+ * These will be used by AM and executors. The files are zipped and added to the job as an
+ * archive, so that YARN will explode it when distributing to AM and executors. This directory
+ * is then added to the classpath of AM and executor process, just to make sure that everybody
+ * is using the same default config.
+ *
+ * This follows the order of precedence set by the startup scripts, in which HADOOP_CONF_DIR
+ * shows up in the classpath before YARN_CONF_DIR.
+ *
+ * Currently this makes a shallow copy of the conf directory. If there are cases where a
+ * Hadoop config directory contains subdirectories, this code will have to be fixed.
+ *
+ * The archive also contains some Spark configuration. Namely, it saves the contents of
+ * SparkConf in a file to be loaded by the AM process.
+ */
+ private def createConfArchive(): File = {
+ val hadoopConfFiles = new HashMap[String, File]()
+
+ // Uploading $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
+ // the executors will use the latest configurations instead of the default values. This is
+ // required when user changes log4j.properties directly to set the log configurations. If
+ // configuration file is provided through --files then executors will be taking configurations
+ // from --files instead of $SPARK_CONF_DIR/log4j.properties.
+
+ // Also uploading metrics.properties to distributed cache if exists in classpath.
+ // If user specify this file using --files then executors will use the one
+ // from --files instead.
+ for { prop <- Seq("log4j.properties", "metrics.properties")
+ url <- Option(Utils.getContextOrSparkClassLoader.getResource(prop))
+ if url.getProtocol == "file" } {
+ hadoopConfFiles(prop) = new File(url.getPath)
+ }
+
+ Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
+ sys.env.get(envKey).foreach { path =>
+ val dir = new File(path)
+ if (dir.isDirectory()) {
+ val files = dir.listFiles()
+ if (files == null) {
+ logWarning("Failed to list files under directory " + dir)
+ } else {
+ files.foreach { file =>
+ if (file.isFile && !hadoopConfFiles.contains(file.getName())) {
+ hadoopConfFiles(file.getName()) = file
+ }
+ }
+ }
+ }
+ }
+ }
+
+ val confArchive = File.createTempFile(LOCALIZED_CONF_DIR, ".zip",
+ new File(Utils.getLocalDir(sparkConf)))
+ val confStream = new ZipOutputStream(new FileOutputStream(confArchive))
+
+ try {
+ confStream.setLevel(0)
+ hadoopConfFiles.foreach { case (name, file) =>
+ if (file.canRead()) {
+ confStream.putNextEntry(new ZipEntry(name))
+ Files.copy(file, confStream)
+ confStream.closeEntry()
+ }
+ }
+
+ // Save Spark configuration to a file in the archive.
+ val props = new Properties()
+ sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
+ confStream.putNextEntry(new ZipEntry(SPARK_CONF_FILE))
+ val writer = new OutputStreamWriter(confStream, StandardCharsets.UTF_8)
+ props.store(writer, "Spark configuration.")
+ writer.flush()
+ confStream.closeEntry()
+ } finally {
+ confStream.close()
+ }
+ confArchive
+ }
+
+ /**
+ * Set up the environment for launching our ApplicationMaster container.
+ */
+ private def setupLaunchEnv(
+ stagingDirPath: Path,
+ pySparkArchives: Seq[String]): HashMap[String, String] = {
+ logInfo("Setting up the launch environment for our AM container")
+ val env = new HashMap[String, String]()
+ populateClasspath(args, yarnConf, sparkConf, env, sparkConf.get(DRIVER_CLASS_PATH))
+ env("SPARK_YARN_MODE") = "true"
+ env("SPARK_YARN_STAGING_DIR") = stagingDirPath.toString
+ env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
+ if (loginFromKeytab) {
+ val credentialsFile = "credentials-" + UUID.randomUUID().toString
+ sparkConf.set(CREDENTIALS_FILE_PATH, new Path(stagingDirPath, credentialsFile).toString)
+ logInfo(s"Credentials file set to: $credentialsFile")
+ }
+
+ // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
+ val amEnvPrefix = "spark.yarn.appMasterEnv."
+ sparkConf.getAll
+ .filter { case (k, v) => k.startsWith(amEnvPrefix) }
+ .map { case (k, v) => (k.substring(amEnvPrefix.length), v) }
+ .foreach { case (k, v) => YarnSparkHadoopUtil.addPathToEnvironment(env, k, v) }
+
+ // Keep this for backwards compatibility but users should move to the config
+ sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
+ // Allow users to specify some environment variables.
+ YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
+ // Pass SPARK_YARN_USER_ENV itself to the AM so it can use it to set up executor environments.
+ env("SPARK_YARN_USER_ENV") = userEnvs
+ }
+
+ // If pyFiles contains any .py files, we need to add LOCALIZED_PYTHON_DIR to the PYTHONPATH
+ // of the container processes too. Add all non-.py files directly to PYTHONPATH.
+ //
+ // NOTE: the code currently does not handle .py files defined with a "local:" scheme.
+ val pythonPath = new ListBuffer[String]()
+ val (pyFiles, pyArchives) = sparkConf.get(PY_FILES).partition(_.endsWith(".py"))
+ if (pyFiles.nonEmpty) {
+ pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ LOCALIZED_PYTHON_DIR)
+ }
+ (pySparkArchives ++ pyArchives).foreach { path =>
+ val uri = Utils.resolveURI(path)
+ if (uri.getScheme != LOCAL_SCHEME) {
+ pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ new Path(uri).getName())
+ } else {
+ pythonPath += uri.getPath()
+ }
+ }
+
+ // Finally, update the Spark config to propagate PYTHONPATH to the AM and executors.
+ if (pythonPath.nonEmpty) {
+ val pythonPathStr = (sys.env.get("PYTHONPATH") ++ pythonPath)
+ .mkString(YarnSparkHadoopUtil.getClassPathSeparator)
+ env("PYTHONPATH") = pythonPathStr
+ sparkConf.setExecutorEnv("PYTHONPATH", pythonPathStr)
+ }
+
+ // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to
+ // executors. But we can't just set spark.executor.extraJavaOptions, because the driver's
+ // SparkContext will not let that set spark* system properties, which is expected behavior for
+ // Yarn clients. So propagate it through the environment.
+ //
+ // Note that to warn the user about the deprecation in cluster mode, some code from
+ // SparkConf#validateSettings() is duplicated here (to avoid triggering the condition
+ // described above).
+ if (isClusterMode) {
+ sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
+ val warning =
+ s"""
+ |SPARK_JAVA_OPTS was detected (set to '$value').
+ |This is deprecated in Spark 1.0+.
+ |
+ |Please instead use:
+ | - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
+ | - ./spark-submit with --driver-java-options to set -X options for a driver
+ | - spark.executor.extraJavaOptions to set -X options for executors
+ """.stripMargin
+ logWarning(warning)
+ for (proc <- Seq("driver", "executor")) {
+ val key = s"spark.$proc.extraJavaOptions"
+ if (sparkConf.contains(key)) {
+ throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
+ }
+ }
+ env("SPARK_JAVA_OPTS") = value
+ }
+ // propagate PYSPARK_DRIVER_PYTHON and PYSPARK_PYTHON to driver in cluster mode
+ Seq("PYSPARK_DRIVER_PYTHON", "PYSPARK_PYTHON").foreach { envname =>
+ if (!env.contains(envname)) {
+ sys.env.get(envname).foreach(env(envname) = _)
+ }
+ }
+ }
+
+ sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>
+ env(ENV_DIST_CLASSPATH) = dcp
+ }
+
+ env
+ }
+
+ /**
+ * Set up a ContainerLaunchContext to launch our ApplicationMaster container.
+ * This sets up the launch environment, java options, and the command for launching the AM.
+ */
+ private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
+ : ContainerLaunchContext = {
+ logInfo("Setting up container launch context for our AM")
+ val appId = newAppResponse.getApplicationId
+ val appStagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
+ val pySparkArchives =
+ if (sparkConf.get(IS_PYTHON_APP)) {
+ findPySparkArchives()
+ } else {
+ Nil
+ }
+ val launchEnv = setupLaunchEnv(appStagingDirPath, pySparkArchives)
+ val localResources = prepareLocalResources(appStagingDirPath, pySparkArchives)
+
+ val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
+ amContainer.setLocalResources(localResources.asJava)
+ amContainer.setEnvironment(launchEnv.asJava)
+
+ val javaOpts = ListBuffer[String]()
+
+ // Set the environment variable through a command prefix
+ // to append to the existing value of the variable
+ var prefixEnv: Option[String] = None
+
+ // Add Xmx for AM memory
+ javaOpts += "-Xmx" + amMemory + "m"
+
+ val tmpDir = new Path(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR
+ )
+ javaOpts += "-Djava.io.tmpdir=" + tmpDir
+
+ // TODO: Remove once cpuset version is pushed out.
+ // The context is, default gc for server class machines ends up using all cores to do gc -
+ // hence if there are multiple containers in same node, Spark GC affects all other containers'
+ // performance (which can be that of other Spark containers)
+ // Instead of using this, rely on cpusets by YARN to enforce "proper" Spark behavior in
+ // multi-tenant environments. Not sure how default Java GC behaves if it is limited to subset
+ // of cores on a node.
+ val useConcurrentAndIncrementalGC = launchEnv.get("SPARK_USE_CONC_INCR_GC").exists(_.toBoolean)
+ if (useConcurrentAndIncrementalGC) {
+ // In our expts, using (default) throughput collector has severe perf ramifications in
+ // multi-tenant machines
+ javaOpts += "-XX:+UseConcMarkSweepGC"
+ javaOpts += "-XX:MaxTenuringThreshold=31"
+ javaOpts += "-XX:SurvivorRatio=8"
+ javaOpts += "-XX:+CMSIncrementalMode"
+ javaOpts += "-XX:+CMSIncrementalPacing"
+ javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
+ javaOpts += "-XX:CMSIncrementalDutyCycle=10"
+ }
+
+ // Include driver-specific java options if we are launching a driver
+ if (isClusterMode) {
+ val driverOpts = sparkConf.get(DRIVER_JAVA_OPTIONS).orElse(sys.env.get("SPARK_JAVA_OPTS"))
+ driverOpts.foreach { opts =>
+ javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
+ }
+ val libraryPaths = Seq(sparkConf.get(DRIVER_LIBRARY_PATH),
+ sys.props.get("spark.driver.libraryPath")).flatten
+ if (libraryPaths.nonEmpty) {
+ prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(libraryPaths)))
+ }
+ if (sparkConf.get(AM_JAVA_OPTIONS).isDefined) {
+ logWarning(s"${AM_JAVA_OPTIONS.key} will not take effect in cluster mode")
+ }
+ } else {
+ // Validate and include yarn am specific java options in yarn-client mode.
+ sparkConf.get(AM_JAVA_OPTIONS).foreach { opts =>
+ if (opts.contains("-Dspark")) {
+ val msg = s"${AM_JAVA_OPTIONS.key} is not allowed to set Spark options (was '$opts')."
+ throw new SparkException(msg)
+ }
+ if (opts.contains("-Xmx")) {
+ val msg = s"${AM_JAVA_OPTIONS.key} is not allowed to specify max heap memory settings " +
+ s"(was '$opts'). Use spark.yarn.am.memory instead."
+ throw new SparkException(msg)
+ }
+ javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
+ }
+ sparkConf.get(AM_LIBRARY_PATH).foreach { paths =>
+ prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(paths))))
+ }
+ }
+
+ // For log4j configuration to reference
+ javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+ YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
+
+ val userClass =
+ if (isClusterMode) {
+ Seq("--class", YarnSparkHadoopUtil.escapeForShell(args.userClass))
+ } else {
+ Nil
+ }
+ val userJar =
+ if (args.userJar != null) {
+ Seq("--jar", args.userJar)
+ } else {
+ Nil
+ }
+ val primaryPyFile =
+ if (isClusterMode && args.primaryPyFile != null) {
+ Seq("--primary-py-file", new Path(args.primaryPyFile).getName())
+ } else {
+ Nil
+ }
+ val primaryRFile =
+ if (args.primaryRFile != null) {
+ Seq("--primary-r-file", args.primaryRFile)
+ } else {
+ Nil
+ }
+ val amClass =
+ if (isClusterMode) {
+ Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
+ } else {
+ Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
+ }
+ if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
+ args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs
+ }
+ val userArgs = args.userArgs.flatMap { arg =>
+ Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
+ }
+ val amArgs =
+ Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++
+ userArgs ++ Seq(
+ "--properties-file", buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ LOCALIZED_CONF_DIR, SPARK_CONF_FILE))
+
+ // Command for the ApplicationMaster
+ val commands = prefixEnv ++ Seq(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java", "-server"
+ ) ++
+ javaOpts ++ amArgs ++
+ Seq(
+ "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
+ "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
+
+ // TODO: it would be nicer to just make sure there are no null commands here
+ val printableCommands = commands.map(s => if (s == null) "null" else s).toList
+ amContainer.setCommands(printableCommands.asJava)
+
+ logDebug("===============================================================================")
+ logDebug("YARN AM launch context:")
+ logDebug(s" user class: ${Option(args.userClass).getOrElse("N/A")}")
+ logDebug(" env:")
+ launchEnv.foreach { case (k, v) => logDebug(s" $k -> $v") }
+ logDebug(" resources:")
+ localResources.foreach { case (k, v) => logDebug(s" $k -> $v")}
+ logDebug(" command:")
+ logDebug(s" ${printableCommands.mkString(" ")}")
+ logDebug("===============================================================================")
+
+ // send the acl settings into YARN to control who has access via YARN interfaces
+ val securityManager = new SecurityManager(sparkConf)
+ amContainer.setApplicationACLs(
+ YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
+ setupSecurityToken(amContainer)
+ amContainer
+ }
+
+ def setupCredentials(): Unit = {
+ loginFromKeytab = sparkConf.contains(PRINCIPAL.key)
+ if (loginFromKeytab) {
+ principal = sparkConf.get(PRINCIPAL).get
+ keytab = sparkConf.get(KEYTAB).orNull
+
+ require(keytab != null, "Keytab must be specified when principal is specified.")
+ logInfo("Attempting to login to the Kerberos" +
+ s" using principal: $principal and keytab: $keytab")
+ val f = new File(keytab)
+ // Generate a file name that can be used for the keytab file, that does not conflict
+ // with any user file.
+ val keytabFileName = f.getName + "-" + UUID.randomUUID().toString
+ sparkConf.set(KEYTAB.key, keytabFileName)
+ sparkConf.set(PRINCIPAL.key, principal)
+ }
+ // Defensive copy of the credentials
+ credentials = new Credentials(UserGroupInformation.getCurrentUser.getCredentials)
+ }
+
+ /**
+ * Report the state of an application until it has exited, either successfully or
+ * due to some failure, then return a pair of the yarn application state (FINISHED, FAILED,
+ * KILLED, or RUNNING) and the final application state (UNDEFINED, SUCCEEDED, FAILED,
+ * or KILLED).
+ *
+ * @param appId ID of the application to monitor.
+ * @param returnOnRunning Whether to also return the application state when it is RUNNING.
+ * @param logApplicationReport Whether to log details of the application report every iteration.
+ * @return A pair of the yarn application state and the final application state.
+ */
+ def monitorApplication(
+ appId: ApplicationId,
+ returnOnRunning: Boolean = false,
+ logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
+ val interval = sparkConf.get(REPORT_INTERVAL)
+ var lastState: YarnApplicationState = null
+ while (true) {
+ Thread.sleep(interval)
+ val report: ApplicationReport =
+ try {
+ getApplicationReport(appId)
+ } catch {
+ case e: ApplicationNotFoundException =>
+ logError(s"Application $appId not found.")
+ cleanupStagingDir(appId)
+ return (YarnApplicationState.KILLED, FinalApplicationStatus.KILLED)
+ case NonFatal(e) =>
+ logError(s"Failed to contact YARN for application $appId.", e)
+ // Don't necessarily clean up staging dir because status is unknown
+ return (YarnApplicationState.FAILED, FinalApplicationStatus.FAILED)
+ }
+ val state = report.getYarnApplicationState
+
+ if (logApplicationReport) {
+ logInfo(s"Application report for $appId (state: $state)")
+
+ // If DEBUG is enabled, log report details every iteration
+ // Otherwise, log them every time the application changes state
+ if (log.isDebugEnabled) {
+ logDebug(formatReportDetails(report))
+ } else if (lastState != state) {
+ logInfo(formatReportDetails(report))
+ }
+ }
+
+ if (lastState != state) {
+ state match {
+ case YarnApplicationState.RUNNING =>
+ reportLauncherState(SparkAppHandle.State.RUNNING)
+ case YarnApplicationState.FINISHED =>
+ report.getFinalApplicationStatus match {
+ case FinalApplicationStatus.FAILED =>
+ reportLauncherState(SparkAppHandle.State.FAILED)
+ case FinalApplicationStatus.KILLED =>
+ reportLauncherState(SparkAppHandle.State.KILLED)
+ case _ =>
+ reportLauncherState(SparkAppHandle.State.FINISHED)
+ }
+ case YarnApplicationState.FAILED =>
+ reportLauncherState(SparkAppHandle.State.FAILED)
+ case YarnApplicationState.KILLED =>
+ reportLauncherState(SparkAppHandle.State.KILLED)
+ case _ =>
+ }
+ }
+
+ if (state == YarnApplicationState.FINISHED ||
+ state == YarnApplicationState.FAILED ||
+ state == YarnApplicationState.KILLED) {
+ cleanupStagingDir(appId)
+ return (state, report.getFinalApplicationStatus)
+ }
+
+ if (returnOnRunning && state == YarnApplicationState.RUNNING) {
+ return (state, report.getFinalApplicationStatus)
+ }
+
+ lastState = state
+ }
+
+ // Never reached, but keeps compiler happy
+ throw new SparkException("While loop is depleted! This should never happen...")
+ }
+
+ private def formatReportDetails(report: ApplicationReport): String = {
+ val details = Seq[(String, String)](
+ ("client token", getClientToken(report)),
+ ("diagnostics", report.getDiagnostics),
+ ("ApplicationMaster host", report.getHost),
+ ("ApplicationMaster RPC port", report.getRpcPort.toString),
+ ("queue", report.getQueue),
+ ("start time", report.getStartTime.toString),
+ ("final status", report.getFinalApplicationStatus.toString),
+ ("tracking URL", report.getTrackingUrl),
+ ("user", report.getUser)
+ )
+
+ // Use more loggable format if value is null or empty
+ details.map { case (k, v) =>
+ val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
+ s"\n\t $k: $newValue"
+ }.mkString("")
+ }
+
+ /**
+ * Submit an application to the ResourceManager.
+ * If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
+ * reporting the application's status until the application has exited for any reason.
+ * Otherwise, the client process will exit after submission.
+ * If the application finishes with a failed, killed, or undefined status,
+ * throw an appropriate SparkException.
+ */
+ def run(): Unit = {
+ this.appId = submitApplication()
+ if (!launcherBackend.isConnected() && fireAndForget) {
+ val report = getApplicationReport(appId)
+ val state = report.getYarnApplicationState
+ logInfo(s"Application report for $appId (state: $state)")
+ logInfo(formatReportDetails(report))
+ if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
+ throw new SparkException(s"Application $appId finished with status: $state")
+ }
+ } else {
+ val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId)
+ if (yarnApplicationState == YarnApplicationState.FAILED ||
+ finalApplicationStatus == FinalApplicationStatus.FAILED) {
+ throw new SparkException(s"Application $appId finished with failed status")
+ }
+ if (yarnApplicationState == YarnApplicationState.KILLED ||
+ finalApplicationStatus == FinalApplicationStatus.KILLED) {
+ throw new SparkException(s"Application $appId is killed")
+ }
+ if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
+ throw new SparkException(s"The final status of application $appId is undefined")
+ }
+ }
+ }
+
+ private def findPySparkArchives(): Seq[String] = {
+ sys.env.get("PYSPARK_ARCHIVES_PATH")
+ .map(_.split(",").toSeq)
+ .getOrElse {
+ val pyLibPath = Seq(sys.env("SPARK_HOME"), "python", "lib").mkString(File.separator)
+ val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
+ require(pyArchivesFile.exists(),
+ s"$pyArchivesFile not found; cannot run pyspark application in YARN mode.")
+ val py4jFile = new File(pyLibPath, "py4j-0.10.4-src.zip")
+ require(py4jFile.exists(),
+ s"$py4jFile not found; cannot run pyspark application in YARN mode.")
+ Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
+ }
+ }
+
+}
+
+private object Client extends Logging {
+
+ def main(argStrings: Array[String]) {
+ if (!sys.props.contains("SPARK_SUBMIT")) {
+ logWarning("WARNING: This client is deprecated and will be removed in a " +
+ "future version of Spark. Use ./bin/spark-submit with \"--master yarn\"")
+ }
+
+ // Set an env variable indicating we are running in YARN mode.
+ // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
+ System.setProperty("SPARK_YARN_MODE", "true")
+ val sparkConf = new SparkConf
+ // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
+ // so remove them from sparkConf here for yarn mode.
+ sparkConf.remove("spark.jars")
+ sparkConf.remove("spark.files")
+ val args = new ClientArguments(argStrings)
+ new Client(args, sparkConf).run()
+ }
+
+ // Alias for the user jar
+ val APP_JAR_NAME: String = "__app__.jar"
+
+ // URI scheme that identifies local resources
+ val LOCAL_SCHEME = "local"
+
+ // Staging directory for any temporary jars or files
+ val SPARK_STAGING: String = ".sparkStaging"
+
+
+ // Staging directory is private! -> rwx--------
+ val STAGING_DIR_PERMISSION: FsPermission =
+ FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
+
+ // App files are world-wide readable and owner writable -> rw-r--r--
+ val APP_FILE_PERMISSION: FsPermission =
+ FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)
+
+ // Distribution-defined classpath to add to processes
+ val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH"
+
+ // Subdirectory where the user's Spark and Hadoop config files will be placed.
+ val LOCALIZED_CONF_DIR = "__spark_conf__"
+
+ // File containing the conf archive in the AM. See prepareLocalResources().
+ val LOCALIZED_CONF_ARCHIVE = LOCALIZED_CONF_DIR + ".zip"
+
+ // Name of the file in the conf archive containing Spark configuration.
+ val SPARK_CONF_FILE = "__spark_conf__.properties"
+
+ // Subdirectory where the user's python files (not archives) will be placed.
+ val LOCALIZED_PYTHON_DIR = "__pyfiles__"
+
+ // Subdirectory where Spark libraries will be placed.
+ val LOCALIZED_LIB_DIR = "__spark_libs__"
+
+ /**
+ * Return the path to the given application's staging directory.
+ */
+ private def getAppStagingDir(appId: ApplicationId): String = {
+ buildPath(SPARK_STAGING, appId.toString())
+ }
+
+ /**
+ * Populate the classpath entry in the given environment map with any application
+ * classpath specified through the Hadoop and Yarn configurations.
+ */
+ private[yarn] def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String])
+ : Unit = {
+ val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf)
+ for (c <- classPathElementsToAdd.flatten) {
+ YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, c.trim)
+ }
+ }
+
+ private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] =
+ Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) match {
+ case Some(s) => Some(s.toSeq)
+ case None => getDefaultYarnApplicationClasspath
+ }
+
+ private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] =
+ Option(conf.getStrings("mapreduce.application.classpath")) match {
+ case Some(s) => Some(s.toSeq)
+ case None => getDefaultMRApplicationClasspath
+ }
+
+ private[yarn] def getDefaultYarnApplicationClasspath: Option[Seq[String]] = {
+ val triedDefault = Try[Seq[String]] {
+ val field = classOf[YarnConfiguration].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
+ val value = field.get(null).asInstanceOf[Array[String]]
+ value.toSeq
+ } recoverWith {
+ case e: NoSuchFieldException => Success(Seq.empty[String])
+ }
+
+ triedDefault match {
+ case f: Failure[_] =>
+ logError("Unable to obtain the default YARN Application classpath.", f.exception)
+ case s: Success[Seq[String]] =>
+ logDebug(s"Using the default YARN application classpath: ${s.get.mkString(",")}")
+ }
+
+ triedDefault.toOption
+ }
+
+ private[yarn] def getDefaultMRApplicationClasspath: Option[Seq[String]] = {
+ val triedDefault = Try[Seq[String]] {
+ val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
+ StringUtils.getStrings(field.get(null).asInstanceOf[String]).toSeq
+ } recoverWith {
+ case e: NoSuchFieldException => Success(Seq.empty[String])
+ }
+
+ triedDefault match {
+ case f: Failure[_] =>
+ logError("Unable to obtain the default MR Application classpath.", f.exception)
+ case s: Success[Seq[String]] =>
+ logDebug(s"Using the default MR application classpath: ${s.get.mkString(",")}")
+ }
+
+ triedDefault.toOption
+ }
+
+ /**
+ * Populate the classpath entry in the given environment map.
+ *
+ * User jars are generally not added to the JVM's system classpath; those are handled by the AM
+ * and executor backend. When the deprecated `spark.yarn.user.classpath.first` is used, user jars
+ * are included in the system classpath, though. The extra class path and other uploaded files are
+ * always made available through the system class path.
+ *
+ * @param args Client arguments (when starting the AM) or null (when starting executors).
+ */
+ private[yarn] def populateClasspath(
+ args: ClientArguments,
+ conf: Configuration,
+ sparkConf: SparkConf,
+ env: HashMap[String, String],
+ extraClassPath: Option[String] = None): Unit = {
+ extraClassPath.foreach { cp =>
+ addClasspathEntry(getClusterPath(sparkConf, cp), env)
+ }
+
+ addClasspathEntry(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), env)
+
+ addClasspathEntry(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR +
+ LOCALIZED_CONF_DIR, env)
+
+ if (sparkConf.get(USER_CLASS_PATH_FIRST)) {
+ // in order to properly add the app jar when user classpath is first
+ // we have to do the mainJar separate in order to send the right thing
+ // into addFileToClasspath
+ val mainJar =
+ if (args != null) {
+ getMainJarUri(Option(args.userJar))
+ } else {
+ getMainJarUri(sparkConf.get(APP_JAR))
+ }
+ mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR_NAME, env))
+
+ val secondaryJars =
+ if (args != null) {
+ getSecondaryJarUris(Option(sparkConf.get(JARS_TO_DISTRIBUTE)))
+ } else {
+ getSecondaryJarUris(sparkConf.get(SECONDARY_JARS))
+ }
+ secondaryJars.foreach { x =>
+ addFileToClasspath(sparkConf, conf, x, null, env)
+ }
+ }
+
+ // Add the Spark jars to the classpath, depending on how they were distributed.
+ addClasspathEntry(buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ LOCALIZED_LIB_DIR, "*"), env)
+ if (!sparkConf.get(SPARK_ARCHIVE).isDefined) {
+ sparkConf.get(SPARK_JARS).foreach { jars =>
+ jars.filter(isLocalUri).foreach { jar =>
+ addClasspathEntry(getClusterPath(sparkConf, jar), env)
+ }
+ }
+ }
+
+ populateHadoopClasspath(conf, env)
+ sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
+ addClasspathEntry(getClusterPath(sparkConf, cp), env)
+ }
+ }
+
+ /**
+ * Returns a list of URIs representing the user classpath.
+ *
+ * @param conf Spark configuration.
+ */
+ def getUserClasspath(conf: SparkConf): Array[URI] = {
+ val mainUri = getMainJarUri(conf.get(APP_JAR))
+ val secondaryUris = getSecondaryJarUris(conf.get(SECONDARY_JARS))
+ (mainUri ++ secondaryUris).toArray
+ }
+
+ private def getMainJarUri(mainJar: Option[String]): Option[URI] = {
+ mainJar.flatMap { path =>
+ val uri = Utils.resolveURI(path)
+ if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
+ }.orElse(Some(new URI(APP_JAR_NAME)))
+ }
+
+ private def getSecondaryJarUris(secondaryJars: Option[Seq[String]]): Seq[URI] = {
+ secondaryJars.getOrElse(Nil).map(new URI(_))
+ }
+
+ /**
+ * Adds the given path to the classpath, handling "local:" URIs correctly.
+ *
+ * If an alternate name for the file is given, and it's not a "local:" file, the alternate
+ * name will be added to the classpath (relative to the job's work directory).
+ *
+ * If not a "local:" file and no alternate name, the linkName will be added to the classpath.
+ *
+ * @param conf Spark configuration.
+ * @param hadoopConf Hadoop configuration.
+ * @param uri URI to add to classpath (optional).
+ * @param fileName Alternate name for the file (optional).
+ * @param env Map holding the environment variables.
+ */
+ private def addFileToClasspath(
+ conf: SparkConf,
+ hadoopConf: Configuration,
+ uri: URI,
+ fileName: String,
+ env: HashMap[String, String]): Unit = {
+ if (uri != null && uri.getScheme == LOCAL_SCHEME) {
+ addClasspathEntry(getClusterPath(conf, uri.getPath), env)
+ } else if (fileName != null) {
+ addClasspathEntry(buildPath(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), fileName), env)
+ } else if (uri != null) {
+ val localPath = getQualifiedLocalPath(uri, hadoopConf)
+ val linkName = Option(uri.getFragment()).getOrElse(localPath.getName())
+ addClasspathEntry(buildPath(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), linkName), env)
+ }
+ }
+
+ /**
+ * Add the given path to the classpath entry of the given environment map.
+ * If the classpath is already set, this appends the new path to the existing classpath.
+ */
+ private def addClasspathEntry(path: String, env: HashMap[String, String]): Unit =
+ YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, path)
+
+ /**
+ * Returns the path to be sent to the NM for a path that is valid on the gateway.
+ *
+ * This method uses two configuration values:
+ *
+ * - spark.yarn.config.gatewayPath: a string that identifies a portion of the input path that may
+ * only be valid in the gateway node.
+ * - spark.yarn.config.replacementPath: a string with which to replace the gateway path. This may
+ * contain, for example, env variable references, which will be expanded by the NMs when
+ * starting containers.
+ *
+ * If either config is not available, the input path is returned.
+ */
+ def getClusterPath(conf: SparkConf, path: String): String = {
+ val localPath = conf.get(GATEWAY_ROOT_PATH)
+ val clusterPath = conf.get(REPLACEMENT_ROOT_PATH)
+ if (localPath != null && clusterPath != null) {
+ path.replace(localPath, clusterPath)
+ } else {
+ path
+ }
+ }
+
+ /**
+ * Return whether the two file systems are the same.
+ */
+ private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+ val srcUri = srcFs.getUri()
+ val dstUri = destFs.getUri()
+ if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
+ return false
+ }
+
+ var srcHost = srcUri.getHost()
+ var dstHost = dstUri.getHost()
+
+ // In HA or when using viewfs, the host part of the URI may not actually be a host, but the
+ // name of the HDFS namespace. Those names won't resolve, so avoid even trying if they
+ // match.
+ if (srcHost != null && dstHost != null && srcHost != dstHost) {
+ try {
+ srcHost = InetAddress.getByName(srcHost).getCanonicalHostName()
+ dstHost = InetAddress.getByName(dstHost).getCanonicalHostName()
+ } catch {
+ case e: UnknownHostException =>
+ return false
+ }
+ }
+
+ Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+ }
+
+ /**
+ * Given a local URI, resolve it and return a qualified local path that corresponds to the URI.
+ * This is used for preparing local resources to be included in the container launch context.
+ */
+ private def getQualifiedLocalPath(localURI: URI, hadoopConf: Configuration): Path = {
+ val qualifiedURI =
+ if (localURI.getScheme == null) {
+ // If not specified, assume this is in the local filesystem to keep the behavior
+ // consistent with that of Hadoop
+ new URI(FileSystem.getLocal(hadoopConf).makeQualified(new Path(localURI)).toString)
+ } else {
+ localURI
+ }
+ new Path(qualifiedURI)
+ }
+
+ /**
+ * Whether to consider jars provided by the user to have precedence over the Spark jars when
+ * loading user classes.
+ */
+ def isUserClassPathFirst(conf: SparkConf, isDriver: Boolean): Boolean = {
+ if (isDriver) {
+ conf.get(DRIVER_USER_CLASS_PATH_FIRST)
+ } else {
+ conf.get(EXECUTOR_USER_CLASS_PATH_FIRST)
+ }
+ }
+
+ /**
+ * Joins all the path components using Path.SEPARATOR.
+ */
+ def buildPath(components: String*): String = {
+ components.mkString(Path.SEPARATOR)
+ }
+
+ /** Returns whether the URI is a "local:" URI. */
+ def isLocalUri(uri: String): Boolean = {
+ uri.startsWith(s"$LOCAL_SCHEME:")
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
new file mode 100644
index 0000000..61c027e
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import scala.collection.mutable.ArrayBuffer
+
+// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
+private[spark] class ClientArguments(args: Array[String]) {
+
+ var userJar: String = null
+ var userClass: String = null
+ var primaryPyFile: String = null
+ var primaryRFile: String = null
+ var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
+
+ parseArgs(args.toList)
+
+ private def parseArgs(inputArgs: List[String]): Unit = {
+ var args = inputArgs
+
+ while (!args.isEmpty) {
+ args match {
+ case ("--jar") :: value :: tail =>
+ userJar = value
+ args = tail
+
+ case ("--class") :: value :: tail =>
+ userClass = value
+ args = tail
+
+ case ("--primary-py-file") :: value :: tail =>
+ primaryPyFile = value
+ args = tail
+
+ case ("--primary-r-file") :: value :: tail =>
+ primaryRFile = value
+ args = tail
+
+ case ("--arg") :: value :: tail =>
+ userArgs += value
+ args = tail
+
+ case Nil =>
+
+ case _ =>
+ throw new IllegalArgumentException(getUsageMessage(args))
+ }
+ }
+
+ if (primaryPyFile != null && primaryRFile != null) {
+ throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
+ " at the same time")
+ }
+ }
+
+ private def getUsageMessage(unknownParam: List[String] = null): String = {
+ val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
+ message +
+ s"""
+ |Usage: org.apache.spark.deploy.yarn.Client [options]
+ |Options:
+ | --jar JAR_PATH Path to your application's JAR file (required in yarn-cluster
+ | mode)
+ | --class CLASS_NAME Name of your application's main class (required)
+ | --primary-py-file A main Python file
+ | --primary-r-file A main R file
+ | --arg ARG Argument to be passed to your application's main class.
+ | Multiple invocations are possible, each will be passed in order.
+ """.stripMargin
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
new file mode 100644
index 0000000..dcc2288
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.net.URI
+
+import scala.collection.mutable.{HashMap, ListBuffer, Map}
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.hadoop.fs.permission.FsAction
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+
+private case class CacheEntry(
+ uri: URI,
+ size: Long,
+ modTime: Long,
+ visibility: LocalResourceVisibility,
+ resType: LocalResourceType)
+
+/** Client side methods to setup the Hadoop distributed cache */
+private[spark] class ClientDistributedCacheManager() extends Logging {
+
+ private val distCacheEntries = new ListBuffer[CacheEntry]()
+
+ /**
+ * Add a resource to the list of distributed cache resources. This list can
+ * be sent to the ApplicationMaster and possibly the executors so that it can
+ * be downloaded into the Hadoop distributed cache for use by this application.
+ * Adds the LocalResource to the localResources HashMap passed in and saves
+ * the stats of the resources to they can be sent to the executors and verified.
+ *
+ * @param fs FileSystem
+ * @param conf Configuration
+ * @param destPath path to the resource
+ * @param localResources localResource hashMap to insert the resource into
+ * @param resourceType LocalResourceType
+ * @param link link presented in the distributed cache to the destination
+ * @param statCache cache to store the file/directory stats
+ * @param appMasterOnly Whether to only add the resource to the app master
+ */
+ def addResource(
+ fs: FileSystem,
+ conf: Configuration,
+ destPath: Path,
+ localResources: HashMap[String, LocalResource],
+ resourceType: LocalResourceType,
+ link: String,
+ statCache: Map[URI, FileStatus],
+ appMasterOnly: Boolean = false): Unit = {
+ val destStatus = fs.getFileStatus(destPath)
+ val amJarRsrc = Records.newRecord(classOf[LocalResource])
+ amJarRsrc.setType(resourceType)
+ val visibility = getVisibility(conf, destPath.toUri(), statCache)
+ amJarRsrc.setVisibility(visibility)
+ amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(destPath))
+ amJarRsrc.setTimestamp(destStatus.getModificationTime())
+ amJarRsrc.setSize(destStatus.getLen())
+ require(link != null && link.nonEmpty, "You must specify a valid link name.")
+ localResources(link) = amJarRsrc
+
+ if (!appMasterOnly) {
+ val uri = destPath.toUri()
+ val pathURI = new URI(uri.getScheme(), uri.getAuthority(), uri.getPath(), null, link)
+ distCacheEntries += CacheEntry(pathURI, destStatus.getLen(), destStatus.getModificationTime(),
+ visibility, resourceType)
+ }
+ }
+
+ /**
+ * Writes down information about cached files needed in executors to the given configuration.
+ */
+ def updateConfiguration(conf: SparkConf): Unit = {
+ conf.set(CACHED_FILES, distCacheEntries.map(_.uri.toString))
+ conf.set(CACHED_FILES_SIZES, distCacheEntries.map(_.size))
+ conf.set(CACHED_FILES_TIMESTAMPS, distCacheEntries.map(_.modTime))
+ conf.set(CACHED_FILES_VISIBILITIES, distCacheEntries.map(_.visibility.name()))
+ conf.set(CACHED_FILES_TYPES, distCacheEntries.map(_.resType.name()))
+ }
+
+ /**
+ * Returns the local resource visibility depending on the cache file permissions
+ * @return LocalResourceVisibility
+ */
+ private[yarn] def getVisibility(
+ conf: Configuration,
+ uri: URI,
+ statCache: Map[URI, FileStatus]): LocalResourceVisibility = {
+ if (isPublic(conf, uri, statCache)) {
+ LocalResourceVisibility.PUBLIC
+ } else {
+ LocalResourceVisibility.PRIVATE
+ }
+ }
+
+ /**
+ * Returns a boolean to denote whether a cache file is visible to all (public)
+ * @return true if the path in the uri is visible to all, false otherwise
+ */
+ private def isPublic(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): Boolean = {
+ val fs = FileSystem.get(uri, conf)
+ val current = new Path(uri.getPath())
+ // the leaf level file should be readable by others
+ if (!checkPermissionOfOther(fs, current, FsAction.READ, statCache)) {
+ return false
+ }
+ ancestorsHaveExecutePermissions(fs, current.getParent(), statCache)
+ }
+
+ /**
+ * Returns true if all ancestors of the specified path have the 'execute'
+ * permission set for all users (i.e. that other users can traverse
+ * the directory hierarchy to the given path)
+ * @return true if all ancestors have the 'execute' permission set for all users
+ */
+ private def ancestorsHaveExecutePermissions(
+ fs: FileSystem,
+ path: Path,
+ statCache: Map[URI, FileStatus]): Boolean = {
+ var current = path
+ while (current != null) {
+ // the subdirs in the path should have execute permissions for others
+ if (!checkPermissionOfOther(fs, current, FsAction.EXECUTE, statCache)) {
+ return false
+ }
+ current = current.getParent()
+ }
+ true
+ }
+
+ /**
+ * Checks for a given path whether the Other permissions on it
+ * imply the permission in the passed FsAction
+ * @return true if the path in the uri is visible to all, false otherwise
+ */
+ private def checkPermissionOfOther(
+ fs: FileSystem,
+ path: Path,
+ action: FsAction,
+ statCache: Map[URI, FileStatus]): Boolean = {
+ val status = getFileStatus(fs, path.toUri(), statCache)
+ val perms = status.getPermission()
+ val otherAction = perms.getOtherAction()
+ otherAction.implies(action)
+ }
+
+ /**
+ * Checks to see if the given uri exists in the cache, if it does it
+ * returns the existing FileStatus, otherwise it stats the uri, stores
+ * it in the cache, and returns the FileStatus.
+ * @return FileStatus
+ */
+ private[yarn] def getFileStatus(
+ fs: FileSystem,
+ uri: URI,
+ statCache: Map[URI, FileStatus]): FileStatus = {
+ val stat = statCache.get(uri) match {
+ case Some(existstat) => existstat
+ case None =>
+ val newStat = fs.getFileStatus(new Path(uri))
+ statCache.put(uri, newStat)
+ newStat
+ }
+ stat
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
new file mode 100644
index 0000000..868c2ed
--- /dev/null
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.File
+import java.nio.ByteBuffer
+import java.util.Collections
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap, ListBuffer}
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.DataOutputBuffer
+import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.yarn.api._
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.records._
+import org.apache.hadoop.yarn.client.api.NMClient
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.ipc.YarnRPC
+import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.launcher.YarnCommandBuilderUtils
+import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.util.Utils
+
+private[yarn] class ExecutorRunnable(
+ container: Option[Container],
+ conf: YarnConfiguration,
+ sparkConf: SparkConf,
+ masterAddress: String,
+ executorId: String,
+ hostname: String,
+ executorMemory: Int,
+ executorCores: Int,
+ appId: String,
+ securityMgr: SecurityManager,
+ localResources: Map[String, LocalResource]) extends Logging {
+
+ var rpc: YarnRPC = YarnRPC.create(conf)
+ var nmClient: NMClient = _
+
+ def run(): Unit = {
+ logDebug("Starting Executor Container")
+ nmClient = NMClient.createNMClient()
+ nmClient.init(conf)
+ nmClient.start()
+ startContainer()
+ }
+
+ def launchContextDebugInfo(): String = {
+ val commands = prepareCommand()
+ val env = prepareEnvironment()
+
+ s"""
+ |===============================================================================
+ |YARN executor launch context:
+ | env:
+ |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString}
+ | command:
+ | ${commands.mkString(" \\ \n ")}
+ |
+ | resources:
+ |${localResources.map { case (k, v) => s" $k -> $v\n" }.mkString}
+ |===============================================================================""".stripMargin
+ }
+
+ def startContainer(): java.util.Map[String, ByteBuffer] = {
+ val ctx = Records.newRecord(classOf[ContainerLaunchContext])
+ .asInstanceOf[ContainerLaunchContext]
+ val env = prepareEnvironment().asJava
+
+ ctx.setLocalResources(localResources.asJava)
+ ctx.setEnvironment(env)
+
+ val credentials = UserGroupInformation.getCurrentUser().getCredentials()
+ val dob = new DataOutputBuffer()
+ credentials.writeTokenStorageToStream(dob)
+ ctx.setTokens(ByteBuffer.wrap(dob.getData()))
+
+ val commands = prepareCommand()
+
+ ctx.setCommands(commands.asJava)
+ ctx.setApplicationACLs(
+ YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr).asJava)
+
+ // If external shuffle service is enabled, register with the Yarn shuffle service already
+ // started on the NodeManager and, if authentication is enabled, provide it with our secret
+ // key for fetching shuffle files later
+ if (sparkConf.get(SHUFFLE_SERVICE_ENABLED)) {
+ val secretString = securityMgr.getSecretKey()
+ val secretBytes =
+ if (secretString != null) {
+ // This conversion must match how the YarnShuffleService decodes our secret
+ JavaUtils.stringToBytes(secretString)
+ } else {
+ // Authentication is not enabled, so just provide dummy metadata
+ ByteBuffer.allocate(0)
+ }
+ ctx.setServiceData(Collections.singletonMap("spark_shuffle", secretBytes))
+ }
+
+ // Send the start request to the ContainerManager
+ try {
+ nmClient.startContainer(container.get, ctx)
+ } catch {
+ case ex: Exception =>
+ throw new SparkException(s"Exception while starting container ${container.get.getId}" +
+ s" on host $hostname", ex)
+ }
+ }
+
+ private def prepareCommand(): List[String] = {
+ // Extra options for the JVM
+ val javaOpts = ListBuffer[String]()
+
+ // Set the environment variable through a command prefix
+ // to append to the existing value of the variable
+ var prefixEnv: Option[String] = None
+
+ // Set the JVM memory
+ val executorMemoryString = executorMemory + "m"
+ javaOpts += "-Xmx" + executorMemoryString
+
+ // Set extra Java options for the executor, if defined
+ sparkConf.get(EXECUTOR_JAVA_OPTIONS).foreach { opts =>
+ javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
+ }
+ sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
+ javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
+ }
+ sparkConf.get(EXECUTOR_LIBRARY_PATH).foreach { p =>
+ prefixEnv = Some(Client.getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(p))))
+ }
+
+ javaOpts += "-Djava.io.tmpdir=" +
+ new Path(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
+ YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR
+ )
+
+ // Certain configs need to be passed here because they are needed before the Executor
+ // registers with the Scheduler and transfers the spark configs. Since the Executor backend
+ // uses RPC to connect to the scheduler, the RPC settings are needed as well as the
+ // authentication settings.
+ sparkConf.getAll
+ .filter { case (k, v) => SparkConf.isExecutorStartupConf(k) }
+ .foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") }
+
+ // Commenting it out for now - so that people can refer to the properties if required. Remove
+ // it once cpuset version is pushed out.
+ // The context is, default gc for server class machines end up using all cores to do gc - hence
+ // if there are multiple containers in same node, spark gc effects all other containers
+ // performance (which can also be other spark containers)
+ // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in
+ // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset
+ // of cores on a node.
+ /*
+ else {
+ // If no java_opts specified, default to using -XX:+CMSIncrementalMode
+ // It might be possible that other modes/config is being done in
+ // spark.executor.extraJavaOptions, so we don't want to mess with it.
+ // In our expts, using (default) throughput collector has severe perf ramifications in
+ // multi-tenant machines
+ // The options are based on
+ // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use
+ // %20the%20Concurrent%20Low%20Pause%20Collector|outline
+ javaOpts += "-XX:+UseConcMarkSweepGC"
+ javaOpts += "-XX:+CMSIncrementalMode"
+ javaOpts += "-XX:+CMSIncrementalPacing"
+ javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
+ javaOpts += "-XX:CMSIncrementalDutyCycle=10"
+ }
+ */
+
+ // For log4j configuration to reference
+ javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
+ YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
+
+ val userClassPath = Client.getUserClasspath(sparkConf).flatMap { uri =>
+ val absPath =
+ if (new File(uri.getPath()).isAbsolute()) {
+ Client.getClusterPath(sparkConf, uri.getPath())
+ } else {
+ Client.buildPath(Environment.PWD.$(), uri.getPath())
+ }
+ Seq("--user-class-path", "file:" + absPath)
+ }.toSeq
+
+ YarnSparkHadoopUtil.addOutOfMemoryErrorArgument(javaOpts)
+ val commands = prefixEnv ++ Seq(
+ YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java",
+ "-server") ++
+ javaOpts ++
+ Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend",
+ "--driver-url", masterAddress,
+ "--executor-id", executorId,
+ "--hostname", hostname,
+ "--cores", executorCores.toString,
+ "--app-id", appId) ++
+ userClassPath ++
+ Seq(
+ s"1>${ApplicationConstants.LOG_DIR_EXPANSION_VAR}/stdout",
+ s"2>${ApplicationConstants.LOG_DIR_EXPANSION_VAR}/stderr")
+
+ // TODO: it would be nicer to just make sure there are no null commands here
+ commands.map(s => if (s == null) "null" else s).toList
+ }
+
+ private def prepareEnvironment(): HashMap[String, String] = {
+ val env = new HashMap[String, String]()
+ Client.populateClasspath(null, conf, sparkConf, env, sparkConf.get(EXECUTOR_CLASS_PATH))
+
+ sparkConf.getExecutorEnv.foreach { case (key, value) =>
+ // This assumes each executor environment variable set here is a path
+ // This is kept for backward compatibility and consistency with hadoop
+ YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
+ }
+
+ // Keep this for backwards compatibility but users should move to the config
+ sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
+ YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
+ }
+
+ // lookup appropriate http scheme for container log urls
+ val yarnHttpPolicy = conf.get(
+ YarnConfiguration.YARN_HTTP_POLICY_KEY,
+ YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
+ )
+ val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
+
+ // Add log urls
+ container.foreach { c =>
+ sys.env.get("SPARK_USER").foreach { user =>
+ val containerId = ConverterUtils.toString(c.getId)
+ val address = c.getNodeHttpAddress
+ val baseUrl = s"$httpScheme$address/node/containerlogs/$containerId/$user"
+
+ env("SPARK_LOG_URL_STDERR") = s"$baseUrl/stderr?start=-4096"
+ env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=-4096"
+ }
+ }
+
+ System.getenv().asScala.filterKeys(_.startsWith("SPARK"))
+ .foreach { case (k, v) => env(k) = v }
+ env
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[02/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
deleted file mode 100644
index 7deaf0a..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ /dev/null
@@ -1,462 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.{File, FileInputStream, FileOutputStream}
-import java.net.URI
-import java.util.Properties
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{HashMap => MutableHashMap}
-import scala.reflect.ClassTag
-import scala.util.Try
-
-import org.apache.commons.lang3.SerializationUtils
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.mapreduce.MRJobConfig
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.YarnClientApplication
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.Records
-import org.mockito.Matchers.{eq => meq, _}
-import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import org.apache.spark.{SparkConf, SparkFunSuite, TestUtils}
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv, Utils}
-
-class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
- with ResetSystemProperties {
-
- import Client._
-
- var oldSystemProperties: Properties = null
-
- override def beforeAll(): Unit = {
- super.beforeAll()
- oldSystemProperties = SerializationUtils.clone(System.getProperties)
- System.setProperty("SPARK_YARN_MODE", "true")
- }
-
- override def afterAll(): Unit = {
- try {
- System.setProperties(oldSystemProperties)
- oldSystemProperties = null
- } finally {
- super.afterAll()
- }
- }
-
- test("default Yarn application classpath") {
- getDefaultYarnApplicationClasspath should be(Some(Fixtures.knownDefYarnAppCP))
- }
-
- test("default MR application classpath") {
- getDefaultMRApplicationClasspath should be(Some(Fixtures.knownDefMRAppCP))
- }
-
- test("resultant classpath for an application that defines a classpath for YARN") {
- withAppConf(Fixtures.mapYARNAppConf) { conf =>
- val env = newEnv
- populateHadoopClasspath(conf, env)
- classpath(env) should be(
- flatten(Fixtures.knownYARNAppCP, getDefaultMRApplicationClasspath))
- }
- }
-
- test("resultant classpath for an application that defines a classpath for MR") {
- withAppConf(Fixtures.mapMRAppConf) { conf =>
- val env = newEnv
- populateHadoopClasspath(conf, env)
- classpath(env) should be(
- flatten(getDefaultYarnApplicationClasspath, Fixtures.knownMRAppCP))
- }
- }
-
- test("resultant classpath for an application that defines both classpaths, YARN and MR") {
- withAppConf(Fixtures.mapAppConf) { conf =>
- val env = newEnv
- populateHadoopClasspath(conf, env)
- classpath(env) should be(flatten(Fixtures.knownYARNAppCP, Fixtures.knownMRAppCP))
- }
- }
-
- private val SPARK = "local:/sparkJar"
- private val USER = "local:/userJar"
- private val ADDED = "local:/addJar1,local:/addJar2,/addJar3"
-
- private val PWD =
- if (classOf[Environment].getMethods().exists(_.getName == "$$")) {
- "{{PWD}}"
- } else if (Utils.isWindows) {
- "%PWD%"
- } else {
- Environment.PWD.$()
- }
-
- test("Local jar URIs") {
- val conf = new Configuration()
- val sparkConf = new SparkConf()
- .set(SPARK_JARS, Seq(SPARK))
- .set(USER_CLASS_PATH_FIRST, true)
- .set("spark.yarn.dist.jars", ADDED)
- val env = new MutableHashMap[String, String]()
- val args = new ClientArguments(Array("--jar", USER))
-
- populateClasspath(args, conf, sparkConf, env)
-
- val cp = env("CLASSPATH").split(":|;|<CPS>")
- s"$SPARK,$USER,$ADDED".split(",").foreach({ entry =>
- val uri = new URI(entry)
- if (LOCAL_SCHEME.equals(uri.getScheme())) {
- cp should contain (uri.getPath())
- } else {
- cp should not contain (uri.getPath())
- }
- })
- cp should contain(PWD)
- cp should contain (s"$PWD${Path.SEPARATOR}${LOCALIZED_CONF_DIR}")
- cp should not contain (APP_JAR)
- }
-
- test("Jar path propagation through SparkConf") {
- val conf = new Configuration()
- val sparkConf = new SparkConf()
- .set(SPARK_JARS, Seq(SPARK))
- .set("spark.yarn.dist.jars", ADDED)
- val client = createClient(sparkConf, args = Array("--jar", USER))
- doReturn(new Path("/")).when(client).copyFileToRemote(any(classOf[Path]),
- any(classOf[Path]), anyShort(), anyBoolean(), any())
-
- val tempDir = Utils.createTempDir()
- try {
- // Because we mocked "copyFileToRemote" above to avoid having to create fake local files,
- // we need to create a fake config archive in the temp dir to avoid having
- // prepareLocalResources throw an exception.
- new FileOutputStream(new File(tempDir, LOCALIZED_CONF_ARCHIVE)).close()
-
- client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
- sparkConf.get(APP_JAR) should be (Some(USER))
-
- // The non-local path should be propagated by name only, since it will end up in the app's
- // staging dir.
- val expected = ADDED.split(",")
- .map(p => {
- val uri = new URI(p)
- if (LOCAL_SCHEME == uri.getScheme()) {
- p
- } else {
- Option(uri.getFragment()).getOrElse(new File(p).getName())
- }
- })
- .mkString(",")
-
- sparkConf.get(SECONDARY_JARS) should be (Some(expected.split(",").toSeq))
- } finally {
- Utils.deleteRecursively(tempDir)
- }
- }
-
- test("Cluster path translation") {
- val conf = new Configuration()
- val sparkConf = new SparkConf()
- .set(SPARK_JARS, Seq("local:/localPath/spark.jar"))
- .set(GATEWAY_ROOT_PATH, "/localPath")
- .set(REPLACEMENT_ROOT_PATH, "/remotePath")
-
- getClusterPath(sparkConf, "/localPath") should be ("/remotePath")
- getClusterPath(sparkConf, "/localPath/1:/localPath/2") should be (
- "/remotePath/1:/remotePath/2")
-
- val env = new MutableHashMap[String, String]()
- populateClasspath(null, conf, sparkConf, env, extraClassPath = Some("/localPath/my1.jar"))
- val cp = classpath(env)
- cp should contain ("/remotePath/spark.jar")
- cp should contain ("/remotePath/my1.jar")
- }
-
- test("configuration and args propagate through createApplicationSubmissionContext") {
- val conf = new Configuration()
- // When parsing tags, duplicates and leading/trailing whitespace should be removed.
- // Spaces between non-comma strings should be preserved as single tags. Empty strings may or
- // may not be removed depending on the version of Hadoop being used.
- val sparkConf = new SparkConf()
- .set(APPLICATION_TAGS.key, ",tag1, dup,tag2 , ,multi word , dup")
- .set(MAX_APP_ATTEMPTS, 42)
- .set("spark.app.name", "foo-test-app")
- .set(QUEUE_NAME, "staging-queue")
- val args = new ClientArguments(Array())
-
- val appContext = Records.newRecord(classOf[ApplicationSubmissionContext])
- val getNewApplicationResponse = Records.newRecord(classOf[GetNewApplicationResponse])
- val containerLaunchContext = Records.newRecord(classOf[ContainerLaunchContext])
-
- val client = new Client(args, conf, sparkConf)
- client.createApplicationSubmissionContext(
- new YarnClientApplication(getNewApplicationResponse, appContext),
- containerLaunchContext)
-
- appContext.getApplicationName should be ("foo-test-app")
- appContext.getQueue should be ("staging-queue")
- appContext.getAMContainerSpec should be (containerLaunchContext)
- appContext.getApplicationType should be ("SPARK")
- appContext.getClass.getMethods.filter(_.getName.equals("getApplicationTags")).foreach{ method =>
- val tags = method.invoke(appContext).asInstanceOf[java.util.Set[String]]
- tags should contain allOf ("tag1", "dup", "tag2", "multi word")
- tags.asScala.count(_.nonEmpty) should be (4)
- }
- appContext.getMaxAppAttempts should be (42)
- }
-
- test("spark.yarn.jars with multiple paths and globs") {
- val libs = Utils.createTempDir()
- val single = Utils.createTempDir()
- val jar1 = TestUtils.createJarWithFiles(Map(), libs)
- val jar2 = TestUtils.createJarWithFiles(Map(), libs)
- val jar3 = TestUtils.createJarWithFiles(Map(), single)
- val jar4 = TestUtils.createJarWithFiles(Map(), single)
-
- val jarsConf = Seq(
- s"${libs.getAbsolutePath()}/*",
- jar3.getPath(),
- s"local:${jar4.getPath()}",
- s"local:${single.getAbsolutePath()}/*")
-
- val sparkConf = new SparkConf().set(SPARK_JARS, jarsConf)
- val client = createClient(sparkConf)
-
- val tempDir = Utils.createTempDir()
- client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
-
- assert(sparkConf.get(SPARK_JARS) ===
- Some(Seq(s"local:${jar4.getPath()}", s"local:${single.getAbsolutePath()}/*")))
-
- verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar1.toURI())), anyShort(),
- anyBoolean(), any())
- verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar2.toURI())), anyShort(),
- anyBoolean(), any())
- verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(jar3.toURI())), anyShort(),
- anyBoolean(), any())
-
- val cp = classpath(client)
- cp should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
- cp should not contain (jar3.getPath())
- cp should contain (jar4.getPath())
- cp should contain (buildPath(single.getAbsolutePath(), "*"))
- }
-
- test("distribute jars archive") {
- val temp = Utils.createTempDir()
- val archive = TestUtils.createJarWithFiles(Map(), temp)
-
- val sparkConf = new SparkConf().set(SPARK_ARCHIVE, archive.getPath())
- val client = createClient(sparkConf)
- client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
-
- verify(client).copyFileToRemote(any(classOf[Path]), meq(new Path(archive.toURI())), anyShort(),
- anyBoolean(), any())
- classpath(client) should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
-
- sparkConf.set(SPARK_ARCHIVE, LOCAL_SCHEME + ":" + archive.getPath())
- intercept[IllegalArgumentException] {
- client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
- }
- }
-
- test("distribute archive multiple times") {
- val libs = Utils.createTempDir()
- // Create jars dir and RELEASE file to avoid IllegalStateException.
- val jarsDir = new File(libs, "jars")
- assert(jarsDir.mkdir())
- new FileOutputStream(new File(libs, "RELEASE")).close()
-
- val userLib1 = Utils.createTempDir()
- val testJar = TestUtils.createJarWithFiles(Map(), userLib1)
-
- // Case 1: FILES_TO_DISTRIBUTE and ARCHIVES_TO_DISTRIBUTE can't have duplicate files
- val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
- .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
-
- val client = createClient(sparkConf)
- val tempDir = Utils.createTempDir()
- intercept[IllegalArgumentException] {
- client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
- }
-
- // Case 2: FILES_TO_DISTRIBUTE can't have duplicate files.
- val sparkConfFiles = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
-
- val clientFiles = createClient(sparkConfFiles)
- val tempDirForFiles = Utils.createTempDir()
- intercept[IllegalArgumentException] {
- clientFiles.prepareLocalResources(new Path(tempDirForFiles.getAbsolutePath()), Nil)
- }
-
- // Case 3: ARCHIVES_TO_DISTRIBUTE can't have duplicate files.
- val sparkConfArchives = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath, testJar.getPath))
-
- val clientArchives = createClient(sparkConfArchives)
- val tempDirForArchives = Utils.createTempDir()
- intercept[IllegalArgumentException] {
- clientArchives.prepareLocalResources(new Path(tempDirForArchives.getAbsolutePath()), Nil)
- }
-
- // Case 4: FILES_TO_DISTRIBUTE can have unique file.
- val sparkConfFilesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(FILES_TO_DISTRIBUTE, Seq(testJar.getPath))
-
- val clientFilesUniq = createClient(sparkConfFilesUniq)
- val tempDirForFilesUniq = Utils.createTempDir()
- clientFilesUniq.prepareLocalResources(new Path(tempDirForFilesUniq.getAbsolutePath()), Nil)
-
- // Case 5: ARCHIVES_TO_DISTRIBUTE can have unique file.
- val sparkConfArchivesUniq = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(ARCHIVES_TO_DISTRIBUTE, Seq(testJar.getPath))
-
- val clientArchivesUniq = createClient(sparkConfArchivesUniq)
- val tempDirArchivesUniq = Utils.createTempDir()
- clientArchivesUniq.prepareLocalResources(new Path(tempDirArchivesUniq.getAbsolutePath()), Nil)
-
- }
-
- test("distribute local spark jars") {
- val temp = Utils.createTempDir()
- val jarsDir = new File(temp, "jars")
- assert(jarsDir.mkdir())
- val jar = TestUtils.createJarWithFiles(Map(), jarsDir)
- new FileOutputStream(new File(temp, "RELEASE")).close()
-
- val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> temp.getAbsolutePath()))
- val client = createClient(sparkConf)
- client.prepareLocalResources(new Path(temp.getAbsolutePath()), Nil)
- classpath(client) should contain (buildPath(PWD, LOCALIZED_LIB_DIR, "*"))
- }
-
- test("ignore same name jars") {
- val libs = Utils.createTempDir()
- val jarsDir = new File(libs, "jars")
- assert(jarsDir.mkdir())
- new FileOutputStream(new File(libs, "RELEASE")).close()
- val userLib1 = Utils.createTempDir()
- val userLib2 = Utils.createTempDir()
-
- val jar1 = TestUtils.createJarWithFiles(Map(), jarsDir)
- val jar2 = TestUtils.createJarWithFiles(Map(), userLib1)
- // Copy jar2 to jar3 with same name
- val jar3 = {
- val target = new File(userLib2, new File(jar2.toURI).getName)
- val input = new FileInputStream(jar2.getPath)
- val output = new FileOutputStream(target)
- Utils.copyStream(input, output, closeStreams = true)
- target.toURI.toURL
- }
-
- val sparkConf = new SparkConfWithEnv(Map("SPARK_HOME" -> libs.getAbsolutePath))
- .set(JARS_TO_DISTRIBUTE, Seq(jar2.getPath, jar3.getPath))
-
- val client = createClient(sparkConf)
- val tempDir = Utils.createTempDir()
- client.prepareLocalResources(new Path(tempDir.getAbsolutePath()), Nil)
-
- // Only jar2 will be added to SECONDARY_JARS, jar3 which has the same name with jar2 will be
- // ignored.
- sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new File(jar2.toURI).getName)))
- }
-
- object Fixtures {
-
- val knownDefYarnAppCP: Seq[String] =
- getFieldValue[Array[String], Seq[String]](classOf[YarnConfiguration],
- "DEFAULT_YARN_APPLICATION_CLASSPATH",
- Seq[String]())(a => a.toSeq)
-
-
- val knownDefMRAppCP: Seq[String] =
- getFieldValue2[String, Array[String], Seq[String]](
- classOf[MRJobConfig],
- "DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH",
- Seq[String]())(a => a.split(","))(a => a.toSeq)
-
- val knownYARNAppCP = Some(Seq("/known/yarn/path"))
-
- val knownMRAppCP = Some(Seq("/known/mr/path"))
-
- val mapMRAppConf =
- Map("mapreduce.application.classpath" -> knownMRAppCP.map(_.mkString(":")).get)
-
- val mapYARNAppConf =
- Map(YarnConfiguration.YARN_APPLICATION_CLASSPATH -> knownYARNAppCP.map(_.mkString(":")).get)
-
- val mapAppConf = mapYARNAppConf ++ mapMRAppConf
- }
-
- def withAppConf(m: Map[String, String] = Map())(testCode: (Configuration) => Any) {
- val conf = new Configuration
- m.foreach { case (k, v) => conf.set(k, v, "ClientSpec") }
- testCode(conf)
- }
-
- def newEnv: MutableHashMap[String, String] = MutableHashMap[String, String]()
-
- def classpath(env: MutableHashMap[String, String]): Array[String] =
- env(Environment.CLASSPATH.name).split(":|;|<CPS>")
-
- def flatten(a: Option[Seq[String]], b: Option[Seq[String]]): Array[String] =
- (a ++ b).flatten.toArray
-
- def getFieldValue[A, B](clazz: Class[_], field: String, defaults: => B)(mapTo: A => B): B = {
- Try(clazz.getField(field))
- .map(_.get(null).asInstanceOf[A])
- .toOption
- .map(mapTo)
- .getOrElse(defaults)
- }
-
- def getFieldValue2[A: ClassTag, A1: ClassTag, B](
- clazz: Class[_],
- field: String,
- defaults: => B)(mapTo: A => B)(mapTo1: A1 => B): B = {
- Try(clazz.getField(field)).map(_.get(null)).map {
- case v: A => mapTo(v)
- case v1: A1 => mapTo1(v1)
- case _ => defaults
- }.toOption.getOrElse(defaults)
- }
-
- private def createClient(
- sparkConf: SparkConf,
- conf: Configuration = new Configuration(),
- args: Array[String] = Array()): Client = {
- val clientArgs = new ClientArguments(args)
- spy(new Client(clientArgs, conf, sparkConf))
- }
-
- private def classpath(client: Client): Array[String] = {
- val env = new MutableHashMap[String, String]()
- populateClasspath(null, client.hadoopConf, client.sparkConf, env)
- classpath(env)
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
deleted file mode 100644
index afb4b69..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/ContainerPlacementStrategySuite.scala
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.scalatest.{BeforeAndAfterEach, Matchers}
-
-import org.apache.spark.SparkFunSuite
-
-class ContainerPlacementStrategySuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
-
- private val yarnAllocatorSuite = new YarnAllocatorSuite
- import yarnAllocatorSuite._
-
- def createContainerRequest(nodes: Array[String]): ContainerRequest =
- new ContainerRequest(containerResource, nodes, null, YarnSparkHadoopUtil.RM_REQUEST_PRIORITY)
-
- override def beforeEach() {
- yarnAllocatorSuite.beforeEach()
- }
-
- override def afterEach() {
- yarnAllocatorSuite.afterEach()
- }
-
- test("allocate locality preferred containers with enough resource and no matched existed " +
- "containers") {
- // 1. All the locations of current containers cannot satisfy the new requirements
- // 2. Current requested container number can fully satisfy the pending tasks.
-
- val handler = createAllocator(2)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 3, 15, Map("host3" -> 15, "host4" -> 15, "host5" -> 10),
- handler.allocatedHostToContainersMap, Seq.empty)
-
- assert(localities.map(_.nodes) === Array(
- Array("host3", "host4", "host5"),
- Array("host3", "host4", "host5"),
- Array("host3", "host4")))
- }
-
- test("allocate locality preferred containers with enough resource and partially matched " +
- "containers") {
- // 1. Parts of current containers' locations can satisfy the new requirements
- // 2. Current requested container number can fully satisfy the pending tasks.
-
- val handler = createAllocator(3)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(
- createContainer("host1"),
- createContainer("host1"),
- createContainer("host2")
- ))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
- handler.allocatedHostToContainersMap, Seq.empty)
-
- assert(localities.map(_.nodes) ===
- Array(null, Array("host2", "host3"), Array("host2", "host3")))
- }
-
- test("allocate locality preferred containers with limited resource and partially matched " +
- "containers") {
- // 1. Parts of current containers' locations can satisfy the new requirements
- // 2. Current requested container number cannot fully satisfy the pending tasks.
-
- val handler = createAllocator(3)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(
- createContainer("host1"),
- createContainer("host1"),
- createContainer("host2")
- ))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
- handler.allocatedHostToContainersMap, Seq.empty)
-
- assert(localities.map(_.nodes) === Array(Array("host2", "host3")))
- }
-
- test("allocate locality preferred containers with fully matched containers") {
- // Current containers' locations can fully satisfy the new requirements
-
- val handler = createAllocator(5)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(
- createContainer("host1"),
- createContainer("host1"),
- createContainer("host2"),
- createContainer("host2"),
- createContainer("host3")
- ))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 3, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
- handler.allocatedHostToContainersMap, Seq.empty)
-
- assert(localities.map(_.nodes) === Array(null, null, null))
- }
-
- test("allocate containers with no locality preference") {
- // Request new container without locality preference
-
- val handler = createAllocator(2)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(createContainer("host1"), createContainer("host2")))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 1, 0, Map.empty, handler.allocatedHostToContainersMap, Seq.empty)
-
- assert(localities.map(_.nodes) === Array(null))
- }
-
- test("allocate locality preferred containers by considering the localities of pending requests") {
- val handler = createAllocator(3)
- handler.updateResourceRequests()
- handler.handleAllocatedContainers(Array(
- createContainer("host1"),
- createContainer("host1"),
- createContainer("host2")
- ))
-
- val pendingAllocationRequests = Seq(
- createContainerRequest(Array("host2", "host3")),
- createContainerRequest(Array("host1", "host4")))
-
- val localities = handler.containerPlacementStrategy.localityOfRequestedContainers(
- 1, 15, Map("host1" -> 15, "host2" -> 15, "host3" -> 10),
- handler.allocatedHostToContainersMap, pendingAllocationRequests)
-
- assert(localities.map(_.nodes) === Array(Array("host3")))
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
deleted file mode 100644
index 994dc75..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.util.{Arrays, List => JList}
-
-import org.apache.hadoop.fs.CommonConfigurationKeysPublic
-import org.apache.hadoop.net.DNSToSwitchMapping
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.AMRMClient
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfterEach, Matchers}
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.deploy.yarn.YarnAllocator._
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.SplitInfo
-import org.apache.spark.util.ManualClock
-
-class MockResolver extends DNSToSwitchMapping {
-
- override def resolve(names: JList[String]): JList[String] = {
- if (names.size > 0 && names.get(0) == "host3") Arrays.asList("/rack2")
- else Arrays.asList("/rack1")
- }
-
- override def reloadCachedMappings() {}
-
- def reloadCachedMappings(names: JList[String]) {}
-}
-
-class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
- val conf = new YarnConfiguration()
- conf.setClass(
- CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
- classOf[MockResolver], classOf[DNSToSwitchMapping])
-
- val sparkConf = new SparkConf()
- sparkConf.set("spark.driver.host", "localhost")
- sparkConf.set("spark.driver.port", "4040")
- sparkConf.set(SPARK_JARS, Seq("notarealjar.jar"))
- sparkConf.set("spark.yarn.launchContainers", "false")
-
- val appAttemptId = ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 0), 0)
-
- // Resource returned by YARN. YARN can give larger containers than requested, so give 6 cores
- // instead of the 5 requested and 3 GB instead of the 2 requested.
- val containerResource = Resource.newInstance(3072, 6)
-
- var rmClient: AMRMClient[ContainerRequest] = _
-
- var containerNum = 0
-
- override def beforeEach() {
- super.beforeEach()
- rmClient = AMRMClient.createAMRMClient()
- rmClient.init(conf)
- rmClient.start()
- }
-
- override def afterEach() {
- try {
- rmClient.stop()
- } finally {
- super.afterEach()
- }
- }
-
- class MockSplitInfo(host: String) extends SplitInfo(null, host, null, 1, null) {
- override def hashCode(): Int = 0
- override def equals(other: Any): Boolean = false
- }
-
- def createAllocator(maxExecutors: Int = 5): YarnAllocator = {
- val args = Array(
- "--jar", "somejar.jar",
- "--class", "SomeClass")
- val sparkConfClone = sparkConf.clone()
- sparkConfClone
- .set("spark.executor.instances", maxExecutors.toString)
- .set("spark.executor.cores", "5")
- .set("spark.executor.memory", "2048")
- new YarnAllocator(
- "not used",
- mock(classOf[RpcEndpointRef]),
- conf,
- sparkConfClone,
- rmClient,
- appAttemptId,
- new SecurityManager(sparkConf),
- Map())
- }
-
- def createContainer(host: String): Container = {
- // When YARN 2.6+ is required, avoid deprecation by using version with long second arg
- val containerId = ContainerId.newInstance(appAttemptId, containerNum)
- containerNum += 1
- val nodeId = NodeId.newInstance(host, 1000)
- Container.newInstance(containerId, nodeId, "", containerResource, RM_REQUEST_PRIORITY, null)
- }
-
- test("single container allocated") {
- // request a single container and receive it
- val handler = createAllocator(1)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (1)
-
- val container = createContainer("host1")
- handler.handleAllocatedContainers(Array(container))
-
- handler.getNumExecutorsRunning should be (1)
- handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
- handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
-
- val size = rmClient.getMatchingRequests(container.getPriority, "host1", containerResource).size
- size should be (0)
- }
-
- test("container should not be created if requested number if met") {
- // request a single container and receive it
- val handler = createAllocator(1)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (1)
-
- val container = createContainer("host1")
- handler.handleAllocatedContainers(Array(container))
-
- handler.getNumExecutorsRunning should be (1)
- handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
- handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
-
- val container2 = createContainer("host2")
- handler.handleAllocatedContainers(Array(container2))
- handler.getNumExecutorsRunning should be (1)
- }
-
- test("some containers allocated") {
- // request a few containers and receive some of them
- val handler = createAllocator(4)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- val container1 = createContainer("host1")
- val container2 = createContainer("host1")
- val container3 = createContainer("host2")
- handler.handleAllocatedContainers(Array(container1, container2, container3))
-
- handler.getNumExecutorsRunning should be (3)
- handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
- handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host1")
- handler.allocatedContainerToHostMap.get(container3.getId).get should be ("host2")
- handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
- handler.allocatedHostToContainersMap.get("host1").get should contain (container2.getId)
- handler.allocatedHostToContainersMap.get("host2").get should contain (container3.getId)
- }
-
- test("receive more containers than requested") {
- val handler = createAllocator(2)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (2)
-
- val container1 = createContainer("host1")
- val container2 = createContainer("host2")
- val container3 = createContainer("host4")
- handler.handleAllocatedContainers(Array(container1, container2, container3))
-
- handler.getNumExecutorsRunning should be (2)
- handler.allocatedContainerToHostMap.get(container1.getId).get should be ("host1")
- handler.allocatedContainerToHostMap.get(container2.getId).get should be ("host2")
- handler.allocatedContainerToHostMap.contains(container3.getId) should be (false)
- handler.allocatedHostToContainersMap.get("host1").get should contain (container1.getId)
- handler.allocatedHostToContainersMap.get("host2").get should contain (container2.getId)
- handler.allocatedHostToContainersMap.contains("host4") should be (false)
- }
-
- test("decrease total requested executors") {
- val handler = createAllocator(4)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
- handler.updateResourceRequests()
- handler.getPendingAllocate.size should be (3)
-
- val container = createContainer("host1")
- handler.handleAllocatedContainers(Array(container))
-
- handler.getNumExecutorsRunning should be (1)
- handler.allocatedContainerToHostMap.get(container.getId).get should be ("host1")
- handler.allocatedHostToContainersMap.get("host1").get should contain (container.getId)
-
- handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map.empty)
- handler.updateResourceRequests()
- handler.getPendingAllocate.size should be (1)
- }
-
- test("decrease total requested executors to less than currently running") {
- val handler = createAllocator(4)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- handler.requestTotalExecutorsWithPreferredLocalities(3, 0, Map.empty)
- handler.updateResourceRequests()
- handler.getPendingAllocate.size should be (3)
-
- val container1 = createContainer("host1")
- val container2 = createContainer("host2")
- handler.handleAllocatedContainers(Array(container1, container2))
-
- handler.getNumExecutorsRunning should be (2)
-
- handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
- handler.updateResourceRequests()
- handler.getPendingAllocate.size should be (0)
- handler.getNumExecutorsRunning should be (2)
- }
-
- test("kill executors") {
- val handler = createAllocator(4)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- val container1 = createContainer("host1")
- val container2 = createContainer("host2")
- handler.handleAllocatedContainers(Array(container1, container2))
-
- handler.requestTotalExecutorsWithPreferredLocalities(1, 0, Map.empty)
- handler.executorIdToContainer.keys.foreach { id => handler.killExecutor(id ) }
-
- val statuses = Seq(container1, container2).map { c =>
- ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Finished", 0)
- }
- handler.updateResourceRequests()
- handler.processCompletedContainers(statuses.toSeq)
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (1)
- }
-
- test("lost executor removed from backend") {
- val handler = createAllocator(4)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- val container1 = createContainer("host1")
- val container2 = createContainer("host2")
- handler.handleAllocatedContainers(Array(container1, container2))
-
- handler.requestTotalExecutorsWithPreferredLocalities(2, 0, Map())
-
- val statuses = Seq(container1, container2).map { c =>
- ContainerStatus.newInstance(c.getId(), ContainerState.COMPLETE, "Failed", -1)
- }
- handler.updateResourceRequests()
- handler.processCompletedContainers(statuses.toSeq)
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (2)
- handler.getNumExecutorsFailed should be (2)
- handler.getNumUnexpectedContainerRelease should be (2)
- }
-
- test("memory exceeded diagnostic regexes") {
- val diagnostics =
- "Container [pid=12465,containerID=container_1412887393566_0003_01_000002] is running " +
- "beyond physical memory limits. Current usage: 2.1 MB of 2 GB physical memory used; " +
- "5.8 GB of 4.2 GB virtual memory used. Killing container."
- val vmemMsg = memLimitExceededLogMessage(diagnostics, VMEM_EXCEEDED_PATTERN)
- val pmemMsg = memLimitExceededLogMessage(diagnostics, PMEM_EXCEEDED_PATTERN)
- assert(vmemMsg.contains("5.8 GB of 4.2 GB virtual memory used."))
- assert(pmemMsg.contains("2.1 MB of 2 GB physical memory used."))
- }
-
- test("window based failure executor counting") {
- sparkConf.set("spark.yarn.executor.failuresValidityInterval", "100s")
- val handler = createAllocator(4)
- val clock = new ManualClock(0L)
- handler.setClock(clock)
-
- handler.updateResourceRequests()
- handler.getNumExecutorsRunning should be (0)
- handler.getPendingAllocate.size should be (4)
-
- val containers = Seq(
- createContainer("host1"),
- createContainer("host2"),
- createContainer("host3"),
- createContainer("host4")
- )
- handler.handleAllocatedContainers(containers)
-
- val failedStatuses = containers.map { c =>
- ContainerStatus.newInstance(c.getId, ContainerState.COMPLETE, "Failed", -1)
- }
-
- handler.getNumExecutorsFailed should be (0)
-
- clock.advance(100 * 1000L)
- handler.processCompletedContainers(failedStatuses.slice(0, 1))
- handler.getNumExecutorsFailed should be (1)
-
- clock.advance(101 * 1000L)
- handler.getNumExecutorsFailed should be (0)
-
- handler.processCompletedContainers(failedStatuses.slice(1, 3))
- handler.getNumExecutorsFailed should be (2)
-
- clock.advance(50 * 1000L)
- handler.processCompletedContainers(failedStatuses.slice(3, 4))
- handler.getNumExecutorsFailed should be (3)
-
- clock.advance(51 * 1000L)
- handler.getNumExecutorsFailed should be (1)
-
- clock.advance(50 * 1000L)
- handler.getNumExecutorsFailed should be (0)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
deleted file mode 100644
index 99fb58a..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ /dev/null
@@ -1,493 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.File
-import java.net.URL
-import java.nio.charset.StandardCharsets
-import java.util.{HashMap => JHashMap}
-
-import scala.collection.mutable
-import scala.concurrent.duration._
-import scala.language.postfixOps
-
-import com.google.common.io.{ByteStreams, Files}
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.scalatest.Matchers
-import org.scalatest.concurrent.Eventually._
-
-import org.apache.spark._
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.launcher._
-import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
- SparkListenerExecutorAdded}
-import org.apache.spark.scheduler.cluster.ExecutorInfo
-import org.apache.spark.tags.ExtendedYarnTest
-import org.apache.spark.util.Utils
-
-/**
- * Integration tests for YARN; these tests use a mini Yarn cluster to run Spark-on-YARN
- * applications, and require the Spark assembly to be built before they can be successfully
- * run.
- */
-@ExtendedYarnTest
-class YarnClusterSuite extends BaseYarnClusterSuite {
-
- override def newYarnConfig(): YarnConfiguration = new YarnConfiguration()
-
- private val TEST_PYFILE = """
- |import mod1, mod2
- |import sys
- |from operator import add
- |
- |from pyspark import SparkConf , SparkContext
- |if __name__ == "__main__":
- | if len(sys.argv) != 2:
- | print >> sys.stderr, "Usage: test.py [result file]"
- | exit(-1)
- | sc = SparkContext(conf=SparkConf())
- | status = open(sys.argv[1],'w')
- | result = "failure"
- | rdd = sc.parallelize(range(10)).map(lambda x: x * mod1.func() * mod2.func())
- | cnt = rdd.count()
- | if cnt == 10:
- | result = "success"
- | status.write(result)
- | status.close()
- | sc.stop()
- """.stripMargin
-
- private val TEST_PYMODULE = """
- |def func():
- | return 42
- """.stripMargin
-
- test("run Spark in yarn-client mode") {
- testBasicYarnApp(true)
- }
-
- test("run Spark in yarn-cluster mode") {
- testBasicYarnApp(false)
- }
-
- test("run Spark in yarn-client mode with different configurations") {
- testBasicYarnApp(true,
- Map(
- "spark.driver.memory" -> "512m",
- "spark.executor.cores" -> "1",
- "spark.executor.memory" -> "512m",
- "spark.executor.instances" -> "2"
- ))
- }
-
- test("run Spark in yarn-cluster mode with different configurations") {
- testBasicYarnApp(false,
- Map(
- "spark.driver.memory" -> "512m",
- "spark.driver.cores" -> "1",
- "spark.executor.cores" -> "1",
- "spark.executor.memory" -> "512m",
- "spark.executor.instances" -> "2"
- ))
- }
-
- test("run Spark in yarn-cluster mode with using SparkHadoopUtil.conf") {
- testYarnAppUseSparkHadoopUtilConf()
- }
-
- test("run Spark in yarn-client mode with additional jar") {
- testWithAddJar(true)
- }
-
- test("run Spark in yarn-cluster mode with additional jar") {
- testWithAddJar(false)
- }
-
- test("run Spark in yarn-cluster mode unsuccessfully") {
- // Don't provide arguments so the driver will fail.
- val finalState = runSpark(false, mainClassName(YarnClusterDriver.getClass))
- finalState should be (SparkAppHandle.State.FAILED)
- }
-
- test("run Spark in yarn-cluster mode failure after sc initialized") {
- val finalState = runSpark(false, mainClassName(YarnClusterDriverWithFailure.getClass))
- finalState should be (SparkAppHandle.State.FAILED)
- }
-
- test("run Python application in yarn-client mode") {
- testPySpark(true)
- }
-
- test("run Python application in yarn-cluster mode") {
- testPySpark(false)
- }
-
- test("run Python application in yarn-cluster mode using " +
- " spark.yarn.appMasterEnv to override local envvar") {
- testPySpark(
- clientMode = false,
- extraConf = Map(
- "spark.yarn.appMasterEnv.PYSPARK_DRIVER_PYTHON"
- -> sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python"),
- "spark.yarn.appMasterEnv.PYSPARK_PYTHON"
- -> sys.env.getOrElse("PYSPARK_PYTHON", "python")),
- extraEnv = Map(
- "PYSPARK_DRIVER_PYTHON" -> "not python",
- "PYSPARK_PYTHON" -> "not python"))
- }
-
- test("user class path first in client mode") {
- testUseClassPathFirst(true)
- }
-
- test("user class path first in cluster mode") {
- testUseClassPathFirst(false)
- }
-
- test("monitor app using launcher library") {
- val env = new JHashMap[String, String]()
- env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath())
-
- val propsFile = createConfFile()
- val handle = new SparkLauncher(env)
- .setSparkHome(sys.props("spark.test.home"))
- .setConf("spark.ui.enabled", "false")
- .setPropertiesFile(propsFile)
- .setMaster("yarn")
- .setDeployMode("client")
- .setAppResource(SparkLauncher.NO_RESOURCE)
- .setMainClass(mainClassName(YarnLauncherTestApp.getClass))
- .startApplication()
-
- try {
- eventually(timeout(30 seconds), interval(100 millis)) {
- handle.getState() should be (SparkAppHandle.State.RUNNING)
- }
-
- handle.getAppId() should not be (null)
- handle.getAppId() should startWith ("application_")
- handle.stop()
-
- eventually(timeout(30 seconds), interval(100 millis)) {
- handle.getState() should be (SparkAppHandle.State.KILLED)
- }
- } finally {
- handle.kill()
- }
- }
-
- test("timeout to get SparkContext in cluster mode triggers failure") {
- val timeout = 2000
- val finalState = runSpark(false, mainClassName(SparkContextTimeoutApp.getClass),
- appArgs = Seq((timeout * 4).toString),
- extraConf = Map(AM_MAX_WAIT_TIME.key -> timeout.toString))
- finalState should be (SparkAppHandle.State.FAILED)
- }
-
- private def testBasicYarnApp(clientMode: Boolean, conf: Map[String, String] = Map()): Unit = {
- val result = File.createTempFile("result", null, tempDir)
- val finalState = runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
- appArgs = Seq(result.getAbsolutePath()),
- extraConf = conf)
- checkResult(finalState, result)
- }
-
- private def testYarnAppUseSparkHadoopUtilConf(): Unit = {
- val result = File.createTempFile("result", null, tempDir)
- val finalState = runSpark(false,
- mainClassName(YarnClusterDriverUseSparkHadoopUtilConf.getClass),
- appArgs = Seq("key=value", result.getAbsolutePath()),
- extraConf = Map("spark.hadoop.key" -> "value"))
- checkResult(finalState, result)
- }
-
- private def testWithAddJar(clientMode: Boolean): Unit = {
- val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
- val driverResult = File.createTempFile("driver", null, tempDir)
- val executorResult = File.createTempFile("executor", null, tempDir)
- val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
- appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
- extraClassPath = Seq(originalJar.getPath()),
- extraJars = Seq("local:" + originalJar.getPath()))
- checkResult(finalState, driverResult, "ORIGINAL")
- checkResult(finalState, executorResult, "ORIGINAL")
- }
-
- private def testPySpark(
- clientMode: Boolean,
- extraConf: Map[String, String] = Map(),
- extraEnv: Map[String, String] = Map()): Unit = {
- val primaryPyFile = new File(tempDir, "test.py")
- Files.write(TEST_PYFILE, primaryPyFile, StandardCharsets.UTF_8)
-
- // When running tests, let's not assume the user has built the assembly module, which also
- // creates the pyspark archive. Instead, let's use PYSPARK_ARCHIVES_PATH to point at the
- // needed locations.
- val sparkHome = sys.props("spark.test.home")
- val pythonPath = Seq(
- s"$sparkHome/python/lib/py4j-0.10.4-src.zip",
- s"$sparkHome/python")
- val extraEnvVars = Map(
- "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),
- "PYTHONPATH" -> pythonPath.mkString(File.pathSeparator)) ++ extraEnv
-
- val moduleDir =
- if (clientMode) {
- // In client-mode, .py files added with --py-files are not visible in the driver.
- // This is something that the launcher library would have to handle.
- tempDir
- } else {
- val subdir = new File(tempDir, "pyModules")
- subdir.mkdir()
- subdir
- }
- val pyModule = new File(moduleDir, "mod1.py")
- Files.write(TEST_PYMODULE, pyModule, StandardCharsets.UTF_8)
-
- val mod2Archive = TestUtils.createJarWithFiles(Map("mod2.py" -> TEST_PYMODULE), moduleDir)
- val pyFiles = Seq(pyModule.getAbsolutePath(), mod2Archive.getPath()).mkString(",")
- val result = File.createTempFile("result", null, tempDir)
-
- val finalState = runSpark(clientMode, primaryPyFile.getAbsolutePath(),
- sparkArgs = Seq("--py-files" -> pyFiles),
- appArgs = Seq(result.getAbsolutePath()),
- extraEnv = extraEnvVars,
- extraConf = extraConf)
- checkResult(finalState, result)
- }
-
- private def testUseClassPathFirst(clientMode: Boolean): Unit = {
- // Create a jar file that contains a different version of "test.resource".
- val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
- val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "OVERRIDDEN"), tempDir)
- val driverResult = File.createTempFile("driver", null, tempDir)
- val executorResult = File.createTempFile("executor", null, tempDir)
- val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
- appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
- extraClassPath = Seq(originalJar.getPath()),
- extraJars = Seq("local:" + userJar.getPath()),
- extraConf = Map(
- "spark.driver.userClassPathFirst" -> "true",
- "spark.executor.userClassPathFirst" -> "true"))
- checkResult(finalState, driverResult, "OVERRIDDEN")
- checkResult(finalState, executorResult, "OVERRIDDEN")
- }
-
-}
-
-private[spark] class SaveExecutorInfo extends SparkListener {
- val addedExecutorInfos = mutable.Map[String, ExecutorInfo]()
- var driverLogs: Option[collection.Map[String, String]] = None
-
- override def onExecutorAdded(executor: SparkListenerExecutorAdded) {
- addedExecutorInfos(executor.executorId) = executor.executorInfo
- }
-
- override def onApplicationStart(appStart: SparkListenerApplicationStart): Unit = {
- driverLogs = appStart.driverLogs
- }
-}
-
-private object YarnClusterDriverWithFailure extends Logging with Matchers {
- def main(args: Array[String]): Unit = {
- val sc = new SparkContext(new SparkConf()
- .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
- .setAppName("yarn test with failure"))
-
- throw new Exception("exception after sc initialized")
- }
-}
-
-private object YarnClusterDriverUseSparkHadoopUtilConf extends Logging with Matchers {
- def main(args: Array[String]): Unit = {
- if (args.length != 2) {
- // scalastyle:off println
- System.err.println(
- s"""
- |Invalid command line: ${args.mkString(" ")}
- |
- |Usage: YarnClusterDriverUseSparkHadoopUtilConf [hadoopConfKey=value] [result file]
- """.stripMargin)
- // scalastyle:on println
- System.exit(1)
- }
-
- val sc = new SparkContext(new SparkConf()
- .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
- .setAppName("yarn test using SparkHadoopUtil's conf"))
-
- val kv = args(0).split("=")
- val status = new File(args(1))
- var result = "failure"
- try {
- SparkHadoopUtil.get.conf.get(kv(0)) should be (kv(1))
- result = "success"
- } finally {
- Files.write(result, status, StandardCharsets.UTF_8)
- sc.stop()
- }
- }
-}
-
-private object YarnClusterDriver extends Logging with Matchers {
-
- val WAIT_TIMEOUT_MILLIS = 10000
-
- def main(args: Array[String]): Unit = {
- if (args.length != 1) {
- // scalastyle:off println
- System.err.println(
- s"""
- |Invalid command line: ${args.mkString(" ")}
- |
- |Usage: YarnClusterDriver [result file]
- """.stripMargin)
- // scalastyle:on println
- System.exit(1)
- }
-
- val sc = new SparkContext(new SparkConf()
- .set("spark.extraListeners", classOf[SaveExecutorInfo].getName)
- .setAppName("yarn \"test app\" 'with quotes' and \\back\\slashes and $dollarSigns"))
- val conf = sc.getConf
- val status = new File(args(0))
- var result = "failure"
- try {
- val data = sc.parallelize(1 to 4, 4).collect().toSet
- sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
- data should be (Set(1, 2, 3, 4))
- result = "success"
-
- // Verify that the config archive is correctly placed in the classpath of all containers.
- val confFile = "/" + Client.SPARK_CONF_FILE
- assert(getClass().getResource(confFile) != null)
- val configFromExecutors = sc.parallelize(1 to 4, 4)
- .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull }
- .collect()
- assert(configFromExecutors.find(_ == null) === None)
- } finally {
- Files.write(result, status, StandardCharsets.UTF_8)
- sc.stop()
- }
-
- // verify log urls are present
- val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo]
- assert(listeners.size === 1)
- val listener = listeners(0)
- val executorInfos = listener.addedExecutorInfos.values
- assert(executorInfos.nonEmpty)
- executorInfos.foreach { info =>
- assert(info.logUrlMap.nonEmpty)
- }
-
- // If we are running in yarn-cluster mode, verify that driver logs links and present and are
- // in the expected format.
- if (conf.get("spark.submit.deployMode") == "cluster") {
- assert(listener.driverLogs.nonEmpty)
- val driverLogs = listener.driverLogs.get
- assert(driverLogs.size === 2)
- assert(driverLogs.contains("stderr"))
- assert(driverLogs.contains("stdout"))
- val urlStr = driverLogs("stderr")
- // Ensure that this is a valid URL, else this will throw an exception
- new URL(urlStr)
- val containerId = YarnSparkHadoopUtil.get.getContainerId
- val user = Utils.getCurrentUserName()
- assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=-4096"))
- }
- }
-
-}
-
-private object YarnClasspathTest extends Logging {
- def error(m: String, ex: Throwable = null): Unit = {
- logError(m, ex)
- // scalastyle:off println
- System.out.println(m)
- if (ex != null) {
- ex.printStackTrace(System.out)
- }
- // scalastyle:on println
- }
-
- def main(args: Array[String]): Unit = {
- if (args.length != 2) {
- error(
- s"""
- |Invalid command line: ${args.mkString(" ")}
- |
- |Usage: YarnClasspathTest [driver result file] [executor result file]
- """.stripMargin)
- // scalastyle:on println
- }
-
- readResource(args(0))
- val sc = new SparkContext(new SparkConf())
- try {
- sc.parallelize(Seq(1)).foreach { x => readResource(args(1)) }
- } finally {
- sc.stop()
- }
- }
-
- private def readResource(resultPath: String): Unit = {
- var result = "failure"
- try {
- val ccl = Thread.currentThread().getContextClassLoader()
- val resource = ccl.getResourceAsStream("test.resource")
- val bytes = ByteStreams.toByteArray(resource)
- result = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8)
- } catch {
- case t: Throwable =>
- error(s"loading test.resource to $resultPath", t)
- } finally {
- Files.write(result, new File(resultPath), StandardCharsets.UTF_8)
- }
- }
-
-}
-
-private object YarnLauncherTestApp {
-
- def main(args: Array[String]): Unit = {
- // Do not stop the application; the test will stop it using the launcher lib. Just run a task
- // that will prevent the process from exiting.
- val sc = new SparkContext(new SparkConf())
- sc.parallelize(Seq(1)).foreach { i =>
- this.synchronized {
- wait()
- }
- }
- }
-
-}
-
-/**
- * Used to test code in the AM that detects the SparkContext instance. Expects a single argument
- * with the duration to sleep for, in ms.
- */
-private object SparkContextTimeoutApp {
-
- def main(args: Array[String]): Unit = {
- val Array(sleepTime) = args
- Thread.sleep(java.lang.Long.parseLong(sleepTime))
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
deleted file mode 100644
index 950ebd9..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package org.apache.spark.deploy.yarn
-
-import java.io.File
-import java.nio.charset.StandardCharsets
-
-import com.google.common.io.Files
-import org.apache.commons.io.FileUtils
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.scalatest.Matchers
-
-import org.apache.spark._
-import org.apache.spark.internal.Logging
-import org.apache.spark.network.shuffle.ShuffleTestAccessor
-import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor}
-import org.apache.spark.tags.ExtendedYarnTest
-
-/**
- * Integration test for the external shuffle service with a yarn mini-cluster
- */
-@ExtendedYarnTest
-class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
-
- override def newYarnConfig(): YarnConfiguration = {
- val yarnConfig = new YarnConfiguration()
- yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
- yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
- classOf[YarnShuffleService].getCanonicalName)
- yarnConfig.set("spark.shuffle.service.port", "0")
- yarnConfig
- }
-
- test("external shuffle service") {
- val shuffleServicePort = YarnTestAccessor.getShuffleServicePort
- val shuffleService = YarnTestAccessor.getShuffleServiceInstance
-
- val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService)
-
- logInfo("Shuffle service port = " + shuffleServicePort)
- val result = File.createTempFile("result", null, tempDir)
- val finalState = runSpark(
- false,
- mainClassName(YarnExternalShuffleDriver.getClass),
- appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
- extraConf = Map(
- "spark.shuffle.service.enabled" -> "true",
- "spark.shuffle.service.port" -> shuffleServicePort.toString
- )
- )
- checkResult(finalState, result)
- assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
- }
-}
-
-private object YarnExternalShuffleDriver extends Logging with Matchers {
-
- val WAIT_TIMEOUT_MILLIS = 10000
-
- def main(args: Array[String]): Unit = {
- if (args.length != 2) {
- // scalastyle:off println
- System.err.println(
- s"""
- |Invalid command line: ${args.mkString(" ")}
- |
- |Usage: ExternalShuffleDriver [result file] [registered exec file]
- """.stripMargin)
- // scalastyle:on println
- System.exit(1)
- }
-
- val sc = new SparkContext(new SparkConf()
- .setAppName("External Shuffle Test"))
- val conf = sc.getConf
- val status = new File(args(0))
- val registeredExecFile = new File(args(1))
- logInfo("shuffle service executor file = " + registeredExecFile)
- var result = "failure"
- val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup")
- try {
- val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }.
- collect().toSet
- sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
- data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet)
- result = "success"
- // only one process can open a leveldb file at a time, so we copy the files
- FileUtils.copyDirectory(registeredExecFile, execStateCopy)
- assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty)
- } finally {
- sc.stop()
- FileUtils.deleteDirectory(execStateCopy)
- Files.write(result, status, StandardCharsets.UTF_8)
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
deleted file mode 100644
index 7fbbe12..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.{File, IOException}
-import java.nio.charset.StandardCharsets
-
-import com.google.common.io.{ByteStreams, Files}
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.yarn.api.ApplicationConstants
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.records.ApplicationAccessType
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.scalatest.Matchers
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ResetSystemProperties, Utils}
-
-class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
- with ResetSystemProperties {
-
- val hasBash =
- try {
- val exitCode = Runtime.getRuntime().exec(Array("bash", "--version")).waitFor()
- exitCode == 0
- } catch {
- case e: IOException =>
- false
- }
-
- if (!hasBash) {
- logWarning("Cannot execute bash, skipping bash tests.")
- }
-
- def bashTest(name: String)(fn: => Unit): Unit =
- if (hasBash) test(name)(fn) else ignore(name)(fn)
-
- bashTest("shell script escaping") {
- val scriptFile = File.createTempFile("script.", ".sh", Utils.createTempDir())
- val args = Array("arg1", "${arg.2}", "\"arg3\"", "'arg4'", "$arg5", "\\arg6")
- try {
- val argLine = args.map(a => YarnSparkHadoopUtil.escapeForShell(a)).mkString(" ")
- Files.write(("bash -c \"echo " + argLine + "\"").getBytes(StandardCharsets.UTF_8), scriptFile)
- scriptFile.setExecutable(true)
-
- val proc = Runtime.getRuntime().exec(Array(scriptFile.getAbsolutePath()))
- val out = new String(ByteStreams.toByteArray(proc.getInputStream())).trim()
- val err = new String(ByteStreams.toByteArray(proc.getErrorStream()))
- val exitCode = proc.waitFor()
- exitCode should be (0)
- out should be (args.mkString(" "))
- } finally {
- scriptFile.delete()
- }
- }
-
- test("Yarn configuration override") {
- val key = "yarn.nodemanager.hostname"
- val default = new YarnConfiguration()
-
- val sparkConf = new SparkConf()
- .set("spark.hadoop." + key, "someHostName")
- val yarnConf = new YarnSparkHadoopUtil().newConfiguration(sparkConf)
-
- yarnConf.getClass() should be (classOf[YarnConfiguration])
- yarnConf.get(key) should not be default.get(key)
- }
-
-
- test("test getApplicationAclsForYarn acls on") {
-
- // spark acls on, just pick up default user
- val sparkConf = new SparkConf()
- sparkConf.set("spark.acls.enable", "true")
-
- val securityMgr = new SecurityManager(sparkConf)
- val acls = YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr)
-
- val viewAcls = acls.get(ApplicationAccessType.VIEW_APP)
- val modifyAcls = acls.get(ApplicationAccessType.MODIFY_APP)
-
- viewAcls match {
- case Some(vacls) =>
- val aclSet = vacls.split(',').map(_.trim).toSet
- assert(aclSet.contains(System.getProperty("user.name", "invalid")))
- case None =>
- fail()
- }
- modifyAcls match {
- case Some(macls) =>
- val aclSet = macls.split(',').map(_.trim).toSet
- assert(aclSet.contains(System.getProperty("user.name", "invalid")))
- case None =>
- fail()
- }
- }
-
- test("test getApplicationAclsForYarn acls on and specify users") {
-
- // default spark acls are on and specify acls
- val sparkConf = new SparkConf()
- sparkConf.set("spark.acls.enable", "true")
- sparkConf.set("spark.ui.view.acls", "user1,user2")
- sparkConf.set("spark.modify.acls", "user3,user4")
-
- val securityMgr = new SecurityManager(sparkConf)
- val acls = YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr)
-
- val viewAcls = acls.get(ApplicationAccessType.VIEW_APP)
- val modifyAcls = acls.get(ApplicationAccessType.MODIFY_APP)
-
- viewAcls match {
- case Some(vacls) =>
- val aclSet = vacls.split(',').map(_.trim).toSet
- assert(aclSet.contains("user1"))
- assert(aclSet.contains("user2"))
- assert(aclSet.contains(System.getProperty("user.name", "invalid")))
- case None =>
- fail()
- }
- modifyAcls match {
- case Some(macls) =>
- val aclSet = macls.split(',').map(_.trim).toSet
- assert(aclSet.contains("user3"))
- assert(aclSet.contains("user4"))
- assert(aclSet.contains(System.getProperty("user.name", "invalid")))
- case None =>
- fail()
- }
-
- }
-
- test("test expandEnvironment result") {
- val target = Environment.PWD
- if (classOf[Environment].getMethods().exists(_.getName == "$$")) {
- YarnSparkHadoopUtil.expandEnvironment(target) should be ("{{" + target + "}}")
- } else if (Utils.isWindows) {
- YarnSparkHadoopUtil.expandEnvironment(target) should be ("%" + target + "%")
- } else {
- YarnSparkHadoopUtil.expandEnvironment(target) should be ("$" + target)
- }
-
- }
-
- test("test getClassPathSeparator result") {
- if (classOf[ApplicationConstants].getFields().exists(_.getName == "CLASS_PATH_SEPARATOR")) {
- YarnSparkHadoopUtil.getClassPathSeparator() should be ("<CPS>")
- } else if (Utils.isWindows) {
- YarnSparkHadoopUtil.getClassPathSeparator() should be (";")
- } else {
- YarnSparkHadoopUtil.getClassPathSeparator() should be (":")
- }
- }
-
- test("check different hadoop utils based on env variable") {
- try {
- System.setProperty("SPARK_YARN_MODE", "true")
- assert(SparkHadoopUtil.get.getClass === classOf[YarnSparkHadoopUtil])
- System.setProperty("SPARK_YARN_MODE", "false")
- assert(SparkHadoopUtil.get.getClass === classOf[SparkHadoopUtil])
- } finally {
- System.clearProperty("SPARK_YARN_MODE")
- }
- }
-
-
-
- // This test needs to live here because it depends on isYarnMode returning true, which can only
- // happen in the YARN module.
- test("security manager token generation") {
- try {
- System.setProperty("SPARK_YARN_MODE", "true")
- val initial = SparkHadoopUtil.get
- .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
- assert(initial === null || initial.length === 0)
-
- val conf = new SparkConf()
- .set(SecurityManager.SPARK_AUTH_CONF, "true")
- .set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
- val sm = new SecurityManager(conf)
-
- val generated = SparkHadoopUtil.get
- .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
- assert(generated != null)
- val genString = new Text(generated).toString()
- assert(genString != "unused")
- assert(sm.getSecretKey() === genString)
- } finally {
- // removeSecretKey() was only added in Hadoop 2.6, so instead we just set the secret
- // to an empty string.
- SparkHadoopUtil.get.addSecretKeyToUserCredentials(SecurityManager.SECRET_LOOKUP_KEY, "")
- System.clearProperty("SPARK_YARN_MODE")
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
deleted file mode 100644
index db4619e..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.security.Credentials
-import org.apache.hadoop.security.token.Token
-import org.scalatest.{BeforeAndAfter, Matchers}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.yarn.config._
-
-class ConfigurableCredentialManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
- private var credentialManager: ConfigurableCredentialManager = null
- private var sparkConf: SparkConf = null
- private var hadoopConf: Configuration = null
-
- override def beforeAll(): Unit = {
- super.beforeAll()
-
- sparkConf = new SparkConf()
- hadoopConf = new Configuration()
- System.setProperty("SPARK_YARN_MODE", "true")
- }
-
- override def afterAll(): Unit = {
- System.clearProperty("SPARK_YARN_MODE")
-
- super.afterAll()
- }
-
- test("Correctly load default credential providers") {
- credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
-
- credentialManager.getServiceCredentialProvider("hdfs") should not be (None)
- credentialManager.getServiceCredentialProvider("hbase") should not be (None)
- credentialManager.getServiceCredentialProvider("hive") should not be (None)
- }
-
- test("disable hive credential provider") {
- sparkConf.set("spark.yarn.security.credentials.hive.enabled", "false")
- credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
-
- credentialManager.getServiceCredentialProvider("hdfs") should not be (None)
- credentialManager.getServiceCredentialProvider("hbase") should not be (None)
- credentialManager.getServiceCredentialProvider("hive") should be (None)
- }
-
- test("using deprecated configurations") {
- sparkConf.set("spark.yarn.security.tokens.hdfs.enabled", "false")
- sparkConf.set("spark.yarn.security.tokens.hive.enabled", "false")
- credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
-
- credentialManager.getServiceCredentialProvider("hdfs") should be (None)
- credentialManager.getServiceCredentialProvider("hive") should be (None)
- credentialManager.getServiceCredentialProvider("test") should not be (None)
- credentialManager.getServiceCredentialProvider("hbase") should not be (None)
- }
-
- test("verify obtaining credentials from provider") {
- credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
- val creds = new Credentials()
-
- // Tokens can only be obtained from TestTokenProvider, for hdfs, hbase and hive tokens cannot
- // be obtained.
- credentialManager.obtainCredentials(hadoopConf, creds)
- val tokens = creds.getAllTokens
- tokens.size() should be (1)
- tokens.iterator().next().getService should be (new Text("test"))
- }
-
- test("verify getting credential renewal info") {
- credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
- val creds = new Credentials()
-
- val testCredentialProvider = credentialManager.getServiceCredentialProvider("test").get
- .asInstanceOf[TestCredentialProvider]
- // Only TestTokenProvider can get the time of next token renewal
- val nextRenewal = credentialManager.obtainCredentials(hadoopConf, creds)
- nextRenewal should be (testCredentialProvider.timeOfNextTokenRenewal)
- }
-
- test("obtain tokens For HiveMetastore") {
- val hadoopConf = new Configuration()
- hadoopConf.set("hive.metastore.kerberos.principal", "bob")
- // thrift picks up on port 0 and bails out, without trying to talk to endpoint
- hadoopConf.set("hive.metastore.uris", "http://localhost:0")
-
- val hiveCredentialProvider = new HiveCredentialProvider()
- val credentials = new Credentials()
- hiveCredentialProvider.obtainCredentials(hadoopConf, sparkConf, credentials)
-
- credentials.getAllTokens.size() should be (0)
- }
-
- test("Obtain tokens For HBase") {
- val hadoopConf = new Configuration()
- hadoopConf.set("hbase.security.authentication", "kerberos")
-
- val hbaseTokenProvider = new HBaseCredentialProvider()
- val creds = new Credentials()
- hbaseTokenProvider.obtainCredentials(hadoopConf, sparkConf, creds)
-
- creds.getAllTokens.size should be (0)
- }
-}
-
-class TestCredentialProvider extends ServiceCredentialProvider {
- val tokenRenewalInterval = 86400 * 1000L
- var timeOfNextTokenRenewal = 0L
-
- override def serviceName: String = "test"
-
- override def credentialsRequired(conf: Configuration): Boolean = true
-
- override def obtainCredentials(
- hadoopConf: Configuration,
- sparkConf: SparkConf,
- creds: Credentials): Option[Long] = {
- if (creds == null) {
- // Guard out other unit test failures.
- return None
- }
-
- val emptyToken = new Token()
- emptyToken.setService(new Text("test"))
- creds.addToken(emptyToken.getService, emptyToken)
-
- val currTime = System.currentTimeMillis()
- timeOfNextTokenRenewal = (currTime - currTime % tokenRenewalInterval) + tokenRenewalInterval
-
- Some(timeOfNextTokenRenewal)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
deleted file mode 100644
index 7b2da3f..0000000
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn.security
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.Path
-import org.scalatest.{Matchers, PrivateMethodTester}
-
-import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
-
-class HDFSCredentialProviderSuite
- extends SparkFunSuite
- with PrivateMethodTester
- with Matchers {
- private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)
-
- private def getTokenRenewer(
- hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
- hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
- }
-
- private var hdfsCredentialProvider: HDFSCredentialProvider = null
-
- override def beforeAll() {
- super.beforeAll()
-
- if (hdfsCredentialProvider == null) {
- hdfsCredentialProvider = new HDFSCredentialProvider()
- }
- }
-
- override def afterAll() {
- if (hdfsCredentialProvider != null) {
- hdfsCredentialProvider = null
- }
-
- super.afterAll()
- }
-
- test("check token renewer") {
- val hadoopConf = new Configuration()
- hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
- hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
- val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
- renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
- }
-
- test("check token renewer default") {
- val hadoopConf = new Configuration()
- val caught =
- intercept[SparkException] {
- getTokenRenewer(hdfsCredentialProvider, hadoopConf)
- }
- assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala b/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
deleted file mode 100644
index da9e8e2..0000000
--- a/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.launcher
-
-import java.util.{List => JList, Map => JMap}
-
-/**
- * Exposes AbstractCommandBuilder to the YARN tests, so that they can build classpaths the same
- * way other cluster managers do.
- */
-private[spark] class TestClasspathBuilder extends AbstractCommandBuilder {
-
- childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, sys.props("spark.test.home"))
-
- override def buildClassPath(extraCp: String): JList[String] = super.buildClassPath(extraCp)
-
- /** Not used by the YARN tests. */
- override def buildCommand(env: JMap[String, String]): JList[String] =
- throw new UnsupportedOperationException()
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
----------------------------------------------------------------------
diff --git a/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala b/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
deleted file mode 100644
index 1fed256..0000000
--- a/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.network.shuffle
-
-import java.io.File
-import java.util.concurrent.ConcurrentMap
-
-import org.apache.hadoop.yarn.api.records.ApplicationId
-import org.fusesource.leveldbjni.JniDBFactory
-import org.iq80.leveldb.{DB, Options}
-
-import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId
-import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
-
-/**
- * just a cheat to get package-visible members in tests
- */
-object ShuffleTestAccessor {
-
- def getBlockResolver(handler: ExternalShuffleBlockHandler): ExternalShuffleBlockResolver = {
- handler.blockManager
- }
-
- def getExecutorInfo(
- appId: ApplicationId,
- execId: String,
- resolver: ExternalShuffleBlockResolver
- ): Option[ExecutorShuffleInfo] = {
- val id = new AppExecId(appId.toString, execId)
- Option(resolver.executors.get(id))
- }
-
- def registeredExecutorFile(resolver: ExternalShuffleBlockResolver): File = {
- resolver.registeredExecutorFile
- }
-
- def shuffleServiceLevelDB(resolver: ExternalShuffleBlockResolver): DB = {
- resolver.db
- }
-
- def reloadRegisteredExecutors(
- file: File): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
- val options: Options = new Options
- options.createIfMissing(true)
- val factory = new JniDBFactory
- val db = factory.open(file, options)
- val result = ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
- db.close()
- result
- }
-
- def reloadRegisteredExecutors(
- db: DB): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
- ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[04/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
deleted file mode 100644
index 8772e26..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import scala.collection.mutable.{ArrayBuffer, HashMap, Set}
-import scala.collection.JavaConverters._
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api.records.{ContainerId, Resource}
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.util.RackResolver
-
-import org.apache.spark.SparkConf
-import org.apache.spark.internal.config._
-
-private[yarn] case class ContainerLocalityPreferences(nodes: Array[String], racks: Array[String])
-
-/**
- * This strategy is calculating the optimal locality preferences of YARN containers by considering
- * the node ratio of pending tasks, number of required cores/containers and and locality of current
- * existing and pending allocated containers. The target of this algorithm is to maximize the number
- * of tasks that would run locally.
- *
- * Consider a situation in which we have 20 tasks that require (host1, host2, host3)
- * and 10 tasks that require (host1, host2, host4), besides each container has 2 cores
- * and cpus per task is 1, so the required container number is 15,
- * and host ratio is (host1: 30, host2: 30, host3: 20, host4: 10).
- *
- * 1. If requested container number (18) is more than the required container number (15):
- *
- * requests for 5 containers with nodes: (host1, host2, host3, host4)
- * requests for 5 containers with nodes: (host1, host2, host3)
- * requests for 5 containers with nodes: (host1, host2)
- * requests for 3 containers with no locality preferences.
- *
- * The placement ratio is 3 : 3 : 2 : 1, and set the additional containers with no locality
- * preferences.
- *
- * 2. If requested container number (10) is less than or equal to the required container number
- * (15):
- *
- * requests for 4 containers with nodes: (host1, host2, host3, host4)
- * requests for 3 containers with nodes: (host1, host2, host3)
- * requests for 3 containers with nodes: (host1, host2)
- *
- * The placement ratio is 10 : 10 : 7 : 4, close to expected ratio (3 : 3 : 2 : 1)
- *
- * 3. If containers exist but none of them can match the requested localities,
- * follow the method of 1 and 2.
- *
- * 4. If containers exist and some of them can match the requested localities.
- * For example if we have 1 containers on each node (host1: 1, host2: 1: host3: 1, host4: 1),
- * and the expected containers on each node would be (host1: 5, host2: 5, host3: 4, host4: 2),
- * so the newly requested containers on each node would be updated to (host1: 4, host2: 4,
- * host3: 3, host4: 1), 12 containers by total.
- *
- * 4.1 If requested container number (18) is more than newly required containers (12). Follow
- * method 1 with updated ratio 4 : 4 : 3 : 1.
- *
- * 4.2 If request container number (10) is more than newly required containers (12). Follow
- * method 2 with updated ratio 4 : 4 : 3 : 1.
- *
- * 5. If containers exist and existing localities can fully cover the requested localities.
- * For example if we have 5 containers on each node (host1: 5, host2: 5, host3: 5, host4: 5),
- * which could cover the current requested localities. This algorithm will allocate all the
- * requested containers with no localities.
- */
-private[yarn] class LocalityPreferredContainerPlacementStrategy(
- val sparkConf: SparkConf,
- val yarnConf: Configuration,
- val resource: Resource) {
-
- /**
- * Calculate each container's node locality and rack locality
- * @param numContainer number of containers to calculate
- * @param numLocalityAwareTasks number of locality required tasks
- * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
- * numbers running on it, used as hints for container allocation
- * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
- * expected locality preference by considering the existing
- * containers
- * @param localityMatchedPendingAllocations A sequence of pending container request which
- * matches the localities of current required tasks.
- * @return node localities and rack localities, each locality is an array of string,
- * the length of localities is the same as number of containers
- */
- def localityOfRequestedContainers(
- numContainer: Int,
- numLocalityAwareTasks: Int,
- hostToLocalTaskCount: Map[String, Int],
- allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
- localityMatchedPendingAllocations: Seq[ContainerRequest]
- ): Array[ContainerLocalityPreferences] = {
- val updatedHostToContainerCount = expectedHostToContainerCount(
- numLocalityAwareTasks, hostToLocalTaskCount, allocatedHostToContainersMap,
- localityMatchedPendingAllocations)
- val updatedLocalityAwareContainerNum = updatedHostToContainerCount.values.sum
-
- // The number of containers to allocate, divided into two groups, one with preferred locality,
- // and the other without locality preference.
- val requiredLocalityFreeContainerNum =
- math.max(0, numContainer - updatedLocalityAwareContainerNum)
- val requiredLocalityAwareContainerNum = numContainer - requiredLocalityFreeContainerNum
-
- val containerLocalityPreferences = ArrayBuffer[ContainerLocalityPreferences]()
- if (requiredLocalityFreeContainerNum > 0) {
- for (i <- 0 until requiredLocalityFreeContainerNum) {
- containerLocalityPreferences += ContainerLocalityPreferences(
- null.asInstanceOf[Array[String]], null.asInstanceOf[Array[String]])
- }
- }
-
- if (requiredLocalityAwareContainerNum > 0) {
- val largestRatio = updatedHostToContainerCount.values.max
- // Round the ratio of preferred locality to the number of locality required container
- // number, which is used for locality preferred host calculating.
- var preferredLocalityRatio = updatedHostToContainerCount.mapValues { ratio =>
- val adjustedRatio = ratio.toDouble * requiredLocalityAwareContainerNum / largestRatio
- adjustedRatio.ceil.toInt
- }
-
- for (i <- 0 until requiredLocalityAwareContainerNum) {
- // Only filter out the ratio which is larger than 0, which means the current host can
- // still be allocated with new container request.
- val hosts = preferredLocalityRatio.filter(_._2 > 0).keys.toArray
- val racks = hosts.map { h =>
- RackResolver.resolve(yarnConf, h).getNetworkLocation
- }.toSet
- containerLocalityPreferences += ContainerLocalityPreferences(hosts, racks.toArray)
-
- // Minus 1 each time when the host is used. When the current ratio is 0,
- // which means all the required ratio is satisfied, this host will not be allocated again.
- preferredLocalityRatio = preferredLocalityRatio.mapValues(_ - 1)
- }
- }
-
- containerLocalityPreferences.toArray
- }
-
- /**
- * Calculate the number of executors need to satisfy the given number of pending tasks.
- */
- private def numExecutorsPending(numTasksPending: Int): Int = {
- val coresPerExecutor = resource.getVirtualCores
- (numTasksPending * sparkConf.get(CPUS_PER_TASK) + coresPerExecutor - 1) / coresPerExecutor
- }
-
- /**
- * Calculate the expected host to number of containers by considering with allocated containers.
- * @param localityAwareTasks number of locality aware tasks
- * @param hostToLocalTaskCount a map to store the preferred hostname and possible task
- * numbers running on it, used as hints for container allocation
- * @param allocatedHostToContainersMap host to allocated containers map, used to calculate the
- * expected locality preference by considering the existing
- * containers
- * @param localityMatchedPendingAllocations A sequence of pending container request which
- * matches the localities of current required tasks.
- * @return a map with hostname as key and required number of containers on this host as value
- */
- private def expectedHostToContainerCount(
- localityAwareTasks: Int,
- hostToLocalTaskCount: Map[String, Int],
- allocatedHostToContainersMap: HashMap[String, Set[ContainerId]],
- localityMatchedPendingAllocations: Seq[ContainerRequest]
- ): Map[String, Int] = {
- val totalLocalTaskNum = hostToLocalTaskCount.values.sum
- val pendingHostToContainersMap = pendingHostToContainerCount(localityMatchedPendingAllocations)
-
- hostToLocalTaskCount.map { case (host, count) =>
- val expectedCount =
- count.toDouble * numExecutorsPending(localityAwareTasks) / totalLocalTaskNum
- // Take the locality of pending containers into consideration
- val existedCount = allocatedHostToContainersMap.get(host).map(_.size).getOrElse(0) +
- pendingHostToContainersMap.getOrElse(host, 0.0)
-
- // If existing container can not fully satisfy the expected number of container,
- // the required container number is expected count minus existed count. Otherwise the
- // required container number is 0.
- (host, math.max(0, (expectedCount - existedCount).ceil.toInt))
- }
- }
-
- /**
- * According to the locality ratio and number of container requests, calculate the host to
- * possible number of containers for pending allocated containers.
- *
- * If current locality ratio of hosts is: Host1 : Host2 : Host3 = 20 : 20 : 10,
- * and pending container requests is 3, so the possible number of containers on
- * Host1 : Host2 : Host3 will be 1.2 : 1.2 : 0.6.
- * @param localityMatchedPendingAllocations A sequence of pending container request which
- * matches the localities of current required tasks.
- * @return a Map with hostname as key and possible number of containers on this host as value
- */
- private def pendingHostToContainerCount(
- localityMatchedPendingAllocations: Seq[ContainerRequest]): Map[String, Double] = {
- val pendingHostToContainerCount = new HashMap[String, Int]()
- localityMatchedPendingAllocations.foreach { cr =>
- cr.getNodes.asScala.foreach { n =>
- val count = pendingHostToContainerCount.getOrElse(n, 0) + 1
- pendingHostToContainerCount(n) = count
- }
- }
-
- val possibleTotalContainerNum = pendingHostToContainerCount.values.sum
- val localityMatchedPendingNum = localityMatchedPendingAllocations.size.toDouble
- pendingHostToContainerCount.mapValues(_ * localityMatchedPendingNum / possibleTotalContainerNum)
- .toMap
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
deleted file mode 100644
index 0b66d1c..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.util.Collections
-import java.util.concurrent._
-import java.util.regex.Pattern
-
-import scala.collection.mutable
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue}
-import scala.collection.JavaConverters._
-import scala.util.control.NonFatal
-
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.AMRMClient
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.RackResolver
-import org.apache.log4j.{Level, Logger}
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
-import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveLastAllocatedExecutorId
-import org.apache.spark.util.{Clock, SystemClock, ThreadUtils}
-
-/**
- * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
- * what to do with containers when YARN fulfills these requests.
- *
- * This class makes use of YARN's AMRMClient APIs. We interact with the AMRMClient in three ways:
- * * Making our resource needs known, which updates local bookkeeping about containers requested.
- * * Calling "allocate", which syncs our local container requests with the RM, and returns any
- * containers that YARN has granted to us. This also functions as a heartbeat.
- * * Processing the containers granted to us to possibly launch executors inside of them.
- *
- * The public methods of this class are thread-safe. All methods that mutate state are
- * synchronized.
- */
-private[yarn] class YarnAllocator(
- driverUrl: String,
- driverRef: RpcEndpointRef,
- conf: YarnConfiguration,
- sparkConf: SparkConf,
- amClient: AMRMClient[ContainerRequest],
- appAttemptId: ApplicationAttemptId,
- securityMgr: SecurityManager,
- localResources: Map[String, LocalResource])
- extends Logging {
-
- import YarnAllocator._
-
- // RackResolver logs an INFO message whenever it resolves a rack, which is way too often.
- if (Logger.getLogger(classOf[RackResolver]).getLevel == null) {
- Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN)
- }
-
- // Visible for testing.
- val allocatedHostToContainersMap = new HashMap[String, collection.mutable.Set[ContainerId]]
- val allocatedContainerToHostMap = new HashMap[ContainerId, String]
-
- // Containers that we no longer care about. We've either already told the RM to release them or
- // will on the next heartbeat. Containers get removed from this map after the RM tells us they've
- // completed.
- private val releasedContainers = Collections.newSetFromMap[ContainerId](
- new ConcurrentHashMap[ContainerId, java.lang.Boolean])
-
- @volatile private var numExecutorsRunning = 0
-
- /**
- * Used to generate a unique ID per executor
- *
- * Init `executorIdCounter`. when AM restart, `executorIdCounter` will reset to 0. Then
- * the id of new executor will start from 1, this will conflict with the executor has
- * already created before. So, we should initialize the `executorIdCounter` by getting
- * the max executorId from driver.
- *
- * And this situation of executorId conflict is just in yarn client mode, so this is an issue
- * in yarn client mode. For more details, can check in jira.
- *
- * @see SPARK-12864
- */
- private var executorIdCounter: Int =
- driverRef.askWithRetry[Int](RetrieveLastAllocatedExecutorId)
-
- // Queue to store the timestamp of failed executors
- private val failedExecutorsTimeStamps = new Queue[Long]()
-
- private var clock: Clock = new SystemClock
-
- private val executorFailuresValidityInterval =
- sparkConf.get(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L)
-
- @volatile private var targetNumExecutors =
- YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
-
- // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
- // list of requesters that should be responded to once we find out why the given executor
- // was lost.
- private val pendingLossReasonRequests = new HashMap[String, mutable.Buffer[RpcCallContext]]
-
- // Maintain loss reasons for already released executors, it will be added when executor loss
- // reason is got from AM-RM call, and be removed after querying this loss reason.
- private val releasedExecutorLossReasons = new HashMap[String, ExecutorLossReason]
-
- // Keep track of which container is running which executor to remove the executors later
- // Visible for testing.
- private[yarn] val executorIdToContainer = new HashMap[String, Container]
-
- private var numUnexpectedContainerRelease = 0L
- private val containerIdToExecutorId = new HashMap[ContainerId, String]
-
- // Executor memory in MB.
- protected val executorMemory = sparkConf.get(EXECUTOR_MEMORY).toInt
- // Additional memory overhead.
- protected val memoryOverhead: Int = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
- math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)).toInt
- // Number of cores per executor.
- protected val executorCores = sparkConf.get(EXECUTOR_CORES)
- // Resource capability requested for each executors
- private[yarn] val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
-
- private val launcherPool = ThreadUtils.newDaemonCachedThreadPool(
- "ContainerLauncher", sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS))
-
- // For testing
- private val launchContainers = sparkConf.getBoolean("spark.yarn.launchContainers", true)
-
- private val labelExpression = sparkConf.get(EXECUTOR_NODE_LABEL_EXPRESSION)
-
- // ContainerRequest constructor that can take a node label expression. We grab it through
- // reflection because it's only available in later versions of YARN.
- private val nodeLabelConstructor = labelExpression.flatMap { expr =>
- try {
- Some(classOf[ContainerRequest].getConstructor(classOf[Resource],
- classOf[Array[String]], classOf[Array[String]], classOf[Priority], classOf[Boolean],
- classOf[String]))
- } catch {
- case e: NoSuchMethodException =>
- logWarning(s"Node label expression $expr will be ignored because YARN version on" +
- " classpath does not support it.")
- None
- }
- }
-
- // A map to store preferred hostname and possible task numbers running on it.
- private var hostToLocalTaskCounts: Map[String, Int] = Map.empty
-
- // Number of tasks that have locality preferences in active stages
- private var numLocalityAwareTasks: Int = 0
-
- // A container placement strategy based on pending tasks' locality preference
- private[yarn] val containerPlacementStrategy =
- new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource)
-
- /**
- * Use a different clock for YarnAllocator. This is mainly used for testing.
- */
- def setClock(newClock: Clock): Unit = {
- clock = newClock
- }
-
- def getNumExecutorsRunning: Int = numExecutorsRunning
-
- def getNumExecutorsFailed: Int = synchronized {
- val endTime = clock.getTimeMillis()
-
- while (executorFailuresValidityInterval > 0
- && failedExecutorsTimeStamps.nonEmpty
- && failedExecutorsTimeStamps.head < endTime - executorFailuresValidityInterval) {
- failedExecutorsTimeStamps.dequeue()
- }
-
- failedExecutorsTimeStamps.size
- }
-
- /**
- * A sequence of pending container requests that have not yet been fulfilled.
- */
- def getPendingAllocate: Seq[ContainerRequest] = getPendingAtLocation(ANY_HOST)
-
- /**
- * A sequence of pending container requests at the given location that have not yet been
- * fulfilled.
- */
- private def getPendingAtLocation(location: String): Seq[ContainerRequest] = {
- amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala
- .flatMap(_.asScala)
- .toSeq
- }
-
- /**
- * Request as many executors from the ResourceManager as needed to reach the desired total. If
- * the requested total is smaller than the current number of running executors, no executors will
- * be killed.
- * @param requestedTotal total number of containers requested
- * @param localityAwareTasks number of locality aware tasks to be used as container placement hint
- * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
- * container placement hint.
- * @return Whether the new requested total is different than the old value.
- */
- def requestTotalExecutorsWithPreferredLocalities(
- requestedTotal: Int,
- localityAwareTasks: Int,
- hostToLocalTaskCount: Map[String, Int]): Boolean = synchronized {
- this.numLocalityAwareTasks = localityAwareTasks
- this.hostToLocalTaskCounts = hostToLocalTaskCount
-
- if (requestedTotal != targetNumExecutors) {
- logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
- targetNumExecutors = requestedTotal
- true
- } else {
- false
- }
- }
-
- /**
- * Request that the ResourceManager release the container running the specified executor.
- */
- def killExecutor(executorId: String): Unit = synchronized {
- if (executorIdToContainer.contains(executorId)) {
- val container = executorIdToContainer.get(executorId).get
- internalReleaseContainer(container)
- numExecutorsRunning -= 1
- } else {
- logWarning(s"Attempted to kill unknown executor $executorId!")
- }
- }
-
- /**
- * Request resources such that, if YARN gives us all we ask for, we'll have a number of containers
- * equal to maxExecutors.
- *
- * Deal with any containers YARN has granted to us by possibly launching executors in them.
- *
- * This must be synchronized because variables read in this method are mutated by other methods.
- */
- def allocateResources(): Unit = synchronized {
- updateResourceRequests()
-
- val progressIndicator = 0.1f
- // Poll the ResourceManager. This doubles as a heartbeat if there are no pending container
- // requests.
- val allocateResponse = amClient.allocate(progressIndicator)
-
- val allocatedContainers = allocateResponse.getAllocatedContainers()
-
- if (allocatedContainers.size > 0) {
- logDebug("Allocated containers: %d. Current executor count: %d. Cluster resources: %s."
- .format(
- allocatedContainers.size,
- numExecutorsRunning,
- allocateResponse.getAvailableResources))
-
- handleAllocatedContainers(allocatedContainers.asScala)
- }
-
- val completedContainers = allocateResponse.getCompletedContainersStatuses()
- if (completedContainers.size > 0) {
- logDebug("Completed %d containers".format(completedContainers.size))
- processCompletedContainers(completedContainers.asScala)
- logDebug("Finished processing %d completed containers. Current running executor count: %d."
- .format(completedContainers.size, numExecutorsRunning))
- }
- }
-
- /**
- * Update the set of container requests that we will sync with the RM based on the number of
- * executors we have currently running and our target number of executors.
- *
- * Visible for testing.
- */
- def updateResourceRequests(): Unit = {
- val pendingAllocate = getPendingAllocate
- val numPendingAllocate = pendingAllocate.size
- val missing = targetNumExecutors - numPendingAllocate - numExecutorsRunning
-
- if (missing > 0) {
- logInfo(s"Will request $missing executor container(s), each with " +
- s"${resource.getVirtualCores} core(s) and " +
- s"${resource.getMemory} MB memory (including $memoryOverhead MB of overhead)")
-
- // Split the pending container request into three groups: locality matched list, locality
- // unmatched list and non-locality list. Take the locality matched container request into
- // consideration of container placement, treat as allocated containers.
- // For locality unmatched and locality free container requests, cancel these container
- // requests, since required locality preference has been changed, recalculating using
- // container placement strategy.
- val (localRequests, staleRequests, anyHostRequests) = splitPendingAllocationsByLocality(
- hostToLocalTaskCounts, pendingAllocate)
-
- // cancel "stale" requests for locations that are no longer needed
- staleRequests.foreach { stale =>
- amClient.removeContainerRequest(stale)
- }
- val cancelledContainers = staleRequests.size
- if (cancelledContainers > 0) {
- logInfo(s"Canceled $cancelledContainers container request(s) (locality no longer needed)")
- }
-
- // consider the number of new containers and cancelled stale containers available
- val availableContainers = missing + cancelledContainers
-
- // to maximize locality, include requests with no locality preference that can be cancelled
- val potentialContainers = availableContainers + anyHostRequests.size
-
- val containerLocalityPreferences = containerPlacementStrategy.localityOfRequestedContainers(
- potentialContainers, numLocalityAwareTasks, hostToLocalTaskCounts,
- allocatedHostToContainersMap, localRequests)
-
- val newLocalityRequests = new mutable.ArrayBuffer[ContainerRequest]
- containerLocalityPreferences.foreach {
- case ContainerLocalityPreferences(nodes, racks) if nodes != null =>
- newLocalityRequests += createContainerRequest(resource, nodes, racks)
- case _ =>
- }
-
- if (availableContainers >= newLocalityRequests.size) {
- // more containers are available than needed for locality, fill in requests for any host
- for (i <- 0 until (availableContainers - newLocalityRequests.size)) {
- newLocalityRequests += createContainerRequest(resource, null, null)
- }
- } else {
- val numToCancel = newLocalityRequests.size - availableContainers
- // cancel some requests without locality preferences to schedule more local containers
- anyHostRequests.slice(0, numToCancel).foreach { nonLocal =>
- amClient.removeContainerRequest(nonLocal)
- }
- if (numToCancel > 0) {
- logInfo(s"Canceled $numToCancel unlocalized container requests to resubmit with locality")
- }
- }
-
- newLocalityRequests.foreach { request =>
- amClient.addContainerRequest(request)
- }
-
- if (log.isInfoEnabled()) {
- val (localized, anyHost) = newLocalityRequests.partition(_.getNodes() != null)
- if (anyHost.nonEmpty) {
- logInfo(s"Submitted ${anyHost.size} unlocalized container requests.")
- }
- localized.foreach { request =>
- logInfo(s"Submitted container request for host ${hostStr(request)}.")
- }
- }
- } else if (numPendingAllocate > 0 && missing < 0) {
- val numToCancel = math.min(numPendingAllocate, -missing)
- logInfo(s"Canceling requests for $numToCancel executor container(s) to have a new desired " +
- s"total $targetNumExecutors executors.")
-
- val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource)
- if (!matchingRequests.isEmpty) {
- matchingRequests.iterator().next().asScala
- .take(numToCancel).foreach(amClient.removeContainerRequest)
- } else {
- logWarning("Expected to find pending requests, but found none.")
- }
- }
- }
-
- private def hostStr(request: ContainerRequest): String = {
- Option(request.getNodes) match {
- case Some(nodes) => nodes.asScala.mkString(",")
- case None => "Any"
- }
- }
-
- /**
- * Creates a container request, handling the reflection required to use YARN features that were
- * added in recent versions.
- */
- private def createContainerRequest(
- resource: Resource,
- nodes: Array[String],
- racks: Array[String]): ContainerRequest = {
- nodeLabelConstructor.map { constructor =>
- constructor.newInstance(resource, nodes, racks, RM_REQUEST_PRIORITY, true: java.lang.Boolean,
- labelExpression.orNull)
- }.getOrElse(new ContainerRequest(resource, nodes, racks, RM_REQUEST_PRIORITY))
- }
-
- /**
- * Handle containers granted by the RM by launching executors on them.
- *
- * Due to the way the YARN allocation protocol works, certain healthy race conditions can result
- * in YARN granting containers that we no longer need. In this case, we release them.
- *
- * Visible for testing.
- */
- def handleAllocatedContainers(allocatedContainers: Seq[Container]): Unit = {
- val containersToUse = new ArrayBuffer[Container](allocatedContainers.size)
-
- // Match incoming requests by host
- val remainingAfterHostMatches = new ArrayBuffer[Container]
- for (allocatedContainer <- allocatedContainers) {
- matchContainerToRequest(allocatedContainer, allocatedContainer.getNodeId.getHost,
- containersToUse, remainingAfterHostMatches)
- }
-
- // Match remaining by rack
- val remainingAfterRackMatches = new ArrayBuffer[Container]
- for (allocatedContainer <- remainingAfterHostMatches) {
- val rack = RackResolver.resolve(conf, allocatedContainer.getNodeId.getHost).getNetworkLocation
- matchContainerToRequest(allocatedContainer, rack, containersToUse,
- remainingAfterRackMatches)
- }
-
- // Assign remaining that are neither node-local nor rack-local
- val remainingAfterOffRackMatches = new ArrayBuffer[Container]
- for (allocatedContainer <- remainingAfterRackMatches) {
- matchContainerToRequest(allocatedContainer, ANY_HOST, containersToUse,
- remainingAfterOffRackMatches)
- }
-
- if (!remainingAfterOffRackMatches.isEmpty) {
- logDebug(s"Releasing ${remainingAfterOffRackMatches.size} unneeded containers that were " +
- s"allocated to us")
- for (container <- remainingAfterOffRackMatches) {
- internalReleaseContainer(container)
- }
- }
-
- runAllocatedContainers(containersToUse)
-
- logInfo("Received %d containers from YARN, launching executors on %d of them."
- .format(allocatedContainers.size, containersToUse.size))
- }
-
- /**
- * Looks for requests for the given location that match the given container allocation. If it
- * finds one, removes the request so that it won't be submitted again. Places the container into
- * containersToUse or remaining.
- *
- * @param allocatedContainer container that was given to us by YARN
- * @param location resource name, either a node, rack, or *
- * @param containersToUse list of containers that will be used
- * @param remaining list of containers that will not be used
- */
- private def matchContainerToRequest(
- allocatedContainer: Container,
- location: String,
- containersToUse: ArrayBuffer[Container],
- remaining: ArrayBuffer[Container]): Unit = {
- // SPARK-6050: certain Yarn configurations return a virtual core count that doesn't match the
- // request; for example, capacity scheduler + DefaultResourceCalculator. So match on requested
- // memory, but use the asked vcore count for matching, effectively disabling matching on vcore
- // count.
- val matchingResource = Resource.newInstance(allocatedContainer.getResource.getMemory,
- resource.getVirtualCores)
- val matchingRequests = amClient.getMatchingRequests(allocatedContainer.getPriority, location,
- matchingResource)
-
- // Match the allocation to a request
- if (!matchingRequests.isEmpty) {
- val containerRequest = matchingRequests.get(0).iterator.next
- amClient.removeContainerRequest(containerRequest)
- containersToUse += allocatedContainer
- } else {
- remaining += allocatedContainer
- }
- }
-
- /**
- * Launches executors in the allocated containers.
- */
- private def runAllocatedContainers(containersToUse: ArrayBuffer[Container]): Unit = {
- for (container <- containersToUse) {
- executorIdCounter += 1
- val executorHostname = container.getNodeId.getHost
- val containerId = container.getId
- val executorId = executorIdCounter.toString
- assert(container.getResource.getMemory >= resource.getMemory)
- logInfo(s"Launching container $containerId on host $executorHostname")
-
- def updateInternalState(): Unit = synchronized {
- numExecutorsRunning += 1
- executorIdToContainer(executorId) = container
- containerIdToExecutorId(container.getId) = executorId
-
- val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
- new HashSet[ContainerId])
- containerSet += containerId
- allocatedContainerToHostMap.put(containerId, executorHostname)
- }
-
- if (numExecutorsRunning < targetNumExecutors) {
- if (launchContainers) {
- launcherPool.execute(new Runnable {
- override def run(): Unit = {
- try {
- new ExecutorRunnable(
- Some(container),
- conf,
- sparkConf,
- driverUrl,
- executorId,
- executorHostname,
- executorMemory,
- executorCores,
- appAttemptId.getApplicationId.toString,
- securityMgr,
- localResources
- ).run()
- updateInternalState()
- } catch {
- case NonFatal(e) =>
- logError(s"Failed to launch executor $executorId on container $containerId", e)
- // Assigned container should be released immediately to avoid unnecessary resource
- // occupation.
- amClient.releaseAssignedContainer(containerId)
- }
- }
- })
- } else {
- // For test only
- updateInternalState()
- }
- } else {
- logInfo(("Skip launching executorRunnable as runnning Excecutors count: %d " +
- "reached target Executors count: %d.").format(numExecutorsRunning, targetNumExecutors))
- }
- }
- }
-
- // Visible for testing.
- private[yarn] def processCompletedContainers(completedContainers: Seq[ContainerStatus]): Unit = {
- for (completedContainer <- completedContainers) {
- val containerId = completedContainer.getContainerId
- val alreadyReleased = releasedContainers.remove(containerId)
- val hostOpt = allocatedContainerToHostMap.get(containerId)
- val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
- val exitReason = if (!alreadyReleased) {
- // Decrement the number of executors running. The next iteration of
- // the ApplicationMaster's reporting thread will take care of allocating.
- numExecutorsRunning -= 1
- logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
- containerId,
- onHostStr,
- completedContainer.getState,
- completedContainer.getExitStatus))
- // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
- // there are some exit status' we shouldn't necessarily count against us, but for
- // now I think its ok as none of the containers are expected to exit.
- val exitStatus = completedContainer.getExitStatus
- val (exitCausedByApp, containerExitReason) = exitStatus match {
- case ContainerExitStatus.SUCCESS =>
- (false, s"Executor for container $containerId exited because of a YARN event (e.g., " +
- "pre-emption) and not because of an error in the running job.")
- case ContainerExitStatus.PREEMPTED =>
- // Preemption is not the fault of the running tasks, since YARN preempts containers
- // merely to do resource sharing, and tasks that fail due to preempted executors could
- // just as easily finish on any other executor. See SPARK-8167.
- (false, s"Container ${containerId}${onHostStr} was preempted.")
- // Should probably still count memory exceeded exit codes towards task failures
- case VMEM_EXCEEDED_EXIT_CODE =>
- (true, memLimitExceededLogMessage(
- completedContainer.getDiagnostics,
- VMEM_EXCEEDED_PATTERN))
- case PMEM_EXCEEDED_EXIT_CODE =>
- (true, memLimitExceededLogMessage(
- completedContainer.getDiagnostics,
- PMEM_EXCEEDED_PATTERN))
- case _ =>
- // Enqueue the timestamp of failed executor
- failedExecutorsTimeStamps.enqueue(clock.getTimeMillis())
- (true, "Container marked as failed: " + containerId + onHostStr +
- ". Exit status: " + completedContainer.getExitStatus +
- ". Diagnostics: " + completedContainer.getDiagnostics)
-
- }
- if (exitCausedByApp) {
- logWarning(containerExitReason)
- } else {
- logInfo(containerExitReason)
- }
- ExecutorExited(exitStatus, exitCausedByApp, containerExitReason)
- } else {
- // If we have already released this container, then it must mean
- // that the driver has explicitly requested it to be killed
- ExecutorExited(completedContainer.getExitStatus, exitCausedByApp = false,
- s"Container $containerId exited from explicit termination request.")
- }
-
- for {
- host <- hostOpt
- containerSet <- allocatedHostToContainersMap.get(host)
- } {
- containerSet.remove(containerId)
- if (containerSet.isEmpty) {
- allocatedHostToContainersMap.remove(host)
- } else {
- allocatedHostToContainersMap.update(host, containerSet)
- }
-
- allocatedContainerToHostMap.remove(containerId)
- }
-
- containerIdToExecutorId.remove(containerId).foreach { eid =>
- executorIdToContainer.remove(eid)
- pendingLossReasonRequests.remove(eid) match {
- case Some(pendingRequests) =>
- // Notify application of executor loss reasons so it can decide whether it should abort
- pendingRequests.foreach(_.reply(exitReason))
-
- case None =>
- // We cannot find executor for pending reasons. This is because completed container
- // is processed before querying pending result. We should store it for later query.
- // This is usually happened when explicitly killing a container, the result will be
- // returned in one AM-RM communication. So query RPC will be later than this completed
- // container process.
- releasedExecutorLossReasons.put(eid, exitReason)
- }
- if (!alreadyReleased) {
- // The executor could have gone away (like no route to host, node failure, etc)
- // Notify backend about the failure of the executor
- numUnexpectedContainerRelease += 1
- driverRef.send(RemoveExecutor(eid, exitReason))
- }
- }
- }
- }
-
- /**
- * Register that some RpcCallContext has asked the AM why the executor was lost. Note that
- * we can only find the loss reason to send back in the next call to allocateResources().
- */
- private[yarn] def enqueueGetLossReasonRequest(
- eid: String,
- context: RpcCallContext): Unit = synchronized {
- if (executorIdToContainer.contains(eid)) {
- pendingLossReasonRequests
- .getOrElseUpdate(eid, new ArrayBuffer[RpcCallContext]) += context
- } else if (releasedExecutorLossReasons.contains(eid)) {
- // Executor is already released explicitly before getting the loss reason, so directly send
- // the pre-stored lost reason
- context.reply(releasedExecutorLossReasons.remove(eid).get)
- } else {
- logWarning(s"Tried to get the loss reason for non-existent executor $eid")
- context.sendFailure(
- new SparkException(s"Fail to find loss reason for non-existent executor $eid"))
- }
- }
-
- private def internalReleaseContainer(container: Container): Unit = {
- releasedContainers.add(container.getId())
- amClient.releaseAssignedContainer(container.getId())
- }
-
- private[yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
-
- private[yarn] def getNumPendingLossReasonRequests: Int = synchronized {
- pendingLossReasonRequests.size
- }
-
- /**
- * Split the pending container requests into 3 groups based on current localities of pending
- * tasks.
- * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
- * container placement hint.
- * @param pendingAllocations A sequence of pending allocation container request.
- * @return A tuple of 3 sequences, first is a sequence of locality matched container
- * requests, second is a sequence of locality unmatched container requests, and third is a
- * sequence of locality free container requests.
- */
- private def splitPendingAllocationsByLocality(
- hostToLocalTaskCount: Map[String, Int],
- pendingAllocations: Seq[ContainerRequest]
- ): (Seq[ContainerRequest], Seq[ContainerRequest], Seq[ContainerRequest]) = {
- val localityMatched = ArrayBuffer[ContainerRequest]()
- val localityUnMatched = ArrayBuffer[ContainerRequest]()
- val localityFree = ArrayBuffer[ContainerRequest]()
-
- val preferredHosts = hostToLocalTaskCount.keySet
- pendingAllocations.foreach { cr =>
- val nodes = cr.getNodes
- if (nodes == null) {
- localityFree += cr
- } else if (nodes.asScala.toSet.intersect(preferredHosts).nonEmpty) {
- localityMatched += cr
- } else {
- localityUnMatched += cr
- }
- }
-
- (localityMatched.toSeq, localityUnMatched.toSeq, localityFree.toSeq)
- }
-
-}
-
-private object YarnAllocator {
- val MEM_REGEX = "[0-9.]+ [KMG]B"
- val PMEM_EXCEEDED_PATTERN =
- Pattern.compile(s"$MEM_REGEX of $MEM_REGEX physical memory used")
- val VMEM_EXCEEDED_PATTERN =
- Pattern.compile(s"$MEM_REGEX of $MEM_REGEX virtual memory used")
- val VMEM_EXCEEDED_EXIT_CODE = -103
- val PMEM_EXCEEDED_EXIT_CODE = -104
-
- def memLimitExceededLogMessage(diagnostics: String, pattern: Pattern): String = {
- val matcher = pattern.matcher(diagnostics)
- val diag = if (matcher.find()) " " + matcher.group() + "." else ""
- ("Container killed by YARN for exceeding memory limits." + diag
- + " Consider boosting spark.yarn.executor.memoryOverhead.")
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
deleted file mode 100644
index 53df11e..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.util.{List => JList}
-
-import scala.collection.JavaConverters._
-import scala.util.Try
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.AMRMClient
-import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.webapp.util.WebAppUtils
-
-import org.apache.spark.{SecurityManager, SparkConf}
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.util.Utils
-
-/**
- * Handles registering and unregistering the application with the YARN ResourceManager.
- */
-private[spark] class YarnRMClient extends Logging {
-
- private var amClient: AMRMClient[ContainerRequest] = _
- private var uiHistoryAddress: String = _
- private var registered: Boolean = false
-
- /**
- * Registers the application master with the RM.
- *
- * @param conf The Yarn configuration.
- * @param sparkConf The Spark configuration.
- * @param uiAddress Address of the SparkUI.
- * @param uiHistoryAddress Address of the application on the History Server.
- * @param securityMgr The security manager.
- * @param localResources Map with information about files distributed via YARN's cache.
- */
- def register(
- driverUrl: String,
- driverRef: RpcEndpointRef,
- conf: YarnConfiguration,
- sparkConf: SparkConf,
- uiAddress: String,
- uiHistoryAddress: String,
- securityMgr: SecurityManager,
- localResources: Map[String, LocalResource]
- ): YarnAllocator = {
- amClient = AMRMClient.createAMRMClient()
- amClient.init(conf)
- amClient.start()
- this.uiHistoryAddress = uiHistoryAddress
-
- logInfo("Registering the ApplicationMaster")
- synchronized {
- amClient.registerApplicationMaster(Utils.localHostName(), 0, uiAddress)
- registered = true
- }
- new YarnAllocator(driverUrl, driverRef, conf, sparkConf, amClient, getAttemptId(), securityMgr,
- localResources)
- }
-
- /**
- * Unregister the AM. Guaranteed to only be called once.
- *
- * @param status The final status of the AM.
- * @param diagnostics Diagnostics message to include in the final status.
- */
- def unregister(status: FinalApplicationStatus, diagnostics: String = ""): Unit = synchronized {
- if (registered) {
- amClient.unregisterApplicationMaster(status, diagnostics, uiHistoryAddress)
- }
- }
-
- /** Returns the attempt ID. */
- def getAttemptId(): ApplicationAttemptId = {
- YarnSparkHadoopUtil.get.getContainerId.getApplicationAttemptId()
- }
-
- /** Returns the configuration for the AmIpFilter to add to the Spark UI. */
- def getAmIpFilterParams(conf: YarnConfiguration, proxyBase: String): Map[String, String] = {
- // Figure out which scheme Yarn is using. Note the method seems to have been added after 2.2,
- // so not all stable releases have it.
- val prefix = Try(classOf[WebAppUtils].getMethod("getHttpSchemePrefix", classOf[Configuration])
- .invoke(null, conf).asInstanceOf[String]).getOrElse("http://")
-
- // If running a new enough Yarn, use the HA-aware API for retrieving the RM addresses.
- try {
- val method = classOf[WebAppUtils].getMethod("getProxyHostsAndPortsForAmFilter",
- classOf[Configuration])
- val proxies = method.invoke(null, conf).asInstanceOf[JList[String]]
- val hosts = proxies.asScala.map { proxy => proxy.split(":")(0) }
- val uriBases = proxies.asScala.map { proxy => prefix + proxy + proxyBase }
- Map("PROXY_HOSTS" -> hosts.mkString(","), "PROXY_URI_BASES" -> uriBases.mkString(","))
- } catch {
- case e: NoSuchMethodException =>
- val proxy = WebAppUtils.getProxyHostAndPort(conf)
- val parts = proxy.split(":")
- val uriBase = prefix + proxy + proxyBase
- Map("PROXY_HOST" -> parts(0), "PROXY_URI_BASE" -> uriBase)
- }
- }
-
- /** Returns the maximum number of attempts to register the AM. */
- def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = {
- val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt)
- val yarnMaxAttempts = yarnConf.getInt(
- YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)
- val retval: Int = sparkMaxAttempts match {
- case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts
- case None => yarnMaxAttempts
- }
-
- retval
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
deleted file mode 100644
index cc53b1b..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.File
-import java.nio.charset.StandardCharsets.UTF_8
-import java.util.regex.Matcher
-import java.util.regex.Pattern
-
-import scala.collection.mutable.{HashMap, ListBuffer}
-import scala.util.Try
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.mapred.JobConf
-import org.apache.hadoop.security.Credentials
-import org.apache.hadoop.security.UserGroupInformation
-import org.apache.hadoop.yarn.api.ApplicationConstants
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.records.{ApplicationAccessType, ContainerId, Priority}
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.util.ConverterUtils
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.yarn.security.{ConfigurableCredentialManager, CredentialUpdater}
-import org.apache.spark.internal.config._
-import org.apache.spark.launcher.YarnCommandBuilderUtils
-import org.apache.spark.util.Utils
-
-/**
- * Contains util methods to interact with Hadoop from spark.
- */
-class YarnSparkHadoopUtil extends SparkHadoopUtil {
-
- private var credentialUpdater: CredentialUpdater = _
-
- override def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
- dest.addCredentials(source.getCredentials())
- }
-
- // Note that all params which start with SPARK are propagated all the way through, so if in yarn
- // mode, this MUST be set to true.
- override def isYarnMode(): Boolean = { true }
-
- // Return an appropriate (subclass) of Configuration. Creating a config initializes some Hadoop
- // subsystems. Always create a new config, don't reuse yarnConf.
- override def newConfiguration(conf: SparkConf): Configuration =
- new YarnConfiguration(super.newConfiguration(conf))
-
- // Add any user credentials to the job conf which are necessary for running on a secure Hadoop
- // cluster
- override def addCredentials(conf: JobConf) {
- val jobCreds = conf.getCredentials()
- jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
- }
-
- override def getCurrentUserCredentials(): Credentials = {
- UserGroupInformation.getCurrentUser().getCredentials()
- }
-
- override def addCurrentUserCredentials(creds: Credentials) {
- UserGroupInformation.getCurrentUser().addCredentials(creds)
- }
-
- override def addSecretKeyToUserCredentials(key: String, secret: String) {
- val creds = new Credentials()
- creds.addSecretKey(new Text(key), secret.getBytes(UTF_8))
- addCurrentUserCredentials(creds)
- }
-
- override def getSecretKeyFromUserCredentials(key: String): Array[Byte] = {
- val credentials = getCurrentUserCredentials()
- if (credentials != null) credentials.getSecretKey(new Text(key)) else null
- }
-
- private[spark] override def startCredentialUpdater(sparkConf: SparkConf): Unit = {
- credentialUpdater =
- new ConfigurableCredentialManager(sparkConf, newConfiguration(sparkConf)).credentialUpdater()
- credentialUpdater.start()
- }
-
- private[spark] override def stopCredentialUpdater(): Unit = {
- if (credentialUpdater != null) {
- credentialUpdater.stop()
- credentialUpdater = null
- }
- }
-
- private[spark] def getContainerId: ContainerId = {
- val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
- ConverterUtils.toContainerId(containerIdString)
- }
-}
-
-object YarnSparkHadoopUtil {
- // Additional memory overhead
- // 10% was arrived at experimentally. In the interest of minimizing memory waste while covering
- // the common cases. Memory overhead tends to grow with container size.
-
- val MEMORY_OVERHEAD_FACTOR = 0.10
- val MEMORY_OVERHEAD_MIN = 384L
-
- val ANY_HOST = "*"
-
- val DEFAULT_NUMBER_EXECUTORS = 2
-
- // All RM requests are issued with same priority : we do not (yet) have any distinction between
- // request types (like map/reduce in hadoop for example)
- val RM_REQUEST_PRIORITY = Priority.newInstance(1)
-
- def get: YarnSparkHadoopUtil = {
- val yarnMode = java.lang.Boolean.parseBoolean(
- System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
- if (!yarnMode) {
- throw new SparkException("YarnSparkHadoopUtil is not available in non-YARN mode!")
- }
- SparkHadoopUtil.get.asInstanceOf[YarnSparkHadoopUtil]
- }
- /**
- * Add a path variable to the given environment map.
- * If the map already contains this key, append the value to the existing value instead.
- */
- def addPathToEnvironment(env: HashMap[String, String], key: String, value: String): Unit = {
- val newValue = if (env.contains(key)) { env(key) + getClassPathSeparator + value } else value
- env.put(key, newValue)
- }
-
- /**
- * Set zero or more environment variables specified by the given input string.
- * The input string is expected to take the form "KEY1=VAL1,KEY2=VAL2,KEY3=VAL3".
- */
- def setEnvFromInputString(env: HashMap[String, String], inputString: String): Unit = {
- if (inputString != null && inputString.length() > 0) {
- val childEnvs = inputString.split(",")
- val p = Pattern.compile(environmentVariableRegex)
- for (cEnv <- childEnvs) {
- val parts = cEnv.split("=") // split on '='
- val m = p.matcher(parts(1))
- val sb = new StringBuffer
- while (m.find()) {
- val variable = m.group(1)
- var replace = ""
- if (env.get(variable) != None) {
- replace = env.get(variable).get
- } else {
- // if this key is not configured for the child .. get it from the env
- replace = System.getenv(variable)
- if (replace == null) {
- // the env key is note present anywhere .. simply set it
- replace = ""
- }
- }
- m.appendReplacement(sb, Matcher.quoteReplacement(replace))
- }
- m.appendTail(sb)
- // This treats the environment variable as path variable delimited by `File.pathSeparator`
- // This is kept for backward compatibility and consistency with Hadoop's behavior
- addPathToEnvironment(env, parts(0), sb.toString)
- }
- }
- }
-
- private val environmentVariableRegex: String = {
- if (Utils.isWindows) {
- "%([A-Za-z_][A-Za-z0-9_]*?)%"
- } else {
- "\\$([A-Za-z_][A-Za-z0-9_]*)"
- }
- }
-
- /**
- * Kill if OOM is raised - leverage yarn's failure handling to cause rescheduling.
- * Not killing the task leaves various aspects of the executor and (to some extent) the jvm in
- * an inconsistent state.
- * TODO: If the OOM is not recoverable by rescheduling it on different node, then do
- * 'something' to fail job ... akin to blacklisting trackers in mapred ?
- *
- * The handler if an OOM Exception is thrown by the JVM must be configured on Windows
- * differently: the 'taskkill' command should be used, whereas Unix-based systems use 'kill'.
- *
- * As the JVM interprets both %p and %%p as the same, we can use either of them. However,
- * some tests on Windows computers suggest, that the JVM only accepts '%%p'.
- *
- * Furthermore, the behavior of the character '%' on the Windows command line differs from
- * the behavior of '%' in a .cmd file: it gets interpreted as an incomplete environment
- * variable. Windows .cmd files escape a '%' by '%%'. Thus, the correct way of writing
- * '%%p' in an escaped way is '%%%%p'.
- */
- private[yarn] def addOutOfMemoryErrorArgument(javaOpts: ListBuffer[String]): Unit = {
- if (!javaOpts.exists(_.contains("-XX:OnOutOfMemoryError"))) {
- if (Utils.isWindows) {
- javaOpts += escapeForShell("-XX:OnOutOfMemoryError=taskkill /F /PID %%%%p")
- } else {
- javaOpts += "-XX:OnOutOfMemoryError='kill %p'"
- }
- }
- }
-
- /**
- * Escapes a string for inclusion in a command line executed by Yarn. Yarn executes commands
- * using either
- *
- * (Unix-based) `bash -c "command arg1 arg2"` and that means plain quoting doesn't really work.
- * The argument is enclosed in single quotes and some key characters are escaped.
- *
- * (Windows-based) part of a .cmd file in which case windows escaping for each argument must be
- * applied. Windows is quite lenient, however it is usually Java that causes trouble, needing to
- * distinguish between arguments starting with '-' and class names. If arguments are surrounded
- * by ' java takes the following string as is, hence an argument is mistakenly taken as a class
- * name which happens to start with a '-'. The way to avoid this, is to surround nothing with
- * a ', but instead with a ".
- *
- * @param arg A single argument.
- * @return Argument quoted for execution via Yarn's generated shell script.
- */
- def escapeForShell(arg: String): String = {
- if (arg != null) {
- if (Utils.isWindows) {
- YarnCommandBuilderUtils.quoteForBatchScript(arg)
- } else {
- val escaped = new StringBuilder("'")
- for (i <- 0 to arg.length() - 1) {
- arg.charAt(i) match {
- case '$' => escaped.append("\\$")
- case '"' => escaped.append("\\\"")
- case '\'' => escaped.append("'\\''")
- case c => escaped.append(c)
- }
- }
- escaped.append("'").toString()
- }
- } else {
- arg
- }
- }
-
- // YARN/Hadoop acls are specified as user1,user2 group1,group2
- // Users and groups are separated by a space and hence we need to pass the acls in same format
- def getApplicationAclsForYarn(securityMgr: SecurityManager)
- : Map[ApplicationAccessType, String] = {
- Map[ApplicationAccessType, String] (
- ApplicationAccessType.VIEW_APP -> (securityMgr.getViewAcls + " " +
- securityMgr.getViewAclsGroups),
- ApplicationAccessType.MODIFY_APP -> (securityMgr.getModifyAcls + " " +
- securityMgr.getModifyAclsGroups)
- )
- }
-
- /**
- * Expand environment variable using Yarn API.
- * If environment.$$() is implemented, return the result of it.
- * Otherwise, return the result of environment.$()
- * Note: $$() is added in Hadoop 2.4.
- */
- private lazy val expandMethod =
- Try(classOf[Environment].getMethod("$$"))
- .getOrElse(classOf[Environment].getMethod("$"))
-
- def expandEnvironment(environment: Environment): String =
- expandMethod.invoke(environment).asInstanceOf[String]
-
- /**
- * Get class path separator using Yarn API.
- * If ApplicationConstants.CLASS_PATH_SEPARATOR is implemented, return it.
- * Otherwise, return File.pathSeparator
- * Note: CLASS_PATH_SEPARATOR is added in Hadoop 2.4.
- */
- private lazy val classPathSeparatorField =
- Try(classOf[ApplicationConstants].getField("CLASS_PATH_SEPARATOR"))
- .getOrElse(classOf[File].getField("pathSeparator"))
-
- def getClassPathSeparator(): String = {
- classPathSeparatorField.get(null).asInstanceOf[String]
- }
-
- /**
- * Getting the initial target number of executors depends on whether dynamic allocation is
- * enabled.
- * If not using dynamic allocation it gets the number of executors requested by the user.
- */
- def getInitialTargetExecutorNumber(
- conf: SparkConf,
- numExecutors: Int = DEFAULT_NUMBER_EXECUTORS): Int = {
- if (Utils.isDynamicAllocationEnabled(conf)) {
- val minNumExecutors = conf.get(DYN_ALLOCATION_MIN_EXECUTORS)
- val initialNumExecutors = Utils.getDynamicAllocationInitialExecutors(conf)
- val maxNumExecutors = conf.get(DYN_ALLOCATION_MAX_EXECUTORS)
- require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
- s"initial executor number $initialNumExecutors must between min executor number " +
- s"$minNumExecutors and max executor number $maxNumExecutors")
-
- initialNumExecutors
- } else {
- val targetNumExecutors =
- sys.env.get("SPARK_EXECUTOR_INSTANCES").map(_.toInt).getOrElse(numExecutors)
- // System property can override environment variable.
- conf.get(EXECUTOR_INSTANCES).getOrElse(targetNumExecutors)
- }
- }
-}
-
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
deleted file mode 100644
index 666cb45..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.util.concurrent.TimeUnit
-
-import org.apache.spark.internal.config.ConfigBuilder
-import org.apache.spark.network.util.ByteUnit
-
-package object config {
-
- /* Common app configuration. */
-
- private[spark] val APPLICATION_TAGS = ConfigBuilder("spark.yarn.tags")
- .doc("Comma-separated list of strings to pass through as YARN application tags appearing " +
- "in YARN Application Reports, which can be used for filtering when querying YARN.")
- .stringConf
- .toSequence
- .createOptional
-
- private[spark] val AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS =
- ConfigBuilder("spark.yarn.am.attemptFailuresValidityInterval")
- .doc("Interval after which AM failures will be considered independent and " +
- "not accumulate towards the attempt count.")
- .timeConf(TimeUnit.MILLISECONDS)
- .createOptional
-
- private[spark] val AM_PORT =
- ConfigBuilder("spark.yarn.am.port")
- .intConf
- .createWithDefault(0)
-
- private[spark] val EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS =
- ConfigBuilder("spark.yarn.executor.failuresValidityInterval")
- .doc("Interval after which Executor failures will be considered independent and not " +
- "accumulate towards the attempt count.")
- .timeConf(TimeUnit.MILLISECONDS)
- .createOptional
-
- private[spark] val MAX_APP_ATTEMPTS = ConfigBuilder("spark.yarn.maxAppAttempts")
- .doc("Maximum number of AM attempts before failing the app.")
- .intConf
- .createOptional
-
- private[spark] val USER_CLASS_PATH_FIRST = ConfigBuilder("spark.yarn.user.classpath.first")
- .doc("Whether to place user jars in front of Spark's classpath.")
- .booleanConf
- .createWithDefault(false)
-
- private[spark] val GATEWAY_ROOT_PATH = ConfigBuilder("spark.yarn.config.gatewayPath")
- .doc("Root of configuration paths that is present on gateway nodes, and will be replaced " +
- "with the corresponding path in cluster machines.")
- .stringConf
- .createWithDefault(null)
-
- private[spark] val REPLACEMENT_ROOT_PATH = ConfigBuilder("spark.yarn.config.replacementPath")
- .doc(s"Path to use as a replacement for ${GATEWAY_ROOT_PATH.key} when launching processes " +
- "in the YARN cluster.")
- .stringConf
- .createWithDefault(null)
-
- private[spark] val QUEUE_NAME = ConfigBuilder("spark.yarn.queue")
- .stringConf
- .createWithDefault("default")
-
- private[spark] val HISTORY_SERVER_ADDRESS = ConfigBuilder("spark.yarn.historyServer.address")
- .stringConf
- .createOptional
-
- /* File distribution. */
-
- private[spark] val SPARK_ARCHIVE = ConfigBuilder("spark.yarn.archive")
- .doc("Location of archive containing jars files with Spark classes.")
- .stringConf
- .createOptional
-
- private[spark] val SPARK_JARS = ConfigBuilder("spark.yarn.jars")
- .doc("Location of jars containing Spark classes.")
- .stringConf
- .toSequence
- .createOptional
-
- private[spark] val ARCHIVES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.archives")
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val FILES_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.files")
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val JARS_TO_DISTRIBUTE = ConfigBuilder("spark.yarn.dist.jars")
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val PRESERVE_STAGING_FILES = ConfigBuilder("spark.yarn.preserve.staging.files")
- .doc("Whether to preserve temporary files created by the job in HDFS.")
- .booleanConf
- .createWithDefault(false)
-
- private[spark] val STAGING_FILE_REPLICATION = ConfigBuilder("spark.yarn.submit.file.replication")
- .doc("Replication factor for files uploaded by Spark to HDFS.")
- .intConf
- .createOptional
-
- private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir")
- .doc("Staging directory used while submitting applications.")
- .stringConf
- .createOptional
-
- /* Cluster-mode launcher configuration. */
-
- private[spark] val WAIT_FOR_APP_COMPLETION = ConfigBuilder("spark.yarn.submit.waitAppCompletion")
- .doc("In cluster mode, whether to wait for the application to finish before exiting the " +
- "launcher process.")
- .booleanConf
- .createWithDefault(true)
-
- private[spark] val REPORT_INTERVAL = ConfigBuilder("spark.yarn.report.interval")
- .doc("Interval between reports of the current app status in cluster mode.")
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefaultString("1s")
-
- /* Shared Client-mode AM / Driver configuration. */
-
- private[spark] val AM_MAX_WAIT_TIME = ConfigBuilder("spark.yarn.am.waitTime")
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefaultString("100s")
-
- private[spark] val AM_NODE_LABEL_EXPRESSION = ConfigBuilder("spark.yarn.am.nodeLabelExpression")
- .doc("Node label expression for the AM.")
- .stringConf
- .createOptional
-
- private[spark] val CONTAINER_LAUNCH_MAX_THREADS =
- ConfigBuilder("spark.yarn.containerLauncherMaxThreads")
- .intConf
- .createWithDefault(25)
-
- private[spark] val MAX_EXECUTOR_FAILURES = ConfigBuilder("spark.yarn.max.executor.failures")
- .intConf
- .createOptional
-
- private[spark] val MAX_REPORTER_THREAD_FAILURES =
- ConfigBuilder("spark.yarn.scheduler.reporterThread.maxFailures")
- .intConf
- .createWithDefault(5)
-
- private[spark] val RM_HEARTBEAT_INTERVAL =
- ConfigBuilder("spark.yarn.scheduler.heartbeat.interval-ms")
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefaultString("3s")
-
- private[spark] val INITIAL_HEARTBEAT_INTERVAL =
- ConfigBuilder("spark.yarn.scheduler.initial-allocation.interval")
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefaultString("200ms")
-
- private[spark] val SCHEDULER_SERVICES = ConfigBuilder("spark.yarn.services")
- .doc("A comma-separated list of class names of services to add to the scheduler.")
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- /* Client-mode AM configuration. */
-
- private[spark] val AM_CORES = ConfigBuilder("spark.yarn.am.cores")
- .intConf
- .createWithDefault(1)
-
- private[spark] val AM_JAVA_OPTIONS = ConfigBuilder("spark.yarn.am.extraJavaOptions")
- .doc("Extra Java options for the client-mode AM.")
- .stringConf
- .createOptional
-
- private[spark] val AM_LIBRARY_PATH = ConfigBuilder("spark.yarn.am.extraLibraryPath")
- .doc("Extra native library path for the client-mode AM.")
- .stringConf
- .createOptional
-
- private[spark] val AM_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.am.memoryOverhead")
- .bytesConf(ByteUnit.MiB)
- .createOptional
-
- private[spark] val AM_MEMORY = ConfigBuilder("spark.yarn.am.memory")
- .bytesConf(ByteUnit.MiB)
- .createWithDefaultString("512m")
-
- /* Driver configuration. */
-
- private[spark] val DRIVER_CORES = ConfigBuilder("spark.driver.cores")
- .intConf
- .createWithDefault(1)
-
- private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.driver.memoryOverhead")
- .bytesConf(ByteUnit.MiB)
- .createOptional
-
- /* Executor configuration. */
-
- private[spark] val EXECUTOR_CORES = ConfigBuilder("spark.executor.cores")
- .intConf
- .createWithDefault(1)
-
- private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.executor.memoryOverhead")
- .bytesConf(ByteUnit.MiB)
- .createOptional
-
- private[spark] val EXECUTOR_NODE_LABEL_EXPRESSION =
- ConfigBuilder("spark.yarn.executor.nodeLabelExpression")
- .doc("Node label expression for executors.")
- .stringConf
- .createOptional
-
- /* Security configuration. */
-
- private[spark] val CREDENTIAL_FILE_MAX_COUNT =
- ConfigBuilder("spark.yarn.credentials.file.retention.count")
- .intConf
- .createWithDefault(5)
-
- private[spark] val CREDENTIALS_FILE_MAX_RETENTION =
- ConfigBuilder("spark.yarn.credentials.file.retention.days")
- .intConf
- .createWithDefault(5)
-
- private[spark] val NAMENODES_TO_ACCESS = ConfigBuilder("spark.yarn.access.namenodes")
- .doc("Extra NameNode URLs for which to request delegation tokens. The NameNode that hosts " +
- "fs.defaultFS does not need to be listed here.")
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- /* Rolled log aggregation configuration. */
-
- private[spark] val ROLLED_LOG_INCLUDE_PATTERN =
- ConfigBuilder("spark.yarn.rolledLog.includePattern")
- .doc("Java Regex to filter the log files which match the defined include pattern and those " +
- "log files will be aggregated in a rolling fashion.")
- .stringConf
- .createOptional
-
- private[spark] val ROLLED_LOG_EXCLUDE_PATTERN =
- ConfigBuilder("spark.yarn.rolledLog.excludePattern")
- .doc("Java Regex to filter the log files which match the defined exclude pattern and those " +
- "log files will not be aggregated in a rolling fashion.")
- .stringConf
- .createOptional
-
- /* Private configs. */
-
- private[spark] val CREDENTIALS_FILE_PATH = ConfigBuilder("spark.yarn.credentials.file")
- .internal()
- .stringConf
- .createWithDefault(null)
-
- // Internal config to propagate the location of the user's jar to the driver/executors
- private[spark] val APP_JAR = ConfigBuilder("spark.yarn.user.jar")
- .internal()
- .stringConf
- .createOptional
-
- // Internal config to propagate the locations of any extra jars to add to the classpath
- // of the executors
- private[spark] val SECONDARY_JARS = ConfigBuilder("spark.yarn.secondary.jars")
- .internal()
- .stringConf
- .toSequence
- .createOptional
-
- /* Configuration and cached file propagation. */
-
- private[spark] val CACHED_FILES = ConfigBuilder("spark.yarn.cache.filenames")
- .internal()
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val CACHED_FILES_SIZES = ConfigBuilder("spark.yarn.cache.sizes")
- .internal()
- .longConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val CACHED_FILES_TIMESTAMPS = ConfigBuilder("spark.yarn.cache.timestamps")
- .internal()
- .longConf
- .toSequence
- .createWithDefault(Nil)
-
- private[spark] val CACHED_FILES_VISIBILITIES = ConfigBuilder("spark.yarn.cache.visibilities")
- .internal()
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- // Either "file" or "archive", for each file.
- private[spark] val CACHED_FILES_TYPES = ConfigBuilder("spark.yarn.cache.types")
- .internal()
- .stringConf
- .toSequence
- .createWithDefault(Nil)
-
- // The location of the conf archive in HDFS.
- private[spark] val CACHED_CONF_ARCHIVE = ConfigBuilder("spark.yarn.cache.confArchive")
- .internal()
- .stringConf
- .createOptional
-
- private[spark] val CREDENTIALS_RENEWAL_TIME = ConfigBuilder("spark.yarn.credentials.renewalTime")
- .internal()
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefault(Long.MaxValue)
-
- private[spark] val CREDENTIALS_UPDATE_TIME = ConfigBuilder("spark.yarn.credentials.updateTime")
- .internal()
- .timeConf(TimeUnit.MILLISECONDS)
- .createWithDefault(Long.MaxValue)
-
- // The list of cache-related config entries. This is used by Client and the AM to clean
- // up the environment so that these settings do not appear on the web UI.
- private[yarn] val CACHE_CONFIGS = Seq(
- CACHED_FILES,
- CACHED_FILES_SIZES,
- CACHED_FILES_TIMESTAMPS,
- CACHED_FILES_VISIBILITIES,
- CACHED_FILES_TYPES,
- CACHED_CONF_ARCHIVE)
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
deleted file mode 100644
index 7e76f40..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.yarn.security
-
-import java.security.PrivilegedExceptionAction
-import java.util.concurrent.{Executors, TimeUnit}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.security.UserGroupInformation
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.util.ThreadUtils
-
-/**
- * The following methods are primarily meant to make sure long-running apps like Spark
- * Streaming apps can run without interruption while accessing secured services. The
- * scheduleLoginFromKeytab method is called on the AM to get the new credentials.
- * This method wakes up a thread that logs into the KDC
- * once 75% of the renewal interval of the original credentials used for the container
- * has elapsed. It then obtains new credentials and writes them to HDFS in a
- * pre-specified location - the prefix of which is specified in the sparkConf by
- * spark.yarn.credentials.file (so the file(s) would be named c-timestamp1-1, c-timestamp2-2 etc.
- * - each update goes to a new file, with a monotonically increasing suffix), also the
- * timestamp1, timestamp2 here indicates the time of next update for CredentialUpdater.
- * After this, the credentials are renewed once 75% of the new tokens renewal interval has elapsed.
- *
- * On the executor and driver (yarn client mode) side, the updateCredentialsIfRequired method is
- * called once 80% of the validity of the original credentials has elapsed. At that time the
- * executor finds the credentials file with the latest timestamp and checks if it has read those
- * credentials before (by keeping track of the suffix of the last file it read). If a new file has
- * appeared, it will read the credentials and update the currently running UGI with it. This
- * process happens again once 80% of the validity of this has expired.
- */
-private[yarn] class AMCredentialRenewer(
- sparkConf: SparkConf,
- hadoopConf: Configuration,
- credentialManager: ConfigurableCredentialManager) extends Logging {
-
- private var lastCredentialsFileSuffix = 0
-
- private val credentialRenewer =
- Executors.newSingleThreadScheduledExecutor(
- ThreadUtils.namedThreadFactory("Credential Refresh Thread"))
-
- private val hadoopUtil = YarnSparkHadoopUtil.get
-
- private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
- private val daysToKeepFiles = sparkConf.get(CREDENTIALS_FILE_MAX_RETENTION)
- private val numFilesToKeep = sparkConf.get(CREDENTIAL_FILE_MAX_COUNT)
- private val freshHadoopConf =
- hadoopUtil.getConfBypassingFSCache(hadoopConf, new Path(credentialsFile).toUri.getScheme)
-
- @volatile private var timeOfNextRenewal = sparkConf.get(CREDENTIALS_RENEWAL_TIME)
-
- /**
- * Schedule a login from the keytab and principal set using the --principal and --keytab
- * arguments to spark-submit. This login happens only when the credentials of the current user
- * are about to expire. This method reads spark.yarn.principal and spark.yarn.keytab from
- * SparkConf to do the login. This method is a no-op in non-YARN mode.
- *
- */
- private[spark] def scheduleLoginFromKeytab(): Unit = {
- val principal = sparkConf.get(PRINCIPAL).get
- val keytab = sparkConf.get(KEYTAB).get
-
- /**
- * Schedule re-login and creation of new credentials. If credentials have already expired, this
- * method will synchronously create new ones.
- */
- def scheduleRenewal(runnable: Runnable): Unit = {
- // Run now!
- val remainingTime = timeOfNextRenewal - System.currentTimeMillis()
- if (remainingTime <= 0) {
- logInfo("Credentials have expired, creating new ones now.")
- runnable.run()
- } else {
- logInfo(s"Scheduling login from keytab in $remainingTime millis.")
- credentialRenewer.schedule(runnable, remainingTime, TimeUnit.MILLISECONDS)
- }
- }
-
- // This thread periodically runs on the AM to update the credentials on HDFS.
- val credentialRenewerRunnable =
- new Runnable {
- override def run(): Unit = {
- try {
- writeNewCredentialsToHDFS(principal, keytab)
- cleanupOldFiles()
- } catch {
- case e: Exception =>
- // Log the error and try to write new tokens back in an hour
- logWarning("Failed to write out new credentials to HDFS, will try again in an " +
- "hour! If this happens too often tasks will fail.", e)
- credentialRenewer.schedule(this, 1, TimeUnit.HOURS)
- return
- }
- scheduleRenewal(this)
- }
- }
- // Schedule update of credentials. This handles the case of updating the credentials right now
- // as well, since the renewal interval will be 0, and the thread will get scheduled
- // immediately.
- scheduleRenewal(credentialRenewerRunnable)
- }
-
- // Keeps only files that are newer than daysToKeepFiles days, and deletes everything else. At
- // least numFilesToKeep files are kept for safety
- private def cleanupOldFiles(): Unit = {
- import scala.concurrent.duration._
- try {
- val remoteFs = FileSystem.get(freshHadoopConf)
- val credentialsPath = new Path(credentialsFile)
- val thresholdTime = System.currentTimeMillis() - (daysToKeepFiles.days).toMillis
- hadoopUtil.listFilesSorted(
- remoteFs, credentialsPath.getParent,
- credentialsPath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
- .dropRight(numFilesToKeep)
- .takeWhile(_.getModificationTime < thresholdTime)
- .foreach(x => remoteFs.delete(x.getPath, true))
- } catch {
- // Such errors are not fatal, so don't throw. Make sure they are logged though
- case e: Exception =>
- logWarning("Error while attempting to cleanup old credentials. If you are seeing many " +
- "such warnings there may be an issue with your HDFS cluster.", e)
- }
- }
-
- private def writeNewCredentialsToHDFS(principal: String, keytab: String): Unit = {
- // Keytab is copied by YARN to the working directory of the AM, so full path is
- // not needed.
-
- // HACK:
- // HDFS will not issue new delegation tokens, if the Credentials object
- // passed in already has tokens for that FS even if the tokens are expired (it really only
- // checks if there are tokens for the service, and not if they are valid). So the only real
- // way to get new tokens is to make sure a different Credentials object is used each time to
- // get new tokens and then the new tokens are copied over the current user's Credentials.
- // So:
- // - we login as a different user and get the UGI
- // - use that UGI to get the tokens (see doAs block below)
- // - copy the tokens over to the current user's credentials (this will overwrite the tokens
- // in the current user's Credentials object for this FS).
- // The login to KDC happens each time new tokens are required, but this is rare enough to not
- // have to worry about (like once every day or so). This makes this code clearer than having
- // to login and then relogin every time (the HDFS API may not relogin since we don't use this
- // UGI directly for HDFS communication.
- logInfo(s"Attempting to login to KDC using principal: $principal")
- val keytabLoggedInUGI = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab)
- logInfo("Successfully logged into KDC.")
- val tempCreds = keytabLoggedInUGI.getCredentials
- val credentialsPath = new Path(credentialsFile)
- val dst = credentialsPath.getParent
- var nearestNextRenewalTime = Long.MaxValue
- keytabLoggedInUGI.doAs(new PrivilegedExceptionAction[Void] {
- // Get a copy of the credentials
- override def run(): Void = {
- nearestNextRenewalTime = credentialManager.obtainCredentials(freshHadoopConf, tempCreds)
- null
- }
- })
-
- val currTime = System.currentTimeMillis()
- val timeOfNextUpdate = if (nearestNextRenewalTime <= currTime) {
- // If next renewal time is earlier than current time, we set next renewal time to current
- // time, this will trigger next renewal immediately. Also set next update time to current
- // time. There still has a gap between token renewal and update will potentially introduce
- // issue.
- logWarning(s"Next credential renewal time ($nearestNextRenewalTime) is earlier than " +
- s"current time ($currTime), which is unexpected, please check your credential renewal " +
- "related configurations in the target services.")
- timeOfNextRenewal = currTime
- currTime
- } else {
- // Next valid renewal time is about 75% of credential renewal time, and update time is
- // slightly later than valid renewal time (80% of renewal time).
- timeOfNextRenewal = ((nearestNextRenewalTime - currTime) * 0.75 + currTime).toLong
- ((nearestNextRenewalTime - currTime) * 0.8 + currTime).toLong
- }
-
- // Add the temp credentials back to the original ones.
- UserGroupInformation.getCurrentUser.addCredentials(tempCreds)
- val remoteFs = FileSystem.get(freshHadoopConf)
- // If lastCredentialsFileSuffix is 0, then the AM is either started or restarted. If the AM
- // was restarted, then the lastCredentialsFileSuffix might be > 0, so find the newest file
- // and update the lastCredentialsFileSuffix.
- if (lastCredentialsFileSuffix == 0) {
- hadoopUtil.listFilesSorted(
- remoteFs, credentialsPath.getParent,
- credentialsPath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
- .lastOption.foreach { status =>
- lastCredentialsFileSuffix = hadoopUtil.getSuffixForCredentialsPath(status.getPath)
- }
- }
- val nextSuffix = lastCredentialsFileSuffix + 1
-
- val tokenPathStr =
- credentialsFile + SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM +
- timeOfNextUpdate.toLong.toString + SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM +
- nextSuffix
- val tokenPath = new Path(tokenPathStr)
- val tempTokenPath = new Path(tokenPathStr + SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
-
- logInfo("Writing out delegation tokens to " + tempTokenPath.toString)
- val credentials = UserGroupInformation.getCurrentUser.getCredentials
- credentials.writeTokenStorageFile(tempTokenPath, freshHadoopConf)
- logInfo(s"Delegation Tokens written out successfully. Renaming file to $tokenPathStr")
- remoteFs.rename(tempTokenPath, tokenPath)
- logInfo("Delegation token file rename complete.")
- lastCredentialsFileSuffix = nextSuffix
- }
-
- def stop(): Unit = {
- credentialRenewer.shutdown()
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[07/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
new file mode 100644
index 0000000..7fbbe12
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import java.io.{File, IOException}
+import java.nio.charset.StandardCharsets
+
+import com.google.common.io.{ByteStreams, Files}
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.yarn.api.ApplicationConstants
+import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
+import org.apache.hadoop.yarn.api.records.ApplicationAccessType
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.scalatest.Matchers
+
+import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{ResetSystemProperties, Utils}
+
+class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
+ with ResetSystemProperties {
+
+ val hasBash =
+ try {
+ val exitCode = Runtime.getRuntime().exec(Array("bash", "--version")).waitFor()
+ exitCode == 0
+ } catch {
+ case e: IOException =>
+ false
+ }
+
+ if (!hasBash) {
+ logWarning("Cannot execute bash, skipping bash tests.")
+ }
+
+ def bashTest(name: String)(fn: => Unit): Unit =
+ if (hasBash) test(name)(fn) else ignore(name)(fn)
+
+ bashTest("shell script escaping") {
+ val scriptFile = File.createTempFile("script.", ".sh", Utils.createTempDir())
+ val args = Array("arg1", "${arg.2}", "\"arg3\"", "'arg4'", "$arg5", "\\arg6")
+ try {
+ val argLine = args.map(a => YarnSparkHadoopUtil.escapeForShell(a)).mkString(" ")
+ Files.write(("bash -c \"echo " + argLine + "\"").getBytes(StandardCharsets.UTF_8), scriptFile)
+ scriptFile.setExecutable(true)
+
+ val proc = Runtime.getRuntime().exec(Array(scriptFile.getAbsolutePath()))
+ val out = new String(ByteStreams.toByteArray(proc.getInputStream())).trim()
+ val err = new String(ByteStreams.toByteArray(proc.getErrorStream()))
+ val exitCode = proc.waitFor()
+ exitCode should be (0)
+ out should be (args.mkString(" "))
+ } finally {
+ scriptFile.delete()
+ }
+ }
+
+ test("Yarn configuration override") {
+ val key = "yarn.nodemanager.hostname"
+ val default = new YarnConfiguration()
+
+ val sparkConf = new SparkConf()
+ .set("spark.hadoop." + key, "someHostName")
+ val yarnConf = new YarnSparkHadoopUtil().newConfiguration(sparkConf)
+
+ yarnConf.getClass() should be (classOf[YarnConfiguration])
+ yarnConf.get(key) should not be default.get(key)
+ }
+
+
+ test("test getApplicationAclsForYarn acls on") {
+
+ // spark acls on, just pick up default user
+ val sparkConf = new SparkConf()
+ sparkConf.set("spark.acls.enable", "true")
+
+ val securityMgr = new SecurityManager(sparkConf)
+ val acls = YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr)
+
+ val viewAcls = acls.get(ApplicationAccessType.VIEW_APP)
+ val modifyAcls = acls.get(ApplicationAccessType.MODIFY_APP)
+
+ viewAcls match {
+ case Some(vacls) =>
+ val aclSet = vacls.split(',').map(_.trim).toSet
+ assert(aclSet.contains(System.getProperty("user.name", "invalid")))
+ case None =>
+ fail()
+ }
+ modifyAcls match {
+ case Some(macls) =>
+ val aclSet = macls.split(',').map(_.trim).toSet
+ assert(aclSet.contains(System.getProperty("user.name", "invalid")))
+ case None =>
+ fail()
+ }
+ }
+
+ test("test getApplicationAclsForYarn acls on and specify users") {
+
+ // default spark acls are on and specify acls
+ val sparkConf = new SparkConf()
+ sparkConf.set("spark.acls.enable", "true")
+ sparkConf.set("spark.ui.view.acls", "user1,user2")
+ sparkConf.set("spark.modify.acls", "user3,user4")
+
+ val securityMgr = new SecurityManager(sparkConf)
+ val acls = YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr)
+
+ val viewAcls = acls.get(ApplicationAccessType.VIEW_APP)
+ val modifyAcls = acls.get(ApplicationAccessType.MODIFY_APP)
+
+ viewAcls match {
+ case Some(vacls) =>
+ val aclSet = vacls.split(',').map(_.trim).toSet
+ assert(aclSet.contains("user1"))
+ assert(aclSet.contains("user2"))
+ assert(aclSet.contains(System.getProperty("user.name", "invalid")))
+ case None =>
+ fail()
+ }
+ modifyAcls match {
+ case Some(macls) =>
+ val aclSet = macls.split(',').map(_.trim).toSet
+ assert(aclSet.contains("user3"))
+ assert(aclSet.contains("user4"))
+ assert(aclSet.contains(System.getProperty("user.name", "invalid")))
+ case None =>
+ fail()
+ }
+
+ }
+
+ test("test expandEnvironment result") {
+ val target = Environment.PWD
+ if (classOf[Environment].getMethods().exists(_.getName == "$$")) {
+ YarnSparkHadoopUtil.expandEnvironment(target) should be ("{{" + target + "}}")
+ } else if (Utils.isWindows) {
+ YarnSparkHadoopUtil.expandEnvironment(target) should be ("%" + target + "%")
+ } else {
+ YarnSparkHadoopUtil.expandEnvironment(target) should be ("$" + target)
+ }
+
+ }
+
+ test("test getClassPathSeparator result") {
+ if (classOf[ApplicationConstants].getFields().exists(_.getName == "CLASS_PATH_SEPARATOR")) {
+ YarnSparkHadoopUtil.getClassPathSeparator() should be ("<CPS>")
+ } else if (Utils.isWindows) {
+ YarnSparkHadoopUtil.getClassPathSeparator() should be (";")
+ } else {
+ YarnSparkHadoopUtil.getClassPathSeparator() should be (":")
+ }
+ }
+
+ test("check different hadoop utils based on env variable") {
+ try {
+ System.setProperty("SPARK_YARN_MODE", "true")
+ assert(SparkHadoopUtil.get.getClass === classOf[YarnSparkHadoopUtil])
+ System.setProperty("SPARK_YARN_MODE", "false")
+ assert(SparkHadoopUtil.get.getClass === classOf[SparkHadoopUtil])
+ } finally {
+ System.clearProperty("SPARK_YARN_MODE")
+ }
+ }
+
+
+
+ // This test needs to live here because it depends on isYarnMode returning true, which can only
+ // happen in the YARN module.
+ test("security manager token generation") {
+ try {
+ System.setProperty("SPARK_YARN_MODE", "true")
+ val initial = SparkHadoopUtil.get
+ .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
+ assert(initial === null || initial.length === 0)
+
+ val conf = new SparkConf()
+ .set(SecurityManager.SPARK_AUTH_CONF, "true")
+ .set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
+ val sm = new SecurityManager(conf)
+
+ val generated = SparkHadoopUtil.get
+ .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
+ assert(generated != null)
+ val genString = new Text(generated).toString()
+ assert(genString != "unused")
+ assert(sm.getSecretKey() === genString)
+ } finally {
+ // removeSecretKey() was only added in Hadoop 2.6, so instead we just set the secret
+ // to an empty string.
+ SparkHadoopUtil.get.addSecretKeyToUserCredentials(SecurityManager.SECRET_LOOKUP_KEY, "")
+ System.clearProperty("SPARK_YARN_MODE")
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
new file mode 100644
index 0000000..db4619e
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/ConfigurableCredentialManagerSuite.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.security.Credentials
+import org.apache.hadoop.security.token.Token
+import org.scalatest.{BeforeAndAfter, Matchers}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
+
+class ConfigurableCredentialManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
+ private var credentialManager: ConfigurableCredentialManager = null
+ private var sparkConf: SparkConf = null
+ private var hadoopConf: Configuration = null
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+
+ sparkConf = new SparkConf()
+ hadoopConf = new Configuration()
+ System.setProperty("SPARK_YARN_MODE", "true")
+ }
+
+ override def afterAll(): Unit = {
+ System.clearProperty("SPARK_YARN_MODE")
+
+ super.afterAll()
+ }
+
+ test("Correctly load default credential providers") {
+ credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+
+ credentialManager.getServiceCredentialProvider("hdfs") should not be (None)
+ credentialManager.getServiceCredentialProvider("hbase") should not be (None)
+ credentialManager.getServiceCredentialProvider("hive") should not be (None)
+ }
+
+ test("disable hive credential provider") {
+ sparkConf.set("spark.yarn.security.credentials.hive.enabled", "false")
+ credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+
+ credentialManager.getServiceCredentialProvider("hdfs") should not be (None)
+ credentialManager.getServiceCredentialProvider("hbase") should not be (None)
+ credentialManager.getServiceCredentialProvider("hive") should be (None)
+ }
+
+ test("using deprecated configurations") {
+ sparkConf.set("spark.yarn.security.tokens.hdfs.enabled", "false")
+ sparkConf.set("spark.yarn.security.tokens.hive.enabled", "false")
+ credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+
+ credentialManager.getServiceCredentialProvider("hdfs") should be (None)
+ credentialManager.getServiceCredentialProvider("hive") should be (None)
+ credentialManager.getServiceCredentialProvider("test") should not be (None)
+ credentialManager.getServiceCredentialProvider("hbase") should not be (None)
+ }
+
+ test("verify obtaining credentials from provider") {
+ credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+ val creds = new Credentials()
+
+ // Tokens can only be obtained from TestTokenProvider, for hdfs, hbase and hive tokens cannot
+ // be obtained.
+ credentialManager.obtainCredentials(hadoopConf, creds)
+ val tokens = creds.getAllTokens
+ tokens.size() should be (1)
+ tokens.iterator().next().getService should be (new Text("test"))
+ }
+
+ test("verify getting credential renewal info") {
+ credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
+ val creds = new Credentials()
+
+ val testCredentialProvider = credentialManager.getServiceCredentialProvider("test").get
+ .asInstanceOf[TestCredentialProvider]
+ // Only TestTokenProvider can get the time of next token renewal
+ val nextRenewal = credentialManager.obtainCredentials(hadoopConf, creds)
+ nextRenewal should be (testCredentialProvider.timeOfNextTokenRenewal)
+ }
+
+ test("obtain tokens For HiveMetastore") {
+ val hadoopConf = new Configuration()
+ hadoopConf.set("hive.metastore.kerberos.principal", "bob")
+ // thrift picks up on port 0 and bails out, without trying to talk to endpoint
+ hadoopConf.set("hive.metastore.uris", "http://localhost:0")
+
+ val hiveCredentialProvider = new HiveCredentialProvider()
+ val credentials = new Credentials()
+ hiveCredentialProvider.obtainCredentials(hadoopConf, sparkConf, credentials)
+
+ credentials.getAllTokens.size() should be (0)
+ }
+
+ test("Obtain tokens For HBase") {
+ val hadoopConf = new Configuration()
+ hadoopConf.set("hbase.security.authentication", "kerberos")
+
+ val hbaseTokenProvider = new HBaseCredentialProvider()
+ val creds = new Credentials()
+ hbaseTokenProvider.obtainCredentials(hadoopConf, sparkConf, creds)
+
+ creds.getAllTokens.size should be (0)
+ }
+}
+
+class TestCredentialProvider extends ServiceCredentialProvider {
+ val tokenRenewalInterval = 86400 * 1000L
+ var timeOfNextTokenRenewal = 0L
+
+ override def serviceName: String = "test"
+
+ override def credentialsRequired(conf: Configuration): Boolean = true
+
+ override def obtainCredentials(
+ hadoopConf: Configuration,
+ sparkConf: SparkConf,
+ creds: Credentials): Option[Long] = {
+ if (creds == null) {
+ // Guard out other unit test failures.
+ return None
+ }
+
+ val emptyToken = new Token()
+ emptyToken.setService(new Text("test"))
+ creds.addToken(emptyToken.getService, emptyToken)
+
+ val currTime = System.currentTimeMillis()
+ timeOfNextTokenRenewal = (currTime - currTime % tokenRenewalInterval) + tokenRenewalInterval
+
+ Some(timeOfNextTokenRenewal)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
new file mode 100644
index 0000000..7b2da3f
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/HDFSCredentialProviderSuite.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn.security
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.scalatest.{Matchers, PrivateMethodTester}
+
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
+
+class HDFSCredentialProviderSuite
+ extends SparkFunSuite
+ with PrivateMethodTester
+ with Matchers {
+ private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)
+
+ private def getTokenRenewer(
+ hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
+ hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
+ }
+
+ private var hdfsCredentialProvider: HDFSCredentialProvider = null
+
+ override def beforeAll() {
+ super.beforeAll()
+
+ if (hdfsCredentialProvider == null) {
+ hdfsCredentialProvider = new HDFSCredentialProvider()
+ }
+ }
+
+ override def afterAll() {
+ if (hdfsCredentialProvider != null) {
+ hdfsCredentialProvider = null
+ }
+
+ super.afterAll()
+ }
+
+ test("check token renewer") {
+ val hadoopConf = new Configuration()
+ hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
+ hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
+ val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
+ renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
+ }
+
+ test("check token renewer default") {
+ val hadoopConf = new Configuration()
+ val caught =
+ intercept[SparkException] {
+ getTokenRenewer(hdfsCredentialProvider, hadoopConf)
+ }
+ assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
new file mode 100644
index 0000000..da9e8e2
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/launcher/TestClasspathBuilder.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.util.{List => JList, Map => JMap}
+
+/**
+ * Exposes AbstractCommandBuilder to the YARN tests, so that they can build classpaths the same
+ * way other cluster managers do.
+ */
+private[spark] class TestClasspathBuilder extends AbstractCommandBuilder {
+
+ childEnv.put(CommandBuilderUtils.ENV_SPARK_HOME, sys.props("spark.test.home"))
+
+ override def buildClassPath(extraCp: String): JList[String] = super.buildClassPath(extraCp)
+
+ /** Not used by the YARN tests. */
+ override def buildCommand(env: JMap[String, String]): JList[String] =
+ throw new UnsupportedOperationException()
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
new file mode 100644
index 0000000..1fed256
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/shuffle/ShuffleTestAccessor.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.shuffle
+
+import java.io.File
+import java.util.concurrent.ConcurrentMap
+
+import org.apache.hadoop.yarn.api.records.ApplicationId
+import org.fusesource.leveldbjni.JniDBFactory
+import org.iq80.leveldb.{DB, Options}
+
+import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver.AppExecId
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
+
+/**
+ * just a cheat to get package-visible members in tests
+ */
+object ShuffleTestAccessor {
+
+ def getBlockResolver(handler: ExternalShuffleBlockHandler): ExternalShuffleBlockResolver = {
+ handler.blockManager
+ }
+
+ def getExecutorInfo(
+ appId: ApplicationId,
+ execId: String,
+ resolver: ExternalShuffleBlockResolver
+ ): Option[ExecutorShuffleInfo] = {
+ val id = new AppExecId(appId.toString, execId)
+ Option(resolver.executors.get(id))
+ }
+
+ def registeredExecutorFile(resolver: ExternalShuffleBlockResolver): File = {
+ resolver.registeredExecutorFile
+ }
+
+ def shuffleServiceLevelDB(resolver: ExternalShuffleBlockResolver): DB = {
+ resolver.db
+ }
+
+ def reloadRegisteredExecutors(
+ file: File): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
+ val options: Options = new Options
+ options.createIfMissing(true)
+ val factory = new JniDBFactory
+ val db = factory.open(file, options)
+ val result = ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
+ db.close()
+ result
+ }
+
+ def reloadRegisteredExecutors(
+ db: DB): ConcurrentMap[ExternalShuffleBlockResolver.AppExecId, ExecutorShuffleInfo] = {
+ ExternalShuffleBlockResolver.reloadRegisteredExecutors(db)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
new file mode 100644
index 0000000..a58784f
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnShuffleServiceSuite.scala
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.yarn
+
+import java.io.{DataOutputStream, File, FileOutputStream, IOException}
+import java.nio.ByteBuffer
+import java.nio.file.Files
+import java.nio.file.attribute.PosixFilePermission._
+import java.util.EnumSet
+
+import scala.annotation.tailrec
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.service.ServiceStateException
+import org.apache.hadoop.yarn.api.records.ApplicationId
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+import org.apache.hadoop.yarn.server.api.{ApplicationInitializationContext, ApplicationTerminationContext}
+import org.scalatest.{BeforeAndAfterEach, Matchers}
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark.SecurityManager
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.network.shuffle.ShuffleTestAccessor
+import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo
+import org.apache.spark.util.Utils
+
+class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
+ private[yarn] var yarnConfig: YarnConfiguration = null
+ private[yarn] val SORT_MANAGER = "org.apache.spark.shuffle.sort.SortShuffleManager"
+
+ override def beforeEach(): Unit = {
+ super.beforeEach()
+ yarnConfig = new YarnConfiguration()
+ yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle")
+ yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"),
+ classOf[YarnShuffleService].getCanonicalName)
+ yarnConfig.setInt("spark.shuffle.service.port", 0)
+ yarnConfig.setBoolean(YarnShuffleService.STOP_ON_FAILURE_KEY, true)
+ val localDir = Utils.createTempDir()
+ yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, localDir.getAbsolutePath)
+ }
+
+ var s1: YarnShuffleService = null
+ var s2: YarnShuffleService = null
+ var s3: YarnShuffleService = null
+
+ override def afterEach(): Unit = {
+ try {
+ if (s1 != null) {
+ s1.stop()
+ s1 = null
+ }
+ if (s2 != null) {
+ s2.stop()
+ s2 = null
+ }
+ if (s3 != null) {
+ s3.stop()
+ s3 = null
+ }
+ } finally {
+ super.afterEach()
+ }
+ }
+
+ test("executor state kept across NM restart") {
+ s1 = new YarnShuffleService
+ // set auth to true to test the secrets recovery
+ yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
+ s1.init(yarnConfig)
+ val app1Id = ApplicationId.newInstance(0, 1)
+ val app1Data = makeAppInfo("user", app1Id)
+ s1.initializeApplication(app1Data)
+ val app2Id = ApplicationId.newInstance(0, 2)
+ val app2Data = makeAppInfo("user", app2Id)
+ s1.initializeApplication(app2Data)
+
+ val execStateFile = s1.registeredExecutorFile
+ execStateFile should not be (null)
+ val secretsFile = s1.secretsFile
+ secretsFile should not be (null)
+ val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
+ val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
+
+ val blockHandler = s1.blockHandler
+ val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+ ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+ blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+ blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+ ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", blockResolver) should
+ be (Some(shuffleInfo1))
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", blockResolver) should
+ be (Some(shuffleInfo2))
+
+ if (!execStateFile.exists()) {
+ @tailrec def findExistingParent(file: File): File = {
+ if (file == null) file
+ else if (file.exists()) file
+ else findExistingParent(file.getParentFile())
+ }
+ val existingParent = findExistingParent(execStateFile)
+ assert(false, s"$execStateFile does not exist -- closest existing parent is $existingParent")
+ }
+ assert(execStateFile.exists(), s"$execStateFile did not exist")
+
+ // now we pretend the shuffle service goes down, and comes back up
+ s1.stop()
+ s2 = new YarnShuffleService
+ s2.init(yarnConfig)
+ s2.secretsFile should be (secretsFile)
+ s2.registeredExecutorFile should be (execStateFile)
+
+ val handler2 = s2.blockHandler
+ val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
+
+ // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
+ // during the restart
+ s2.initializeApplication(app1Data)
+ s2.stopApplication(new ApplicationTerminationContext(app2Id))
+ ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver2) should be (Some(shuffleInfo1))
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (None)
+
+ // Act like the NM restarts one more time
+ s2.stop()
+ s3 = new YarnShuffleService
+ s3.init(yarnConfig)
+ s3.registeredExecutorFile should be (execStateFile)
+ s3.secretsFile should be (secretsFile)
+
+ val handler3 = s3.blockHandler
+ val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
+
+ // app1 is still running
+ s3.initializeApplication(app1Data)
+ ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver3) should be (Some(shuffleInfo1))
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (None)
+ s3.stop()
+ }
+
+ test("removed applications should not be in registered executor file") {
+ s1 = new YarnShuffleService
+ yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, false)
+ s1.init(yarnConfig)
+ val secretsFile = s1.secretsFile
+ secretsFile should be (null)
+ val app1Id = ApplicationId.newInstance(0, 1)
+ val app1Data = makeAppInfo("user", app1Id)
+ s1.initializeApplication(app1Data)
+ val app2Id = ApplicationId.newInstance(0, 2)
+ val app2Data = makeAppInfo("user", app2Id)
+ s1.initializeApplication(app2Data)
+
+ val execStateFile = s1.registeredExecutorFile
+ execStateFile should not be (null)
+ val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
+ val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
+
+ val blockHandler = s1.blockHandler
+ val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+ ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+ blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+ blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+
+ val db = ShuffleTestAccessor.shuffleServiceLevelDB(blockResolver)
+ ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
+
+ s1.stopApplication(new ApplicationTerminationContext(app1Id))
+ ShuffleTestAccessor.reloadRegisteredExecutors(db) should not be empty
+ s1.stopApplication(new ApplicationTerminationContext(app2Id))
+ ShuffleTestAccessor.reloadRegisteredExecutors(db) shouldBe empty
+ }
+
+ test("shuffle service should be robust to corrupt registered executor file") {
+ s1 = new YarnShuffleService
+ s1.init(yarnConfig)
+ val app1Id = ApplicationId.newInstance(0, 1)
+ val app1Data = makeAppInfo("user", app1Id)
+ s1.initializeApplication(app1Data)
+
+ val execStateFile = s1.registeredExecutorFile
+ val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
+
+ val blockHandler = s1.blockHandler
+ val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+ ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+ blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+
+ // now we pretend the shuffle service goes down, and comes back up. But we'll also
+ // make a corrupt registeredExecutor File
+ s1.stop()
+
+ execStateFile.listFiles().foreach{_.delete()}
+
+ val out = new DataOutputStream(new FileOutputStream(execStateFile + "/CURRENT"))
+ out.writeInt(42)
+ out.close()
+
+ s2 = new YarnShuffleService
+ s2.init(yarnConfig)
+ s2.registeredExecutorFile should be (execStateFile)
+
+ val handler2 = s2.blockHandler
+ val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
+
+ // we re-initialize app1, but since the file was corrupt there is nothing we can do about it ...
+ s2.initializeApplication(app1Data)
+ // however, when we initialize a totally new app2, everything is still happy
+ val app2Id = ApplicationId.newInstance(0, 2)
+ val app2Data = makeAppInfo("user", app2Id)
+ s2.initializeApplication(app2Data)
+ val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
+ resolver2.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (Some(shuffleInfo2))
+ s2.stop()
+
+ // another stop & restart should be fine though (eg., we recover from previous corruption)
+ s3 = new YarnShuffleService
+ s3.init(yarnConfig)
+ s3.registeredExecutorFile should be (execStateFile)
+ val handler3 = s3.blockHandler
+ val resolver3 = ShuffleTestAccessor.getBlockResolver(handler3)
+
+ s3.initializeApplication(app2Data)
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver3) should be (Some(shuffleInfo2))
+ s3.stop()
+ }
+
+ test("get correct recovery path") {
+ // Test recovery path is set outside the shuffle service, this is to simulate NM recovery
+ // enabled scenario, where recovery path will be set by yarn.
+ s1 = new YarnShuffleService
+ val recoveryPath = new Path(Utils.createTempDir().toURI)
+ s1.setRecoveryPath(recoveryPath)
+
+ s1.init(yarnConfig)
+ s1._recoveryPath should be (recoveryPath)
+ s1.stop()
+
+ // Test recovery path is set inside the shuffle service, this will be happened when NM
+ // recovery is not enabled or there's no NM recovery (Hadoop 2.5-).
+ s2 = new YarnShuffleService
+ s2.init(yarnConfig)
+ s2._recoveryPath should be
+ (new Path(yarnConfig.getTrimmedStrings("yarn.nodemanager.local-dirs")(0)))
+ s2.stop()
+ }
+
+ test("moving recovery file from NM local dir to recovery path") {
+ // This is to test when Hadoop is upgrade to 2.5+ and NM recovery is enabled, we should move
+ // old recovery file to the new path to keep compatibility
+
+ // Simulate s1 is running on old version of Hadoop in which recovery file is in the NM local
+ // dir.
+ s1 = new YarnShuffleService
+ // set auth to true to test the secrets recovery
+ yarnConfig.setBoolean(SecurityManager.SPARK_AUTH_CONF, true)
+ s1.init(yarnConfig)
+ val app1Id = ApplicationId.newInstance(0, 1)
+ val app1Data = makeAppInfo("user", app1Id)
+ s1.initializeApplication(app1Data)
+ val app2Id = ApplicationId.newInstance(0, 2)
+ val app2Data = makeAppInfo("user", app2Id)
+ s1.initializeApplication(app2Data)
+
+ assert(s1.secretManager.getSecretKey(app1Id.toString()) != null)
+ assert(s1.secretManager.getSecretKey(app2Id.toString()) != null)
+
+ val execStateFile = s1.registeredExecutorFile
+ execStateFile should not be (null)
+ val secretsFile = s1.secretsFile
+ secretsFile should not be (null)
+ val shuffleInfo1 = new ExecutorShuffleInfo(Array("/foo", "/bar"), 3, SORT_MANAGER)
+ val shuffleInfo2 = new ExecutorShuffleInfo(Array("/bippy"), 5, SORT_MANAGER)
+
+ val blockHandler = s1.blockHandler
+ val blockResolver = ShuffleTestAccessor.getBlockResolver(blockHandler)
+ ShuffleTestAccessor.registeredExecutorFile(blockResolver) should be (execStateFile)
+
+ blockResolver.registerExecutor(app1Id.toString, "exec-1", shuffleInfo1)
+ blockResolver.registerExecutor(app2Id.toString, "exec-2", shuffleInfo2)
+ ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", blockResolver) should
+ be (Some(shuffleInfo1))
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", blockResolver) should
+ be (Some(shuffleInfo2))
+
+ assert(execStateFile.exists(), s"$execStateFile did not exist")
+
+ s1.stop()
+
+ // Simulate s2 is running on Hadoop 2.5+ with NM recovery is enabled.
+ assert(execStateFile.exists())
+ val recoveryPath = new Path(Utils.createTempDir().toURI)
+ s2 = new YarnShuffleService
+ s2.setRecoveryPath(recoveryPath)
+ s2.init(yarnConfig)
+
+ // Ensure that s2 has loaded known apps from the secrets db.
+ assert(s2.secretManager.getSecretKey(app1Id.toString()) != null)
+ assert(s2.secretManager.getSecretKey(app2Id.toString()) != null)
+
+ val execStateFile2 = s2.registeredExecutorFile
+ val secretsFile2 = s2.secretsFile
+
+ recoveryPath.toString should be (new Path(execStateFile2.getParentFile.toURI).toString)
+ recoveryPath.toString should be (new Path(secretsFile2.getParentFile.toURI).toString)
+ eventually(timeout(10 seconds), interval(5 millis)) {
+ assert(!execStateFile.exists())
+ }
+ eventually(timeout(10 seconds), interval(5 millis)) {
+ assert(!secretsFile.exists())
+ }
+
+ val handler2 = s2.blockHandler
+ val resolver2 = ShuffleTestAccessor.getBlockResolver(handler2)
+
+ // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
+ // during the restart
+ // Since recovery file is got from old path, so the previous state should be stored.
+ s2.initializeApplication(app1Data)
+ s2.stopApplication(new ApplicationTerminationContext(app2Id))
+ ShuffleTestAccessor.getExecutorInfo(app1Id, "exec-1", resolver2) should be (Some(shuffleInfo1))
+ ShuffleTestAccessor.getExecutorInfo(app2Id, "exec-2", resolver2) should be (None)
+
+ s2.stop()
+ }
+
+ test("service throws error if cannot start") {
+ // Set up a read-only local dir.
+ val roDir = Utils.createTempDir()
+ Files.setPosixFilePermissions(roDir.toPath(), EnumSet.of(OWNER_READ, OWNER_EXECUTE))
+ yarnConfig.set(YarnConfiguration.NM_LOCAL_DIRS, roDir.getAbsolutePath())
+
+ // Try to start the shuffle service, it should fail.
+ val service = new YarnShuffleService()
+
+ try {
+ val error = intercept[ServiceStateException] {
+ service.init(yarnConfig)
+ }
+ assert(error.getCause().isInstanceOf[IOException])
+ } finally {
+ service.stop()
+ Files.setPosixFilePermissions(roDir.toPath(),
+ EnumSet.of(OWNER_READ, OWNER_WRITE, OWNER_EXECUTE))
+ }
+ }
+
+ private def makeAppInfo(user: String, appId: ApplicationId): ApplicationInitializationContext = {
+ val secret = ByteBuffer.wrap(new Array[Byte](0))
+ new ApplicationInitializationContext(user, appId, secret)
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
new file mode 100644
index 0000000..db322cd
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/network/yarn/YarnTestAccessor.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.network.yarn
+
+import java.io.File
+
+/**
+ * just a cheat to get package-visible members in tests
+ */
+object YarnTestAccessor {
+ def getShuffleServicePort: Int = {
+ YarnShuffleService.boundPort
+ }
+
+ def getShuffleServiceInstance: YarnShuffleService = {
+ YarnShuffleService.instance
+ }
+
+ def getRegisteredExecutorFile(service: YarnShuffleService): File = {
+ service.registeredExecutorFile
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
new file mode 100644
index 0000000..6ea7984
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/ExtensionServiceIntegrationSuite.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.deploy.yarn.config._
+import org.apache.spark.internal.Logging
+
+/**
+ * Test the integration with [[SchedulerExtensionServices]]
+ */
+class ExtensionServiceIntegrationSuite extends SparkFunSuite
+ with LocalSparkContext with BeforeAndAfter
+ with Logging {
+
+ val applicationId = new StubApplicationId(0, 1111L)
+ val attemptId = new StubApplicationAttemptId(applicationId, 1)
+
+ /*
+ * Setup phase creates the spark context
+ */
+ before {
+ val sparkConf = new SparkConf()
+ sparkConf.set(SCHEDULER_SERVICES, Seq(classOf[SimpleExtensionService].getName()))
+ sparkConf.setMaster("local").setAppName("ExtensionServiceIntegrationSuite")
+ sc = new SparkContext(sparkConf)
+ }
+
+ test("Instantiate") {
+ val services = new SchedulerExtensionServices()
+ assertResult(Nil, "non-nil service list") {
+ services.getServices
+ }
+ services.start(SchedulerExtensionServiceBinding(sc, applicationId))
+ services.stop()
+ }
+
+ test("Contains SimpleExtensionService Service") {
+ val services = new SchedulerExtensionServices()
+ try {
+ services.start(SchedulerExtensionServiceBinding(sc, applicationId))
+ val serviceList = services.getServices
+ assert(serviceList.nonEmpty, "empty service list")
+ val (service :: Nil) = serviceList
+ val simpleService = service.asInstanceOf[SimpleExtensionService]
+ assert(simpleService.started.get, "service not started")
+ services.stop()
+ assert(!simpleService.started.get, "service not stopped")
+ } finally {
+ services.stop()
+ }
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
new file mode 100644
index 0000000..9b8c98c
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/SimpleExtensionService.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import java.util.concurrent.atomic.AtomicBoolean
+
+private[spark] class SimpleExtensionService extends SchedulerExtensionService {
+
+ /** started flag; set in the `start()` call, stopped in `stop()`. */
+ val started = new AtomicBoolean(false)
+
+ override def start(binding: SchedulerExtensionServiceBinding): Unit = {
+ started.set(true)
+ }
+
+ override def stop(): Unit = {
+ started.set(false)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
new file mode 100644
index 0000000..4b57b95
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationAttemptId.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId}
+
+/**
+ * A stub application ID; can be set in constructor and/or updated later.
+ * @param applicationId application ID
+ * @param attempt an attempt counter
+ */
+class StubApplicationAttemptId(var applicationId: ApplicationId, var attempt: Int)
+ extends ApplicationAttemptId {
+
+ override def setApplicationId(appID: ApplicationId): Unit = {
+ applicationId = appID
+ }
+
+ override def getAttemptId: Int = {
+ attempt
+ }
+
+ override def setAttemptId(attemptId: Int): Unit = {
+ attempt = attemptId
+ }
+
+ override def getApplicationId: ApplicationId = {
+ applicationId
+ }
+
+ override def build(): Unit = {
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
----------------------------------------------------------------------
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
new file mode 100644
index 0000000..bffa0e0
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/scheduler/cluster/StubApplicationId.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster
+
+import org.apache.hadoop.yarn.api.records.ApplicationId
+
+/**
+ * Simple Testing Application Id; ID and cluster timestamp are set in constructor
+ * and cannot be updated.
+ * @param id app id
+ * @param clusterTimestamp timestamp
+ */
+private[spark] class StubApplicationId(id: Int, clusterTimestamp: Long) extends ApplicationId {
+ override def getId: Int = {
+ id
+ }
+
+ override def getClusterTimestamp: Long = {
+ clusterTimestamp
+ }
+
+ override def setId(id: Int): Unit = {}
+
+ override def setClusterTimestamp(clusterTimestamp: Long): Unit = {}
+
+ override def build(): Unit = {}
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/pom.xml
----------------------------------------------------------------------
diff --git a/yarn/pom.xml b/yarn/pom.xml
deleted file mode 100644
index 91e3437..0000000
--- a/yarn/pom.xml
+++ /dev/null
@@ -1,215 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one or more
- ~ contributor license agreements. See the NOTICE file distributed with
- ~ this work for additional information regarding copyright ownership.
- ~ The ASF licenses this file to You under the Apache License, Version 2.0
- ~ (the "License"); you may not use this file except in compliance with
- ~ the License. You may obtain a copy of the License at
- ~
- ~ http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing, software
- ~ distributed under the License is distributed on an "AS IS" BASIS,
- ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ~ See the License for the specific language governing permissions and
- ~ limitations under the License.
- -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-parent_2.11</artifactId>
- <version>2.2.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>spark-yarn_2.11</artifactId>
- <packaging>jar</packaging>
- <name>Spark Project YARN</name>
- <properties>
- <sbt.project.name>yarn</sbt.project.name>
- <jersey-1.version>1.9</jersey-1.version>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-network-yarn_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_${scala.binary.version}</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-tags_${scala.binary.version}</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-api</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-common</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-server-web-proxy</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-client</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- </dependency>
-
- <!-- Explicit listing of transitive deps that are shaded. Otherwise, odd compiler crashes. -->
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-server</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-plus</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-util</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-http</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-servlet</artifactId>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty</groupId>
- <artifactId>jetty-servlets</artifactId>
- </dependency>
- <!-- End of shaded deps. -->
-
- <!--
- SPARK-10059: Explicitly add JSP dependencies for tests since the MiniYARN cluster needs them.
- -->
- <dependency>
- <groupId>org.eclipse.jetty.orbit</groupId>
- <artifactId>javax.servlet.jsp</artifactId>
- <version>2.2.0.v201112011158</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.eclipse.jetty.orbit</groupId>
- <artifactId>javax.servlet.jsp.jstl</artifactId>
- <version>1.2.0.v201105211821</version>
- <scope>test</scope>
- </dependency>
-
- <!--
- See SPARK-3710. hadoop-yarn-server-tests in Hadoop 2.2 fails to pull some needed
- dependencies, so they need to be added manually for the tests to work.
- -->
-
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-server-tests</artifactId>
- <classifier>tests</classifier>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>jetty</artifactId>
- <version>6.1.26</version>
- <exclusions>
- <exclusion>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>servlet-api</artifactId>
- </exclusion>
- </exclusions>
- <scope>test</scope>
- </dependency>
-
- <!--
- Jersey 1 dependencies only required for YARN integration testing. Creating a YARN cluster
- in the JVM requires starting a Jersey 1-based web application.
- -->
- <dependency>
- <groupId>com.sun.jersey</groupId>
- <artifactId>jersey-core</artifactId>
- <scope>test</scope>
- <version>${jersey-1.version}</version>
- </dependency>
- <dependency>
- <groupId>com.sun.jersey</groupId>
- <artifactId>jersey-json</artifactId>
- <scope>test</scope>
- <version>${jersey-1.version}</version>
- </dependency>
- <dependency>
- <groupId>com.sun.jersey</groupId>
- <artifactId>jersey-server</artifactId>
- <scope>test</scope>
- <version>${jersey-1.version}</version>
- </dependency>
- <dependency>
- <groupId>com.sun.jersey.contribs</groupId>
- <artifactId>jersey-guice</artifactId>
- <scope>test</scope>
- <version>${jersey-1.version}</version>
- </dependency>
-
- <!--
- Testing Hive reflection needs hive on the test classpath only.
- It doesn't need the spark hive modules, so the -Phive flag is not checked.
- -->
- <dependency>
- <groupId>${hive.group}</groupId>
- <artifactId>hive-exec</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${hive.group}</groupId>
- <artifactId>hive-metastore</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.thrift</groupId>
- <artifactId>libthrift</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.thrift</groupId>
- <artifactId>libfb303</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
- <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
- </build>
-
-</project>
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
----------------------------------------------------------------------
diff --git a/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider b/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
deleted file mode 100644
index 22ead56..0000000
--- a/yarn/src/main/resources/META-INF/services/org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
+++ /dev/null
@@ -1,3 +0,0 @@
-org.apache.spark.deploy.yarn.security.HDFSCredentialProvider
-org.apache.spark.deploy.yarn.security.HBaseCredentialProvider
-org.apache.spark.deploy.yarn.security.HiveCredentialProvider
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
----------------------------------------------------------------------
diff --git a/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
deleted file mode 100644
index 6e8a1eb..0000000
--- a/yarn/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.spark.scheduler.cluster.YarnClusterManager
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[05/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
deleted file mode 100644
index be419ce..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ /dev/null
@@ -1,1541 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.{File, FileOutputStream, IOException, OutputStreamWriter}
-import java.net.{InetAddress, UnknownHostException, URI}
-import java.nio.ByteBuffer
-import java.nio.charset.StandardCharsets
-import java.util.{Properties, UUID}
-import java.util.zip.{ZipEntry, ZipOutputStream}
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, ListBuffer, Map}
-import scala.util.{Failure, Success, Try}
-import scala.util.control.NonFatal
-
-import com.google.common.base.Objects
-import com.google.common.io.Files
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs._
-import org.apache.hadoop.fs.permission.FsPermission
-import org.apache.hadoop.io.DataOutputBuffer
-import org.apache.hadoop.mapreduce.MRJobConfig
-import org.apache.hadoop.security.{Credentials, UserGroupInformation}
-import org.apache.hadoop.util.StringUtils
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.protocolrecords._
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.{YarnClient, YarnClientApplication}
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
-import org.apache.hadoop.yarn.util.Records
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkContext, SparkException}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.deploy.yarn.security.ConfigurableCredentialManager
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils}
-import org.apache.spark.util.{CallerContext, Utils}
-
-private[spark] class Client(
- val args: ClientArguments,
- val hadoopConf: Configuration,
- val sparkConf: SparkConf)
- extends Logging {
-
- import Client._
- import YarnSparkHadoopUtil._
-
- def this(clientArgs: ClientArguments, spConf: SparkConf) =
- this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
-
- private val yarnClient = YarnClient.createYarnClient
- private val yarnConf = new YarnConfiguration(hadoopConf)
-
- private val isClusterMode = sparkConf.get("spark.submit.deployMode", "client") == "cluster"
-
- // AM related configurations
- private val amMemory = if (isClusterMode) {
- sparkConf.get(DRIVER_MEMORY).toInt
- } else {
- sparkConf.get(AM_MEMORY).toInt
- }
- private val amMemoryOverhead = {
- val amMemoryOverheadEntry = if (isClusterMode) DRIVER_MEMORY_OVERHEAD else AM_MEMORY_OVERHEAD
- sparkConf.get(amMemoryOverheadEntry).getOrElse(
- math.max((MEMORY_OVERHEAD_FACTOR * amMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
- }
- private val amCores = if (isClusterMode) {
- sparkConf.get(DRIVER_CORES)
- } else {
- sparkConf.get(AM_CORES)
- }
-
- // Executor related configurations
- private val executorMemory = sparkConf.get(EXECUTOR_MEMORY)
- private val executorMemoryOverhead = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
- math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
-
- private val distCacheMgr = new ClientDistributedCacheManager()
-
- private var loginFromKeytab = false
- private var principal: String = null
- private var keytab: String = null
- private var credentials: Credentials = null
-
- private val launcherBackend = new LauncherBackend() {
- override def onStopRequest(): Unit = {
- if (isClusterMode && appId != null) {
- yarnClient.killApplication(appId)
- } else {
- setState(SparkAppHandle.State.KILLED)
- stop()
- }
- }
- }
- private val fireAndForget = isClusterMode && !sparkConf.get(WAIT_FOR_APP_COMPLETION)
-
- private var appId: ApplicationId = null
-
- // The app staging dir based on the STAGING_DIR configuration if configured
- // otherwise based on the users home directory.
- private val appStagingBaseDir = sparkConf.get(STAGING_DIR).map { new Path(_) }
- .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory())
-
- private val credentialManager = new ConfigurableCredentialManager(sparkConf, hadoopConf)
-
- def reportLauncherState(state: SparkAppHandle.State): Unit = {
- launcherBackend.setState(state)
- }
-
- def stop(): Unit = {
- launcherBackend.close()
- yarnClient.stop()
- // Unset YARN mode system env variable, to allow switching between cluster types.
- System.clearProperty("SPARK_YARN_MODE")
- }
-
- /**
- * Submit an application running our ApplicationMaster to the ResourceManager.
- *
- * The stable Yarn API provides a convenience method (YarnClient#createApplication) for
- * creating applications and setting up the application submission context. This was not
- * available in the alpha API.
- */
- def submitApplication(): ApplicationId = {
- var appId: ApplicationId = null
- try {
- launcherBackend.connect()
- // Setup the credentials before doing anything else,
- // so we have don't have issues at any point.
- setupCredentials()
- yarnClient.init(yarnConf)
- yarnClient.start()
-
- logInfo("Requesting a new application from cluster with %d NodeManagers"
- .format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))
-
- // Get a new application from our RM
- val newApp = yarnClient.createApplication()
- val newAppResponse = newApp.getNewApplicationResponse()
- appId = newAppResponse.getApplicationId()
- reportLauncherState(SparkAppHandle.State.SUBMITTED)
- launcherBackend.setAppId(appId.toString)
-
- new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
- Option(appId.toString)).setCurrentContext()
-
- // Verify whether the cluster has enough resources for our AM
- verifyClusterResources(newAppResponse)
-
- // Set up the appropriate contexts to launch our AM
- val containerContext = createContainerLaunchContext(newAppResponse)
- val appContext = createApplicationSubmissionContext(newApp, containerContext)
-
- // Finally, submit and monitor the application
- logInfo(s"Submitting application $appId to ResourceManager")
- yarnClient.submitApplication(appContext)
- appId
- } catch {
- case e: Throwable =>
- if (appId != null) {
- cleanupStagingDir(appId)
- }
- throw e
- }
- }
-
- /**
- * Cleanup application staging directory.
- */
- private def cleanupStagingDir(appId: ApplicationId): Unit = {
- val stagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
- try {
- val preserveFiles = sparkConf.get(PRESERVE_STAGING_FILES)
- val fs = stagingDirPath.getFileSystem(hadoopConf)
- if (!preserveFiles && fs.delete(stagingDirPath, true)) {
- logInfo(s"Deleted staging directory $stagingDirPath")
- }
- } catch {
- case ioe: IOException =>
- logWarning("Failed to cleanup staging dir " + stagingDirPath, ioe)
- }
- }
-
- /**
- * Set up the context for submitting our ApplicationMaster.
- * This uses the YarnClientApplication not available in the Yarn alpha API.
- */
- def createApplicationSubmissionContext(
- newApp: YarnClientApplication,
- containerContext: ContainerLaunchContext): ApplicationSubmissionContext = {
- val appContext = newApp.getApplicationSubmissionContext
- appContext.setApplicationName(sparkConf.get("spark.app.name", "Spark"))
- appContext.setQueue(sparkConf.get(QUEUE_NAME))
- appContext.setAMContainerSpec(containerContext)
- appContext.setApplicationType("SPARK")
-
- sparkConf.get(APPLICATION_TAGS).foreach { tags =>
- try {
- // The setApplicationTags method was only introduced in Hadoop 2.4+, so we need to use
- // reflection to set it, printing a warning if a tag was specified but the YARN version
- // doesn't support it.
- val method = appContext.getClass().getMethod(
- "setApplicationTags", classOf[java.util.Set[String]])
- method.invoke(appContext, new java.util.HashSet[String](tags.asJava))
- } catch {
- case e: NoSuchMethodException =>
- logWarning(s"Ignoring ${APPLICATION_TAGS.key} because this version of " +
- "YARN does not support it")
- }
- }
- sparkConf.get(MAX_APP_ATTEMPTS) match {
- case Some(v) => appContext.setMaxAppAttempts(v)
- case None => logDebug(s"${MAX_APP_ATTEMPTS.key} is not set. " +
- "Cluster's default value will be used.")
- }
-
- sparkConf.get(AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).foreach { interval =>
- try {
- val method = appContext.getClass().getMethod(
- "setAttemptFailuresValidityInterval", classOf[Long])
- method.invoke(appContext, interval: java.lang.Long)
- } catch {
- case e: NoSuchMethodException =>
- logWarning(s"Ignoring ${AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS.key} because " +
- "the version of YARN does not support it")
- }
- }
-
- val capability = Records.newRecord(classOf[Resource])
- capability.setMemory(amMemory + amMemoryOverhead)
- capability.setVirtualCores(amCores)
-
- sparkConf.get(AM_NODE_LABEL_EXPRESSION) match {
- case Some(expr) =>
- try {
- val amRequest = Records.newRecord(classOf[ResourceRequest])
- amRequest.setResourceName(ResourceRequest.ANY)
- amRequest.setPriority(Priority.newInstance(0))
- amRequest.setCapability(capability)
- amRequest.setNumContainers(1)
- val method = amRequest.getClass.getMethod("setNodeLabelExpression", classOf[String])
- method.invoke(amRequest, expr)
-
- val setResourceRequestMethod =
- appContext.getClass.getMethod("setAMContainerResourceRequest", classOf[ResourceRequest])
- setResourceRequestMethod.invoke(appContext, amRequest)
- } catch {
- case e: NoSuchMethodException =>
- logWarning(s"Ignoring ${AM_NODE_LABEL_EXPRESSION.key} because the version " +
- "of YARN does not support it")
- appContext.setResource(capability)
- }
- case None =>
- appContext.setResource(capability)
- }
-
- sparkConf.get(ROLLED_LOG_INCLUDE_PATTERN).foreach { includePattern =>
- try {
- val logAggregationContext = Records.newRecord(
- Utils.classForName("org.apache.hadoop.yarn.api.records.LogAggregationContext"))
- .asInstanceOf[Object]
-
- val setRolledLogsIncludePatternMethod =
- logAggregationContext.getClass.getMethod("setRolledLogsIncludePattern", classOf[String])
- setRolledLogsIncludePatternMethod.invoke(logAggregationContext, includePattern)
-
- sparkConf.get(ROLLED_LOG_EXCLUDE_PATTERN).foreach { excludePattern =>
- val setRolledLogsExcludePatternMethod =
- logAggregationContext.getClass.getMethod("setRolledLogsExcludePattern", classOf[String])
- setRolledLogsExcludePatternMethod.invoke(logAggregationContext, excludePattern)
- }
-
- val setLogAggregationContextMethod =
- appContext.getClass.getMethod("setLogAggregationContext",
- Utils.classForName("org.apache.hadoop.yarn.api.records.LogAggregationContext"))
- setLogAggregationContextMethod.invoke(appContext, logAggregationContext)
- } catch {
- case NonFatal(e) =>
- logWarning(s"Ignoring ${ROLLED_LOG_INCLUDE_PATTERN.key} because the version of YARN " +
- s"does not support it", e)
- }
- }
-
- appContext
- }
-
- /** Set up security tokens for launching our ApplicationMaster container. */
- private def setupSecurityToken(amContainer: ContainerLaunchContext): Unit = {
- val dob = new DataOutputBuffer
- credentials.writeTokenStorageToStream(dob)
- amContainer.setTokens(ByteBuffer.wrap(dob.getData))
- }
-
- /** Get the application report from the ResourceManager for an application we have submitted. */
- def getApplicationReport(appId: ApplicationId): ApplicationReport =
- yarnClient.getApplicationReport(appId)
-
- /**
- * Return the security token used by this client to communicate with the ApplicationMaster.
- * If no security is enabled, the token returned by the report is null.
- */
- private def getClientToken(report: ApplicationReport): String =
- Option(report.getClientToAMToken).map(_.toString).getOrElse("")
-
- /**
- * Fail fast if we have requested more resources per container than is available in the cluster.
- */
- private def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = {
- val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()
- logInfo("Verifying our application has not requested more than the maximum " +
- s"memory capability of the cluster ($maxMem MB per container)")
- val executorMem = executorMemory + executorMemoryOverhead
- if (executorMem > maxMem) {
- throw new IllegalArgumentException(s"Required executor memory ($executorMemory" +
- s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +
- "Please check the values of 'yarn.scheduler.maximum-allocation-mb' and/or " +
- "'yarn.nodemanager.resource.memory-mb'.")
- }
- val amMem = amMemory + amMemoryOverhead
- if (amMem > maxMem) {
- throw new IllegalArgumentException(s"Required AM memory ($amMemory" +
- s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +
- "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.")
- }
- logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format(
- amMem,
- amMemoryOverhead))
-
- // We could add checks to make sure the entire cluster has enough resources but that involves
- // getting all the node reports and computing ourselves.
- }
-
- /**
- * Copy the given file to a remote file system (e.g. HDFS) if needed.
- * The file is only copied if the source and destination file systems are different. This is used
- * for preparing resources for launching the ApplicationMaster container. Exposed for testing.
- */
- private[yarn] def copyFileToRemote(
- destDir: Path,
- srcPath: Path,
- replication: Short,
- force: Boolean = false,
- destName: Option[String] = None): Path = {
- val destFs = destDir.getFileSystem(hadoopConf)
- val srcFs = srcPath.getFileSystem(hadoopConf)
- var destPath = srcPath
- if (force || !compareFs(srcFs, destFs)) {
- destPath = new Path(destDir, destName.getOrElse(srcPath.getName()))
- logInfo(s"Uploading resource $srcPath -> $destPath")
- FileUtil.copy(srcFs, srcPath, destFs, destPath, false, hadoopConf)
- destFs.setReplication(destPath, replication)
- destFs.setPermission(destPath, new FsPermission(APP_FILE_PERMISSION))
- } else {
- logInfo(s"Source and destination file systems are the same. Not copying $srcPath")
- }
- // Resolve any symlinks in the URI path so using a "current" symlink to point to a specific
- // version shows the specific version in the distributed cache configuration
- val qualifiedDestPath = destFs.makeQualified(destPath)
- val fc = FileContext.getFileContext(qualifiedDestPath.toUri(), hadoopConf)
- fc.resolvePath(qualifiedDestPath)
- }
-
- /**
- * Upload any resources to the distributed cache if needed. If a resource is intended to be
- * consumed locally, set up the appropriate config for downstream code to handle it properly.
- * This is used for setting up a container launch context for our ApplicationMaster.
- * Exposed for testing.
- */
- def prepareLocalResources(
- destDir: Path,
- pySparkArchives: Seq[String]): HashMap[String, LocalResource] = {
- logInfo("Preparing resources for our AM container")
- // Upload Spark and the application JAR to the remote file system if necessary,
- // and add them as local resources to the application master.
- val fs = destDir.getFileSystem(hadoopConf)
-
- // Merge credentials obtained from registered providers
- val nearestTimeOfNextRenewal = credentialManager.obtainCredentials(hadoopConf, credentials)
-
- if (credentials != null) {
- logDebug(YarnSparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
- }
-
- // If we use principal and keytab to login, also credentials can be renewed some time
- // after current time, we should pass the next renewal and updating time to credential
- // renewer and updater.
- if (loginFromKeytab && nearestTimeOfNextRenewal > System.currentTimeMillis() &&
- nearestTimeOfNextRenewal != Long.MaxValue) {
-
- // Valid renewal time is 75% of next renewal time, and the valid update time will be
- // slightly later then renewal time (80% of next renewal time). This is to make sure
- // credentials are renewed and updated before expired.
- val currTime = System.currentTimeMillis()
- val renewalTime = (nearestTimeOfNextRenewal - currTime) * 0.75 + currTime
- val updateTime = (nearestTimeOfNextRenewal - currTime) * 0.8 + currTime
-
- sparkConf.set(CREDENTIALS_RENEWAL_TIME, renewalTime.toLong)
- sparkConf.set(CREDENTIALS_UPDATE_TIME, updateTime.toLong)
- }
-
- // Used to keep track of URIs added to the distributed cache. If the same URI is added
- // multiple times, YARN will fail to launch containers for the app with an internal
- // error.
- val distributedUris = new HashSet[String]
- // Used to keep track of URIs(files) added to the distribute cache have the same name. If
- // same name but different path files are added multiple time, YARN will fail to launch
- // containers for the app with an internal error.
- val distributedNames = new HashSet[String]
-
- val replication = sparkConf.get(STAGING_FILE_REPLICATION).map(_.toShort)
- .getOrElse(fs.getDefaultReplication(destDir))
- val localResources = HashMap[String, LocalResource]()
- FileSystem.mkdirs(fs, destDir, new FsPermission(STAGING_DIR_PERMISSION))
-
- val statCache: Map[URI, FileStatus] = HashMap[URI, FileStatus]()
-
- def addDistributedUri(uri: URI): Boolean = {
- val uriStr = uri.toString()
- val fileName = new File(uri.getPath).getName
- if (distributedUris.contains(uriStr)) {
- logWarning(s"Same path resource $uri added multiple times to distributed cache.")
- false
- } else if (distributedNames.contains(fileName)) {
- logWarning(s"Same name resource $uri added multiple times to distributed cache")
- false
- } else {
- distributedUris += uriStr
- distributedNames += fileName
- true
- }
- }
-
- /**
- * Distribute a file to the cluster.
- *
- * If the file's path is a "local:" URI, it's actually not distributed. Other files are copied
- * to HDFS (if not already there) and added to the application's distributed cache.
- *
- * @param path URI of the file to distribute.
- * @param resType Type of resource being distributed.
- * @param destName Name of the file in the distributed cache.
- * @param targetDir Subdirectory where to place the file.
- * @param appMasterOnly Whether to distribute only to the AM.
- * @return A 2-tuple. First item is whether the file is a "local:" URI. Second item is the
- * localized path for non-local paths, or the input `path` for local paths.
- * The localized path will be null if the URI has already been added to the cache.
- */
- def distribute(
- path: String,
- resType: LocalResourceType = LocalResourceType.FILE,
- destName: Option[String] = None,
- targetDir: Option[String] = None,
- appMasterOnly: Boolean = false): (Boolean, String) = {
- val trimmedPath = path.trim()
- val localURI = Utils.resolveURI(trimmedPath)
- if (localURI.getScheme != LOCAL_SCHEME) {
- if (addDistributedUri(localURI)) {
- val localPath = getQualifiedLocalPath(localURI, hadoopConf)
- val linkname = targetDir.map(_ + "/").getOrElse("") +
- destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName())
- val destPath = copyFileToRemote(destDir, localPath, replication)
- val destFs = FileSystem.get(destPath.toUri(), hadoopConf)
- distCacheMgr.addResource(
- destFs, hadoopConf, destPath, localResources, resType, linkname, statCache,
- appMasterOnly = appMasterOnly)
- (false, linkname)
- } else {
- (false, null)
- }
- } else {
- (true, trimmedPath)
- }
- }
-
- // If we passed in a keytab, make sure we copy the keytab to the staging directory on
- // HDFS, and setup the relevant environment vars, so the AM can login again.
- if (loginFromKeytab) {
- logInfo("To enable the AM to login from keytab, credentials are being copied over to the AM" +
- " via the YARN Secure Distributed Cache.")
- val (_, localizedPath) = distribute(keytab,
- destName = sparkConf.get(KEYTAB),
- appMasterOnly = true)
- require(localizedPath != null, "Keytab file already distributed.")
- }
-
- /**
- * Add Spark to the cache. There are two settings that control what files to add to the cache:
- * - if a Spark archive is defined, use the archive. The archive is expected to contain
- * jar files at its root directory.
- * - if a list of jars is provided, filter the non-local ones, resolve globs, and
- * add the found files to the cache.
- *
- * Note that the archive cannot be a "local" URI. If none of the above settings are found,
- * then upload all files found in $SPARK_HOME/jars.
- */
- val sparkArchive = sparkConf.get(SPARK_ARCHIVE)
- if (sparkArchive.isDefined) {
- val archive = sparkArchive.get
- require(!isLocalUri(archive), s"${SPARK_ARCHIVE.key} cannot be a local URI.")
- distribute(Utils.resolveURI(archive).toString,
- resType = LocalResourceType.ARCHIVE,
- destName = Some(LOCALIZED_LIB_DIR))
- } else {
- sparkConf.get(SPARK_JARS) match {
- case Some(jars) =>
- // Break the list of jars to upload, and resolve globs.
- val localJars = new ArrayBuffer[String]()
- jars.foreach { jar =>
- if (!isLocalUri(jar)) {
- val path = getQualifiedLocalPath(Utils.resolveURI(jar), hadoopConf)
- val pathFs = FileSystem.get(path.toUri(), hadoopConf)
- pathFs.globStatus(path).filter(_.isFile()).foreach { entry =>
- distribute(entry.getPath().toUri().toString(),
- targetDir = Some(LOCALIZED_LIB_DIR))
- }
- } else {
- localJars += jar
- }
- }
-
- // Propagate the local URIs to the containers using the configuration.
- sparkConf.set(SPARK_JARS, localJars)
-
- case None =>
- // No configuration, so fall back to uploading local jar files.
- logWarning(s"Neither ${SPARK_JARS.key} nor ${SPARK_ARCHIVE.key} is set, falling back " +
- "to uploading libraries under SPARK_HOME.")
- val jarsDir = new File(YarnCommandBuilderUtils.findJarsDir(
- sparkConf.getenv("SPARK_HOME")))
- val jarsArchive = File.createTempFile(LOCALIZED_LIB_DIR, ".zip",
- new File(Utils.getLocalDir(sparkConf)))
- val jarsStream = new ZipOutputStream(new FileOutputStream(jarsArchive))
-
- try {
- jarsStream.setLevel(0)
- jarsDir.listFiles().foreach { f =>
- if (f.isFile && f.getName.toLowerCase().endsWith(".jar") && f.canRead) {
- jarsStream.putNextEntry(new ZipEntry(f.getName))
- Files.copy(f, jarsStream)
- jarsStream.closeEntry()
- }
- }
- } finally {
- jarsStream.close()
- }
-
- distribute(jarsArchive.toURI.getPath,
- resType = LocalResourceType.ARCHIVE,
- destName = Some(LOCALIZED_LIB_DIR))
- }
- }
-
- /**
- * Copy user jar to the distributed cache if their scheme is not "local".
- * Otherwise, set the corresponding key in our SparkConf to handle it downstream.
- */
- Option(args.userJar).filter(_.trim.nonEmpty).foreach { jar =>
- val (isLocal, localizedPath) = distribute(jar, destName = Some(APP_JAR_NAME))
- if (isLocal) {
- require(localizedPath != null, s"Path $jar already distributed")
- // If the resource is intended for local use only, handle this downstream
- // by setting the appropriate property
- sparkConf.set(APP_JAR, localizedPath)
- }
- }
-
- /**
- * Do the same for any additional resources passed in through ClientArguments.
- * Each resource category is represented by a 3-tuple of:
- * (1) comma separated list of resources in this category,
- * (2) resource type, and
- * (3) whether to add these resources to the classpath
- */
- val cachedSecondaryJarLinks = ListBuffer.empty[String]
- List(
- (sparkConf.get(JARS_TO_DISTRIBUTE), LocalResourceType.FILE, true),
- (sparkConf.get(FILES_TO_DISTRIBUTE), LocalResourceType.FILE, false),
- (sparkConf.get(ARCHIVES_TO_DISTRIBUTE), LocalResourceType.ARCHIVE, false)
- ).foreach { case (flist, resType, addToClasspath) =>
- flist.foreach { file =>
- val (_, localizedPath) = distribute(file, resType = resType)
- // If addToClassPath, we ignore adding jar multiple times to distitrbuted cache.
- if (addToClasspath) {
- if (localizedPath != null) {
- cachedSecondaryJarLinks += localizedPath
- }
- } else {
- if (localizedPath == null) {
- throw new IllegalArgumentException(s"Attempt to add ($file) multiple times" +
- " to the distributed cache.")
- }
- }
- }
- }
- if (cachedSecondaryJarLinks.nonEmpty) {
- sparkConf.set(SECONDARY_JARS, cachedSecondaryJarLinks)
- }
-
- if (isClusterMode && args.primaryPyFile != null) {
- distribute(args.primaryPyFile, appMasterOnly = true)
- }
-
- pySparkArchives.foreach { f => distribute(f) }
-
- // The python files list needs to be treated especially. All files that are not an
- // archive need to be placed in a subdirectory that will be added to PYTHONPATH.
- sparkConf.get(PY_FILES).foreach { f =>
- val targetDir = if (f.endsWith(".py")) Some(LOCALIZED_PYTHON_DIR) else None
- distribute(f, targetDir = targetDir)
- }
-
- // Update the configuration with all the distributed files, minus the conf archive. The
- // conf archive will be handled by the AM differently so that we avoid having to send
- // this configuration by other means. See SPARK-14602 for one reason of why this is needed.
- distCacheMgr.updateConfiguration(sparkConf)
-
- // Upload the conf archive to HDFS manually, and record its location in the configuration.
- // This will allow the AM to know where the conf archive is in HDFS, so that it can be
- // distributed to the containers.
- //
- // This code forces the archive to be copied, so that unit tests pass (since in that case both
- // file systems are the same and the archive wouldn't normally be copied). In most (all?)
- // deployments, the archive would be copied anyway, since it's a temp file in the local file
- // system.
- val remoteConfArchivePath = new Path(destDir, LOCALIZED_CONF_ARCHIVE)
- val remoteFs = FileSystem.get(remoteConfArchivePath.toUri(), hadoopConf)
- sparkConf.set(CACHED_CONF_ARCHIVE, remoteConfArchivePath.toString())
-
- val localConfArchive = new Path(createConfArchive().toURI())
- copyFileToRemote(destDir, localConfArchive, replication, force = true,
- destName = Some(LOCALIZED_CONF_ARCHIVE))
-
- // Manually add the config archive to the cache manager so that the AM is launched with
- // the proper files set up.
- distCacheMgr.addResource(
- remoteFs, hadoopConf, remoteConfArchivePath, localResources, LocalResourceType.ARCHIVE,
- LOCALIZED_CONF_DIR, statCache, appMasterOnly = false)
-
- // Clear the cache-related entries from the configuration to avoid them polluting the
- // UI's environment page. This works for client mode; for cluster mode, this is handled
- // by the AM.
- CACHE_CONFIGS.foreach(sparkConf.remove)
-
- localResources
- }
-
- /**
- * Create an archive with the config files for distribution.
- *
- * These will be used by AM and executors. The files are zipped and added to the job as an
- * archive, so that YARN will explode it when distributing to AM and executors. This directory
- * is then added to the classpath of AM and executor process, just to make sure that everybody
- * is using the same default config.
- *
- * This follows the order of precedence set by the startup scripts, in which HADOOP_CONF_DIR
- * shows up in the classpath before YARN_CONF_DIR.
- *
- * Currently this makes a shallow copy of the conf directory. If there are cases where a
- * Hadoop config directory contains subdirectories, this code will have to be fixed.
- *
- * The archive also contains some Spark configuration. Namely, it saves the contents of
- * SparkConf in a file to be loaded by the AM process.
- */
- private def createConfArchive(): File = {
- val hadoopConfFiles = new HashMap[String, File]()
-
- // Uploading $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
- // the executors will use the latest configurations instead of the default values. This is
- // required when user changes log4j.properties directly to set the log configurations. If
- // configuration file is provided through --files then executors will be taking configurations
- // from --files instead of $SPARK_CONF_DIR/log4j.properties.
-
- // Also uploading metrics.properties to distributed cache if exists in classpath.
- // If user specify this file using --files then executors will use the one
- // from --files instead.
- for { prop <- Seq("log4j.properties", "metrics.properties")
- url <- Option(Utils.getContextOrSparkClassLoader.getResource(prop))
- if url.getProtocol == "file" } {
- hadoopConfFiles(prop) = new File(url.getPath)
- }
-
- Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
- sys.env.get(envKey).foreach { path =>
- val dir = new File(path)
- if (dir.isDirectory()) {
- val files = dir.listFiles()
- if (files == null) {
- logWarning("Failed to list files under directory " + dir)
- } else {
- files.foreach { file =>
- if (file.isFile && !hadoopConfFiles.contains(file.getName())) {
- hadoopConfFiles(file.getName()) = file
- }
- }
- }
- }
- }
- }
-
- val confArchive = File.createTempFile(LOCALIZED_CONF_DIR, ".zip",
- new File(Utils.getLocalDir(sparkConf)))
- val confStream = new ZipOutputStream(new FileOutputStream(confArchive))
-
- try {
- confStream.setLevel(0)
- hadoopConfFiles.foreach { case (name, file) =>
- if (file.canRead()) {
- confStream.putNextEntry(new ZipEntry(name))
- Files.copy(file, confStream)
- confStream.closeEntry()
- }
- }
-
- // Save Spark configuration to a file in the archive.
- val props = new Properties()
- sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
- confStream.putNextEntry(new ZipEntry(SPARK_CONF_FILE))
- val writer = new OutputStreamWriter(confStream, StandardCharsets.UTF_8)
- props.store(writer, "Spark configuration.")
- writer.flush()
- confStream.closeEntry()
- } finally {
- confStream.close()
- }
- confArchive
- }
-
- /**
- * Set up the environment for launching our ApplicationMaster container.
- */
- private def setupLaunchEnv(
- stagingDirPath: Path,
- pySparkArchives: Seq[String]): HashMap[String, String] = {
- logInfo("Setting up the launch environment for our AM container")
- val env = new HashMap[String, String]()
- populateClasspath(args, yarnConf, sparkConf, env, sparkConf.get(DRIVER_CLASS_PATH))
- env("SPARK_YARN_MODE") = "true"
- env("SPARK_YARN_STAGING_DIR") = stagingDirPath.toString
- env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
- if (loginFromKeytab) {
- val credentialsFile = "credentials-" + UUID.randomUUID().toString
- sparkConf.set(CREDENTIALS_FILE_PATH, new Path(stagingDirPath, credentialsFile).toString)
- logInfo(s"Credentials file set to: $credentialsFile")
- }
-
- // Pick up any environment variables for the AM provided through spark.yarn.appMasterEnv.*
- val amEnvPrefix = "spark.yarn.appMasterEnv."
- sparkConf.getAll
- .filter { case (k, v) => k.startsWith(amEnvPrefix) }
- .map { case (k, v) => (k.substring(amEnvPrefix.length), v) }
- .foreach { case (k, v) => YarnSparkHadoopUtil.addPathToEnvironment(env, k, v) }
-
- // Keep this for backwards compatibility but users should move to the config
- sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
- // Allow users to specify some environment variables.
- YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
- // Pass SPARK_YARN_USER_ENV itself to the AM so it can use it to set up executor environments.
- env("SPARK_YARN_USER_ENV") = userEnvs
- }
-
- // If pyFiles contains any .py files, we need to add LOCALIZED_PYTHON_DIR to the PYTHONPATH
- // of the container processes too. Add all non-.py files directly to PYTHONPATH.
- //
- // NOTE: the code currently does not handle .py files defined with a "local:" scheme.
- val pythonPath = new ListBuffer[String]()
- val (pyFiles, pyArchives) = sparkConf.get(PY_FILES).partition(_.endsWith(".py"))
- if (pyFiles.nonEmpty) {
- pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- LOCALIZED_PYTHON_DIR)
- }
- (pySparkArchives ++ pyArchives).foreach { path =>
- val uri = Utils.resolveURI(path)
- if (uri.getScheme != LOCAL_SCHEME) {
- pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- new Path(uri).getName())
- } else {
- pythonPath += uri.getPath()
- }
- }
-
- // Finally, update the Spark config to propagate PYTHONPATH to the AM and executors.
- if (pythonPath.nonEmpty) {
- val pythonPathStr = (sys.env.get("PYTHONPATH") ++ pythonPath)
- .mkString(YarnSparkHadoopUtil.getClassPathSeparator)
- env("PYTHONPATH") = pythonPathStr
- sparkConf.setExecutorEnv("PYTHONPATH", pythonPathStr)
- }
-
- // In cluster mode, if the deprecated SPARK_JAVA_OPTS is set, we need to propagate it to
- // executors. But we can't just set spark.executor.extraJavaOptions, because the driver's
- // SparkContext will not let that set spark* system properties, which is expected behavior for
- // Yarn clients. So propagate it through the environment.
- //
- // Note that to warn the user about the deprecation in cluster mode, some code from
- // SparkConf#validateSettings() is duplicated here (to avoid triggering the condition
- // described above).
- if (isClusterMode) {
- sys.env.get("SPARK_JAVA_OPTS").foreach { value =>
- val warning =
- s"""
- |SPARK_JAVA_OPTS was detected (set to '$value').
- |This is deprecated in Spark 1.0+.
- |
- |Please instead use:
- | - ./spark-submit with conf/spark-defaults.conf to set defaults for an application
- | - ./spark-submit with --driver-java-options to set -X options for a driver
- | - spark.executor.extraJavaOptions to set -X options for executors
- """.stripMargin
- logWarning(warning)
- for (proc <- Seq("driver", "executor")) {
- val key = s"spark.$proc.extraJavaOptions"
- if (sparkConf.contains(key)) {
- throw new SparkException(s"Found both $key and SPARK_JAVA_OPTS. Use only the former.")
- }
- }
- env("SPARK_JAVA_OPTS") = value
- }
- // propagate PYSPARK_DRIVER_PYTHON and PYSPARK_PYTHON to driver in cluster mode
- Seq("PYSPARK_DRIVER_PYTHON", "PYSPARK_PYTHON").foreach { envname =>
- if (!env.contains(envname)) {
- sys.env.get(envname).foreach(env(envname) = _)
- }
- }
- }
-
- sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>
- env(ENV_DIST_CLASSPATH) = dcp
- }
-
- env
- }
-
- /**
- * Set up a ContainerLaunchContext to launch our ApplicationMaster container.
- * This sets up the launch environment, java options, and the command for launching the AM.
- */
- private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
- : ContainerLaunchContext = {
- logInfo("Setting up container launch context for our AM")
- val appId = newAppResponse.getApplicationId
- val appStagingDirPath = new Path(appStagingBaseDir, getAppStagingDir(appId))
- val pySparkArchives =
- if (sparkConf.get(IS_PYTHON_APP)) {
- findPySparkArchives()
- } else {
- Nil
- }
- val launchEnv = setupLaunchEnv(appStagingDirPath, pySparkArchives)
- val localResources = prepareLocalResources(appStagingDirPath, pySparkArchives)
-
- val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
- amContainer.setLocalResources(localResources.asJava)
- amContainer.setEnvironment(launchEnv.asJava)
-
- val javaOpts = ListBuffer[String]()
-
- // Set the environment variable through a command prefix
- // to append to the existing value of the variable
- var prefixEnv: Option[String] = None
-
- // Add Xmx for AM memory
- javaOpts += "-Xmx" + amMemory + "m"
-
- val tmpDir = new Path(
- YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR
- )
- javaOpts += "-Djava.io.tmpdir=" + tmpDir
-
- // TODO: Remove once cpuset version is pushed out.
- // The context is, default gc for server class machines ends up using all cores to do gc -
- // hence if there are multiple containers in same node, Spark GC affects all other containers'
- // performance (which can be that of other Spark containers)
- // Instead of using this, rely on cpusets by YARN to enforce "proper" Spark behavior in
- // multi-tenant environments. Not sure how default Java GC behaves if it is limited to subset
- // of cores on a node.
- val useConcurrentAndIncrementalGC = launchEnv.get("SPARK_USE_CONC_INCR_GC").exists(_.toBoolean)
- if (useConcurrentAndIncrementalGC) {
- // In our expts, using (default) throughput collector has severe perf ramifications in
- // multi-tenant machines
- javaOpts += "-XX:+UseConcMarkSweepGC"
- javaOpts += "-XX:MaxTenuringThreshold=31"
- javaOpts += "-XX:SurvivorRatio=8"
- javaOpts += "-XX:+CMSIncrementalMode"
- javaOpts += "-XX:+CMSIncrementalPacing"
- javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
- javaOpts += "-XX:CMSIncrementalDutyCycle=10"
- }
-
- // Include driver-specific java options if we are launching a driver
- if (isClusterMode) {
- val driverOpts = sparkConf.get(DRIVER_JAVA_OPTIONS).orElse(sys.env.get("SPARK_JAVA_OPTS"))
- driverOpts.foreach { opts =>
- javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
- }
- val libraryPaths = Seq(sparkConf.get(DRIVER_LIBRARY_PATH),
- sys.props.get("spark.driver.libraryPath")).flatten
- if (libraryPaths.nonEmpty) {
- prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(libraryPaths)))
- }
- if (sparkConf.get(AM_JAVA_OPTIONS).isDefined) {
- logWarning(s"${AM_JAVA_OPTIONS.key} will not take effect in cluster mode")
- }
- } else {
- // Validate and include yarn am specific java options in yarn-client mode.
- sparkConf.get(AM_JAVA_OPTIONS).foreach { opts =>
- if (opts.contains("-Dspark")) {
- val msg = s"${AM_JAVA_OPTIONS.key} is not allowed to set Spark options (was '$opts')."
- throw new SparkException(msg)
- }
- if (opts.contains("-Xmx")) {
- val msg = s"${AM_JAVA_OPTIONS.key} is not allowed to specify max heap memory settings " +
- s"(was '$opts'). Use spark.yarn.am.memory instead."
- throw new SparkException(msg)
- }
- javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
- }
- sparkConf.get(AM_LIBRARY_PATH).foreach { paths =>
- prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(paths))))
- }
- }
-
- // For log4j configuration to reference
- javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
- YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
-
- val userClass =
- if (isClusterMode) {
- Seq("--class", YarnSparkHadoopUtil.escapeForShell(args.userClass))
- } else {
- Nil
- }
- val userJar =
- if (args.userJar != null) {
- Seq("--jar", args.userJar)
- } else {
- Nil
- }
- val primaryPyFile =
- if (isClusterMode && args.primaryPyFile != null) {
- Seq("--primary-py-file", new Path(args.primaryPyFile).getName())
- } else {
- Nil
- }
- val primaryRFile =
- if (args.primaryRFile != null) {
- Seq("--primary-r-file", args.primaryRFile)
- } else {
- Nil
- }
- val amClass =
- if (isClusterMode) {
- Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
- } else {
- Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
- }
- if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
- args.userArgs = ArrayBuffer(args.primaryRFile) ++ args.userArgs
- }
- val userArgs = args.userArgs.flatMap { arg =>
- Seq("--arg", YarnSparkHadoopUtil.escapeForShell(arg))
- }
- val amArgs =
- Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++
- userArgs ++ Seq(
- "--properties-file", buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- LOCALIZED_CONF_DIR, SPARK_CONF_FILE))
-
- // Command for the ApplicationMaster
- val commands = prefixEnv ++ Seq(
- YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java", "-server"
- ) ++
- javaOpts ++ amArgs ++
- Seq(
- "1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
- "2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
-
- // TODO: it would be nicer to just make sure there are no null commands here
- val printableCommands = commands.map(s => if (s == null) "null" else s).toList
- amContainer.setCommands(printableCommands.asJava)
-
- logDebug("===============================================================================")
- logDebug("YARN AM launch context:")
- logDebug(s" user class: ${Option(args.userClass).getOrElse("N/A")}")
- logDebug(" env:")
- launchEnv.foreach { case (k, v) => logDebug(s" $k -> $v") }
- logDebug(" resources:")
- localResources.foreach { case (k, v) => logDebug(s" $k -> $v")}
- logDebug(" command:")
- logDebug(s" ${printableCommands.mkString(" ")}")
- logDebug("===============================================================================")
-
- // send the acl settings into YARN to control who has access via YARN interfaces
- val securityManager = new SecurityManager(sparkConf)
- amContainer.setApplicationACLs(
- YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
- setupSecurityToken(amContainer)
- amContainer
- }
-
- def setupCredentials(): Unit = {
- loginFromKeytab = sparkConf.contains(PRINCIPAL.key)
- if (loginFromKeytab) {
- principal = sparkConf.get(PRINCIPAL).get
- keytab = sparkConf.get(KEYTAB).orNull
-
- require(keytab != null, "Keytab must be specified when principal is specified.")
- logInfo("Attempting to login to the Kerberos" +
- s" using principal: $principal and keytab: $keytab")
- val f = new File(keytab)
- // Generate a file name that can be used for the keytab file, that does not conflict
- // with any user file.
- val keytabFileName = f.getName + "-" + UUID.randomUUID().toString
- sparkConf.set(KEYTAB.key, keytabFileName)
- sparkConf.set(PRINCIPAL.key, principal)
- }
- // Defensive copy of the credentials
- credentials = new Credentials(UserGroupInformation.getCurrentUser.getCredentials)
- }
-
- /**
- * Report the state of an application until it has exited, either successfully or
- * due to some failure, then return a pair of the yarn application state (FINISHED, FAILED,
- * KILLED, or RUNNING) and the final application state (UNDEFINED, SUCCEEDED, FAILED,
- * or KILLED).
- *
- * @param appId ID of the application to monitor.
- * @param returnOnRunning Whether to also return the application state when it is RUNNING.
- * @param logApplicationReport Whether to log details of the application report every iteration.
- * @return A pair of the yarn application state and the final application state.
- */
- def monitorApplication(
- appId: ApplicationId,
- returnOnRunning: Boolean = false,
- logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {
- val interval = sparkConf.get(REPORT_INTERVAL)
- var lastState: YarnApplicationState = null
- while (true) {
- Thread.sleep(interval)
- val report: ApplicationReport =
- try {
- getApplicationReport(appId)
- } catch {
- case e: ApplicationNotFoundException =>
- logError(s"Application $appId not found.")
- cleanupStagingDir(appId)
- return (YarnApplicationState.KILLED, FinalApplicationStatus.KILLED)
- case NonFatal(e) =>
- logError(s"Failed to contact YARN for application $appId.", e)
- // Don't necessarily clean up staging dir because status is unknown
- return (YarnApplicationState.FAILED, FinalApplicationStatus.FAILED)
- }
- val state = report.getYarnApplicationState
-
- if (logApplicationReport) {
- logInfo(s"Application report for $appId (state: $state)")
-
- // If DEBUG is enabled, log report details every iteration
- // Otherwise, log them every time the application changes state
- if (log.isDebugEnabled) {
- logDebug(formatReportDetails(report))
- } else if (lastState != state) {
- logInfo(formatReportDetails(report))
- }
- }
-
- if (lastState != state) {
- state match {
- case YarnApplicationState.RUNNING =>
- reportLauncherState(SparkAppHandle.State.RUNNING)
- case YarnApplicationState.FINISHED =>
- report.getFinalApplicationStatus match {
- case FinalApplicationStatus.FAILED =>
- reportLauncherState(SparkAppHandle.State.FAILED)
- case FinalApplicationStatus.KILLED =>
- reportLauncherState(SparkAppHandle.State.KILLED)
- case _ =>
- reportLauncherState(SparkAppHandle.State.FINISHED)
- }
- case YarnApplicationState.FAILED =>
- reportLauncherState(SparkAppHandle.State.FAILED)
- case YarnApplicationState.KILLED =>
- reportLauncherState(SparkAppHandle.State.KILLED)
- case _ =>
- }
- }
-
- if (state == YarnApplicationState.FINISHED ||
- state == YarnApplicationState.FAILED ||
- state == YarnApplicationState.KILLED) {
- cleanupStagingDir(appId)
- return (state, report.getFinalApplicationStatus)
- }
-
- if (returnOnRunning && state == YarnApplicationState.RUNNING) {
- return (state, report.getFinalApplicationStatus)
- }
-
- lastState = state
- }
-
- // Never reached, but keeps compiler happy
- throw new SparkException("While loop is depleted! This should never happen...")
- }
-
- private def formatReportDetails(report: ApplicationReport): String = {
- val details = Seq[(String, String)](
- ("client token", getClientToken(report)),
- ("diagnostics", report.getDiagnostics),
- ("ApplicationMaster host", report.getHost),
- ("ApplicationMaster RPC port", report.getRpcPort.toString),
- ("queue", report.getQueue),
- ("start time", report.getStartTime.toString),
- ("final status", report.getFinalApplicationStatus.toString),
- ("tracking URL", report.getTrackingUrl),
- ("user", report.getUser)
- )
-
- // Use more loggable format if value is null or empty
- details.map { case (k, v) =>
- val newValue = Option(v).filter(_.nonEmpty).getOrElse("N/A")
- s"\n\t $k: $newValue"
- }.mkString("")
- }
-
- /**
- * Submit an application to the ResourceManager.
- * If set spark.yarn.submit.waitAppCompletion to true, it will stay alive
- * reporting the application's status until the application has exited for any reason.
- * Otherwise, the client process will exit after submission.
- * If the application finishes with a failed, killed, or undefined status,
- * throw an appropriate SparkException.
- */
- def run(): Unit = {
- this.appId = submitApplication()
- if (!launcherBackend.isConnected() && fireAndForget) {
- val report = getApplicationReport(appId)
- val state = report.getYarnApplicationState
- logInfo(s"Application report for $appId (state: $state)")
- logInfo(formatReportDetails(report))
- if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {
- throw new SparkException(s"Application $appId finished with status: $state")
- }
- } else {
- val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId)
- if (yarnApplicationState == YarnApplicationState.FAILED ||
- finalApplicationStatus == FinalApplicationStatus.FAILED) {
- throw new SparkException(s"Application $appId finished with failed status")
- }
- if (yarnApplicationState == YarnApplicationState.KILLED ||
- finalApplicationStatus == FinalApplicationStatus.KILLED) {
- throw new SparkException(s"Application $appId is killed")
- }
- if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {
- throw new SparkException(s"The final status of application $appId is undefined")
- }
- }
- }
-
- private def findPySparkArchives(): Seq[String] = {
- sys.env.get("PYSPARK_ARCHIVES_PATH")
- .map(_.split(",").toSeq)
- .getOrElse {
- val pyLibPath = Seq(sys.env("SPARK_HOME"), "python", "lib").mkString(File.separator)
- val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
- require(pyArchivesFile.exists(),
- s"$pyArchivesFile not found; cannot run pyspark application in YARN mode.")
- val py4jFile = new File(pyLibPath, "py4j-0.10.4-src.zip")
- require(py4jFile.exists(),
- s"$py4jFile not found; cannot run pyspark application in YARN mode.")
- Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
- }
- }
-
-}
-
-private object Client extends Logging {
-
- def main(argStrings: Array[String]) {
- if (!sys.props.contains("SPARK_SUBMIT")) {
- logWarning("WARNING: This client is deprecated and will be removed in a " +
- "future version of Spark. Use ./bin/spark-submit with \"--master yarn\"")
- }
-
- // Set an env variable indicating we are running in YARN mode.
- // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
- System.setProperty("SPARK_YARN_MODE", "true")
- val sparkConf = new SparkConf
- // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
- // so remove them from sparkConf here for yarn mode.
- sparkConf.remove("spark.jars")
- sparkConf.remove("spark.files")
- val args = new ClientArguments(argStrings)
- new Client(args, sparkConf).run()
- }
-
- // Alias for the user jar
- val APP_JAR_NAME: String = "__app__.jar"
-
- // URI scheme that identifies local resources
- val LOCAL_SCHEME = "local"
-
- // Staging directory for any temporary jars or files
- val SPARK_STAGING: String = ".sparkStaging"
-
-
- // Staging directory is private! -> rwx--------
- val STAGING_DIR_PERMISSION: FsPermission =
- FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
-
- // App files are world-wide readable and owner writable -> rw-r--r--
- val APP_FILE_PERMISSION: FsPermission =
- FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)
-
- // Distribution-defined classpath to add to processes
- val ENV_DIST_CLASSPATH = "SPARK_DIST_CLASSPATH"
-
- // Subdirectory where the user's Spark and Hadoop config files will be placed.
- val LOCALIZED_CONF_DIR = "__spark_conf__"
-
- // File containing the conf archive in the AM. See prepareLocalResources().
- val LOCALIZED_CONF_ARCHIVE = LOCALIZED_CONF_DIR + ".zip"
-
- // Name of the file in the conf archive containing Spark configuration.
- val SPARK_CONF_FILE = "__spark_conf__.properties"
-
- // Subdirectory where the user's python files (not archives) will be placed.
- val LOCALIZED_PYTHON_DIR = "__pyfiles__"
-
- // Subdirectory where Spark libraries will be placed.
- val LOCALIZED_LIB_DIR = "__spark_libs__"
-
- /**
- * Return the path to the given application's staging directory.
- */
- private def getAppStagingDir(appId: ApplicationId): String = {
- buildPath(SPARK_STAGING, appId.toString())
- }
-
- /**
- * Populate the classpath entry in the given environment map with any application
- * classpath specified through the Hadoop and Yarn configurations.
- */
- private[yarn] def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String])
- : Unit = {
- val classPathElementsToAdd = getYarnAppClasspath(conf) ++ getMRAppClasspath(conf)
- for (c <- classPathElementsToAdd.flatten) {
- YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, c.trim)
- }
- }
-
- private def getYarnAppClasspath(conf: Configuration): Option[Seq[String]] =
- Option(conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) match {
- case Some(s) => Some(s.toSeq)
- case None => getDefaultYarnApplicationClasspath
- }
-
- private def getMRAppClasspath(conf: Configuration): Option[Seq[String]] =
- Option(conf.getStrings("mapreduce.application.classpath")) match {
- case Some(s) => Some(s.toSeq)
- case None => getDefaultMRApplicationClasspath
- }
-
- private[yarn] def getDefaultYarnApplicationClasspath: Option[Seq[String]] = {
- val triedDefault = Try[Seq[String]] {
- val field = classOf[YarnConfiguration].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
- val value = field.get(null).asInstanceOf[Array[String]]
- value.toSeq
- } recoverWith {
- case e: NoSuchFieldException => Success(Seq.empty[String])
- }
-
- triedDefault match {
- case f: Failure[_] =>
- logError("Unable to obtain the default YARN Application classpath.", f.exception)
- case s: Success[Seq[String]] =>
- logDebug(s"Using the default YARN application classpath: ${s.get.mkString(",")}")
- }
-
- triedDefault.toOption
- }
-
- private[yarn] def getDefaultMRApplicationClasspath: Option[Seq[String]] = {
- val triedDefault = Try[Seq[String]] {
- val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
- StringUtils.getStrings(field.get(null).asInstanceOf[String]).toSeq
- } recoverWith {
- case e: NoSuchFieldException => Success(Seq.empty[String])
- }
-
- triedDefault match {
- case f: Failure[_] =>
- logError("Unable to obtain the default MR Application classpath.", f.exception)
- case s: Success[Seq[String]] =>
- logDebug(s"Using the default MR application classpath: ${s.get.mkString(",")}")
- }
-
- triedDefault.toOption
- }
-
- /**
- * Populate the classpath entry in the given environment map.
- *
- * User jars are generally not added to the JVM's system classpath; those are handled by the AM
- * and executor backend. When the deprecated `spark.yarn.user.classpath.first` is used, user jars
- * are included in the system classpath, though. The extra class path and other uploaded files are
- * always made available through the system class path.
- *
- * @param args Client arguments (when starting the AM) or null (when starting executors).
- */
- private[yarn] def populateClasspath(
- args: ClientArguments,
- conf: Configuration,
- sparkConf: SparkConf,
- env: HashMap[String, String],
- extraClassPath: Option[String] = None): Unit = {
- extraClassPath.foreach { cp =>
- addClasspathEntry(getClusterPath(sparkConf, cp), env)
- }
-
- addClasspathEntry(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), env)
-
- addClasspathEntry(
- YarnSparkHadoopUtil.expandEnvironment(Environment.PWD) + Path.SEPARATOR +
- LOCALIZED_CONF_DIR, env)
-
- if (sparkConf.get(USER_CLASS_PATH_FIRST)) {
- // in order to properly add the app jar when user classpath is first
- // we have to do the mainJar separate in order to send the right thing
- // into addFileToClasspath
- val mainJar =
- if (args != null) {
- getMainJarUri(Option(args.userJar))
- } else {
- getMainJarUri(sparkConf.get(APP_JAR))
- }
- mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR_NAME, env))
-
- val secondaryJars =
- if (args != null) {
- getSecondaryJarUris(Option(sparkConf.get(JARS_TO_DISTRIBUTE)))
- } else {
- getSecondaryJarUris(sparkConf.get(SECONDARY_JARS))
- }
- secondaryJars.foreach { x =>
- addFileToClasspath(sparkConf, conf, x, null, env)
- }
- }
-
- // Add the Spark jars to the classpath, depending on how they were distributed.
- addClasspathEntry(buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- LOCALIZED_LIB_DIR, "*"), env)
- if (!sparkConf.get(SPARK_ARCHIVE).isDefined) {
- sparkConf.get(SPARK_JARS).foreach { jars =>
- jars.filter(isLocalUri).foreach { jar =>
- addClasspathEntry(getClusterPath(sparkConf, jar), env)
- }
- }
- }
-
- populateHadoopClasspath(conf, env)
- sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
- addClasspathEntry(getClusterPath(sparkConf, cp), env)
- }
- }
-
- /**
- * Returns a list of URIs representing the user classpath.
- *
- * @param conf Spark configuration.
- */
- def getUserClasspath(conf: SparkConf): Array[URI] = {
- val mainUri = getMainJarUri(conf.get(APP_JAR))
- val secondaryUris = getSecondaryJarUris(conf.get(SECONDARY_JARS))
- (mainUri ++ secondaryUris).toArray
- }
-
- private def getMainJarUri(mainJar: Option[String]): Option[URI] = {
- mainJar.flatMap { path =>
- val uri = Utils.resolveURI(path)
- if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
- }.orElse(Some(new URI(APP_JAR_NAME)))
- }
-
- private def getSecondaryJarUris(secondaryJars: Option[Seq[String]]): Seq[URI] = {
- secondaryJars.getOrElse(Nil).map(new URI(_))
- }
-
- /**
- * Adds the given path to the classpath, handling "local:" URIs correctly.
- *
- * If an alternate name for the file is given, and it's not a "local:" file, the alternate
- * name will be added to the classpath (relative to the job's work directory).
- *
- * If not a "local:" file and no alternate name, the linkName will be added to the classpath.
- *
- * @param conf Spark configuration.
- * @param hadoopConf Hadoop configuration.
- * @param uri URI to add to classpath (optional).
- * @param fileName Alternate name for the file (optional).
- * @param env Map holding the environment variables.
- */
- private def addFileToClasspath(
- conf: SparkConf,
- hadoopConf: Configuration,
- uri: URI,
- fileName: String,
- env: HashMap[String, String]): Unit = {
- if (uri != null && uri.getScheme == LOCAL_SCHEME) {
- addClasspathEntry(getClusterPath(conf, uri.getPath), env)
- } else if (fileName != null) {
- addClasspathEntry(buildPath(
- YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), fileName), env)
- } else if (uri != null) {
- val localPath = getQualifiedLocalPath(uri, hadoopConf)
- val linkName = Option(uri.getFragment()).getOrElse(localPath.getName())
- addClasspathEntry(buildPath(
- YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), linkName), env)
- }
- }
-
- /**
- * Add the given path to the classpath entry of the given environment map.
- * If the classpath is already set, this appends the new path to the existing classpath.
- */
- private def addClasspathEntry(path: String, env: HashMap[String, String]): Unit =
- YarnSparkHadoopUtil.addPathToEnvironment(env, Environment.CLASSPATH.name, path)
-
- /**
- * Returns the path to be sent to the NM for a path that is valid on the gateway.
- *
- * This method uses two configuration values:
- *
- * - spark.yarn.config.gatewayPath: a string that identifies a portion of the input path that may
- * only be valid in the gateway node.
- * - spark.yarn.config.replacementPath: a string with which to replace the gateway path. This may
- * contain, for example, env variable references, which will be expanded by the NMs when
- * starting containers.
- *
- * If either config is not available, the input path is returned.
- */
- def getClusterPath(conf: SparkConf, path: String): String = {
- val localPath = conf.get(GATEWAY_ROOT_PATH)
- val clusterPath = conf.get(REPLACEMENT_ROOT_PATH)
- if (localPath != null && clusterPath != null) {
- path.replace(localPath, clusterPath)
- } else {
- path
- }
- }
-
- /**
- * Return whether the two file systems are the same.
- */
- private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
- val srcUri = srcFs.getUri()
- val dstUri = destFs.getUri()
- if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
- return false
- }
-
- var srcHost = srcUri.getHost()
- var dstHost = dstUri.getHost()
-
- // In HA or when using viewfs, the host part of the URI may not actually be a host, but the
- // name of the HDFS namespace. Those names won't resolve, so avoid even trying if they
- // match.
- if (srcHost != null && dstHost != null && srcHost != dstHost) {
- try {
- srcHost = InetAddress.getByName(srcHost).getCanonicalHostName()
- dstHost = InetAddress.getByName(dstHost).getCanonicalHostName()
- } catch {
- case e: UnknownHostException =>
- return false
- }
- }
-
- Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
- }
-
- /**
- * Given a local URI, resolve it and return a qualified local path that corresponds to the URI.
- * This is used for preparing local resources to be included in the container launch context.
- */
- private def getQualifiedLocalPath(localURI: URI, hadoopConf: Configuration): Path = {
- val qualifiedURI =
- if (localURI.getScheme == null) {
- // If not specified, assume this is in the local filesystem to keep the behavior
- // consistent with that of Hadoop
- new URI(FileSystem.getLocal(hadoopConf).makeQualified(new Path(localURI)).toString)
- } else {
- localURI
- }
- new Path(qualifiedURI)
- }
-
- /**
- * Whether to consider jars provided by the user to have precedence over the Spark jars when
- * loading user classes.
- */
- def isUserClassPathFirst(conf: SparkConf, isDriver: Boolean): Boolean = {
- if (isDriver) {
- conf.get(DRIVER_USER_CLASS_PATH_FIRST)
- } else {
- conf.get(EXECUTOR_USER_CLASS_PATH_FIRST)
- }
- }
-
- /**
- * Joins all the path components using Path.SEPARATOR.
- */
- def buildPath(components: String*): String = {
- components.mkString(Path.SEPARATOR)
- }
-
- /** Returns whether the URI is a "local:" URI. */
- def isLocalUri(uri: String): Boolean = {
- uri.startsWith(s"$LOCAL_SCHEME:")
- }
-
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
deleted file mode 100644
index 61c027e..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientArguments.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import scala.collection.mutable.ArrayBuffer
-
-// TODO: Add code and support for ensuring that yarn resource 'tasks' are location aware !
-private[spark] class ClientArguments(args: Array[String]) {
-
- var userJar: String = null
- var userClass: String = null
- var primaryPyFile: String = null
- var primaryRFile: String = null
- var userArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
-
- parseArgs(args.toList)
-
- private def parseArgs(inputArgs: List[String]): Unit = {
- var args = inputArgs
-
- while (!args.isEmpty) {
- args match {
- case ("--jar") :: value :: tail =>
- userJar = value
- args = tail
-
- case ("--class") :: value :: tail =>
- userClass = value
- args = tail
-
- case ("--primary-py-file") :: value :: tail =>
- primaryPyFile = value
- args = tail
-
- case ("--primary-r-file") :: value :: tail =>
- primaryRFile = value
- args = tail
-
- case ("--arg") :: value :: tail =>
- userArgs += value
- args = tail
-
- case Nil =>
-
- case _ =>
- throw new IllegalArgumentException(getUsageMessage(args))
- }
- }
-
- if (primaryPyFile != null && primaryRFile != null) {
- throw new IllegalArgumentException("Cannot have primary-py-file and primary-r-file" +
- " at the same time")
- }
- }
-
- private def getUsageMessage(unknownParam: List[String] = null): String = {
- val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else ""
- message +
- s"""
- |Usage: org.apache.spark.deploy.yarn.Client [options]
- |Options:
- | --jar JAR_PATH Path to your application's JAR file (required in yarn-cluster
- | mode)
- | --class CLASS_NAME Name of your application's main class (required)
- | --primary-py-file A main Python file
- | --primary-r-file A main R file
- | --arg ARG Argument to be passed to your application's main class.
- | Multiple invocations are possible, each will be passed in order.
- """.stripMargin
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
deleted file mode 100644
index dcc2288..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ClientDistributedCacheManager.scala
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.net.URI
-
-import scala.collection.mutable.{HashMap, ListBuffer, Map}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
-import org.apache.hadoop.fs.permission.FsAction
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.internal.Logging
-
-private case class CacheEntry(
- uri: URI,
- size: Long,
- modTime: Long,
- visibility: LocalResourceVisibility,
- resType: LocalResourceType)
-
-/** Client side methods to setup the Hadoop distributed cache */
-private[spark] class ClientDistributedCacheManager() extends Logging {
-
- private val distCacheEntries = new ListBuffer[CacheEntry]()
-
- /**
- * Add a resource to the list of distributed cache resources. This list can
- * be sent to the ApplicationMaster and possibly the executors so that it can
- * be downloaded into the Hadoop distributed cache for use by this application.
- * Adds the LocalResource to the localResources HashMap passed in and saves
- * the stats of the resources to they can be sent to the executors and verified.
- *
- * @param fs FileSystem
- * @param conf Configuration
- * @param destPath path to the resource
- * @param localResources localResource hashMap to insert the resource into
- * @param resourceType LocalResourceType
- * @param link link presented in the distributed cache to the destination
- * @param statCache cache to store the file/directory stats
- * @param appMasterOnly Whether to only add the resource to the app master
- */
- def addResource(
- fs: FileSystem,
- conf: Configuration,
- destPath: Path,
- localResources: HashMap[String, LocalResource],
- resourceType: LocalResourceType,
- link: String,
- statCache: Map[URI, FileStatus],
- appMasterOnly: Boolean = false): Unit = {
- val destStatus = fs.getFileStatus(destPath)
- val amJarRsrc = Records.newRecord(classOf[LocalResource])
- amJarRsrc.setType(resourceType)
- val visibility = getVisibility(conf, destPath.toUri(), statCache)
- amJarRsrc.setVisibility(visibility)
- amJarRsrc.setResource(ConverterUtils.getYarnUrlFromPath(destPath))
- amJarRsrc.setTimestamp(destStatus.getModificationTime())
- amJarRsrc.setSize(destStatus.getLen())
- require(link != null && link.nonEmpty, "You must specify a valid link name.")
- localResources(link) = amJarRsrc
-
- if (!appMasterOnly) {
- val uri = destPath.toUri()
- val pathURI = new URI(uri.getScheme(), uri.getAuthority(), uri.getPath(), null, link)
- distCacheEntries += CacheEntry(pathURI, destStatus.getLen(), destStatus.getModificationTime(),
- visibility, resourceType)
- }
- }
-
- /**
- * Writes down information about cached files needed in executors to the given configuration.
- */
- def updateConfiguration(conf: SparkConf): Unit = {
- conf.set(CACHED_FILES, distCacheEntries.map(_.uri.toString))
- conf.set(CACHED_FILES_SIZES, distCacheEntries.map(_.size))
- conf.set(CACHED_FILES_TIMESTAMPS, distCacheEntries.map(_.modTime))
- conf.set(CACHED_FILES_VISIBILITIES, distCacheEntries.map(_.visibility.name()))
- conf.set(CACHED_FILES_TYPES, distCacheEntries.map(_.resType.name()))
- }
-
- /**
- * Returns the local resource visibility depending on the cache file permissions
- * @return LocalResourceVisibility
- */
- private[yarn] def getVisibility(
- conf: Configuration,
- uri: URI,
- statCache: Map[URI, FileStatus]): LocalResourceVisibility = {
- if (isPublic(conf, uri, statCache)) {
- LocalResourceVisibility.PUBLIC
- } else {
- LocalResourceVisibility.PRIVATE
- }
- }
-
- /**
- * Returns a boolean to denote whether a cache file is visible to all (public)
- * @return true if the path in the uri is visible to all, false otherwise
- */
- private def isPublic(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]): Boolean = {
- val fs = FileSystem.get(uri, conf)
- val current = new Path(uri.getPath())
- // the leaf level file should be readable by others
- if (!checkPermissionOfOther(fs, current, FsAction.READ, statCache)) {
- return false
- }
- ancestorsHaveExecutePermissions(fs, current.getParent(), statCache)
- }
-
- /**
- * Returns true if all ancestors of the specified path have the 'execute'
- * permission set for all users (i.e. that other users can traverse
- * the directory hierarchy to the given path)
- * @return true if all ancestors have the 'execute' permission set for all users
- */
- private def ancestorsHaveExecutePermissions(
- fs: FileSystem,
- path: Path,
- statCache: Map[URI, FileStatus]): Boolean = {
- var current = path
- while (current != null) {
- // the subdirs in the path should have execute permissions for others
- if (!checkPermissionOfOther(fs, current, FsAction.EXECUTE, statCache)) {
- return false
- }
- current = current.getParent()
- }
- true
- }
-
- /**
- * Checks for a given path whether the Other permissions on it
- * imply the permission in the passed FsAction
- * @return true if the path in the uri is visible to all, false otherwise
- */
- private def checkPermissionOfOther(
- fs: FileSystem,
- path: Path,
- action: FsAction,
- statCache: Map[URI, FileStatus]): Boolean = {
- val status = getFileStatus(fs, path.toUri(), statCache)
- val perms = status.getPermission()
- val otherAction = perms.getOtherAction()
- otherAction.implies(action)
- }
-
- /**
- * Checks to see if the given uri exists in the cache, if it does it
- * returns the existing FileStatus, otherwise it stats the uri, stores
- * it in the cache, and returns the FileStatus.
- * @return FileStatus
- */
- private[yarn] def getFileStatus(
- fs: FileSystem,
- uri: URI,
- statCache: Map[URI, FileStatus]): FileStatus = {
- val stat = statCache.get(uri) match {
- case Some(existstat) => existstat
- case None =>
- val newStat = fs.getFileStatus(new Path(uri))
- statCache.put(uri, newStat)
- newStat
- }
- stat
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
----------------------------------------------------------------------
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
deleted file mode 100644
index 868c2ed..0000000
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.yarn
-
-import java.io.File
-import java.nio.ByteBuffer
-import java.util.Collections
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.{HashMap, ListBuffer}
-
-import org.apache.hadoop.fs.Path
-import org.apache.hadoop.io.DataOutputBuffer
-import org.apache.hadoop.security.UserGroupInformation
-import org.apache.hadoop.yarn.api._
-import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
-import org.apache.hadoop.yarn.api.records._
-import org.apache.hadoop.yarn.client.api.NMClient
-import org.apache.hadoop.yarn.conf.YarnConfiguration
-import org.apache.hadoop.yarn.ipc.YarnRPC
-import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
-
-import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.launcher.YarnCommandBuilderUtils
-import org.apache.spark.network.util.JavaUtils
-import org.apache.spark.util.Utils
-
-private[yarn] class ExecutorRunnable(
- container: Option[Container],
- conf: YarnConfiguration,
- sparkConf: SparkConf,
- masterAddress: String,
- executorId: String,
- hostname: String,
- executorMemory: Int,
- executorCores: Int,
- appId: String,
- securityMgr: SecurityManager,
- localResources: Map[String, LocalResource]) extends Logging {
-
- var rpc: YarnRPC = YarnRPC.create(conf)
- var nmClient: NMClient = _
-
- def run(): Unit = {
- logDebug("Starting Executor Container")
- nmClient = NMClient.createNMClient()
- nmClient.init(conf)
- nmClient.start()
- startContainer()
- }
-
- def launchContextDebugInfo(): String = {
- val commands = prepareCommand()
- val env = prepareEnvironment()
-
- s"""
- |===============================================================================
- |YARN executor launch context:
- | env:
- |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString}
- | command:
- | ${commands.mkString(" \\ \n ")}
- |
- | resources:
- |${localResources.map { case (k, v) => s" $k -> $v\n" }.mkString}
- |===============================================================================""".stripMargin
- }
-
- def startContainer(): java.util.Map[String, ByteBuffer] = {
- val ctx = Records.newRecord(classOf[ContainerLaunchContext])
- .asInstanceOf[ContainerLaunchContext]
- val env = prepareEnvironment().asJava
-
- ctx.setLocalResources(localResources.asJava)
- ctx.setEnvironment(env)
-
- val credentials = UserGroupInformation.getCurrentUser().getCredentials()
- val dob = new DataOutputBuffer()
- credentials.writeTokenStorageToStream(dob)
- ctx.setTokens(ByteBuffer.wrap(dob.getData()))
-
- val commands = prepareCommand()
-
- ctx.setCommands(commands.asJava)
- ctx.setApplicationACLs(
- YarnSparkHadoopUtil.getApplicationAclsForYarn(securityMgr).asJava)
-
- // If external shuffle service is enabled, register with the Yarn shuffle service already
- // started on the NodeManager and, if authentication is enabled, provide it with our secret
- // key for fetching shuffle files later
- if (sparkConf.get(SHUFFLE_SERVICE_ENABLED)) {
- val secretString = securityMgr.getSecretKey()
- val secretBytes =
- if (secretString != null) {
- // This conversion must match how the YarnShuffleService decodes our secret
- JavaUtils.stringToBytes(secretString)
- } else {
- // Authentication is not enabled, so just provide dummy metadata
- ByteBuffer.allocate(0)
- }
- ctx.setServiceData(Collections.singletonMap("spark_shuffle", secretBytes))
- }
-
- // Send the start request to the ContainerManager
- try {
- nmClient.startContainer(container.get, ctx)
- } catch {
- case ex: Exception =>
- throw new SparkException(s"Exception while starting container ${container.get.getId}" +
- s" on host $hostname", ex)
- }
- }
-
- private def prepareCommand(): List[String] = {
- // Extra options for the JVM
- val javaOpts = ListBuffer[String]()
-
- // Set the environment variable through a command prefix
- // to append to the existing value of the variable
- var prefixEnv: Option[String] = None
-
- // Set the JVM memory
- val executorMemoryString = executorMemory + "m"
- javaOpts += "-Xmx" + executorMemoryString
-
- // Set extra Java options for the executor, if defined
- sparkConf.get(EXECUTOR_JAVA_OPTIONS).foreach { opts =>
- javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
- }
- sys.env.get("SPARK_JAVA_OPTS").foreach { opts =>
- javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
- }
- sparkConf.get(EXECUTOR_LIBRARY_PATH).foreach { p =>
- prefixEnv = Some(Client.getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(Seq(p))))
- }
-
- javaOpts += "-Djava.io.tmpdir=" +
- new Path(
- YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
- YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR
- )
-
- // Certain configs need to be passed here because they are needed before the Executor
- // registers with the Scheduler and transfers the spark configs. Since the Executor backend
- // uses RPC to connect to the scheduler, the RPC settings are needed as well as the
- // authentication settings.
- sparkConf.getAll
- .filter { case (k, v) => SparkConf.isExecutorStartupConf(k) }
- .foreach { case (k, v) => javaOpts += YarnSparkHadoopUtil.escapeForShell(s"-D$k=$v") }
-
- // Commenting it out for now - so that people can refer to the properties if required. Remove
- // it once cpuset version is pushed out.
- // The context is, default gc for server class machines end up using all cores to do gc - hence
- // if there are multiple containers in same node, spark gc effects all other containers
- // performance (which can also be other spark containers)
- // Instead of using this, rely on cpusets by YARN to enforce spark behaves 'properly' in
- // multi-tenant environments. Not sure how default java gc behaves if it is limited to subset
- // of cores on a node.
- /*
- else {
- // If no java_opts specified, default to using -XX:+CMSIncrementalMode
- // It might be possible that other modes/config is being done in
- // spark.executor.extraJavaOptions, so we don't want to mess with it.
- // In our expts, using (default) throughput collector has severe perf ramifications in
- // multi-tenant machines
- // The options are based on
- // http://www.oracle.com/technetwork/java/gc-tuning-5-138395.html#0.0.0.%20When%20to%20Use
- // %20the%20Concurrent%20Low%20Pause%20Collector|outline
- javaOpts += "-XX:+UseConcMarkSweepGC"
- javaOpts += "-XX:+CMSIncrementalMode"
- javaOpts += "-XX:+CMSIncrementalPacing"
- javaOpts += "-XX:CMSIncrementalDutyCycleMin=0"
- javaOpts += "-XX:CMSIncrementalDutyCycle=10"
- }
- */
-
- // For log4j configuration to reference
- javaOpts += ("-Dspark.yarn.app.container.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR)
- YarnCommandBuilderUtils.addPermGenSizeOpt(javaOpts)
-
- val userClassPath = Client.getUserClasspath(sparkConf).flatMap { uri =>
- val absPath =
- if (new File(uri.getPath()).isAbsolute()) {
- Client.getClusterPath(sparkConf, uri.getPath())
- } else {
- Client.buildPath(Environment.PWD.$(), uri.getPath())
- }
- Seq("--user-class-path", "file:" + absPath)
- }.toSeq
-
- YarnSparkHadoopUtil.addOutOfMemoryErrorArgument(javaOpts)
- val commands = prefixEnv ++ Seq(
- YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java",
- "-server") ++
- javaOpts ++
- Seq("org.apache.spark.executor.CoarseGrainedExecutorBackend",
- "--driver-url", masterAddress,
- "--executor-id", executorId,
- "--hostname", hostname,
- "--cores", executorCores.toString,
- "--app-id", appId) ++
- userClassPath ++
- Seq(
- s"1>${ApplicationConstants.LOG_DIR_EXPANSION_VAR}/stdout",
- s"2>${ApplicationConstants.LOG_DIR_EXPANSION_VAR}/stderr")
-
- // TODO: it would be nicer to just make sure there are no null commands here
- commands.map(s => if (s == null) "null" else s).toList
- }
-
- private def prepareEnvironment(): HashMap[String, String] = {
- val env = new HashMap[String, String]()
- Client.populateClasspath(null, conf, sparkConf, env, sparkConf.get(EXECUTOR_CLASS_PATH))
-
- sparkConf.getExecutorEnv.foreach { case (key, value) =>
- // This assumes each executor environment variable set here is a path
- // This is kept for backward compatibility and consistency with hadoop
- YarnSparkHadoopUtil.addPathToEnvironment(env, key, value)
- }
-
- // Keep this for backwards compatibility but users should move to the config
- sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs =>
- YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs)
- }
-
- // lookup appropriate http scheme for container log urls
- val yarnHttpPolicy = conf.get(
- YarnConfiguration.YARN_HTTP_POLICY_KEY,
- YarnConfiguration.YARN_HTTP_POLICY_DEFAULT
- )
- val httpScheme = if (yarnHttpPolicy == "HTTPS_ONLY") "https://" else "http://"
-
- // Add log urls
- container.foreach { c =>
- sys.env.get("SPARK_USER").foreach { user =>
- val containerId = ConverterUtils.toString(c.getId)
- val address = c.getNodeHttpAddress
- val baseUrl = s"$httpScheme$address/node/containerlogs/$containerId/$user"
-
- env("SPARK_LOG_URL_STDERR") = s"$baseUrl/stderr?start=-4096"
- env("SPARK_LOG_URL_STDOUT") = s"$baseUrl/stdout?start=-4096"
- }
- }
-
- System.getenv().asScala.filterKeys(_.startsWith("SPARK"))
- .foreach { case (k, v) => env(k) = v }
- env
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[16/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
deleted file mode 100644
index 1d742fe..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.util.{List => JList}
-import java.util.concurrent.CountDownLatch
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
-import scala.util.control.NonFatal
-
-import com.google.common.base.Splitter
-import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler, SchedulerDriver}
-import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
-import org.apache.mesos.Protos.FrameworkInfo.Capability
-import org.apache.mesos.protobuf.{ByteString, GeneratedMessage}
-
-import org.apache.spark.{SparkConf, SparkContext, SparkException}
-import org.apache.spark.TaskState
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.util.Utils
-
-
-
-/**
- * Shared trait for implementing a Mesos Scheduler. This holds common state and helper
- * methods and Mesos scheduler will use.
- */
-trait MesosSchedulerUtils extends Logging {
- // Lock used to wait for scheduler to be registered
- private final val registerLatch = new CountDownLatch(1)
-
- // Driver for talking to Mesos
- protected var mesosDriver: SchedulerDriver = null
-
- /**
- * Creates a new MesosSchedulerDriver that communicates to the Mesos master.
- *
- * @param masterUrl The url to connect to Mesos master
- * @param scheduler the scheduler class to receive scheduler callbacks
- * @param sparkUser User to impersonate with when running tasks
- * @param appName The framework name to display on the Mesos UI
- * @param conf Spark configuration
- * @param webuiUrl The WebUI url to link from Mesos UI
- * @param checkpoint Option to checkpoint tasks for failover
- * @param failoverTimeout Duration Mesos master expect scheduler to reconnect on disconnect
- * @param frameworkId The id of the new framework
- */
- protected def createSchedulerDriver(
- masterUrl: String,
- scheduler: Scheduler,
- sparkUser: String,
- appName: String,
- conf: SparkConf,
- webuiUrl: Option[String] = None,
- checkpoint: Option[Boolean] = None,
- failoverTimeout: Option[Double] = None,
- frameworkId: Option[String] = None): SchedulerDriver = {
- val fwInfoBuilder = FrameworkInfo.newBuilder().setUser(sparkUser).setName(appName)
- val credBuilder = Credential.newBuilder()
- webuiUrl.foreach { url => fwInfoBuilder.setWebuiUrl(url) }
- checkpoint.foreach { checkpoint => fwInfoBuilder.setCheckpoint(checkpoint) }
- failoverTimeout.foreach { timeout => fwInfoBuilder.setFailoverTimeout(timeout) }
- frameworkId.foreach { id =>
- fwInfoBuilder.setId(FrameworkID.newBuilder().setValue(id).build())
- }
- fwInfoBuilder.setHostname(Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(
- conf.get(DRIVER_HOST_ADDRESS)))
- conf.getOption("spark.mesos.principal").foreach { principal =>
- fwInfoBuilder.setPrincipal(principal)
- credBuilder.setPrincipal(principal)
- }
- conf.getOption("spark.mesos.secret").foreach { secret =>
- credBuilder.setSecret(secret)
- }
- if (credBuilder.hasSecret && !fwInfoBuilder.hasPrincipal) {
- throw new SparkException(
- "spark.mesos.principal must be configured when spark.mesos.secret is set")
- }
- conf.getOption("spark.mesos.role").foreach { role =>
- fwInfoBuilder.setRole(role)
- }
- val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
- if (maxGpus > 0) {
- fwInfoBuilder.addCapabilities(Capability.newBuilder().setType(Capability.Type.GPU_RESOURCES))
- }
- if (credBuilder.hasPrincipal) {
- new MesosSchedulerDriver(
- scheduler, fwInfoBuilder.build(), masterUrl, credBuilder.build())
- } else {
- new MesosSchedulerDriver(scheduler, fwInfoBuilder.build(), masterUrl)
- }
- }
-
- /**
- * Starts the MesosSchedulerDriver and stores the current running driver to this new instance.
- * This driver is expected to not be running.
- * This method returns only after the scheduler has registered with Mesos.
- */
- def startScheduler(newDriver: SchedulerDriver): Unit = {
- synchronized {
- if (mesosDriver != null) {
- registerLatch.await()
- return
- }
- @volatile
- var error: Option[Exception] = None
-
- // We create a new thread that will block inside `mesosDriver.run`
- // until the scheduler exists
- new Thread(Utils.getFormattedClassName(this) + "-mesos-driver") {
- setDaemon(true)
- override def run() {
- try {
- mesosDriver = newDriver
- val ret = mesosDriver.run()
- logInfo("driver.run() returned with code " + ret)
- if (ret != null && ret.equals(Status.DRIVER_ABORTED)) {
- error = Some(new SparkException("Error starting driver, DRIVER_ABORTED"))
- markErr()
- }
- } catch {
- case e: Exception =>
- logError("driver.run() failed", e)
- error = Some(e)
- markErr()
- }
- }
- }.start()
-
- registerLatch.await()
-
- // propagate any error to the calling thread. This ensures that SparkContext creation fails
- // without leaving a broken context that won't be able to schedule any tasks
- error.foreach(throw _)
- }
- }
-
- def getResource(res: JList[Resource], name: String): Double = {
- // A resource can have multiple values in the offer since it can either be from
- // a specific role or wildcard.
- res.asScala.filter(_.getName == name).map(_.getScalar.getValue).sum
- }
-
- /**
- * Transforms a range resource to a list of ranges
- *
- * @param res the mesos resource list
- * @param name the name of the resource
- * @return the list of ranges returned
- */
- protected def getRangeResource(res: JList[Resource], name: String): List[(Long, Long)] = {
- // A resource can have multiple values in the offer since it can either be from
- // a specific role or wildcard.
- res.asScala.filter(_.getName == name).flatMap(_.getRanges.getRangeList.asScala
- .map(r => (r.getBegin, r.getEnd)).toList).toList
- }
-
- /**
- * Signal that the scheduler has registered with Mesos.
- */
- protected def markRegistered(): Unit = {
- registerLatch.countDown()
- }
-
- protected def markErr(): Unit = {
- registerLatch.countDown()
- }
-
- def createResource(name: String, amount: Double, role: Option[String] = None): Resource = {
- val builder = Resource.newBuilder()
- .setName(name)
- .setType(Value.Type.SCALAR)
- .setScalar(Value.Scalar.newBuilder().setValue(amount).build())
-
- role.foreach { r => builder.setRole(r) }
-
- builder.build()
- }
-
- /**
- * Partition the existing set of resources into two groups, those remaining to be
- * scheduled and those requested to be used for a new task.
- *
- * @param resources The full list of available resources
- * @param resourceName The name of the resource to take from the available resources
- * @param amountToUse The amount of resources to take from the available resources
- * @return The remaining resources list and the used resources list.
- */
- def partitionResources(
- resources: JList[Resource],
- resourceName: String,
- amountToUse: Double): (List[Resource], List[Resource]) = {
- var remain = amountToUse
- var requestedResources = new ArrayBuffer[Resource]
- val remainingResources = resources.asScala.map {
- case r =>
- if (remain > 0 &&
- r.getType == Value.Type.SCALAR &&
- r.getScalar.getValue > 0.0 &&
- r.getName == resourceName) {
- val usage = Math.min(remain, r.getScalar.getValue)
- requestedResources += createResource(resourceName, usage, Some(r.getRole))
- remain -= usage
- createResource(resourceName, r.getScalar.getValue - usage, Some(r.getRole))
- } else {
- r
- }
- }
-
- // Filter any resource that has depleted.
- val filteredResources =
- remainingResources.filter(r => r.getType != Value.Type.SCALAR || r.getScalar.getValue > 0.0)
-
- (filteredResources.toList, requestedResources.toList)
- }
-
- /** Helper method to get the key,value-set pair for a Mesos Attribute protobuf */
- protected def getAttribute(attr: Attribute): (String, Set[String]) = {
- (attr.getName, attr.getText.getValue.split(',').toSet)
- }
-
-
- /** Build a Mesos resource protobuf object */
- protected def createResource(resourceName: String, quantity: Double): Protos.Resource = {
- Resource.newBuilder()
- .setName(resourceName)
- .setType(Value.Type.SCALAR)
- .setScalar(Value.Scalar.newBuilder().setValue(quantity).build())
- .build()
- }
-
- /**
- * Converts the attributes from the resource offer into a Map of name -> Attribute Value
- * The attribute values are the mesos attribute types and they are
- *
- * @param offerAttributes the attributes offered
- * @return
- */
- protected def toAttributeMap(offerAttributes: JList[Attribute]): Map[String, GeneratedMessage] = {
- offerAttributes.asScala.map { attr =>
- val attrValue = attr.getType match {
- case Value.Type.SCALAR => attr.getScalar
- case Value.Type.RANGES => attr.getRanges
- case Value.Type.SET => attr.getSet
- case Value.Type.TEXT => attr.getText
- }
- (attr.getName, attrValue)
- }.toMap
- }
-
-
- /**
- * Match the requirements (if any) to the offer attributes.
- * if attribute requirements are not specified - return true
- * else if attribute is defined and no values are given, simple attribute presence is performed
- * else if attribute name and value is specified, subset match is performed on slave attributes
- */
- def matchesAttributeRequirements(
- slaveOfferConstraints: Map[String, Set[String]],
- offerAttributes: Map[String, GeneratedMessage]): Boolean = {
- slaveOfferConstraints.forall {
- // offer has the required attribute and subsumes the required values for that attribute
- case (name, requiredValues) =>
- offerAttributes.get(name) match {
- case None => false
- case Some(_) if requiredValues.isEmpty => true // empty value matches presence
- case Some(scalarValue: Value.Scalar) =>
- // check if provided values is less than equal to the offered values
- requiredValues.map(_.toDouble).exists(_ <= scalarValue.getValue)
- case Some(rangeValue: Value.Range) =>
- val offerRange = rangeValue.getBegin to rangeValue.getEnd
- // Check if there is some required value that is between the ranges specified
- // Note: We only support the ability to specify discrete values, in the future
- // we may expand it to subsume ranges specified with a XX..YY value or something
- // similar to that.
- requiredValues.map(_.toLong).exists(offerRange.contains(_))
- case Some(offeredValue: Value.Set) =>
- // check if the specified required values is a subset of offered set
- requiredValues.subsetOf(offeredValue.getItemList.asScala.toSet)
- case Some(textValue: Value.Text) =>
- // check if the specified value is equal, if multiple values are specified
- // we succeed if any of them match.
- requiredValues.contains(textValue.getValue)
- }
- }
- }
-
- /**
- * Parses the attributes constraints provided to spark and build a matching data struct:
- * Map[<attribute-name>, Set[values-to-match]]
- * The constraints are specified as ';' separated key-value pairs where keys and values
- * are separated by ':'. The ':' implies equality (for singular values) and "is one of" for
- * multiple values (comma separated). For example:
- * {{{
- * parseConstraintString("os:centos7;zone:us-east-1a,us-east-1b")
- * // would result in
- * <code>
- * Map(
- * "os" -> Set("centos7"),
- * "zone": -> Set("us-east-1a", "us-east-1b")
- * )
- * }}}
- *
- * Mesos documentation: http://mesos.apache.org/documentation/attributes-resources/
- * https://github.com/apache/mesos/blob/master/src/common/values.cpp
- * https://github.com/apache/mesos/blob/master/src/common/attributes.cpp
- *
- * @param constraintsVal constaints string consisting of ';' separated key-value pairs (separated
- * by ':')
- * @return Map of constraints to match resources offers.
- */
- def parseConstraintString(constraintsVal: String): Map[String, Set[String]] = {
- /*
- Based on mesos docs:
- attributes : attribute ( ";" attribute )*
- attribute : labelString ":" ( labelString | "," )+
- labelString : [a-zA-Z0-9_/.-]
- */
- val splitter = Splitter.on(';').trimResults().withKeyValueSeparator(':')
- // kv splitter
- if (constraintsVal.isEmpty) {
- Map()
- } else {
- try {
- splitter.split(constraintsVal).asScala.toMap.mapValues(v =>
- if (v == null || v.isEmpty) {
- Set[String]()
- } else {
- v.split(',').toSet
- }
- )
- } catch {
- case NonFatal(e) =>
- throw new IllegalArgumentException(s"Bad constraint string: $constraintsVal", e)
- }
- }
- }
-
- // These defaults copied from YARN
- private val MEMORY_OVERHEAD_FRACTION = 0.10
- private val MEMORY_OVERHEAD_MINIMUM = 384
-
- /**
- * Return the amount of memory to allocate to each executor, taking into account
- * container overheads.
- *
- * @param sc SparkContext to use to get `spark.mesos.executor.memoryOverhead` value
- * @return memory requirement as (0.1 * <memoryOverhead>) or MEMORY_OVERHEAD_MINIMUM
- * (whichever is larger)
- */
- def executorMemory(sc: SparkContext): Int = {
- sc.conf.getInt("spark.mesos.executor.memoryOverhead",
- math.max(MEMORY_OVERHEAD_FRACTION * sc.executorMemory, MEMORY_OVERHEAD_MINIMUM).toInt) +
- sc.executorMemory
- }
-
- def setupUris(uris: String,
- builder: CommandInfo.Builder,
- useFetcherCache: Boolean = false): Unit = {
- uris.split(",").foreach { uri =>
- builder.addUris(CommandInfo.URI.newBuilder().setValue(uri.trim()).setCache(useFetcherCache))
- }
- }
-
- protected def getRejectOfferDurationForUnmetConstraints(sc: SparkContext): Long = {
- sc.conf.getTimeAsSeconds("spark.mesos.rejectOfferDurationForUnmetConstraints", "120s")
- }
-
- protected def getRejectOfferDurationForReachedMaxCores(sc: SparkContext): Long = {
- sc.conf.getTimeAsSeconds("spark.mesos.rejectOfferDurationForReachedMaxCores", "120s")
- }
-
- /**
- * Checks executor ports if they are within some range of the offered list of ports ranges,
- *
- * @param conf the Spark Config
- * @param ports the list of ports to check
- * @return true if ports are within range false otherwise
- */
- protected def checkPorts(conf: SparkConf, ports: List[(Long, Long)]): Boolean = {
-
- def checkIfInRange(port: Long, ps: List[(Long, Long)]): Boolean = {
- ps.exists{case (rangeStart, rangeEnd) => rangeStart <= port & rangeEnd >= port }
- }
-
- val portsToCheck = nonZeroPortValuesFromConfig(conf)
- val withinRange = portsToCheck.forall(p => checkIfInRange(p, ports))
- // make sure we have enough ports to allocate per offer
- val enoughPorts =
- ports.map{case (rangeStart, rangeEnd) => rangeEnd - rangeStart + 1}.sum >= portsToCheck.size
- enoughPorts && withinRange
- }
-
- /**
- * Partitions port resources.
- *
- * @param requestedPorts non-zero ports to assign
- * @param offeredResources the resources offered
- * @return resources left, port resources to be used.
- */
- def partitionPortResources(requestedPorts: List[Long], offeredResources: List[Resource])
- : (List[Resource], List[Resource]) = {
- if (requestedPorts.isEmpty) {
- (offeredResources, List[Resource]())
- } else {
- // partition port offers
- val (resourcesWithoutPorts, portResources) = filterPortResources(offeredResources)
-
- val portsAndRoles = requestedPorts.
- map(x => (x, findPortAndGetAssignedRangeRole(x, portResources)))
-
- val assignedPortResources = createResourcesFromPorts(portsAndRoles)
-
- // ignore non-assigned port resources, they will be declined implicitly by mesos
- // no need for splitting port resources.
- (resourcesWithoutPorts, assignedPortResources)
- }
- }
-
- val managedPortNames = List("spark.executor.port", BLOCK_MANAGER_PORT.key)
-
- /**
- * The values of the non-zero ports to be used by the executor process.
- * @param conf the spark config to use
- * @return the ono-zero values of the ports
- */
- def nonZeroPortValuesFromConfig(conf: SparkConf): List[Long] = {
- managedPortNames.map(conf.getLong(_, 0)).filter( _ != 0)
- }
-
- /** Creates a mesos resource for a specific port number. */
- private def createResourcesFromPorts(portsAndRoles: List[(Long, String)]) : List[Resource] = {
- portsAndRoles.flatMap{ case (port, role) =>
- createMesosPortResource(List((port, port)), Some(role))}
- }
-
- /** Helper to create mesos resources for specific port ranges. */
- private def createMesosPortResource(
- ranges: List[(Long, Long)],
- role: Option[String] = None): List[Resource] = {
- ranges.map { case (rangeStart, rangeEnd) =>
- val rangeValue = Value.Range.newBuilder()
- .setBegin(rangeStart)
- .setEnd(rangeEnd)
- val builder = Resource.newBuilder()
- .setName("ports")
- .setType(Value.Type.RANGES)
- .setRanges(Value.Ranges.newBuilder().addRange(rangeValue))
- role.foreach(r => builder.setRole(r))
- builder.build()
- }
- }
-
- /**
- * Helper to assign a port to an offered range and get the latter's role
- * info to use it later on.
- */
- private def findPortAndGetAssignedRangeRole(port: Long, portResources: List[Resource])
- : String = {
-
- val ranges = portResources.
- map(resource =>
- (resource.getRole, resource.getRanges.getRangeList.asScala
- .map(r => (r.getBegin, r.getEnd)).toList))
-
- val rangePortRole = ranges
- .find { case (role, rangeList) => rangeList
- .exists{ case (rangeStart, rangeEnd) => rangeStart <= port & rangeEnd >= port}}
- // this is safe since we have previously checked about the ranges (see checkPorts method)
- rangePortRole.map{ case (role, rangeList) => role}.get
- }
-
- /** Retrieves the port resources from a list of mesos offered resources */
- private def filterPortResources(resources: List[Resource]): (List[Resource], List[Resource]) = {
- resources.partition { r => !(r.getType == Value.Type.RANGES && r.getName == "ports") }
- }
-
- /**
- * spark.mesos.driver.frameworkId is set by the cluster dispatcher to correlate driver
- * submissions with frameworkIDs. However, this causes issues when a driver process launches
- * more than one framework (more than one SparkContext(, because they all try to register with
- * the same frameworkID. To enforce that only the first driver registers with the configured
- * framework ID, the driver calls this method after the first registration.
- */
- def unsetFrameworkID(sc: SparkContext) {
- sc.conf.remove("spark.mesos.driver.frameworkId")
- System.clearProperty("spark.mesos.driver.frameworkId")
- }
-
- def mesosToTaskState(state: MesosTaskState): TaskState.TaskState = state match {
- case MesosTaskState.TASK_STAGING | MesosTaskState.TASK_STARTING => TaskState.LAUNCHING
- case MesosTaskState.TASK_RUNNING | MesosTaskState.TASK_KILLING => TaskState.RUNNING
- case MesosTaskState.TASK_FINISHED => TaskState.FINISHED
- case MesosTaskState.TASK_FAILED => TaskState.FAILED
- case MesosTaskState.TASK_KILLED => TaskState.KILLED
- case MesosTaskState.TASK_LOST | MesosTaskState.TASK_ERROR => TaskState.LOST
- }
-
- def taskStateToMesos(state: TaskState.TaskState): MesosTaskState = state match {
- case TaskState.LAUNCHING => MesosTaskState.TASK_STARTING
- case TaskState.RUNNING => MesosTaskState.TASK_RUNNING
- case TaskState.FINISHED => MesosTaskState.TASK_FINISHED
- case TaskState.FAILED => MesosTaskState.TASK_FAILED
- case TaskState.KILLED => MesosTaskState.TASK_KILLED
- case TaskState.LOST => MesosTaskState.TASK_LOST
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
----------------------------------------------------------------------
diff --git a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala b/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
deleted file mode 100644
index 8370b61..0000000
--- a/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.nio.ByteBuffer
-
-import org.apache.mesos.protobuf.ByteString
-
-import org.apache.spark.internal.Logging
-
-/**
- * Wrapper for serializing the data sent when launching Mesos tasks.
- */
-private[spark] case class MesosTaskLaunchData(
- serializedTask: ByteBuffer,
- attemptNumber: Int) extends Logging {
-
- def toByteString: ByteString = {
- val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
- dataBuffer.putInt(attemptNumber)
- dataBuffer.put(serializedTask)
- dataBuffer.rewind
- logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
- ByteString.copyFrom(dataBuffer)
- }
-}
-
-private[spark] object MesosTaskLaunchData extends Logging {
- def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
- val byteBuffer = byteString.asReadOnlyByteBuffer()
- logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
- val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
- val serializedTask = byteBuffer.slice() // subsequence starting at the current position
- MesosTaskLaunchData(serializedTask, attemptNumber)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala b/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
deleted file mode 100644
index 33e7d69..0000000
--- a/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.TestPrematureExit
-
-class MesosClusterDispatcherArgumentsSuite extends SparkFunSuite
- with TestPrematureExit {
-
- test("test if spark config args are passed sucessfully") {
- val args = Array[String]("--master", "mesos://localhost:5050", "--conf", "key1=value1",
- "--conf", "spark.mesos.key2=value2", "--verbose")
- val conf = new SparkConf()
- new MesosClusterDispatcherArguments(args, conf)
-
- assert(conf.getOption("key1").isEmpty)
- assert(conf.get("spark.mesos.key2") == "value2")
- }
-
- test("test non conf settings") {
- val masterUrl = "mesos://localhost:5050"
- val port = "1212"
- val zookeeperUrl = "zk://localhost:2181"
- val host = "localhost"
- val webUiPort = "2323"
- val name = "myFramework"
-
- val args1 = Array("--master", masterUrl, "--verbose", "--name", name)
- val args2 = Array("-p", port, "-h", host, "-z", zookeeperUrl)
- val args3 = Array("--webui-port", webUiPort)
-
- val args = args1 ++ args2 ++ args3
- val conf = new SparkConf()
- val mesosDispClusterArgs = new MesosClusterDispatcherArguments(args, conf)
-
- assert(mesosDispClusterArgs.verbose)
- assert(mesosDispClusterArgs.confProperties.isEmpty)
- assert(mesosDispClusterArgs.host == host)
- assert(Option(mesosDispClusterArgs.masterUrl).isDefined)
- assert(mesosDispClusterArgs.masterUrl == masterUrl.stripPrefix("mesos://"))
- assert(Option(mesosDispClusterArgs.zookeeperUrl).isDefined)
- assert(mesosDispClusterArgs.zookeeperUrl == Some(zookeeperUrl))
- assert(mesosDispClusterArgs.name == name)
- assert(mesosDispClusterArgs.webUiPort == webUiPort.toInt)
- assert(mesosDispClusterArgs.port == port.toInt)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala b/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
deleted file mode 100644
index 7484e3b..0000000
--- a/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.deploy.mesos
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.TestPrematureExit
-
-class MesosClusterDispatcherSuite extends SparkFunSuite
- with TestPrematureExit{
-
- test("prints usage on empty input") {
- testPrematureExit(Array[String](),
- "Usage: MesosClusterDispatcher", MesosClusterDispatcher)
- }
-
- test("prints usage with only --help") {
- testPrematureExit(Array("--help"),
- "Usage: MesosClusterDispatcher", MesosClusterDispatcher)
- }
-
- test("prints error with unrecognized options") {
- testPrematureExit(Array("--blarg"), "Unrecognized option: '--blarg'", MesosClusterDispatcher)
- testPrematureExit(Array("-bleg"), "Unrecognized option: '-bleg'", MesosClusterDispatcher)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
deleted file mode 100644
index a558554..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import org.apache.spark._
-import org.apache.spark.internal.config._
-
-class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
- def testURL(masterURL: String, expectedClass: Class[_], coarse: Boolean) {
- val conf = new SparkConf().set("spark.mesos.coarse", coarse.toString)
- sc = new SparkContext("local", "test", conf)
- val clusterManager = new MesosClusterManager()
-
- assert(clusterManager.canCreate(masterURL))
- val taskScheduler = clusterManager.createTaskScheduler(sc, masterURL)
- val sched = clusterManager.createSchedulerBackend(sc, masterURL, taskScheduler)
- assert(sched.getClass === expectedClass)
- }
-
- test("mesos fine-grained") {
- testURL("mesos://localhost:1234", classOf[MesosFineGrainedSchedulerBackend], coarse = false)
- }
-
- test("mesos coarse-grained") {
- testURL("mesos://localhost:1234", classOf[MesosCoarseGrainedSchedulerBackend], coarse = true)
- }
-
- test("mesos with zookeeper") {
- testURL("mesos://zk://localhost:1234,localhost:2345",
- classOf[MesosFineGrainedSchedulerBackend],
- coarse = false)
- }
-
- test("mesos with i/o encryption throws error") {
- val se = intercept[SparkException] {
- val conf = new SparkConf().setAppName("test").set(IO_ENCRYPTION_ENABLED, true)
- sc = new SparkContext("mesos", "test", conf)
- }
- assert(se.getCause().isInstanceOf[IllegalArgumentException])
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
deleted file mode 100644
index 74e5ce2..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.util.{Collection, Collections, Date}
-
-import scala.collection.JavaConverters._
-
-import org.apache.mesos.Protos._
-import org.apache.mesos.Protos.Value.{Scalar, Type}
-import org.apache.mesos.SchedulerDriver
-import org.mockito.{ArgumentCaptor, Matchers}
-import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.Command
-import org.apache.spark.deploy.mesos.MesosDriverDescription
-
-class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
-
- private val command = new Command("mainClass", Seq("arg"), Map(), Seq(), Seq(), Seq())
- private var driver: SchedulerDriver = _
- private var scheduler: MesosClusterScheduler = _
-
- private def setScheduler(sparkConfVars: Map[String, String] = null): Unit = {
- val conf = new SparkConf()
- conf.setMaster("mesos://localhost:5050")
- conf.setAppName("spark mesos")
-
- if (sparkConfVars != null) {
- conf.setAll(sparkConfVars)
- }
-
- driver = mock[SchedulerDriver]
- scheduler = new MesosClusterScheduler(
- new BlackHoleMesosClusterPersistenceEngineFactory, conf) {
- override def start(): Unit = { ready = true }
- }
- scheduler.start()
- }
-
- test("can queue drivers") {
- setScheduler()
-
- val response = scheduler.submitDriver(
- new MesosDriverDescription("d1", "jar", 1000, 1, true,
- command, Map[String, String](), "s1", new Date()))
- assert(response.success)
- val response2 =
- scheduler.submitDriver(new MesosDriverDescription(
- "d1", "jar", 1000, 1, true, command, Map[String, String](), "s2", new Date()))
- assert(response2.success)
- val state = scheduler.getSchedulerState()
- val queuedDrivers = state.queuedDrivers.toList
- assert(queuedDrivers(0).submissionId == response.submissionId)
- assert(queuedDrivers(1).submissionId == response2.submissionId)
- }
-
- test("can kill queued drivers") {
- setScheduler()
-
- val response = scheduler.submitDriver(
- new MesosDriverDescription("d1", "jar", 1000, 1, true,
- command, Map[String, String](), "s1", new Date()))
- assert(response.success)
- val killResponse = scheduler.killDriver(response.submissionId)
- assert(killResponse.success)
- val state = scheduler.getSchedulerState()
- assert(state.queuedDrivers.isEmpty)
- }
-
- test("can handle multiple roles") {
- setScheduler()
-
- val driver = mock[SchedulerDriver]
- val response = scheduler.submitDriver(
- new MesosDriverDescription("d1", "jar", 1200, 1.5, true,
- command,
- Map(("spark.mesos.executor.home", "test"), ("spark.app.name", "test")),
- "s1",
- new Date()))
- assert(response.success)
- val offer = Offer.newBuilder()
- .addResources(
- Resource.newBuilder().setRole("*")
- .setScalar(Scalar.newBuilder().setValue(1).build()).setName("cpus").setType(Type.SCALAR))
- .addResources(
- Resource.newBuilder().setRole("*")
- .setScalar(Scalar.newBuilder().setValue(1000).build())
- .setName("mem")
- .setType(Type.SCALAR))
- .addResources(
- Resource.newBuilder().setRole("role2")
- .setScalar(Scalar.newBuilder().setValue(1).build()).setName("cpus").setType(Type.SCALAR))
- .addResources(
- Resource.newBuilder().setRole("role2")
- .setScalar(Scalar.newBuilder().setValue(500).build()).setName("mem").setType(Type.SCALAR))
- .setId(OfferID.newBuilder().setValue("o1").build())
- .setFrameworkId(FrameworkID.newBuilder().setValue("f1").build())
- .setSlaveId(SlaveID.newBuilder().setValue("s1").build())
- .setHostname("host1")
- .build()
-
- val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
-
- when(
- driver.launchTasks(
- Matchers.eq(Collections.singleton(offer.getId)),
- capture.capture())
- ).thenReturn(Status.valueOf(1))
-
- scheduler.resourceOffers(driver, Collections.singletonList(offer))
-
- val taskInfos = capture.getValue
- assert(taskInfos.size() == 1)
- val taskInfo = taskInfos.iterator().next()
- val resources = taskInfo.getResourcesList
- assert(scheduler.getResource(resources, "cpus") == 1.5)
- assert(scheduler.getResource(resources, "mem") == 1200)
- val resourcesSeq: Seq[Resource] = resources.asScala
- val cpus = resourcesSeq.filter(_.getName.equals("cpus")).toList
- assert(cpus.size == 2)
- assert(cpus.exists(_.getRole().equals("role2")))
- assert(cpus.exists(_.getRole().equals("*")))
- val mem = resourcesSeq.filter(_.getName.equals("mem")).toList
- assert(mem.size == 2)
- assert(mem.exists(_.getRole().equals("role2")))
- assert(mem.exists(_.getRole().equals("*")))
-
- verify(driver, times(1)).launchTasks(
- Matchers.eq(Collections.singleton(offer.getId)),
- capture.capture()
- )
- }
-
- test("escapes commandline args for the shell") {
- setScheduler()
-
- val conf = new SparkConf()
- conf.setMaster("mesos://localhost:5050")
- conf.setAppName("spark mesos")
- val scheduler = new MesosClusterScheduler(
- new BlackHoleMesosClusterPersistenceEngineFactory, conf) {
- override def start(): Unit = { ready = true }
- }
- val escape = scheduler.shellEscape _
- def wrapped(str: String): String = "\"" + str + "\""
-
- // Wrapped in quotes
- assert(escape("'should be left untouched'") === "'should be left untouched'")
- assert(escape("\"should be left untouched\"") === "\"should be left untouched\"")
-
- // Harmless
- assert(escape("") === "")
- assert(escape("harmless") === "harmless")
- assert(escape("har-m.l3ss") === "har-m.l3ss")
-
- // Special Chars escape
- assert(escape("should escape this \" quote") === wrapped("should escape this \\\" quote"))
- assert(escape("shouldescape\"quote") === wrapped("shouldescape\\\"quote"))
- assert(escape("should escape this $ dollar") === wrapped("should escape this \\$ dollar"))
- assert(escape("should escape this ` backtick") === wrapped("should escape this \\` backtick"))
- assert(escape("""should escape this \ backslash""")
- === wrapped("""should escape this \\ backslash"""))
- assert(escape("""\"?""") === wrapped("""\\\"?"""))
-
-
- // Special Chars no escape only wrap
- List(" ", "'", "<", ">", "&", "|", "?", "*", ";", "!", "#", "(", ")").foreach(char => {
- assert(escape(s"onlywrap${char}this") === wrapped(s"onlywrap${char}this"))
- })
- }
-
- test("supports spark.mesos.driverEnv.*") {
- setScheduler()
-
- val mem = 1000
- val cpu = 1
-
- val response = scheduler.submitDriver(
- new MesosDriverDescription("d1", "jar", mem, cpu, true,
- command,
- Map("spark.mesos.executor.home" -> "test",
- "spark.app.name" -> "test",
- "spark.mesos.driverEnv.TEST_ENV" -> "TEST_VAL"),
- "s1",
- new Date()))
- assert(response.success)
-
- val offer = Utils.createOffer("o1", "s1", mem, cpu)
- scheduler.resourceOffers(driver, List(offer).asJava)
- val tasks = Utils.verifyTaskLaunched(driver, "o1")
- val env = tasks.head.getCommand.getEnvironment.getVariablesList.asScala.map(v =>
- (v.getName, v.getValue)).toMap
- assert(env.getOrElse("TEST_ENV", null) == "TEST_VAL")
- }
-
- test("supports spark.mesos.network.name") {
- setScheduler()
-
- val mem = 1000
- val cpu = 1
-
- val response = scheduler.submitDriver(
- new MesosDriverDescription("d1", "jar", mem, cpu, true,
- command,
- Map("spark.mesos.executor.home" -> "test",
- "spark.app.name" -> "test",
- "spark.mesos.network.name" -> "test-network-name"),
- "s1",
- new Date()))
-
- assert(response.success)
-
- val offer = Utils.createOffer("o1", "s1", mem, cpu)
- scheduler.resourceOffers(driver, List(offer).asJava)
-
- val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
- val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
- assert(networkInfos.size == 1)
- assert(networkInfos.get(0).getName == "test-network-name")
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
deleted file mode 100644
index a674da4..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
+++ /dev/null
@@ -1,601 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.util.concurrent.TimeUnit
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
-import scala.concurrent.duration._
-import scala.concurrent.Promise
-import scala.reflect.ClassTag
-
-import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
-import org.apache.mesos.Protos._
-import org.mockito.Matchers
-import org.mockito.Matchers._
-import org.mockito.Mockito._
-import org.scalatest.concurrent.ScalaFutures
-import org.scalatest.mock.MockitoSugar
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
-import org.apache.spark.internal.config._
-import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
-import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
-import org.apache.spark.scheduler.TaskSchedulerImpl
-import org.apache.spark.scheduler.cluster.mesos.Utils._
-
-class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
- with LocalSparkContext
- with MockitoSugar
- with BeforeAndAfter
- with ScalaFutures {
-
- private var sparkConf: SparkConf = _
- private var driver: SchedulerDriver = _
- private var taskScheduler: TaskSchedulerImpl = _
- private var backend: MesosCoarseGrainedSchedulerBackend = _
- private var externalShuffleClient: MesosExternalShuffleClient = _
- private var driverEndpoint: RpcEndpointRef = _
- @volatile private var stopCalled = false
-
- // All 'requests' to the scheduler run immediately on the same thread, so
- // demand that all futures have their value available immediately.
- implicit override val patienceConfig = PatienceConfig(timeout = Duration(0, TimeUnit.SECONDS))
-
- test("mesos supports killing and limiting executors") {
- setBackend()
- sparkConf.set("spark.driver.host", "driverHost")
- sparkConf.set("spark.driver.port", "1234")
-
- val minMem = backend.executorMemory(sc)
- val minCpu = 4
- val offers = List(Resources(minMem, minCpu))
-
- // launches a task on a valid offer
- offerResources(offers)
- verifyTaskLaunched(driver, "o1")
-
- // kills executors
- assert(backend.doRequestTotalExecutors(0).futureValue)
- assert(backend.doKillExecutors(Seq("0")).futureValue)
- val taskID0 = createTaskId("0")
- verify(driver, times(1)).killTask(taskID0)
-
- // doesn't launch a new task when requested executors == 0
- offerResources(offers, 2)
- verifyDeclinedOffer(driver, createOfferId("o2"))
-
- // Launches a new task when requested executors is positive
- backend.doRequestTotalExecutors(2)
- offerResources(offers, 2)
- verifyTaskLaunched(driver, "o2")
- }
-
- test("mesos supports killing and relaunching tasks with executors") {
- setBackend()
-
- // launches a task on a valid offer
- val minMem = backend.executorMemory(sc) + 1024
- val minCpu = 4
- val offer1 = Resources(minMem, minCpu)
- val offer2 = Resources(minMem, 1)
- offerResources(List(offer1, offer2))
- verifyTaskLaunched(driver, "o1")
-
- // accounts for a killed task
- val status = createTaskStatus("0", "s1", TaskState.TASK_KILLED)
- backend.statusUpdate(driver, status)
- verify(driver, times(1)).reviveOffers()
-
- // Launches a new task on a valid offer from the same slave
- offerResources(List(offer2))
- verifyTaskLaunched(driver, "o2")
- }
-
- test("mesos supports spark.executor.cores") {
- val executorCores = 4
- setBackend(Map("spark.executor.cores" -> executorCores.toString))
-
- val executorMemory = backend.executorMemory(sc)
- val offers = List(Resources(executorMemory * 2, executorCores + 1))
- offerResources(offers)
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 1)
-
- val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
- assert(cpus == executorCores)
- }
-
- test("mesos supports unset spark.executor.cores") {
- setBackend()
-
- val executorMemory = backend.executorMemory(sc)
- val offerCores = 10
- offerResources(List(Resources(executorMemory * 2, offerCores)))
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 1)
-
- val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
- assert(cpus == offerCores)
- }
-
- test("mesos does not acquire more than spark.cores.max") {
- val maxCores = 10
- setBackend(Map("spark.cores.max" -> maxCores.toString))
-
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(Resources(executorMemory, maxCores + 1)))
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 1)
-
- val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
- assert(cpus == maxCores)
- }
-
- test("mesos does not acquire gpus if not specified") {
- setBackend()
-
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(Resources(executorMemory, 1, 1)))
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 1)
-
- val gpus = backend.getResource(taskInfos.head.getResourcesList, "gpus")
- assert(gpus == 0.0)
- }
-
-
- test("mesos does not acquire more than spark.mesos.gpus.max") {
- val maxGpus = 5
- setBackend(Map("spark.mesos.gpus.max" -> maxGpus.toString))
-
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(Resources(executorMemory, 1, maxGpus + 1)))
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 1)
-
- val gpus = backend.getResource(taskInfos.head.getResourcesList, "gpus")
- assert(gpus == maxGpus)
- }
-
-
- test("mesos declines offers that violate attribute constraints") {
- setBackend(Map("spark.mesos.constraints" -> "x:true"))
- offerResources(List(Resources(backend.executorMemory(sc), 4)))
- verifyDeclinedOffer(driver, createOfferId("o1"), true)
- }
-
- test("mesos declines offers with a filter when reached spark.cores.max") {
- val maxCores = 3
- setBackend(Map("spark.cores.max" -> maxCores.toString))
-
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(
- Resources(executorMemory, maxCores + 1),
- Resources(executorMemory, maxCores + 1)))
-
- verifyTaskLaunched(driver, "o1")
- verifyDeclinedOffer(driver, createOfferId("o2"), true)
- }
-
- test("mesos assigns tasks round-robin on offers") {
- val executorCores = 4
- val maxCores = executorCores * 2
- setBackend(Map("spark.executor.cores" -> executorCores.toString,
- "spark.cores.max" -> maxCores.toString))
-
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(
- Resources(executorMemory * 2, executorCores * 2),
- Resources(executorMemory * 2, executorCores * 2)))
-
- verifyTaskLaunched(driver, "o1")
- verifyTaskLaunched(driver, "o2")
- }
-
- test("mesos creates multiple executors on a single slave") {
- val executorCores = 4
- setBackend(Map("spark.executor.cores" -> executorCores.toString))
-
- // offer with room for two executors
- val executorMemory = backend.executorMemory(sc)
- offerResources(List(Resources(executorMemory * 2, executorCores * 2)))
-
- // verify two executors were started on a single offer
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.length == 2)
- }
-
- test("mesos doesn't register twice with the same shuffle service") {
- setBackend(Map("spark.shuffle.service.enabled" -> "true"))
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
- verifyTaskLaunched(driver, "o1")
-
- val offer2 = createOffer("o2", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer2).asJava)
- verifyTaskLaunched(driver, "o2")
-
- val status1 = createTaskStatus("0", "s1", TaskState.TASK_RUNNING)
- backend.statusUpdate(driver, status1)
-
- val status2 = createTaskStatus("1", "s1", TaskState.TASK_RUNNING)
- backend.statusUpdate(driver, status2)
- verify(externalShuffleClient, times(1))
- .registerDriverWithShuffleService(anyString, anyInt, anyLong, anyLong)
- }
-
- test("Port offer decline when there is no appropriate range") {
- setBackend(Map(BLOCK_MANAGER_PORT.key -> "30100"))
- val offeredPorts = (31100L, 31200L)
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
- backend.resourceOffers(driver, List(offer1).asJava)
- verify(driver, times(1)).declineOffer(offer1.getId)
- }
-
- test("Port offer accepted when ephemeral ports are used") {
- setBackend()
- val offeredPorts = (31100L, 31200L)
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
- backend.resourceOffers(driver, List(offer1).asJava)
- verifyTaskLaunched(driver, "o1")
- }
-
- test("Port offer accepted with user defined port numbers") {
- val port = 30100
- setBackend(Map(BLOCK_MANAGER_PORT.key -> s"$port"))
- val offeredPorts = (30000L, 31000L)
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
- backend.resourceOffers(driver, List(offer1).asJava)
- val taskInfo = verifyTaskLaunched(driver, "o1")
-
- val taskPortResources = taskInfo.head.getResourcesList.asScala.
- find(r => r.getType == Value.Type.RANGES && r.getName == "ports")
-
- val isPortInOffer = (r: Resource) => {
- r.getRanges().getRangeList
- .asScala.exists(range => range.getBegin == port && range.getEnd == port)
- }
- assert(taskPortResources.exists(isPortInOffer))
- }
-
- test("mesos kills an executor when told") {
- setBackend()
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
- verifyTaskLaunched(driver, "o1")
-
- backend.doKillExecutors(List("0"))
- verify(driver, times(1)).killTask(createTaskId("0"))
- }
-
- test("weburi is set in created scheduler driver") {
- setBackend()
- val taskScheduler = mock[TaskSchedulerImpl]
- when(taskScheduler.sc).thenReturn(sc)
- val driver = mock[SchedulerDriver]
- when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
- val securityManager = mock[SecurityManager]
-
- val backend = new MesosCoarseGrainedSchedulerBackend(
- taskScheduler, sc, "master", securityManager) {
- override protected def createSchedulerDriver(
- masterUrl: String,
- scheduler: Scheduler,
- sparkUser: String,
- appName: String,
- conf: SparkConf,
- webuiUrl: Option[String] = None,
- checkpoint: Option[Boolean] = None,
- failoverTimeout: Option[Double] = None,
- frameworkId: Option[String] = None): SchedulerDriver = {
- markRegistered()
- assert(webuiUrl.isDefined)
- assert(webuiUrl.get.equals("http://webui"))
- driver
- }
- }
-
- backend.start()
- }
-
- test("honors unset spark.mesos.containerizer") {
- setBackend(Map("spark.mesos.executor.docker.image" -> "test"))
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.head.getContainer.getType == ContainerInfo.Type.DOCKER)
- }
-
- test("honors spark.mesos.containerizer=\"mesos\"") {
- setBackend(Map(
- "spark.mesos.executor.docker.image" -> "test",
- "spark.mesos.containerizer" -> "mesos"))
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val taskInfos = verifyTaskLaunched(driver, "o1")
- assert(taskInfos.head.getContainer.getType == ContainerInfo.Type.MESOS)
- }
-
- test("docker settings are reflected in created tasks") {
- setBackend(Map(
- "spark.mesos.executor.docker.image" -> "some_image",
- "spark.mesos.executor.docker.forcePullImage" -> "true",
- "spark.mesos.executor.docker.volumes" -> "/host_vol:/container_vol:ro",
- "spark.mesos.executor.docker.portmaps" -> "8080:80:tcp"
- ))
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- assert(launchedTasks.size == 1)
-
- val containerInfo = launchedTasks.head.getContainer
- assert(containerInfo.getType == ContainerInfo.Type.DOCKER)
-
- val volumes = containerInfo.getVolumesList.asScala
- assert(volumes.size == 1)
-
- val volume = volumes.head
- assert(volume.getHostPath == "/host_vol")
- assert(volume.getContainerPath == "/container_vol")
- assert(volume.getMode == Volume.Mode.RO)
-
- val dockerInfo = containerInfo.getDocker
-
- val portMappings = dockerInfo.getPortMappingsList.asScala
- assert(portMappings.size == 1)
-
- val portMapping = portMappings.head
- assert(portMapping.getHostPort == 8080)
- assert(portMapping.getContainerPort == 80)
- assert(portMapping.getProtocol == "tcp")
- }
-
- test("force-pull-image option is disabled by default") {
- setBackend(Map(
- "spark.mesos.executor.docker.image" -> "some_image"
- ))
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- assert(launchedTasks.size == 1)
-
- val containerInfo = launchedTasks.head.getContainer
- assert(containerInfo.getType == ContainerInfo.Type.DOCKER)
-
- val dockerInfo = containerInfo.getDocker
-
- assert(dockerInfo.getImage == "some_image")
- assert(!dockerInfo.getForcePullImage)
- }
-
- test("Do not call removeExecutor() after backend is stopped") {
- setBackend()
-
- // launches a task on a valid offer
- val offers = List(Resources(backend.executorMemory(sc), 1))
- offerResources(offers)
- verifyTaskLaunched(driver, "o1")
-
- // launches a thread simulating status update
- val statusUpdateThread = new Thread {
- override def run(): Unit = {
- while (!stopCalled) {
- Thread.sleep(100)
- }
-
- val status = createTaskStatus("0", "s1", TaskState.TASK_FINISHED)
- backend.statusUpdate(driver, status)
- }
- }.start
-
- backend.stop()
- // Any method of the backend involving sending messages to the driver endpoint should not
- // be called after the backend is stopped.
- verify(driverEndpoint, never()).askWithRetry(isA(classOf[RemoveExecutor]))(any[ClassTag[_]])
- }
-
- test("mesos supports spark.executor.uri") {
- val url = "spark.spark.spark.com"
- setBackend(Map(
- "spark.executor.uri" -> url
- ), false)
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- assert(launchedTasks.head.getCommand.getUrisList.asScala(0).getValue == url)
- }
-
- test("mesos supports setting fetcher cache") {
- val url = "spark.spark.spark.com"
- setBackend(Map(
- "spark.mesos.fetcherCache.enable" -> "true",
- "spark.executor.uri" -> url
- ), false)
- val offers = List(Resources(backend.executorMemory(sc), 1))
- offerResources(offers)
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- val uris = launchedTasks.head.getCommand.getUrisList
- assert(uris.size() == 1)
- assert(uris.asScala.head.getCache)
- }
-
- test("mesos supports disabling fetcher cache") {
- val url = "spark.spark.spark.com"
- setBackend(Map(
- "spark.mesos.fetcherCache.enable" -> "false",
- "spark.executor.uri" -> url
- ), false)
- val offers = List(Resources(backend.executorMemory(sc), 1))
- offerResources(offers)
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- val uris = launchedTasks.head.getCommand.getUrisList
- assert(uris.size() == 1)
- assert(!uris.asScala.head.getCache)
- }
-
- test("mesos supports spark.mesos.network.name") {
- setBackend(Map(
- "spark.mesos.network.name" -> "test-network-name"
- ))
-
- val (mem, cpu) = (backend.executorMemory(sc), 4)
-
- val offer1 = createOffer("o1", "s1", mem, cpu)
- backend.resourceOffers(driver, List(offer1).asJava)
-
- val launchedTasks = verifyTaskLaunched(driver, "o1")
- val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
- assert(networkInfos.size == 1)
- assert(networkInfos.get(0).getName == "test-network-name")
- }
-
- private case class Resources(mem: Int, cpus: Int, gpus: Int = 0)
-
- private def verifyDeclinedOffer(driver: SchedulerDriver,
- offerId: OfferID,
- filter: Boolean = false): Unit = {
- if (filter) {
- verify(driver, times(1)).declineOffer(Matchers.eq(offerId), anyObject[Filters])
- } else {
- verify(driver, times(1)).declineOffer(Matchers.eq(offerId))
- }
- }
-
- private def offerResources(offers: List[Resources], startId: Int = 1): Unit = {
- val mesosOffers = offers.zipWithIndex.map {case (offer, i) =>
- createOffer(s"o${i + startId}", s"s${i + startId}", offer.mem, offer.cpus, None, offer.gpus)}
-
- backend.resourceOffers(driver, mesosOffers.asJava)
- }
-
- private def createTaskStatus(taskId: String, slaveId: String, state: TaskState): TaskStatus = {
- TaskStatus.newBuilder()
- .setTaskId(TaskID.newBuilder().setValue(taskId).build())
- .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
- .setState(state)
- .build
- }
-
- private def createSchedulerBackend(
- taskScheduler: TaskSchedulerImpl,
- driver: SchedulerDriver,
- shuffleClient: MesosExternalShuffleClient,
- endpoint: RpcEndpointRef): MesosCoarseGrainedSchedulerBackend = {
- val securityManager = mock[SecurityManager]
-
- val backend = new MesosCoarseGrainedSchedulerBackend(
- taskScheduler, sc, "master", securityManager) {
- override protected def createSchedulerDriver(
- masterUrl: String,
- scheduler: Scheduler,
- sparkUser: String,
- appName: String,
- conf: SparkConf,
- webuiUrl: Option[String] = None,
- checkpoint: Option[Boolean] = None,
- failoverTimeout: Option[Double] = None,
- frameworkId: Option[String] = None): SchedulerDriver = driver
-
- override protected def getShuffleClient(): MesosExternalShuffleClient = shuffleClient
-
- override protected def createDriverEndpointRef(
- properties: ArrayBuffer[(String, String)]): RpcEndpointRef = endpoint
-
- // override to avoid race condition with the driver thread on `mesosDriver`
- override def startScheduler(newDriver: SchedulerDriver): Unit = {
- mesosDriver = newDriver
- }
-
- override def stopExecutors(): Unit = {
- stopCalled = true
- }
-
- markRegistered()
- }
- backend.start()
- backend
- }
-
- private def setBackend(sparkConfVars: Map[String, String] = null,
- setHome: Boolean = true) {
- sparkConf = (new SparkConf)
- .setMaster("local[*]")
- .setAppName("test-mesos-dynamic-alloc")
- .set("spark.mesos.driver.webui.url", "http://webui")
-
- if (setHome) {
- sparkConf.setSparkHome("/path")
- }
-
- if (sparkConfVars != null) {
- sparkConf.setAll(sparkConfVars)
- }
-
- sc = new SparkContext(sparkConf)
-
- driver = mock[SchedulerDriver]
- when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
- taskScheduler = mock[TaskSchedulerImpl]
- when(taskScheduler.sc).thenReturn(sc)
- externalShuffleClient = mock[MesosExternalShuffleClient]
- driverEndpoint = mock[RpcEndpointRef]
- when(driverEndpoint.ask(any())(any())).thenReturn(Promise().future)
-
- backend = createSchedulerBackend(taskScheduler, driver, externalShuffleClient, driverEndpoint)
- }
-}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
----------------------------------------------------------------------
diff --git a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
deleted file mode 100644
index 1d7a86f..0000000
--- a/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.scheduler.cluster.mesos
-
-import java.nio.ByteBuffer
-import java.util.Arrays
-import java.util.Collection
-import java.util.Collections
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
-import org.apache.mesos.Protos._
-import org.apache.mesos.Protos.Value.Scalar
-import org.mockito.{ArgumentCaptor, Matchers}
-import org.mockito.Matchers._
-import org.mockito.Mockito._
-import org.scalatest.mock.MockitoSugar
-
-import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
-import org.apache.spark.executor.MesosExecutorBackend
-import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerExecutorAdded,
- TaskDescription, TaskSchedulerImpl, WorkerOffer}
-import org.apache.spark.scheduler.cluster.ExecutorInfo
-
-class MesosFineGrainedSchedulerBackendSuite
- extends SparkFunSuite with LocalSparkContext with MockitoSugar {
-
- test("weburi is set in created scheduler driver") {
- val conf = new SparkConf
- conf.set("spark.mesos.driver.webui.url", "http://webui")
- conf.set("spark.app.name", "name1")
-
- val sc = mock[SparkContext]
- when(sc.conf).thenReturn(conf)
- when(sc.sparkUser).thenReturn("sparkUser1")
- when(sc.appName).thenReturn("appName1")
-
- val taskScheduler = mock[TaskSchedulerImpl]
- val driver = mock[SchedulerDriver]
- when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
-
- val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master") {
- override protected def createSchedulerDriver(
- masterUrl: String,
- scheduler: Scheduler,
- sparkUser: String,
- appName: String,
- conf: SparkConf,
- webuiUrl: Option[String] = None,
- checkpoint: Option[Boolean] = None,
- failoverTimeout: Option[Double] = None,
- frameworkId: Option[String] = None): SchedulerDriver = {
- markRegistered()
- assert(webuiUrl.isDefined)
- assert(webuiUrl.get.equals("http://webui"))
- driver
- }
- }
-
- backend.start()
- }
-
- test("Use configured mesosExecutor.cores for ExecutorInfo") {
- val mesosExecutorCores = 3
- val conf = new SparkConf
- conf.set("spark.mesos.mesosExecutor.cores", mesosExecutorCores.toString)
-
- val listenerBus = mock[LiveListenerBus]
- listenerBus.post(
- SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-
- val sc = mock[SparkContext]
- when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
-
- when(sc.conf).thenReturn(conf)
- when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
- when(sc.executorMemory).thenReturn(100)
- when(sc.listenerBus).thenReturn(listenerBus)
- val taskScheduler = mock[TaskSchedulerImpl]
- when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
-
- val mesosSchedulerBackend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
-
- val resources = Arrays.asList(
- mesosSchedulerBackend.createResource("cpus", 4),
- mesosSchedulerBackend.createResource("mem", 1024))
- // uri is null.
- val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
- val executorResources = executorInfo.getResourcesList
- val cpus = executorResources.asScala.find(_.getName.equals("cpus")).get.getScalar.getValue
-
- assert(cpus === mesosExecutorCores)
- }
-
- test("check spark-class location correctly") {
- val conf = new SparkConf
- conf.set("spark.mesos.executor.home", "/mesos-home")
-
- val listenerBus = mock[LiveListenerBus]
- listenerBus.post(
- SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-
- val sc = mock[SparkContext]
- when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
-
- when(sc.conf).thenReturn(conf)
- when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
- when(sc.executorMemory).thenReturn(100)
- when(sc.listenerBus).thenReturn(listenerBus)
- val taskScheduler = mock[TaskSchedulerImpl]
- when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
-
- val mesosSchedulerBackend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
-
- val resources = Arrays.asList(
- mesosSchedulerBackend.createResource("cpus", 4),
- mesosSchedulerBackend.createResource("mem", 1024))
- // uri is null.
- val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
- assert(executorInfo.getCommand.getValue ===
- s" /mesos-home/bin/spark-class ${classOf[MesosExecutorBackend].getName}")
-
- // uri exists.
- conf.set("spark.executor.uri", "hdfs:///test-app-1.0.0.tgz")
- val (executorInfo1, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
- assert(executorInfo1.getCommand.getValue ===
- s"cd test-app-1*; ./bin/spark-class ${classOf[MesosExecutorBackend].getName}")
- }
-
- test("spark docker properties correctly populate the DockerInfo message") {
- val taskScheduler = mock[TaskSchedulerImpl]
-
- val conf = new SparkConf()
- .set("spark.mesos.executor.docker.image", "spark/mock")
- .set("spark.mesos.executor.docker.forcePullImage", "true")
- .set("spark.mesos.executor.docker.volumes", "/a,/b:/b,/c:/c:rw,/d:ro,/e:/e:ro")
- .set("spark.mesos.executor.docker.portmaps", "80:8080,53:53:tcp")
-
- val listenerBus = mock[LiveListenerBus]
- listenerBus.post(
- SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-
- val sc = mock[SparkContext]
- when(sc.executorMemory).thenReturn(100)
- when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
- when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
- when(sc.conf).thenReturn(conf)
- when(sc.listenerBus).thenReturn(listenerBus)
-
- val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
-
- val (execInfo, _) = backend.createExecutorInfo(
- Arrays.asList(backend.createResource("cpus", 4)), "mockExecutor")
- assert(execInfo.getContainer.getDocker.getImage.equals("spark/mock"))
- assert(execInfo.getContainer.getDocker.getForcePullImage.equals(true))
- val portmaps = execInfo.getContainer.getDocker.getPortMappingsList
- assert(portmaps.get(0).getHostPort.equals(80))
- assert(portmaps.get(0).getContainerPort.equals(8080))
- assert(portmaps.get(0).getProtocol.equals("tcp"))
- assert(portmaps.get(1).getHostPort.equals(53))
- assert(portmaps.get(1).getContainerPort.equals(53))
- assert(portmaps.get(1).getProtocol.equals("tcp"))
- val volumes = execInfo.getContainer.getVolumesList
- assert(volumes.get(0).getContainerPath.equals("/a"))
- assert(volumes.get(0).getMode.equals(Volume.Mode.RW))
- assert(volumes.get(1).getContainerPath.equals("/b"))
- assert(volumes.get(1).getHostPath.equals("/b"))
- assert(volumes.get(1).getMode.equals(Volume.Mode.RW))
- assert(volumes.get(2).getContainerPath.equals("/c"))
- assert(volumes.get(2).getHostPath.equals("/c"))
- assert(volumes.get(2).getMode.equals(Volume.Mode.RW))
- assert(volumes.get(3).getContainerPath.equals("/d"))
- assert(volumes.get(3).getMode.equals(Volume.Mode.RO))
- assert(volumes.get(4).getContainerPath.equals("/e"))
- assert(volumes.get(4).getHostPath.equals("/e"))
- assert(volumes.get(4).getMode.equals(Volume.Mode.RO))
- }
-
- test("mesos resource offers result in launching tasks") {
- def createOffer(id: Int, mem: Int, cpu: Int): Offer = {
- val builder = Offer.newBuilder()
- builder.addResourcesBuilder()
- .setName("mem")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(mem))
- builder.addResourcesBuilder()
- .setName("cpus")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(cpu))
- builder.setId(OfferID.newBuilder().setValue(s"o${id.toString}").build())
- .setFrameworkId(FrameworkID.newBuilder().setValue("f1"))
- .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
- .setHostname(s"host${id.toString}").build()
- }
-
- val driver = mock[SchedulerDriver]
- val taskScheduler = mock[TaskSchedulerImpl]
-
- val listenerBus = mock[LiveListenerBus]
- listenerBus.post(
- SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-
- val sc = mock[SparkContext]
- when(sc.executorMemory).thenReturn(100)
- when(sc.getSparkHome()).thenReturn(Option("/path"))
- when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
- when(sc.conf).thenReturn(new SparkConf)
- when(sc.listenerBus).thenReturn(listenerBus)
-
- val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
-
- val minMem = backend.executorMemory(sc)
- val minCpu = 4
-
- val mesosOffers = new java.util.ArrayList[Offer]
- mesosOffers.add(createOffer(1, minMem, minCpu))
- mesosOffers.add(createOffer(2, minMem - 1, minCpu))
- mesosOffers.add(createOffer(3, minMem, minCpu))
-
- val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](2)
- expectedWorkerOffers += new WorkerOffer(
- mesosOffers.get(0).getSlaveId.getValue,
- mesosOffers.get(0).getHostname,
- (minCpu - backend.mesosExecutorCores).toInt
- )
- expectedWorkerOffers += new WorkerOffer(
- mesosOffers.get(2).getSlaveId.getValue,
- mesosOffers.get(2).getHostname,
- (minCpu - backend.mesosExecutorCores).toInt
- )
- val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
- when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
- when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
-
- val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
- when(
- driver.launchTasks(
- Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
- capture.capture(),
- any(classOf[Filters])
- )
- ).thenReturn(Status.valueOf(1))
- when(driver.declineOffer(mesosOffers.get(1).getId)).thenReturn(Status.valueOf(1))
- when(driver.declineOffer(mesosOffers.get(2).getId)).thenReturn(Status.valueOf(1))
-
- backend.resourceOffers(driver, mesosOffers)
-
- verify(driver, times(1)).launchTasks(
- Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
- capture.capture(),
- any(classOf[Filters])
- )
- verify(driver, times(1)).declineOffer(mesosOffers.get(1).getId)
- verify(driver, times(1)).declineOffer(mesosOffers.get(2).getId)
- assert(capture.getValue.size() === 1)
- val taskInfo = capture.getValue.iterator().next()
- assert(taskInfo.getName.equals("n1"))
- val cpus = taskInfo.getResourcesList.get(0)
- assert(cpus.getName.equals("cpus"))
- assert(cpus.getScalar.getValue.equals(2.0))
- assert(taskInfo.getSlaveId.getValue.equals("s1"))
-
- // Unwanted resources offered on an existing node. Make sure they are declined
- val mesosOffers2 = new java.util.ArrayList[Offer]
- mesosOffers2.add(createOffer(1, minMem, minCpu))
- reset(taskScheduler)
- reset(driver)
- when(taskScheduler.resourceOffers(any(classOf[IndexedSeq[WorkerOffer]]))).thenReturn(Seq(Seq()))
- when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
- when(driver.declineOffer(mesosOffers2.get(0).getId)).thenReturn(Status.valueOf(1))
-
- backend.resourceOffers(driver, mesosOffers2)
- verify(driver, times(1)).declineOffer(mesosOffers2.get(0).getId)
- }
-
- test("can handle multiple roles") {
- val driver = mock[SchedulerDriver]
- val taskScheduler = mock[TaskSchedulerImpl]
-
- val listenerBus = mock[LiveListenerBus]
- listenerBus.post(
- SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
-
- val sc = mock[SparkContext]
- when(sc.executorMemory).thenReturn(100)
- when(sc.getSparkHome()).thenReturn(Option("/path"))
- when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
- when(sc.conf).thenReturn(new SparkConf)
- when(sc.listenerBus).thenReturn(listenerBus)
-
- val id = 1
- val builder = Offer.newBuilder()
- builder.addResourcesBuilder()
- .setName("mem")
- .setType(Value.Type.SCALAR)
- .setRole("prod")
- .setScalar(Scalar.newBuilder().setValue(500))
- builder.addResourcesBuilder()
- .setName("cpus")
- .setRole("prod")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(1))
- builder.addResourcesBuilder()
- .setName("mem")
- .setRole("dev")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(600))
- builder.addResourcesBuilder()
- .setName("cpus")
- .setRole("dev")
- .setType(Value.Type.SCALAR)
- .setScalar(Scalar.newBuilder().setValue(2))
- val offer = builder.setId(OfferID.newBuilder().setValue(s"o${id.toString}").build())
- .setFrameworkId(FrameworkID.newBuilder().setValue("f1"))
- .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
- .setHostname(s"host${id.toString}").build()
-
- val mesosOffers = new java.util.ArrayList[Offer]
- mesosOffers.add(offer)
-
- val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
-
- val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](1)
- expectedWorkerOffers += new WorkerOffer(
- mesosOffers.get(0).getSlaveId.getValue,
- mesosOffers.get(0).getHostname,
- 2 // Deducting 1 for executor
- )
-
- val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
- when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
- when(taskScheduler.CPUS_PER_TASK).thenReturn(1)
-
- val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
- when(
- driver.launchTasks(
- Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
- capture.capture(),
- any(classOf[Filters])
- )
- ).thenReturn(Status.valueOf(1))
-
- backend.resourceOffers(driver, mesosOffers)
-
- verify(driver, times(1)).launchTasks(
- Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
- capture.capture(),
- any(classOf[Filters])
- )
-
- assert(capture.getValue.size() === 1)
- val taskInfo = capture.getValue.iterator().next()
- assert(taskInfo.getName.equals("n1"))
- assert(taskInfo.getResourcesCount === 1)
- val cpusDev = taskInfo.getResourcesList.get(0)
- assert(cpusDev.getName.equals("cpus"))
- assert(cpusDev.getScalar.getValue.equals(1.0))
- assert(cpusDev.getRole.equals("dev"))
- val executorResources = taskInfo.getExecutor.getResourcesList.asScala
- assert(executorResources.exists { r =>
- r.getName.equals("mem") && r.getScalar.getValue.equals(484.0) && r.getRole.equals("prod")
- })
- assert(executorResources.exists { r =>
- r.getName.equals("cpus") && r.getScalar.getValue.equals(1.0) && r.getRole.equals("prod")
- })
- }
-}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[13/18] spark git commit: [SPARK-18662] Move resource managers to
separate directory
Posted by va...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
new file mode 100644
index 0000000..1d742fe
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -0,0 +1,524 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.util.{List => JList}
+import java.util.concurrent.CountDownLatch
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+import scala.util.control.NonFatal
+
+import com.google.common.base.Splitter
+import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler, SchedulerDriver}
+import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
+import org.apache.mesos.Protos.FrameworkInfo.Capability
+import org.apache.mesos.protobuf.{ByteString, GeneratedMessage}
+
+import org.apache.spark.{SparkConf, SparkContext, SparkException}
+import org.apache.spark.TaskState
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.util.Utils
+
+
+
+/**
+ * Shared trait for implementing a Mesos Scheduler. This holds common state and helper
+ * methods and Mesos scheduler will use.
+ */
+trait MesosSchedulerUtils extends Logging {
+ // Lock used to wait for scheduler to be registered
+ private final val registerLatch = new CountDownLatch(1)
+
+ // Driver for talking to Mesos
+ protected var mesosDriver: SchedulerDriver = null
+
+ /**
+ * Creates a new MesosSchedulerDriver that communicates to the Mesos master.
+ *
+ * @param masterUrl The url to connect to Mesos master
+ * @param scheduler the scheduler class to receive scheduler callbacks
+ * @param sparkUser User to impersonate with when running tasks
+ * @param appName The framework name to display on the Mesos UI
+ * @param conf Spark configuration
+ * @param webuiUrl The WebUI url to link from Mesos UI
+ * @param checkpoint Option to checkpoint tasks for failover
+ * @param failoverTimeout Duration Mesos master expect scheduler to reconnect on disconnect
+ * @param frameworkId The id of the new framework
+ */
+ protected def createSchedulerDriver(
+ masterUrl: String,
+ scheduler: Scheduler,
+ sparkUser: String,
+ appName: String,
+ conf: SparkConf,
+ webuiUrl: Option[String] = None,
+ checkpoint: Option[Boolean] = None,
+ failoverTimeout: Option[Double] = None,
+ frameworkId: Option[String] = None): SchedulerDriver = {
+ val fwInfoBuilder = FrameworkInfo.newBuilder().setUser(sparkUser).setName(appName)
+ val credBuilder = Credential.newBuilder()
+ webuiUrl.foreach { url => fwInfoBuilder.setWebuiUrl(url) }
+ checkpoint.foreach { checkpoint => fwInfoBuilder.setCheckpoint(checkpoint) }
+ failoverTimeout.foreach { timeout => fwInfoBuilder.setFailoverTimeout(timeout) }
+ frameworkId.foreach { id =>
+ fwInfoBuilder.setId(FrameworkID.newBuilder().setValue(id).build())
+ }
+ fwInfoBuilder.setHostname(Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(
+ conf.get(DRIVER_HOST_ADDRESS)))
+ conf.getOption("spark.mesos.principal").foreach { principal =>
+ fwInfoBuilder.setPrincipal(principal)
+ credBuilder.setPrincipal(principal)
+ }
+ conf.getOption("spark.mesos.secret").foreach { secret =>
+ credBuilder.setSecret(secret)
+ }
+ if (credBuilder.hasSecret && !fwInfoBuilder.hasPrincipal) {
+ throw new SparkException(
+ "spark.mesos.principal must be configured when spark.mesos.secret is set")
+ }
+ conf.getOption("spark.mesos.role").foreach { role =>
+ fwInfoBuilder.setRole(role)
+ }
+ val maxGpus = conf.getInt("spark.mesos.gpus.max", 0)
+ if (maxGpus > 0) {
+ fwInfoBuilder.addCapabilities(Capability.newBuilder().setType(Capability.Type.GPU_RESOURCES))
+ }
+ if (credBuilder.hasPrincipal) {
+ new MesosSchedulerDriver(
+ scheduler, fwInfoBuilder.build(), masterUrl, credBuilder.build())
+ } else {
+ new MesosSchedulerDriver(scheduler, fwInfoBuilder.build(), masterUrl)
+ }
+ }
+
+ /**
+ * Starts the MesosSchedulerDriver and stores the current running driver to this new instance.
+ * This driver is expected to not be running.
+ * This method returns only after the scheduler has registered with Mesos.
+ */
+ def startScheduler(newDriver: SchedulerDriver): Unit = {
+ synchronized {
+ if (mesosDriver != null) {
+ registerLatch.await()
+ return
+ }
+ @volatile
+ var error: Option[Exception] = None
+
+ // We create a new thread that will block inside `mesosDriver.run`
+ // until the scheduler exists
+ new Thread(Utils.getFormattedClassName(this) + "-mesos-driver") {
+ setDaemon(true)
+ override def run() {
+ try {
+ mesosDriver = newDriver
+ val ret = mesosDriver.run()
+ logInfo("driver.run() returned with code " + ret)
+ if (ret != null && ret.equals(Status.DRIVER_ABORTED)) {
+ error = Some(new SparkException("Error starting driver, DRIVER_ABORTED"))
+ markErr()
+ }
+ } catch {
+ case e: Exception =>
+ logError("driver.run() failed", e)
+ error = Some(e)
+ markErr()
+ }
+ }
+ }.start()
+
+ registerLatch.await()
+
+ // propagate any error to the calling thread. This ensures that SparkContext creation fails
+ // without leaving a broken context that won't be able to schedule any tasks
+ error.foreach(throw _)
+ }
+ }
+
+ def getResource(res: JList[Resource], name: String): Double = {
+ // A resource can have multiple values in the offer since it can either be from
+ // a specific role or wildcard.
+ res.asScala.filter(_.getName == name).map(_.getScalar.getValue).sum
+ }
+
+ /**
+ * Transforms a range resource to a list of ranges
+ *
+ * @param res the mesos resource list
+ * @param name the name of the resource
+ * @return the list of ranges returned
+ */
+ protected def getRangeResource(res: JList[Resource], name: String): List[(Long, Long)] = {
+ // A resource can have multiple values in the offer since it can either be from
+ // a specific role or wildcard.
+ res.asScala.filter(_.getName == name).flatMap(_.getRanges.getRangeList.asScala
+ .map(r => (r.getBegin, r.getEnd)).toList).toList
+ }
+
+ /**
+ * Signal that the scheduler has registered with Mesos.
+ */
+ protected def markRegistered(): Unit = {
+ registerLatch.countDown()
+ }
+
+ protected def markErr(): Unit = {
+ registerLatch.countDown()
+ }
+
+ def createResource(name: String, amount: Double, role: Option[String] = None): Resource = {
+ val builder = Resource.newBuilder()
+ .setName(name)
+ .setType(Value.Type.SCALAR)
+ .setScalar(Value.Scalar.newBuilder().setValue(amount).build())
+
+ role.foreach { r => builder.setRole(r) }
+
+ builder.build()
+ }
+
+ /**
+ * Partition the existing set of resources into two groups, those remaining to be
+ * scheduled and those requested to be used for a new task.
+ *
+ * @param resources The full list of available resources
+ * @param resourceName The name of the resource to take from the available resources
+ * @param amountToUse The amount of resources to take from the available resources
+ * @return The remaining resources list and the used resources list.
+ */
+ def partitionResources(
+ resources: JList[Resource],
+ resourceName: String,
+ amountToUse: Double): (List[Resource], List[Resource]) = {
+ var remain = amountToUse
+ var requestedResources = new ArrayBuffer[Resource]
+ val remainingResources = resources.asScala.map {
+ case r =>
+ if (remain > 0 &&
+ r.getType == Value.Type.SCALAR &&
+ r.getScalar.getValue > 0.0 &&
+ r.getName == resourceName) {
+ val usage = Math.min(remain, r.getScalar.getValue)
+ requestedResources += createResource(resourceName, usage, Some(r.getRole))
+ remain -= usage
+ createResource(resourceName, r.getScalar.getValue - usage, Some(r.getRole))
+ } else {
+ r
+ }
+ }
+
+ // Filter any resource that has depleted.
+ val filteredResources =
+ remainingResources.filter(r => r.getType != Value.Type.SCALAR || r.getScalar.getValue > 0.0)
+
+ (filteredResources.toList, requestedResources.toList)
+ }
+
+ /** Helper method to get the key,value-set pair for a Mesos Attribute protobuf */
+ protected def getAttribute(attr: Attribute): (String, Set[String]) = {
+ (attr.getName, attr.getText.getValue.split(',').toSet)
+ }
+
+
+ /** Build a Mesos resource protobuf object */
+ protected def createResource(resourceName: String, quantity: Double): Protos.Resource = {
+ Resource.newBuilder()
+ .setName(resourceName)
+ .setType(Value.Type.SCALAR)
+ .setScalar(Value.Scalar.newBuilder().setValue(quantity).build())
+ .build()
+ }
+
+ /**
+ * Converts the attributes from the resource offer into a Map of name -> Attribute Value
+ * The attribute values are the mesos attribute types and they are
+ *
+ * @param offerAttributes the attributes offered
+ * @return
+ */
+ protected def toAttributeMap(offerAttributes: JList[Attribute]): Map[String, GeneratedMessage] = {
+ offerAttributes.asScala.map { attr =>
+ val attrValue = attr.getType match {
+ case Value.Type.SCALAR => attr.getScalar
+ case Value.Type.RANGES => attr.getRanges
+ case Value.Type.SET => attr.getSet
+ case Value.Type.TEXT => attr.getText
+ }
+ (attr.getName, attrValue)
+ }.toMap
+ }
+
+
+ /**
+ * Match the requirements (if any) to the offer attributes.
+ * if attribute requirements are not specified - return true
+ * else if attribute is defined and no values are given, simple attribute presence is performed
+ * else if attribute name and value is specified, subset match is performed on slave attributes
+ */
+ def matchesAttributeRequirements(
+ slaveOfferConstraints: Map[String, Set[String]],
+ offerAttributes: Map[String, GeneratedMessage]): Boolean = {
+ slaveOfferConstraints.forall {
+ // offer has the required attribute and subsumes the required values for that attribute
+ case (name, requiredValues) =>
+ offerAttributes.get(name) match {
+ case None => false
+ case Some(_) if requiredValues.isEmpty => true // empty value matches presence
+ case Some(scalarValue: Value.Scalar) =>
+ // check if provided values is less than equal to the offered values
+ requiredValues.map(_.toDouble).exists(_ <= scalarValue.getValue)
+ case Some(rangeValue: Value.Range) =>
+ val offerRange = rangeValue.getBegin to rangeValue.getEnd
+ // Check if there is some required value that is between the ranges specified
+ // Note: We only support the ability to specify discrete values, in the future
+ // we may expand it to subsume ranges specified with a XX..YY value or something
+ // similar to that.
+ requiredValues.map(_.toLong).exists(offerRange.contains(_))
+ case Some(offeredValue: Value.Set) =>
+ // check if the specified required values is a subset of offered set
+ requiredValues.subsetOf(offeredValue.getItemList.asScala.toSet)
+ case Some(textValue: Value.Text) =>
+ // check if the specified value is equal, if multiple values are specified
+ // we succeed if any of them match.
+ requiredValues.contains(textValue.getValue)
+ }
+ }
+ }
+
+ /**
+ * Parses the attributes constraints provided to spark and build a matching data struct:
+ * Map[<attribute-name>, Set[values-to-match]]
+ * The constraints are specified as ';' separated key-value pairs where keys and values
+ * are separated by ':'. The ':' implies equality (for singular values) and "is one of" for
+ * multiple values (comma separated). For example:
+ * {{{
+ * parseConstraintString("os:centos7;zone:us-east-1a,us-east-1b")
+ * // would result in
+ * <code>
+ * Map(
+ * "os" -> Set("centos7"),
+ * "zone": -> Set("us-east-1a", "us-east-1b")
+ * )
+ * }}}
+ *
+ * Mesos documentation: http://mesos.apache.org/documentation/attributes-resources/
+ * https://github.com/apache/mesos/blob/master/src/common/values.cpp
+ * https://github.com/apache/mesos/blob/master/src/common/attributes.cpp
+ *
+ * @param constraintsVal constaints string consisting of ';' separated key-value pairs (separated
+ * by ':')
+ * @return Map of constraints to match resources offers.
+ */
+ def parseConstraintString(constraintsVal: String): Map[String, Set[String]] = {
+ /*
+ Based on mesos docs:
+ attributes : attribute ( ";" attribute )*
+ attribute : labelString ":" ( labelString | "," )+
+ labelString : [a-zA-Z0-9_/.-]
+ */
+ val splitter = Splitter.on(';').trimResults().withKeyValueSeparator(':')
+ // kv splitter
+ if (constraintsVal.isEmpty) {
+ Map()
+ } else {
+ try {
+ splitter.split(constraintsVal).asScala.toMap.mapValues(v =>
+ if (v == null || v.isEmpty) {
+ Set[String]()
+ } else {
+ v.split(',').toSet
+ }
+ )
+ } catch {
+ case NonFatal(e) =>
+ throw new IllegalArgumentException(s"Bad constraint string: $constraintsVal", e)
+ }
+ }
+ }
+
+ // These defaults copied from YARN
+ private val MEMORY_OVERHEAD_FRACTION = 0.10
+ private val MEMORY_OVERHEAD_MINIMUM = 384
+
+ /**
+ * Return the amount of memory to allocate to each executor, taking into account
+ * container overheads.
+ *
+ * @param sc SparkContext to use to get `spark.mesos.executor.memoryOverhead` value
+ * @return memory requirement as (0.1 * <memoryOverhead>) or MEMORY_OVERHEAD_MINIMUM
+ * (whichever is larger)
+ */
+ def executorMemory(sc: SparkContext): Int = {
+ sc.conf.getInt("spark.mesos.executor.memoryOverhead",
+ math.max(MEMORY_OVERHEAD_FRACTION * sc.executorMemory, MEMORY_OVERHEAD_MINIMUM).toInt) +
+ sc.executorMemory
+ }
+
+ def setupUris(uris: String,
+ builder: CommandInfo.Builder,
+ useFetcherCache: Boolean = false): Unit = {
+ uris.split(",").foreach { uri =>
+ builder.addUris(CommandInfo.URI.newBuilder().setValue(uri.trim()).setCache(useFetcherCache))
+ }
+ }
+
+ protected def getRejectOfferDurationForUnmetConstraints(sc: SparkContext): Long = {
+ sc.conf.getTimeAsSeconds("spark.mesos.rejectOfferDurationForUnmetConstraints", "120s")
+ }
+
+ protected def getRejectOfferDurationForReachedMaxCores(sc: SparkContext): Long = {
+ sc.conf.getTimeAsSeconds("spark.mesos.rejectOfferDurationForReachedMaxCores", "120s")
+ }
+
+ /**
+ * Checks executor ports if they are within some range of the offered list of ports ranges,
+ *
+ * @param conf the Spark Config
+ * @param ports the list of ports to check
+ * @return true if ports are within range false otherwise
+ */
+ protected def checkPorts(conf: SparkConf, ports: List[(Long, Long)]): Boolean = {
+
+ def checkIfInRange(port: Long, ps: List[(Long, Long)]): Boolean = {
+ ps.exists{case (rangeStart, rangeEnd) => rangeStart <= port & rangeEnd >= port }
+ }
+
+ val portsToCheck = nonZeroPortValuesFromConfig(conf)
+ val withinRange = portsToCheck.forall(p => checkIfInRange(p, ports))
+ // make sure we have enough ports to allocate per offer
+ val enoughPorts =
+ ports.map{case (rangeStart, rangeEnd) => rangeEnd - rangeStart + 1}.sum >= portsToCheck.size
+ enoughPorts && withinRange
+ }
+
+ /**
+ * Partitions port resources.
+ *
+ * @param requestedPorts non-zero ports to assign
+ * @param offeredResources the resources offered
+ * @return resources left, port resources to be used.
+ */
+ def partitionPortResources(requestedPorts: List[Long], offeredResources: List[Resource])
+ : (List[Resource], List[Resource]) = {
+ if (requestedPorts.isEmpty) {
+ (offeredResources, List[Resource]())
+ } else {
+ // partition port offers
+ val (resourcesWithoutPorts, portResources) = filterPortResources(offeredResources)
+
+ val portsAndRoles = requestedPorts.
+ map(x => (x, findPortAndGetAssignedRangeRole(x, portResources)))
+
+ val assignedPortResources = createResourcesFromPorts(portsAndRoles)
+
+ // ignore non-assigned port resources, they will be declined implicitly by mesos
+ // no need for splitting port resources.
+ (resourcesWithoutPorts, assignedPortResources)
+ }
+ }
+
+ val managedPortNames = List("spark.executor.port", BLOCK_MANAGER_PORT.key)
+
+ /**
+ * The values of the non-zero ports to be used by the executor process.
+ * @param conf the spark config to use
+ * @return the ono-zero values of the ports
+ */
+ def nonZeroPortValuesFromConfig(conf: SparkConf): List[Long] = {
+ managedPortNames.map(conf.getLong(_, 0)).filter( _ != 0)
+ }
+
+ /** Creates a mesos resource for a specific port number. */
+ private def createResourcesFromPorts(portsAndRoles: List[(Long, String)]) : List[Resource] = {
+ portsAndRoles.flatMap{ case (port, role) =>
+ createMesosPortResource(List((port, port)), Some(role))}
+ }
+
+ /** Helper to create mesos resources for specific port ranges. */
+ private def createMesosPortResource(
+ ranges: List[(Long, Long)],
+ role: Option[String] = None): List[Resource] = {
+ ranges.map { case (rangeStart, rangeEnd) =>
+ val rangeValue = Value.Range.newBuilder()
+ .setBegin(rangeStart)
+ .setEnd(rangeEnd)
+ val builder = Resource.newBuilder()
+ .setName("ports")
+ .setType(Value.Type.RANGES)
+ .setRanges(Value.Ranges.newBuilder().addRange(rangeValue))
+ role.foreach(r => builder.setRole(r))
+ builder.build()
+ }
+ }
+
+ /**
+ * Helper to assign a port to an offered range and get the latter's role
+ * info to use it later on.
+ */
+ private def findPortAndGetAssignedRangeRole(port: Long, portResources: List[Resource])
+ : String = {
+
+ val ranges = portResources.
+ map(resource =>
+ (resource.getRole, resource.getRanges.getRangeList.asScala
+ .map(r => (r.getBegin, r.getEnd)).toList))
+
+ val rangePortRole = ranges
+ .find { case (role, rangeList) => rangeList
+ .exists{ case (rangeStart, rangeEnd) => rangeStart <= port & rangeEnd >= port}}
+ // this is safe since we have previously checked about the ranges (see checkPorts method)
+ rangePortRole.map{ case (role, rangeList) => role}.get
+ }
+
+ /** Retrieves the port resources from a list of mesos offered resources */
+ private def filterPortResources(resources: List[Resource]): (List[Resource], List[Resource]) = {
+ resources.partition { r => !(r.getType == Value.Type.RANGES && r.getName == "ports") }
+ }
+
+ /**
+ * spark.mesos.driver.frameworkId is set by the cluster dispatcher to correlate driver
+ * submissions with frameworkIDs. However, this causes issues when a driver process launches
+ * more than one framework (more than one SparkContext(, because they all try to register with
+ * the same frameworkID. To enforce that only the first driver registers with the configured
+ * framework ID, the driver calls this method after the first registration.
+ */
+ def unsetFrameworkID(sc: SparkContext) {
+ sc.conf.remove("spark.mesos.driver.frameworkId")
+ System.clearProperty("spark.mesos.driver.frameworkId")
+ }
+
+ def mesosToTaskState(state: MesosTaskState): TaskState.TaskState = state match {
+ case MesosTaskState.TASK_STAGING | MesosTaskState.TASK_STARTING => TaskState.LAUNCHING
+ case MesosTaskState.TASK_RUNNING | MesosTaskState.TASK_KILLING => TaskState.RUNNING
+ case MesosTaskState.TASK_FINISHED => TaskState.FINISHED
+ case MesosTaskState.TASK_FAILED => TaskState.FAILED
+ case MesosTaskState.TASK_KILLED => TaskState.KILLED
+ case MesosTaskState.TASK_LOST | MesosTaskState.TASK_ERROR => TaskState.LOST
+ }
+
+ def taskStateToMesos(state: TaskState.TaskState): MesosTaskState = state match {
+ case TaskState.LAUNCHING => MesosTaskState.TASK_STARTING
+ case TaskState.RUNNING => MesosTaskState.TASK_RUNNING
+ case TaskState.FINISHED => MesosTaskState.TASK_FINISHED
+ case TaskState.FAILED => MesosTaskState.TASK_FAILED
+ case TaskState.KILLED => MesosTaskState.TASK_KILLED
+ case TaskState.LOST => MesosTaskState.TASK_LOST
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
new file mode 100644
index 0000000..8370b61
--- /dev/null
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosTaskLaunchData.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.nio.ByteBuffer
+
+import org.apache.mesos.protobuf.ByteString
+
+import org.apache.spark.internal.Logging
+
+/**
+ * Wrapper for serializing the data sent when launching Mesos tasks.
+ */
+private[spark] case class MesosTaskLaunchData(
+ serializedTask: ByteBuffer,
+ attemptNumber: Int) extends Logging {
+
+ def toByteString: ByteString = {
+ val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit)
+ dataBuffer.putInt(attemptNumber)
+ dataBuffer.put(serializedTask)
+ dataBuffer.rewind
+ logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]")
+ ByteString.copyFrom(dataBuffer)
+ }
+}
+
+private[spark] object MesosTaskLaunchData extends Logging {
+ def fromByteString(byteString: ByteString): MesosTaskLaunchData = {
+ val byteBuffer = byteString.asReadOnlyByteBuffer()
+ logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]")
+ val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes
+ val serializedTask = byteBuffer.slice() // subsequence starting at the current position
+ MesosTaskLaunchData(serializedTask, attemptNumber)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
new file mode 100644
index 0000000..33e7d69
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherArgumentsSuite.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.TestPrematureExit
+
+class MesosClusterDispatcherArgumentsSuite extends SparkFunSuite
+ with TestPrematureExit {
+
+ test("test if spark config args are passed sucessfully") {
+ val args = Array[String]("--master", "mesos://localhost:5050", "--conf", "key1=value1",
+ "--conf", "spark.mesos.key2=value2", "--verbose")
+ val conf = new SparkConf()
+ new MesosClusterDispatcherArguments(args, conf)
+
+ assert(conf.getOption("key1").isEmpty)
+ assert(conf.get("spark.mesos.key2") == "value2")
+ }
+
+ test("test non conf settings") {
+ val masterUrl = "mesos://localhost:5050"
+ val port = "1212"
+ val zookeeperUrl = "zk://localhost:2181"
+ val host = "localhost"
+ val webUiPort = "2323"
+ val name = "myFramework"
+
+ val args1 = Array("--master", masterUrl, "--verbose", "--name", name)
+ val args2 = Array("-p", port, "-h", host, "-z", zookeeperUrl)
+ val args3 = Array("--webui-port", webUiPort)
+
+ val args = args1 ++ args2 ++ args3
+ val conf = new SparkConf()
+ val mesosDispClusterArgs = new MesosClusterDispatcherArguments(args, conf)
+
+ assert(mesosDispClusterArgs.verbose)
+ assert(mesosDispClusterArgs.confProperties.isEmpty)
+ assert(mesosDispClusterArgs.host == host)
+ assert(Option(mesosDispClusterArgs.masterUrl).isDefined)
+ assert(mesosDispClusterArgs.masterUrl == masterUrl.stripPrefix("mesos://"))
+ assert(Option(mesosDispClusterArgs.zookeeperUrl).isDefined)
+ assert(mesosDispClusterArgs.zookeeperUrl == Some(zookeeperUrl))
+ assert(mesosDispClusterArgs.name == name)
+ assert(mesosDispClusterArgs.webUiPort == webUiPort.toInt)
+ assert(mesosDispClusterArgs.port == port.toInt)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
new file mode 100644
index 0000000..7484e3b
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcherSuite.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.mesos
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.TestPrematureExit
+
+class MesosClusterDispatcherSuite extends SparkFunSuite
+ with TestPrematureExit{
+
+ test("prints usage on empty input") {
+ testPrematureExit(Array[String](),
+ "Usage: MesosClusterDispatcher", MesosClusterDispatcher)
+ }
+
+ test("prints usage with only --help") {
+ testPrematureExit(Array("--help"),
+ "Usage: MesosClusterDispatcher", MesosClusterDispatcher)
+ }
+
+ test("prints error with unrecognized options") {
+ testPrematureExit(Array("--blarg"), "Unrecognized option: '--blarg'", MesosClusterDispatcher)
+ testPrematureExit(Array("-bleg"), "Unrecognized option: '-bleg'", MesosClusterDispatcher)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
new file mode 100644
index 0000000..a558554
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterManagerSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import org.apache.spark._
+import org.apache.spark.internal.config._
+
+class MesosClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
+ def testURL(masterURL: String, expectedClass: Class[_], coarse: Boolean) {
+ val conf = new SparkConf().set("spark.mesos.coarse", coarse.toString)
+ sc = new SparkContext("local", "test", conf)
+ val clusterManager = new MesosClusterManager()
+
+ assert(clusterManager.canCreate(masterURL))
+ val taskScheduler = clusterManager.createTaskScheduler(sc, masterURL)
+ val sched = clusterManager.createSchedulerBackend(sc, masterURL, taskScheduler)
+ assert(sched.getClass === expectedClass)
+ }
+
+ test("mesos fine-grained") {
+ testURL("mesos://localhost:1234", classOf[MesosFineGrainedSchedulerBackend], coarse = false)
+ }
+
+ test("mesos coarse-grained") {
+ testURL("mesos://localhost:1234", classOf[MesosCoarseGrainedSchedulerBackend], coarse = true)
+ }
+
+ test("mesos with zookeeper") {
+ testURL("mesos://zk://localhost:1234,localhost:2345",
+ classOf[MesosFineGrainedSchedulerBackend],
+ coarse = false)
+ }
+
+ test("mesos with i/o encryption throws error") {
+ val se = intercept[SparkException] {
+ val conf = new SparkConf().setAppName("test").set(IO_ENCRYPTION_ENABLED, true)
+ sc = new SparkContext("mesos", "test", conf)
+ }
+ assert(se.getCause().isInstanceOf[IllegalArgumentException])
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
new file mode 100644
index 0000000..74e5ce2
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.util.{Collection, Collections, Date}
+
+import scala.collection.JavaConverters._
+
+import org.apache.mesos.Protos._
+import org.apache.mesos.Protos.Value.{Scalar, Type}
+import org.apache.mesos.SchedulerDriver
+import org.mockito.{ArgumentCaptor, Matchers}
+import org.mockito.Mockito._
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.Command
+import org.apache.spark.deploy.mesos.MesosDriverDescription
+
+class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar {
+
+ private val command = new Command("mainClass", Seq("arg"), Map(), Seq(), Seq(), Seq())
+ private var driver: SchedulerDriver = _
+ private var scheduler: MesosClusterScheduler = _
+
+ private def setScheduler(sparkConfVars: Map[String, String] = null): Unit = {
+ val conf = new SparkConf()
+ conf.setMaster("mesos://localhost:5050")
+ conf.setAppName("spark mesos")
+
+ if (sparkConfVars != null) {
+ conf.setAll(sparkConfVars)
+ }
+
+ driver = mock[SchedulerDriver]
+ scheduler = new MesosClusterScheduler(
+ new BlackHoleMesosClusterPersistenceEngineFactory, conf) {
+ override def start(): Unit = { ready = true }
+ }
+ scheduler.start()
+ }
+
+ test("can queue drivers") {
+ setScheduler()
+
+ val response = scheduler.submitDriver(
+ new MesosDriverDescription("d1", "jar", 1000, 1, true,
+ command, Map[String, String](), "s1", new Date()))
+ assert(response.success)
+ val response2 =
+ scheduler.submitDriver(new MesosDriverDescription(
+ "d1", "jar", 1000, 1, true, command, Map[String, String](), "s2", new Date()))
+ assert(response2.success)
+ val state = scheduler.getSchedulerState()
+ val queuedDrivers = state.queuedDrivers.toList
+ assert(queuedDrivers(0).submissionId == response.submissionId)
+ assert(queuedDrivers(1).submissionId == response2.submissionId)
+ }
+
+ test("can kill queued drivers") {
+ setScheduler()
+
+ val response = scheduler.submitDriver(
+ new MesosDriverDescription("d1", "jar", 1000, 1, true,
+ command, Map[String, String](), "s1", new Date()))
+ assert(response.success)
+ val killResponse = scheduler.killDriver(response.submissionId)
+ assert(killResponse.success)
+ val state = scheduler.getSchedulerState()
+ assert(state.queuedDrivers.isEmpty)
+ }
+
+ test("can handle multiple roles") {
+ setScheduler()
+
+ val driver = mock[SchedulerDriver]
+ val response = scheduler.submitDriver(
+ new MesosDriverDescription("d1", "jar", 1200, 1.5, true,
+ command,
+ Map(("spark.mesos.executor.home", "test"), ("spark.app.name", "test")),
+ "s1",
+ new Date()))
+ assert(response.success)
+ val offer = Offer.newBuilder()
+ .addResources(
+ Resource.newBuilder().setRole("*")
+ .setScalar(Scalar.newBuilder().setValue(1).build()).setName("cpus").setType(Type.SCALAR))
+ .addResources(
+ Resource.newBuilder().setRole("*")
+ .setScalar(Scalar.newBuilder().setValue(1000).build())
+ .setName("mem")
+ .setType(Type.SCALAR))
+ .addResources(
+ Resource.newBuilder().setRole("role2")
+ .setScalar(Scalar.newBuilder().setValue(1).build()).setName("cpus").setType(Type.SCALAR))
+ .addResources(
+ Resource.newBuilder().setRole("role2")
+ .setScalar(Scalar.newBuilder().setValue(500).build()).setName("mem").setType(Type.SCALAR))
+ .setId(OfferID.newBuilder().setValue("o1").build())
+ .setFrameworkId(FrameworkID.newBuilder().setValue("f1").build())
+ .setSlaveId(SlaveID.newBuilder().setValue("s1").build())
+ .setHostname("host1")
+ .build()
+
+ val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
+
+ when(
+ driver.launchTasks(
+ Matchers.eq(Collections.singleton(offer.getId)),
+ capture.capture())
+ ).thenReturn(Status.valueOf(1))
+
+ scheduler.resourceOffers(driver, Collections.singletonList(offer))
+
+ val taskInfos = capture.getValue
+ assert(taskInfos.size() == 1)
+ val taskInfo = taskInfos.iterator().next()
+ val resources = taskInfo.getResourcesList
+ assert(scheduler.getResource(resources, "cpus") == 1.5)
+ assert(scheduler.getResource(resources, "mem") == 1200)
+ val resourcesSeq: Seq[Resource] = resources.asScala
+ val cpus = resourcesSeq.filter(_.getName.equals("cpus")).toList
+ assert(cpus.size == 2)
+ assert(cpus.exists(_.getRole().equals("role2")))
+ assert(cpus.exists(_.getRole().equals("*")))
+ val mem = resourcesSeq.filter(_.getName.equals("mem")).toList
+ assert(mem.size == 2)
+ assert(mem.exists(_.getRole().equals("role2")))
+ assert(mem.exists(_.getRole().equals("*")))
+
+ verify(driver, times(1)).launchTasks(
+ Matchers.eq(Collections.singleton(offer.getId)),
+ capture.capture()
+ )
+ }
+
+ test("escapes commandline args for the shell") {
+ setScheduler()
+
+ val conf = new SparkConf()
+ conf.setMaster("mesos://localhost:5050")
+ conf.setAppName("spark mesos")
+ val scheduler = new MesosClusterScheduler(
+ new BlackHoleMesosClusterPersistenceEngineFactory, conf) {
+ override def start(): Unit = { ready = true }
+ }
+ val escape = scheduler.shellEscape _
+ def wrapped(str: String): String = "\"" + str + "\""
+
+ // Wrapped in quotes
+ assert(escape("'should be left untouched'") === "'should be left untouched'")
+ assert(escape("\"should be left untouched\"") === "\"should be left untouched\"")
+
+ // Harmless
+ assert(escape("") === "")
+ assert(escape("harmless") === "harmless")
+ assert(escape("har-m.l3ss") === "har-m.l3ss")
+
+ // Special Chars escape
+ assert(escape("should escape this \" quote") === wrapped("should escape this \\\" quote"))
+ assert(escape("shouldescape\"quote") === wrapped("shouldescape\\\"quote"))
+ assert(escape("should escape this $ dollar") === wrapped("should escape this \\$ dollar"))
+ assert(escape("should escape this ` backtick") === wrapped("should escape this \\` backtick"))
+ assert(escape("""should escape this \ backslash""")
+ === wrapped("""should escape this \\ backslash"""))
+ assert(escape("""\"?""") === wrapped("""\\\"?"""))
+
+
+ // Special Chars no escape only wrap
+ List(" ", "'", "<", ">", "&", "|", "?", "*", ";", "!", "#", "(", ")").foreach(char => {
+ assert(escape(s"onlywrap${char}this") === wrapped(s"onlywrap${char}this"))
+ })
+ }
+
+ test("supports spark.mesos.driverEnv.*") {
+ setScheduler()
+
+ val mem = 1000
+ val cpu = 1
+
+ val response = scheduler.submitDriver(
+ new MesosDriverDescription("d1", "jar", mem, cpu, true,
+ command,
+ Map("spark.mesos.executor.home" -> "test",
+ "spark.app.name" -> "test",
+ "spark.mesos.driverEnv.TEST_ENV" -> "TEST_VAL"),
+ "s1",
+ new Date()))
+ assert(response.success)
+
+ val offer = Utils.createOffer("o1", "s1", mem, cpu)
+ scheduler.resourceOffers(driver, List(offer).asJava)
+ val tasks = Utils.verifyTaskLaunched(driver, "o1")
+ val env = tasks.head.getCommand.getEnvironment.getVariablesList.asScala.map(v =>
+ (v.getName, v.getValue)).toMap
+ assert(env.getOrElse("TEST_ENV", null) == "TEST_VAL")
+ }
+
+ test("supports spark.mesos.network.name") {
+ setScheduler()
+
+ val mem = 1000
+ val cpu = 1
+
+ val response = scheduler.submitDriver(
+ new MesosDriverDescription("d1", "jar", mem, cpu, true,
+ command,
+ Map("spark.mesos.executor.home" -> "test",
+ "spark.app.name" -> "test",
+ "spark.mesos.network.name" -> "test-network-name"),
+ "s1",
+ new Date()))
+
+ assert(response.success)
+
+ val offer = Utils.createOffer("o1", "s1", mem, cpu)
+ scheduler.resourceOffers(driver, List(offer).asJava)
+
+ val launchedTasks = Utils.verifyTaskLaunched(driver, "o1")
+ val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
+ assert(networkInfos.size == 1)
+ assert(networkInfos.get(0).getName == "test-network-name")
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
new file mode 100644
index 0000000..a674da4
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala
@@ -0,0 +1,601 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.util.concurrent.TimeUnit
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.duration._
+import scala.concurrent.Promise
+import scala.reflect.ClassTag
+
+import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
+import org.apache.mesos.Protos._
+import org.mockito.Matchers
+import org.mockito.Matchers._
+import org.mockito.Mockito._
+import org.scalatest.concurrent.ScalaFutures
+import org.scalatest.mock.MockitoSugar
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{LocalSparkContext, SecurityManager, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.internal.config._
+import org.apache.spark.network.shuffle.mesos.MesosExternalShuffleClient
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
+import org.apache.spark.scheduler.TaskSchedulerImpl
+import org.apache.spark.scheduler.cluster.mesos.Utils._
+
+class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite
+ with LocalSparkContext
+ with MockitoSugar
+ with BeforeAndAfter
+ with ScalaFutures {
+
+ private var sparkConf: SparkConf = _
+ private var driver: SchedulerDriver = _
+ private var taskScheduler: TaskSchedulerImpl = _
+ private var backend: MesosCoarseGrainedSchedulerBackend = _
+ private var externalShuffleClient: MesosExternalShuffleClient = _
+ private var driverEndpoint: RpcEndpointRef = _
+ @volatile private var stopCalled = false
+
+ // All 'requests' to the scheduler run immediately on the same thread, so
+ // demand that all futures have their value available immediately.
+ implicit override val patienceConfig = PatienceConfig(timeout = Duration(0, TimeUnit.SECONDS))
+
+ test("mesos supports killing and limiting executors") {
+ setBackend()
+ sparkConf.set("spark.driver.host", "driverHost")
+ sparkConf.set("spark.driver.port", "1234")
+
+ val minMem = backend.executorMemory(sc)
+ val minCpu = 4
+ val offers = List(Resources(minMem, minCpu))
+
+ // launches a task on a valid offer
+ offerResources(offers)
+ verifyTaskLaunched(driver, "o1")
+
+ // kills executors
+ assert(backend.doRequestTotalExecutors(0).futureValue)
+ assert(backend.doKillExecutors(Seq("0")).futureValue)
+ val taskID0 = createTaskId("0")
+ verify(driver, times(1)).killTask(taskID0)
+
+ // doesn't launch a new task when requested executors == 0
+ offerResources(offers, 2)
+ verifyDeclinedOffer(driver, createOfferId("o2"))
+
+ // Launches a new task when requested executors is positive
+ backend.doRequestTotalExecutors(2)
+ offerResources(offers, 2)
+ verifyTaskLaunched(driver, "o2")
+ }
+
+ test("mesos supports killing and relaunching tasks with executors") {
+ setBackend()
+
+ // launches a task on a valid offer
+ val minMem = backend.executorMemory(sc) + 1024
+ val minCpu = 4
+ val offer1 = Resources(minMem, minCpu)
+ val offer2 = Resources(minMem, 1)
+ offerResources(List(offer1, offer2))
+ verifyTaskLaunched(driver, "o1")
+
+ // accounts for a killed task
+ val status = createTaskStatus("0", "s1", TaskState.TASK_KILLED)
+ backend.statusUpdate(driver, status)
+ verify(driver, times(1)).reviveOffers()
+
+ // Launches a new task on a valid offer from the same slave
+ offerResources(List(offer2))
+ verifyTaskLaunched(driver, "o2")
+ }
+
+ test("mesos supports spark.executor.cores") {
+ val executorCores = 4
+ setBackend(Map("spark.executor.cores" -> executorCores.toString))
+
+ val executorMemory = backend.executorMemory(sc)
+ val offers = List(Resources(executorMemory * 2, executorCores + 1))
+ offerResources(offers)
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 1)
+
+ val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
+ assert(cpus == executorCores)
+ }
+
+ test("mesos supports unset spark.executor.cores") {
+ setBackend()
+
+ val executorMemory = backend.executorMemory(sc)
+ val offerCores = 10
+ offerResources(List(Resources(executorMemory * 2, offerCores)))
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 1)
+
+ val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
+ assert(cpus == offerCores)
+ }
+
+ test("mesos does not acquire more than spark.cores.max") {
+ val maxCores = 10
+ setBackend(Map("spark.cores.max" -> maxCores.toString))
+
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(Resources(executorMemory, maxCores + 1)))
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 1)
+
+ val cpus = backend.getResource(taskInfos.head.getResourcesList, "cpus")
+ assert(cpus == maxCores)
+ }
+
+ test("mesos does not acquire gpus if not specified") {
+ setBackend()
+
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(Resources(executorMemory, 1, 1)))
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 1)
+
+ val gpus = backend.getResource(taskInfos.head.getResourcesList, "gpus")
+ assert(gpus == 0.0)
+ }
+
+
+ test("mesos does not acquire more than spark.mesos.gpus.max") {
+ val maxGpus = 5
+ setBackend(Map("spark.mesos.gpus.max" -> maxGpus.toString))
+
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(Resources(executorMemory, 1, maxGpus + 1)))
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 1)
+
+ val gpus = backend.getResource(taskInfos.head.getResourcesList, "gpus")
+ assert(gpus == maxGpus)
+ }
+
+
+ test("mesos declines offers that violate attribute constraints") {
+ setBackend(Map("spark.mesos.constraints" -> "x:true"))
+ offerResources(List(Resources(backend.executorMemory(sc), 4)))
+ verifyDeclinedOffer(driver, createOfferId("o1"), true)
+ }
+
+ test("mesos declines offers with a filter when reached spark.cores.max") {
+ val maxCores = 3
+ setBackend(Map("spark.cores.max" -> maxCores.toString))
+
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(
+ Resources(executorMemory, maxCores + 1),
+ Resources(executorMemory, maxCores + 1)))
+
+ verifyTaskLaunched(driver, "o1")
+ verifyDeclinedOffer(driver, createOfferId("o2"), true)
+ }
+
+ test("mesos assigns tasks round-robin on offers") {
+ val executorCores = 4
+ val maxCores = executorCores * 2
+ setBackend(Map("spark.executor.cores" -> executorCores.toString,
+ "spark.cores.max" -> maxCores.toString))
+
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(
+ Resources(executorMemory * 2, executorCores * 2),
+ Resources(executorMemory * 2, executorCores * 2)))
+
+ verifyTaskLaunched(driver, "o1")
+ verifyTaskLaunched(driver, "o2")
+ }
+
+ test("mesos creates multiple executors on a single slave") {
+ val executorCores = 4
+ setBackend(Map("spark.executor.cores" -> executorCores.toString))
+
+ // offer with room for two executors
+ val executorMemory = backend.executorMemory(sc)
+ offerResources(List(Resources(executorMemory * 2, executorCores * 2)))
+
+ // verify two executors were started on a single offer
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.length == 2)
+ }
+
+ test("mesos doesn't register twice with the same shuffle service") {
+ setBackend(Map("spark.shuffle.service.enabled" -> "true"))
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+ verifyTaskLaunched(driver, "o1")
+
+ val offer2 = createOffer("o2", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer2).asJava)
+ verifyTaskLaunched(driver, "o2")
+
+ val status1 = createTaskStatus("0", "s1", TaskState.TASK_RUNNING)
+ backend.statusUpdate(driver, status1)
+
+ val status2 = createTaskStatus("1", "s1", TaskState.TASK_RUNNING)
+ backend.statusUpdate(driver, status2)
+ verify(externalShuffleClient, times(1))
+ .registerDriverWithShuffleService(anyString, anyInt, anyLong, anyLong)
+ }
+
+ test("Port offer decline when there is no appropriate range") {
+ setBackend(Map(BLOCK_MANAGER_PORT.key -> "30100"))
+ val offeredPorts = (31100L, 31200L)
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
+ backend.resourceOffers(driver, List(offer1).asJava)
+ verify(driver, times(1)).declineOffer(offer1.getId)
+ }
+
+ test("Port offer accepted when ephemeral ports are used") {
+ setBackend()
+ val offeredPorts = (31100L, 31200L)
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
+ backend.resourceOffers(driver, List(offer1).asJava)
+ verifyTaskLaunched(driver, "o1")
+ }
+
+ test("Port offer accepted with user defined port numbers") {
+ val port = 30100
+ setBackend(Map(BLOCK_MANAGER_PORT.key -> s"$port"))
+ val offeredPorts = (30000L, 31000L)
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu, Some(offeredPorts))
+ backend.resourceOffers(driver, List(offer1).asJava)
+ val taskInfo = verifyTaskLaunched(driver, "o1")
+
+ val taskPortResources = taskInfo.head.getResourcesList.asScala.
+ find(r => r.getType == Value.Type.RANGES && r.getName == "ports")
+
+ val isPortInOffer = (r: Resource) => {
+ r.getRanges().getRangeList
+ .asScala.exists(range => range.getBegin == port && range.getEnd == port)
+ }
+ assert(taskPortResources.exists(isPortInOffer))
+ }
+
+ test("mesos kills an executor when told") {
+ setBackend()
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+ verifyTaskLaunched(driver, "o1")
+
+ backend.doKillExecutors(List("0"))
+ verify(driver, times(1)).killTask(createTaskId("0"))
+ }
+
+ test("weburi is set in created scheduler driver") {
+ setBackend()
+ val taskScheduler = mock[TaskSchedulerImpl]
+ when(taskScheduler.sc).thenReturn(sc)
+ val driver = mock[SchedulerDriver]
+ when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+ val securityManager = mock[SecurityManager]
+
+ val backend = new MesosCoarseGrainedSchedulerBackend(
+ taskScheduler, sc, "master", securityManager) {
+ override protected def createSchedulerDriver(
+ masterUrl: String,
+ scheduler: Scheduler,
+ sparkUser: String,
+ appName: String,
+ conf: SparkConf,
+ webuiUrl: Option[String] = None,
+ checkpoint: Option[Boolean] = None,
+ failoverTimeout: Option[Double] = None,
+ frameworkId: Option[String] = None): SchedulerDriver = {
+ markRegistered()
+ assert(webuiUrl.isDefined)
+ assert(webuiUrl.get.equals("http://webui"))
+ driver
+ }
+ }
+
+ backend.start()
+ }
+
+ test("honors unset spark.mesos.containerizer") {
+ setBackend(Map("spark.mesos.executor.docker.image" -> "test"))
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.head.getContainer.getType == ContainerInfo.Type.DOCKER)
+ }
+
+ test("honors spark.mesos.containerizer=\"mesos\"") {
+ setBackend(Map(
+ "spark.mesos.executor.docker.image" -> "test",
+ "spark.mesos.containerizer" -> "mesos"))
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val taskInfos = verifyTaskLaunched(driver, "o1")
+ assert(taskInfos.head.getContainer.getType == ContainerInfo.Type.MESOS)
+ }
+
+ test("docker settings are reflected in created tasks") {
+ setBackend(Map(
+ "spark.mesos.executor.docker.image" -> "some_image",
+ "spark.mesos.executor.docker.forcePullImage" -> "true",
+ "spark.mesos.executor.docker.volumes" -> "/host_vol:/container_vol:ro",
+ "spark.mesos.executor.docker.portmaps" -> "8080:80:tcp"
+ ))
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ assert(launchedTasks.size == 1)
+
+ val containerInfo = launchedTasks.head.getContainer
+ assert(containerInfo.getType == ContainerInfo.Type.DOCKER)
+
+ val volumes = containerInfo.getVolumesList.asScala
+ assert(volumes.size == 1)
+
+ val volume = volumes.head
+ assert(volume.getHostPath == "/host_vol")
+ assert(volume.getContainerPath == "/container_vol")
+ assert(volume.getMode == Volume.Mode.RO)
+
+ val dockerInfo = containerInfo.getDocker
+
+ val portMappings = dockerInfo.getPortMappingsList.asScala
+ assert(portMappings.size == 1)
+
+ val portMapping = portMappings.head
+ assert(portMapping.getHostPort == 8080)
+ assert(portMapping.getContainerPort == 80)
+ assert(portMapping.getProtocol == "tcp")
+ }
+
+ test("force-pull-image option is disabled by default") {
+ setBackend(Map(
+ "spark.mesos.executor.docker.image" -> "some_image"
+ ))
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ assert(launchedTasks.size == 1)
+
+ val containerInfo = launchedTasks.head.getContainer
+ assert(containerInfo.getType == ContainerInfo.Type.DOCKER)
+
+ val dockerInfo = containerInfo.getDocker
+
+ assert(dockerInfo.getImage == "some_image")
+ assert(!dockerInfo.getForcePullImage)
+ }
+
+ test("Do not call removeExecutor() after backend is stopped") {
+ setBackend()
+
+ // launches a task on a valid offer
+ val offers = List(Resources(backend.executorMemory(sc), 1))
+ offerResources(offers)
+ verifyTaskLaunched(driver, "o1")
+
+ // launches a thread simulating status update
+ val statusUpdateThread = new Thread {
+ override def run(): Unit = {
+ while (!stopCalled) {
+ Thread.sleep(100)
+ }
+
+ val status = createTaskStatus("0", "s1", TaskState.TASK_FINISHED)
+ backend.statusUpdate(driver, status)
+ }
+ }.start
+
+ backend.stop()
+ // Any method of the backend involving sending messages to the driver endpoint should not
+ // be called after the backend is stopped.
+ verify(driverEndpoint, never()).askWithRetry(isA(classOf[RemoveExecutor]))(any[ClassTag[_]])
+ }
+
+ test("mesos supports spark.executor.uri") {
+ val url = "spark.spark.spark.com"
+ setBackend(Map(
+ "spark.executor.uri" -> url
+ ), false)
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ assert(launchedTasks.head.getCommand.getUrisList.asScala(0).getValue == url)
+ }
+
+ test("mesos supports setting fetcher cache") {
+ val url = "spark.spark.spark.com"
+ setBackend(Map(
+ "spark.mesos.fetcherCache.enable" -> "true",
+ "spark.executor.uri" -> url
+ ), false)
+ val offers = List(Resources(backend.executorMemory(sc), 1))
+ offerResources(offers)
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ val uris = launchedTasks.head.getCommand.getUrisList
+ assert(uris.size() == 1)
+ assert(uris.asScala.head.getCache)
+ }
+
+ test("mesos supports disabling fetcher cache") {
+ val url = "spark.spark.spark.com"
+ setBackend(Map(
+ "spark.mesos.fetcherCache.enable" -> "false",
+ "spark.executor.uri" -> url
+ ), false)
+ val offers = List(Resources(backend.executorMemory(sc), 1))
+ offerResources(offers)
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ val uris = launchedTasks.head.getCommand.getUrisList
+ assert(uris.size() == 1)
+ assert(!uris.asScala.head.getCache)
+ }
+
+ test("mesos supports spark.mesos.network.name") {
+ setBackend(Map(
+ "spark.mesos.network.name" -> "test-network-name"
+ ))
+
+ val (mem, cpu) = (backend.executorMemory(sc), 4)
+
+ val offer1 = createOffer("o1", "s1", mem, cpu)
+ backend.resourceOffers(driver, List(offer1).asJava)
+
+ val launchedTasks = verifyTaskLaunched(driver, "o1")
+ val networkInfos = launchedTasks.head.getContainer.getNetworkInfosList
+ assert(networkInfos.size == 1)
+ assert(networkInfos.get(0).getName == "test-network-name")
+ }
+
+ private case class Resources(mem: Int, cpus: Int, gpus: Int = 0)
+
+ private def verifyDeclinedOffer(driver: SchedulerDriver,
+ offerId: OfferID,
+ filter: Boolean = false): Unit = {
+ if (filter) {
+ verify(driver, times(1)).declineOffer(Matchers.eq(offerId), anyObject[Filters])
+ } else {
+ verify(driver, times(1)).declineOffer(Matchers.eq(offerId))
+ }
+ }
+
+ private def offerResources(offers: List[Resources], startId: Int = 1): Unit = {
+ val mesosOffers = offers.zipWithIndex.map {case (offer, i) =>
+ createOffer(s"o${i + startId}", s"s${i + startId}", offer.mem, offer.cpus, None, offer.gpus)}
+
+ backend.resourceOffers(driver, mesosOffers.asJava)
+ }
+
+ private def createTaskStatus(taskId: String, slaveId: String, state: TaskState): TaskStatus = {
+ TaskStatus.newBuilder()
+ .setTaskId(TaskID.newBuilder().setValue(taskId).build())
+ .setSlaveId(SlaveID.newBuilder().setValue(slaveId).build())
+ .setState(state)
+ .build
+ }
+
+ private def createSchedulerBackend(
+ taskScheduler: TaskSchedulerImpl,
+ driver: SchedulerDriver,
+ shuffleClient: MesosExternalShuffleClient,
+ endpoint: RpcEndpointRef): MesosCoarseGrainedSchedulerBackend = {
+ val securityManager = mock[SecurityManager]
+
+ val backend = new MesosCoarseGrainedSchedulerBackend(
+ taskScheduler, sc, "master", securityManager) {
+ override protected def createSchedulerDriver(
+ masterUrl: String,
+ scheduler: Scheduler,
+ sparkUser: String,
+ appName: String,
+ conf: SparkConf,
+ webuiUrl: Option[String] = None,
+ checkpoint: Option[Boolean] = None,
+ failoverTimeout: Option[Double] = None,
+ frameworkId: Option[String] = None): SchedulerDriver = driver
+
+ override protected def getShuffleClient(): MesosExternalShuffleClient = shuffleClient
+
+ override protected def createDriverEndpointRef(
+ properties: ArrayBuffer[(String, String)]): RpcEndpointRef = endpoint
+
+ // override to avoid race condition with the driver thread on `mesosDriver`
+ override def startScheduler(newDriver: SchedulerDriver): Unit = {
+ mesosDriver = newDriver
+ }
+
+ override def stopExecutors(): Unit = {
+ stopCalled = true
+ }
+
+ markRegistered()
+ }
+ backend.start()
+ backend
+ }
+
+ private def setBackend(sparkConfVars: Map[String, String] = null,
+ setHome: Boolean = true) {
+ sparkConf = (new SparkConf)
+ .setMaster("local[*]")
+ .setAppName("test-mesos-dynamic-alloc")
+ .set("spark.mesos.driver.webui.url", "http://webui")
+
+ if (setHome) {
+ sparkConf.setSparkHome("/path")
+ }
+
+ if (sparkConfVars != null) {
+ sparkConf.setAll(sparkConfVars)
+ }
+
+ sc = new SparkContext(sparkConf)
+
+ driver = mock[SchedulerDriver]
+ when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+ taskScheduler = mock[TaskSchedulerImpl]
+ when(taskScheduler.sc).thenReturn(sc)
+ externalShuffleClient = mock[MesosExternalShuffleClient]
+ driverEndpoint = mock[RpcEndpointRef]
+ when(driverEndpoint.ask(any())(any())).thenReturn(Promise().future)
+
+ backend = createSchedulerBackend(taskScheduler, driver, externalShuffleClient, driverEndpoint)
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/81e5619c/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
----------------------------------------------------------------------
diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
new file mode 100644
index 0000000..1d7a86f
--- /dev/null
+++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosFineGrainedSchedulerBackendSuite.scala
@@ -0,0 +1,385 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler.cluster.mesos
+
+import java.nio.ByteBuffer
+import java.util.Arrays
+import java.util.Collection
+import java.util.Collections
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.mesos.{Protos, Scheduler, SchedulerDriver}
+import org.apache.mesos.Protos._
+import org.apache.mesos.Protos.Value.Scalar
+import org.mockito.{ArgumentCaptor, Matchers}
+import org.mockito.Matchers._
+import org.mockito.Mockito._
+import org.scalatest.mock.MockitoSugar
+
+import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.executor.MesosExecutorBackend
+import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerExecutorAdded,
+ TaskDescription, TaskSchedulerImpl, WorkerOffer}
+import org.apache.spark.scheduler.cluster.ExecutorInfo
+
+class MesosFineGrainedSchedulerBackendSuite
+ extends SparkFunSuite with LocalSparkContext with MockitoSugar {
+
+ test("weburi is set in created scheduler driver") {
+ val conf = new SparkConf
+ conf.set("spark.mesos.driver.webui.url", "http://webui")
+ conf.set("spark.app.name", "name1")
+
+ val sc = mock[SparkContext]
+ when(sc.conf).thenReturn(conf)
+ when(sc.sparkUser).thenReturn("sparkUser1")
+ when(sc.appName).thenReturn("appName1")
+
+ val taskScheduler = mock[TaskSchedulerImpl]
+ val driver = mock[SchedulerDriver]
+ when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING)
+
+ val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master") {
+ override protected def createSchedulerDriver(
+ masterUrl: String,
+ scheduler: Scheduler,
+ sparkUser: String,
+ appName: String,
+ conf: SparkConf,
+ webuiUrl: Option[String] = None,
+ checkpoint: Option[Boolean] = None,
+ failoverTimeout: Option[Double] = None,
+ frameworkId: Option[String] = None): SchedulerDriver = {
+ markRegistered()
+ assert(webuiUrl.isDefined)
+ assert(webuiUrl.get.equals("http://webui"))
+ driver
+ }
+ }
+
+ backend.start()
+ }
+
+ test("Use configured mesosExecutor.cores for ExecutorInfo") {
+ val mesosExecutorCores = 3
+ val conf = new SparkConf
+ conf.set("spark.mesos.mesosExecutor.cores", mesosExecutorCores.toString)
+
+ val listenerBus = mock[LiveListenerBus]
+ listenerBus.post(
+ SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+ val sc = mock[SparkContext]
+ when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+
+ when(sc.conf).thenReturn(conf)
+ when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+ when(sc.executorMemory).thenReturn(100)
+ when(sc.listenerBus).thenReturn(listenerBus)
+ val taskScheduler = mock[TaskSchedulerImpl]
+ when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+
+ val mesosSchedulerBackend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
+
+ val resources = Arrays.asList(
+ mesosSchedulerBackend.createResource("cpus", 4),
+ mesosSchedulerBackend.createResource("mem", 1024))
+ // uri is null.
+ val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
+ val executorResources = executorInfo.getResourcesList
+ val cpus = executorResources.asScala.find(_.getName.equals("cpus")).get.getScalar.getValue
+
+ assert(cpus === mesosExecutorCores)
+ }
+
+ test("check spark-class location correctly") {
+ val conf = new SparkConf
+ conf.set("spark.mesos.executor.home", "/mesos-home")
+
+ val listenerBus = mock[LiveListenerBus]
+ listenerBus.post(
+ SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+ val sc = mock[SparkContext]
+ when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+
+ when(sc.conf).thenReturn(conf)
+ when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+ when(sc.executorMemory).thenReturn(100)
+ when(sc.listenerBus).thenReturn(listenerBus)
+ val taskScheduler = mock[TaskSchedulerImpl]
+ when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+
+ val mesosSchedulerBackend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
+
+ val resources = Arrays.asList(
+ mesosSchedulerBackend.createResource("cpus", 4),
+ mesosSchedulerBackend.createResource("mem", 1024))
+ // uri is null.
+ val (executorInfo, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
+ assert(executorInfo.getCommand.getValue ===
+ s" /mesos-home/bin/spark-class ${classOf[MesosExecutorBackend].getName}")
+
+ // uri exists.
+ conf.set("spark.executor.uri", "hdfs:///test-app-1.0.0.tgz")
+ val (executorInfo1, _) = mesosSchedulerBackend.createExecutorInfo(resources, "test-id")
+ assert(executorInfo1.getCommand.getValue ===
+ s"cd test-app-1*; ./bin/spark-class ${classOf[MesosExecutorBackend].getName}")
+ }
+
+ test("spark docker properties correctly populate the DockerInfo message") {
+ val taskScheduler = mock[TaskSchedulerImpl]
+
+ val conf = new SparkConf()
+ .set("spark.mesos.executor.docker.image", "spark/mock")
+ .set("spark.mesos.executor.docker.forcePullImage", "true")
+ .set("spark.mesos.executor.docker.volumes", "/a,/b:/b,/c:/c:rw,/d:ro,/e:/e:ro")
+ .set("spark.mesos.executor.docker.portmaps", "80:8080,53:53:tcp")
+
+ val listenerBus = mock[LiveListenerBus]
+ listenerBus.post(
+ SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+ val sc = mock[SparkContext]
+ when(sc.executorMemory).thenReturn(100)
+ when(sc.getSparkHome()).thenReturn(Option("/spark-home"))
+ when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+ when(sc.conf).thenReturn(conf)
+ when(sc.listenerBus).thenReturn(listenerBus)
+
+ val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
+
+ val (execInfo, _) = backend.createExecutorInfo(
+ Arrays.asList(backend.createResource("cpus", 4)), "mockExecutor")
+ assert(execInfo.getContainer.getDocker.getImage.equals("spark/mock"))
+ assert(execInfo.getContainer.getDocker.getForcePullImage.equals(true))
+ val portmaps = execInfo.getContainer.getDocker.getPortMappingsList
+ assert(portmaps.get(0).getHostPort.equals(80))
+ assert(portmaps.get(0).getContainerPort.equals(8080))
+ assert(portmaps.get(0).getProtocol.equals("tcp"))
+ assert(portmaps.get(1).getHostPort.equals(53))
+ assert(portmaps.get(1).getContainerPort.equals(53))
+ assert(portmaps.get(1).getProtocol.equals("tcp"))
+ val volumes = execInfo.getContainer.getVolumesList
+ assert(volumes.get(0).getContainerPath.equals("/a"))
+ assert(volumes.get(0).getMode.equals(Volume.Mode.RW))
+ assert(volumes.get(1).getContainerPath.equals("/b"))
+ assert(volumes.get(1).getHostPath.equals("/b"))
+ assert(volumes.get(1).getMode.equals(Volume.Mode.RW))
+ assert(volumes.get(2).getContainerPath.equals("/c"))
+ assert(volumes.get(2).getHostPath.equals("/c"))
+ assert(volumes.get(2).getMode.equals(Volume.Mode.RW))
+ assert(volumes.get(3).getContainerPath.equals("/d"))
+ assert(volumes.get(3).getMode.equals(Volume.Mode.RO))
+ assert(volumes.get(4).getContainerPath.equals("/e"))
+ assert(volumes.get(4).getHostPath.equals("/e"))
+ assert(volumes.get(4).getMode.equals(Volume.Mode.RO))
+ }
+
+ test("mesos resource offers result in launching tasks") {
+ def createOffer(id: Int, mem: Int, cpu: Int): Offer = {
+ val builder = Offer.newBuilder()
+ builder.addResourcesBuilder()
+ .setName("mem")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(mem))
+ builder.addResourcesBuilder()
+ .setName("cpus")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(cpu))
+ builder.setId(OfferID.newBuilder().setValue(s"o${id.toString}").build())
+ .setFrameworkId(FrameworkID.newBuilder().setValue("f1"))
+ .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
+ .setHostname(s"host${id.toString}").build()
+ }
+
+ val driver = mock[SchedulerDriver]
+ val taskScheduler = mock[TaskSchedulerImpl]
+
+ val listenerBus = mock[LiveListenerBus]
+ listenerBus.post(
+ SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+ val sc = mock[SparkContext]
+ when(sc.executorMemory).thenReturn(100)
+ when(sc.getSparkHome()).thenReturn(Option("/path"))
+ when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+ when(sc.conf).thenReturn(new SparkConf)
+ when(sc.listenerBus).thenReturn(listenerBus)
+
+ val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
+
+ val minMem = backend.executorMemory(sc)
+ val minCpu = 4
+
+ val mesosOffers = new java.util.ArrayList[Offer]
+ mesosOffers.add(createOffer(1, minMem, minCpu))
+ mesosOffers.add(createOffer(2, minMem - 1, minCpu))
+ mesosOffers.add(createOffer(3, minMem, minCpu))
+
+ val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](2)
+ expectedWorkerOffers += new WorkerOffer(
+ mesosOffers.get(0).getSlaveId.getValue,
+ mesosOffers.get(0).getHostname,
+ (minCpu - backend.mesosExecutorCores).toInt
+ )
+ expectedWorkerOffers += new WorkerOffer(
+ mesosOffers.get(2).getSlaveId.getValue,
+ mesosOffers.get(2).getHostname,
+ (minCpu - backend.mesosExecutorCores).toInt
+ )
+ val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
+ when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
+ when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+
+ val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
+ when(
+ driver.launchTasks(
+ Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+ capture.capture(),
+ any(classOf[Filters])
+ )
+ ).thenReturn(Status.valueOf(1))
+ when(driver.declineOffer(mesosOffers.get(1).getId)).thenReturn(Status.valueOf(1))
+ when(driver.declineOffer(mesosOffers.get(2).getId)).thenReturn(Status.valueOf(1))
+
+ backend.resourceOffers(driver, mesosOffers)
+
+ verify(driver, times(1)).launchTasks(
+ Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+ capture.capture(),
+ any(classOf[Filters])
+ )
+ verify(driver, times(1)).declineOffer(mesosOffers.get(1).getId)
+ verify(driver, times(1)).declineOffer(mesosOffers.get(2).getId)
+ assert(capture.getValue.size() === 1)
+ val taskInfo = capture.getValue.iterator().next()
+ assert(taskInfo.getName.equals("n1"))
+ val cpus = taskInfo.getResourcesList.get(0)
+ assert(cpus.getName.equals("cpus"))
+ assert(cpus.getScalar.getValue.equals(2.0))
+ assert(taskInfo.getSlaveId.getValue.equals("s1"))
+
+ // Unwanted resources offered on an existing node. Make sure they are declined
+ val mesosOffers2 = new java.util.ArrayList[Offer]
+ mesosOffers2.add(createOffer(1, minMem, minCpu))
+ reset(taskScheduler)
+ reset(driver)
+ when(taskScheduler.resourceOffers(any(classOf[IndexedSeq[WorkerOffer]]))).thenReturn(Seq(Seq()))
+ when(taskScheduler.CPUS_PER_TASK).thenReturn(2)
+ when(driver.declineOffer(mesosOffers2.get(0).getId)).thenReturn(Status.valueOf(1))
+
+ backend.resourceOffers(driver, mesosOffers2)
+ verify(driver, times(1)).declineOffer(mesosOffers2.get(0).getId)
+ }
+
+ test("can handle multiple roles") {
+ val driver = mock[SchedulerDriver]
+ val taskScheduler = mock[TaskSchedulerImpl]
+
+ val listenerBus = mock[LiveListenerBus]
+ listenerBus.post(
+ SparkListenerExecutorAdded(anyLong, "s1", new ExecutorInfo("host1", 2, Map.empty)))
+
+ val sc = mock[SparkContext]
+ when(sc.executorMemory).thenReturn(100)
+ when(sc.getSparkHome()).thenReturn(Option("/path"))
+ when(sc.executorEnvs).thenReturn(new mutable.HashMap[String, String])
+ when(sc.conf).thenReturn(new SparkConf)
+ when(sc.listenerBus).thenReturn(listenerBus)
+
+ val id = 1
+ val builder = Offer.newBuilder()
+ builder.addResourcesBuilder()
+ .setName("mem")
+ .setType(Value.Type.SCALAR)
+ .setRole("prod")
+ .setScalar(Scalar.newBuilder().setValue(500))
+ builder.addResourcesBuilder()
+ .setName("cpus")
+ .setRole("prod")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(1))
+ builder.addResourcesBuilder()
+ .setName("mem")
+ .setRole("dev")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(600))
+ builder.addResourcesBuilder()
+ .setName("cpus")
+ .setRole("dev")
+ .setType(Value.Type.SCALAR)
+ .setScalar(Scalar.newBuilder().setValue(2))
+ val offer = builder.setId(OfferID.newBuilder().setValue(s"o${id.toString}").build())
+ .setFrameworkId(FrameworkID.newBuilder().setValue("f1"))
+ .setSlaveId(SlaveID.newBuilder().setValue(s"s${id.toString}"))
+ .setHostname(s"host${id.toString}").build()
+
+ val mesosOffers = new java.util.ArrayList[Offer]
+ mesosOffers.add(offer)
+
+ val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master")
+
+ val expectedWorkerOffers = new ArrayBuffer[WorkerOffer](1)
+ expectedWorkerOffers += new WorkerOffer(
+ mesosOffers.get(0).getSlaveId.getValue,
+ mesosOffers.get(0).getHostname,
+ 2 // Deducting 1 for executor
+ )
+
+ val taskDesc = new TaskDescription(1L, 0, "s1", "n1", 0, ByteBuffer.wrap(new Array[Byte](0)))
+ when(taskScheduler.resourceOffers(expectedWorkerOffers)).thenReturn(Seq(Seq(taskDesc)))
+ when(taskScheduler.CPUS_PER_TASK).thenReturn(1)
+
+ val capture = ArgumentCaptor.forClass(classOf[Collection[TaskInfo]])
+ when(
+ driver.launchTasks(
+ Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+ capture.capture(),
+ any(classOf[Filters])
+ )
+ ).thenReturn(Status.valueOf(1))
+
+ backend.resourceOffers(driver, mesosOffers)
+
+ verify(driver, times(1)).launchTasks(
+ Matchers.eq(Collections.singleton(mesosOffers.get(0).getId)),
+ capture.capture(),
+ any(classOf[Filters])
+ )
+
+ assert(capture.getValue.size() === 1)
+ val taskInfo = capture.getValue.iterator().next()
+ assert(taskInfo.getName.equals("n1"))
+ assert(taskInfo.getResourcesCount === 1)
+ val cpusDev = taskInfo.getResourcesList.get(0)
+ assert(cpusDev.getName.equals("cpus"))
+ assert(cpusDev.getScalar.getValue.equals(1.0))
+ assert(cpusDev.getRole.equals("dev"))
+ val executorResources = taskInfo.getExecutor.getResourcesList.asScala
+ assert(executorResources.exists { r =>
+ r.getName.equals("mem") && r.getScalar.getValue.equals(484.0) && r.getRole.equals("prod")
+ })
+ assert(executorResources.exists { r =>
+ r.getName.equals("cpus") && r.getScalar.getValue.equals(1.0) && r.getRole.equals("prod")
+ })
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org