You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/12/20 22:49:00 UTC

[jira] [Commented] (SPARK-23155) YARN-aggregated executor/driver logs appear unavailable when NM is down

    [ https://issues.apache.org/jira/browse/SPARK-23155?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16726287#comment-16726287 ] 

ASF GitHub Bot commented on SPARK-23155:
----------------------------------------

vanzin closed pull request #20326: [SPARK-23155][DEPLOY] log.server.url links in SHS
URL: https://github.com/apache/spark/pull/20326
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
index 974697890dd03..79a81d3c74713 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.status.api.v1
 
 import java.io.OutputStream
+import java.net.URI
 import java.util.{List => JList}
 import java.util.zip.ZipOutputStream
 import javax.ws.rs._
@@ -49,7 +50,7 @@ private[v1] class AbstractApplicationResource extends BaseAppResource {
 
   @GET
   @Path("executors")
-  def executorList(): Seq[ExecutorSummary] = withUI(_.store.executorList(true))
+  def executorList(): Seq[ExecutorSummary] = fetchExecutors(true)
 
   @GET
   @Path("executors/{executorId}/threads")
@@ -76,7 +77,7 @@ private[v1] class AbstractApplicationResource extends BaseAppResource {
 
   @GET
   @Path("allexecutors")
-  def allExecutorList(): Seq[ExecutorSummary] = withUI(_.store.executorList(false))
+  def allExecutorList(): Seq[ExecutorSummary] = fetchExecutors(false)
 
   @Path("stages")
   def stages(): Class[StagesResource] = classOf[StagesResource]
@@ -160,6 +161,62 @@ private[v1] class AbstractApplicationResource extends BaseAppResource {
     classOf[OneApplicationAttemptResource]
   }
 
+  private def fetchExecutors(activeOnly: Boolean): Seq[ExecutorSummary] = {
+    withUI(ui => {
+      val tmpExecutorList = ui.store.executorList(activeOnly)
+      ui.yarnLogServerUrl.map(lurl =>
+        tmpExecutorList.map(withYarnLogServerLogs(toYarnLogServerUrl(lurl, ui.nmRpcPort)))
+      ).getOrElse(tmpExecutorList)
+    })
+  }
+
+  private def toYarnLogServerUrl(logServerUrl: String, nmPort: Int)(nmLogUrl: String): String = {
+    val containerSuffixPos = nmLogUrl.indexOf("container_")
+    if (containerSuffixPos >= 0) {
+      val nodeId = URI.create(nmLogUrl).getHost + ":" + nmPort
+      val containerSuffix = nmLogUrl.substring(containerSuffixPos)
+      val containerEndPos = containerSuffix.indexOf("/")
+      if (containerEndPos >= 0) {
+        val container = containerSuffix.substring(0, containerEndPos)
+        s"$logServerUrl/$nodeId/$container/$containerSuffix"
+      } else {
+        nmLogUrl
+      }
+    } else {
+      nmLogUrl
+    }
+  }
+
+  private def withYarnLogServerLogs(
+    logRewrite: String => String)(
+    info: ExecutorSummary): ExecutorSummary = {
+      new ExecutorSummary(
+        id = info.id,
+        hostPort = info.hostPort,
+        isActive = info.isActive,
+        rddBlocks = info.rddBlocks,
+        memoryUsed = info.memoryUsed,
+        diskUsed = info.diskUsed,
+        totalCores = info.totalCores,
+        maxTasks = info.maxTasks,
+        activeTasks = info.activeTasks,
+        failedTasks = info.failedTasks,
+        completedTasks = info.completedTasks,
+        totalTasks = info.totalTasks,
+        totalDuration = info.totalDuration,
+        totalGCTime = info.totalGCTime,
+        totalInputBytes = info.totalInputBytes,
+        totalShuffleRead = info.totalShuffleRead,
+        totalShuffleWrite = info.totalShuffleWrite,
+        isBlacklisted = info.isBlacklisted,
+        maxMemory = info.maxMemory,
+        addTime = info.addTime,
+        removeTime = info.removeTime,
+        removeReason = info.removeReason,
+        executorLogs = info.executorLogs.mapValues(logRewrite),
+        memoryMetrics = info.memoryMetrics
+      )
+  }
 }
 
 private[v1] class OneApplicationResource extends AbstractApplicationResource {
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index b44ac0ea1febc..8acfe7161ff51 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.ui
 
-import java.util.{Date, List => JList, ServiceLoader}
+import java.util.Date
 
-import scala.collection.JavaConverters._
+import org.apache.hadoop.yarn.conf.YarnConfiguration
 
-import org.apache.spark.{JobExecutionStatus, SecurityManager, SparkConf, SparkContext}
+import org.apache.spark.{SecurityManager, SparkConf, SparkContext}
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
@@ -31,7 +32,6 @@ import org.apache.spark.ui.env.EnvironmentTab
 import org.apache.spark.ui.exec.ExecutorsTab
 import org.apache.spark.ui.jobs.{JobsTab, StagesTab}
 import org.apache.spark.ui.storage.StorageTab
-import org.apache.spark.util.Utils
 
 /**
  * Top level user interface for a Spark application.
@@ -52,6 +52,22 @@ private[spark] class SparkUI private (
 
   val killEnabled = sc.map(_.conf.getBoolean("spark.ui.killEnabled", true)).getOrElse(false)
 
+  private val yarnConf = SparkHadoopUtil.get.newConfiguration(conf)
+  private val portReg = "^.*:([0-9]+)$".r
+  private[spark] val nmRpcPort = Option(yarnConf.get(YarnConfiguration.NM_ADDRESS))
+    .map { case portReg(port) => port.toInt }
+    .getOrElse(0)
+
+  private val isHistoryUI = sc.isEmpty
+  private val useAggregatedLogs = isHistoryUI && nmRpcPort > 0 &&
+    yarnConf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, false)
+
+  private[spark] val yarnLogServerUrl = if (useAggregatedLogs) {
+      Option(yarnConf.get(YarnConfiguration.YARN_LOG_SERVER_URL))
+    } else {
+      None
+    }
+
   var appId: String = _
 
   private var streamingJobProgressListener: Option[SparkListener] = None


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> YARN-aggregated executor/driver logs appear unavailable when NM is down
> -----------------------------------------------------------------------
>
>                 Key: SPARK-23155
>                 URL: https://issues.apache.org/jira/browse/SPARK-23155
>             Project: Spark
>          Issue Type: Improvement
>          Components: Deploy
>    Affects Versions: 2.2.1
>            Reporter: Gera Shegalov
>            Priority: Major
>
> Unlike MapReduce JobHistory Server, Spark history server isn't rewriting container log URL's to point to the aggregated yarn.log.server.url location and relies on the NodeManager webUI to trigger a redirect. This fails when the NM is down. Note that NM may be down permanently after decommissioning in traditional environments or when used in a cloud environment such as AWS EMR where either worker nodes are taken away with autoscale, the whole cluster is used to run a single job.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org