You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@openwhisk.apache.org by GitBox <gi...@apache.org> on 2018/10/26 06:33:20 UTC

[GitHub] chetanmeh closed pull request #4070: Diagnostic info + metrics for Docker command failures and timeouts

chetanmeh closed pull request #4070: Diagnostic info + metrics for Docker command failures and timeouts
URL: https://github.com/apache/incubator-openwhisk/pull/4070
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/common/scala/src/main/scala/whisk/common/Logging.scala b/common/scala/src/main/scala/whisk/common/Logging.scala
index 19642f9b97..d0457f879c 100644
--- a/common/scala/src/main/scala/whisk/common/Logging.scala
+++ b/common/scala/src/main/scala/whisk/common/Logging.scala
@@ -243,6 +243,7 @@ object LoggingMarkers {
   val finish = "finish"
   val error = "error"
   val count = "count"
+  val timeout = "timeout"
 
   private val controller = "controller"
   private val invoker = "invoker"
@@ -296,6 +297,8 @@ object LoggingMarkers {
   // Time in invoker
   val INVOKER_ACTIVATION = LogMarkerToken(invoker, activation, start)
   def INVOKER_DOCKER_CMD(cmd: String) = LogMarkerToken(invoker, "docker", start, Some(cmd), Map("cmd" -> cmd))
+  def INVOKER_DOCKER_CMD_TIMEOUT(cmd: String) =
+    LogMarkerToken(invoker, "docker", timeout, Some(cmd), Map("cmd" -> cmd))
   def INVOKER_RUNC_CMD(cmd: String) = LogMarkerToken(invoker, "runc", start, Some(cmd), Map("cmd" -> cmd))
   def INVOKER_KUBECTL_CMD(cmd: String) = LogMarkerToken(invoker, "kubectl", start, Some(cmd), Map("cmd" -> cmd))
   def INVOKER_CONTAINER_START(containerState: String) =
diff --git a/core/invoker/src/main/scala/whisk/core/containerpool/docker/DockerClient.scala b/core/invoker/src/main/scala/whisk/core/containerpool/docker/DockerClient.scala
index 8a87e379e8..371b38a40b 100644
--- a/core/invoker/src/main/scala/whisk/core/containerpool/docker/DockerClient.scala
+++ b/core/invoker/src/main/scala/whisk/core/containerpool/docker/DockerClient.scala
@@ -33,9 +33,7 @@ import scala.util.Success
 import scala.util.Try
 import akka.event.Logging.{ErrorLevel, InfoLevel}
 import pureconfig.loadConfigOrThrow
-import whisk.common.Logging
-import whisk.common.LoggingMarkers
-import whisk.common.TransactionId
+import whisk.common.{Logging, LoggingMarkers, MetricEmitter, TransactionId}
 import whisk.core.ConfigKeys
 import whisk.core.containerpool.ContainerId
 import whisk.core.containerpool.ContainerAddress
@@ -127,11 +125,11 @@ class DockerClient(dockerHost: Option[String] = None,
         .map(ContainerId.apply)
         .recoverWith {
           // https://docs.docker.com/v1.12/engine/reference/run/#/exit-status
-          // Exit code 125 means an error reported by the Docker daemon.
+          // Exit status 125 means an error reported by the Docker daemon.
           // Examples:
           // - Unrecognized option specified
           // - Not enough disk space
-          case pre: ProcessRunningException if pre.exitCode == 125 =>
+          case pre: ProcessUnsuccessfulException if pre.exitStatus == ExitStatus(125) =>
             Future.failed(
               DockerContainerId
                 .parse(pre.stdout)
@@ -189,6 +187,9 @@ class DockerClient(dockerHost: Option[String] = None,
       logLevel = InfoLevel)
     executeProcess(cmd, timeout).andThen {
       case Success(_) => transid.finished(this, start)
+      case Failure(pte: ProcessTimeoutException) =>
+        transid.failed(this, start, pte.getMessage, ErrorLevel)
+        MetricEmitter.emitCounterMetric(LoggingMarkers.INVOKER_DOCKER_CMD_TIMEOUT(args.head))
       case Failure(t) => transid.failed(this, start, t.getMessage, ErrorLevel)
     }
   }
diff --git a/core/invoker/src/main/scala/whisk/core/containerpool/docker/ProcessRunner.scala b/core/invoker/src/main/scala/whisk/core/containerpool/docker/ProcessRunner.scala
index b27e62fde9..f943b808e4 100644
--- a/core/invoker/src/main/scala/whisk/core/containerpool/docker/ProcessRunner.scala
+++ b/core/invoker/src/main/scala/whisk/core/containerpool/docker/ProcessRunner.scala
@@ -32,7 +32,7 @@ trait ProcessRunner {
    * Runs the specified command with arguments asynchronously and
    * capture stdout as well as stderr.
    *
-   * If not set to infinite, after timeout is reached the process is killed.
+   * If not set to infinite, after timeout is reached the process is terminated.
    *
    * Be cautious with the execution context you pass because the command
    * is blocking.
@@ -52,17 +52,87 @@ trait ProcessRunner {
         case _                 => None
       }
 
-      (process.exitValue(), out.mkString("\n"), err.mkString("\n"), scheduled)
+      (ExitStatus(process.exitValue()), out.mkString("\n"), err.mkString("\n"), scheduled)
     }).flatMap {
-      case (0, stdout, _, scheduled) =>
+      case (ExitStatus(0), stdout, _, scheduled) =>
         scheduled.foreach(_.cancel())
         Future.successful(stdout)
-      case (code, stdout, stderr, scheduled) =>
+      case (exitStatus, stdout, stderr, scheduled) =>
         scheduled.foreach(_.cancel())
-        Future.failed(ProcessRunningException(code, stdout, stderr))
+        timeout match {
+          case _: FiniteDuration if exitStatus.terminatedBySIGTERM =>
+            Future.failed(ProcessTimeoutException(timeout, exitStatus, stdout, stderr))
+          case _ => Future.failed(ProcessUnsuccessfulException(exitStatus, stdout, stderr))
+        }
+    }
+}
+
+object ExitStatus {
+  // Based on The Open Group Base Specifications Issue 7, 2018 edition:
+  // Shell & Utilities - Shell Command Language - 2.8.2 Exit Status for Commands
+  // http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_08_02
+  val STATUS_SUCCESSFUL = 0
+  val STATUS_NOT_EXECUTABLE = 126
+  val STATUS_NOT_FOUND = 127
+  // When a command is stopped by a signal, the exit status is 128 + signal numer
+  val STATUS_SIGNAL = 128
+
+  // Based on The Open Group Base Specifications Issue 7, 2018 edition:
+  // Shell & Utilities - Utilities - kill
+  // http://pubs.opengroup.org/onlinepubs/9699919799/utilities/kill.html
+  val SIGHUP = 1
+  val SIGINT = 2
+  val SIGQUIT = 3
+  val SIGABRT = 6
+  val SIGKILL = 9
+  val SIGALRM = 14
+  val SIGTERM = 15
+}
+
+case class ExitStatus(statusValue: Int) {
+
+  import ExitStatus._
+
+  override def toString(): String = {
+    def signalAsString(signal: Int): String = {
+      signal match {
+        case SIGHUP  => "SIGHUP"
+        case SIGINT  => "SIGINT"
+        case SIGQUIT => "SIGQUIT"
+        case SIGABRT => "SIGABRT"
+        case SIGKILL => "SIGKILL"
+        case SIGALRM => "SIGALRM"
+        case SIGTERM => "SIGTERM"
+        case _       => signal.toString
+      }
+    }
+
+    val detail = statusValue match {
+      case STATUS_SUCCESSFUL     => "successful"
+      case STATUS_NOT_EXECUTABLE => "not executable"
+      case STATUS_NOT_FOUND      => "not found"
+      case _ if statusValue >= ExitStatus.STATUS_SIGNAL =>
+        "terminated by signal " + signalAsString(statusValue - ExitStatus.STATUS_SIGNAL)
+      case _ => "unsuccessful"
     }
 
+    s"$statusValue ($detail)"
+  }
+
+  val successful = statusValue == ExitStatus.STATUS_SUCCESSFUL
+  val terminatedBySIGTERM = (statusValue - ExitStatus.STATUS_SIGNAL) == ExitStatus.SIGTERM
 }
 
-case class ProcessRunningException(exitCode: Int, stdout: String, stderr: String)
-    extends Exception(s"code: $exitCode ${if (exitCode == 143) "(killed)" else ""}, stdout: $stdout, stderr: $stderr")
+abstract class ProcessRunningException(info: String, val exitStatus: ExitStatus, val stdout: String, val stderr: String)
+    extends Exception(s"info: $info, code: $exitStatus, stdout: $stdout, stderr: $stderr")
+
+case class ProcessUnsuccessfulException(override val exitStatus: ExitStatus,
+                                        override val stdout: String,
+                                        override val stderr: String)
+    extends ProcessRunningException("command was unsuccessful", exitStatus, stdout, stderr)
+
+case class ProcessTimeoutException(timeout: Duration,
+                                   override val exitStatus: ExitStatus,
+                                   override val stdout: String,
+                                   override val stderr: String)
+    extends ProcessRunningException(s"command was terminated, took longer than $timeout", exitStatus, stdout, stderr)
diff --git a/docs/metrics.md b/docs/metrics.md
index 9cfd887e5d..4bf352063e 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -182,34 +182,40 @@ Metrics below are for invoker state as recorded within load balancer monitoring.
 
 Following metrics capture stats around various docker command executions.
 
-* Pause
+* pause
   * `openwhisk.counter.invoker_docker.pause_start`
   * `openwhisk.counter.invoker_docker.pause_error`
+  * `openwhisk.counter.invoker_docker.pause_timeout`
   * `openwhisk.histogram.invoker_docker.pause_finish`
   * `openwhisk.histogram.invoker_docker.pause_error`
-* Ps
+* ps
   * `openwhisk.counter.invoker_docker.ps_start`
   * `openwhisk.counter.invoker_docker.ps_error`
+  * `openwhisk.counter.invoker_docker.ps_timeout`
   * `openwhisk.histogram.invoker_docker.ps_finish`
   * `openwhisk.histogram.invoker_docker.ps_error`
 * pull
   * `openwhisk.counter.invoker_docker.pull_start`
   * `openwhisk.counter.invoker_docker.pull_error`
+  * `openwhisk.counter.invoker_docker.pull_timeout`
   * `openwhisk.histogram.invoker_docker.pull_finish`
   * `openwhisk.histogram.invoker_docker.pull_error`
 * rm
   * `openwhisk.counter.invoker_docker.rm_start`
   * `openwhisk.counter.invoker_docker.rm_error`
+  * `openwhisk.counter.invoker_docker.rm_timeout`
   * `openwhisk.histogram.invoker_docker.rm_finish`
   * `openwhisk.histogram.invoker_docker.rm_error`
 * run
   * `openwhisk.counter.invoker_docker.run_start`
   * `openwhisk.counter.invoker_docker.run_error`
+  * `openwhisk.counter.invoker_docker.run_timeout`
   * `openwhisk.histogram.invoker_docker.run_finish`
   * `openwhisk.histogram.invoker_docker.run_error`
 * unpause
   * `openwhisk.counter.invoker_docker.unpause_start`
   * `openwhisk.counter.invoker_docker.unpause_error`
+  * `openwhisk.counter.invoker_docker.unpause_timeout`
   * `openwhisk.histogram.invoker_docker.unpause_finish`
   * `openwhisk.histogram.invoker_docker.unpause_error`
 
diff --git a/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerClientTests.scala b/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerClientTests.scala
index 200bc993b5..66637f89de 100644
--- a/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerClientTests.scala
+++ b/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerClientTests.scala
@@ -18,7 +18,6 @@
 package whisk.core.containerpool.docker.test
 
 import akka.actor.ActorSystem
-
 import java.util.concurrent.Semaphore
 
 import scala.concurrent.Await
@@ -41,10 +40,7 @@ import whisk.common.LoggingMarkers.INVOKER_DOCKER_CMD
 import whisk.common.TransactionId
 import whisk.core.containerpool.ContainerAddress
 import whisk.core.containerpool.ContainerId
-import whisk.core.containerpool.docker.BrokenDockerContainer
-import whisk.core.containerpool.docker.DockerClient
-import whisk.core.containerpool.docker.DockerContainerId
-import whisk.core.containerpool.docker.ProcessRunningException
+import whisk.core.containerpool.docker._
 import whisk.utils.retry
 
 @RunWith(classOf[JUnitRunner])
@@ -246,7 +242,7 @@ class DockerClientTests
         runCmdCount += 1
         println(s"runCmdCount=${runCmdCount}, args.last=${args.last}")
         runCmdCount match {
-          case 1 => Future.failed(ProcessRunningException(1, "", ""))
+          case 1 => Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
           case 2 => Future.successful(secondContainerId.asString)
           case _ => Future.failed(new Throwable())
         }
@@ -337,11 +333,11 @@ class DockerClientTests
     runAndVerify(dc.pull("image"), "pull")
   }
 
-  it should "fail with BrokenDockerContainer when run returns with exit code 125 and a container ID" in {
+  it should "fail with BrokenDockerContainer when run returns with exit status 125 and a container ID" in {
     val dc = dockerClient {
       Future.failed(
-        ProcessRunningException(
-          exitCode = 125,
+        ProcessUnsuccessfulException(
+          exitStatus = ExitStatus(125),
           stdout = id.asString,
           stderr =
             """/usr/bin/docker: Error response from daemon: mkdir /var/run/docker.1.1/libcontainerd.1.1/55db56ee082239428b27d3728b4dd324c09068458aad9825727d5bfc1bba6d52: no space left on device."""))
@@ -350,19 +346,44 @@ class DockerClientTests
     bdc.id shouldBe id
   }
 
-  it should "fail with ProcessRunningException when run returns with exit code !=125 or no container ID" in {
+  it should "fail with ProcessRunningException when run returns with exit code !=125, no container ID or timeout" in {
     def runAndVerify(pre: ProcessRunningException, clue: String) = {
       val dc = dockerClient { Future.failed(pre) }
-      withClue(s"${clue} - exitCode = ${pre.exitCode}, stdout = '${pre.stdout}', stderr = '${pre.stderr}': ") {
+      withClue(s"${clue} - exitStatus = ${pre.exitStatus}, stdout = '${pre.stdout}', stderr = '${pre.stderr}': ") {
         the[ProcessRunningException] thrownBy await(dc.run("image", Seq.empty)) shouldBe pre
       }
     }
 
     Seq[(ProcessRunningException, String)](
-      (ProcessRunningException(126, id.asString, "Unknown command"), "Exit code not 125"),
-      (ProcessRunningException(125, "", "Unknown flag: --foo"), "No container ID"),
-      (ProcessRunningException(1, "", ""), "Exit code not 125 and no container ID")).foreach {
-      case (pre, clue) => runAndVerify(pre, clue)
+      (ProcessUnsuccessfulException(ExitStatus(127), id.asString, "Unknown command"), "Exit code not 125"),
+      (ProcessUnsuccessfulException(ExitStatus(125), "", "Unknown flag: --foo"), "No container ID"),
+      (ProcessUnsuccessfulException(ExitStatus(1), "", ""), "Exit code not 125 and no container ID"),
+      (ProcessTimeoutException(1.second, ExitStatus(125), id.asString, ""), "Timeout instead of unsuccessful command"))
+      .foreach {
+        case (pre, clue) => runAndVerify(pre, clue)
+      }
+  }
+
+  it should "fail with ProcessTimeoutException when command times out" in {
+    val expectedPTE =
+      ProcessTimeoutException(timeout = 10.seconds, exitStatus = ExitStatus(143), stdout = "stdout", stderr = "stderr")
+    val dc = dockerClient {
+      Future.failed(expectedPTE)
     }
+    Seq[(Future[_], String)](
+      (dc.run("image", Seq.empty), "run"),
+      (dc.inspectIPAddress(id, "network"), "inspectIPAddress"),
+      (dc.pause(id), "pause"),
+      (dc.unpause(id), "unpause"),
+      (dc.rm(id), "rm"),
+      (dc.ps(), "ps"),
+      (dc.pull("image"), "pull"),
+      (dc.isOomKilled(id), "isOomKilled"))
+      .foreach {
+        case (cmd, clue) =>
+          withClue(s"command '$clue' - ") {
+            the[ProcessTimeoutException] thrownBy await(cmd) shouldBe expectedPTE
+          }
+      }
   }
 }
diff --git a/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerContainerTests.scala b/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerContainerTests.scala
index 6730e8caf5..c0e5e376d8 100644
--- a/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerContainerTests.scala
+++ b/tests/src/test/scala/whisk/core/containerpool/docker/test/DockerContainerTests.scala
@@ -214,7 +214,7 @@ class DockerContainerTests
       override def run(image: String,
                        args: Seq[String] = Seq.empty[String])(implicit transid: TransactionId): Future[ContainerId] = {
         runs += ((image, args))
-        Future.failed(ProcessRunningException(1, "", ""))
+        Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
       }
     }
     implicit val runc = stub[RuncApi]
diff --git a/tests/src/test/scala/whisk/core/containerpool/docker/test/ProcessRunnerTests.scala b/tests/src/test/scala/whisk/core/containerpool/docker/test/ProcessRunnerTests.scala
index 4abe61fd5a..4527373ff7 100644
--- a/tests/src/test/scala/whisk/core/containerpool/docker/test/ProcessRunnerTests.scala
+++ b/tests/src/test/scala/whisk/core/containerpool/docker/test/ProcessRunnerTests.scala
@@ -26,13 +26,12 @@ import org.scalatest.FlatSpec
 import org.scalatest.junit.JUnitRunner
 
 import scala.concurrent.ExecutionContext.Implicits.global
-import whisk.core.containerpool.docker.ProcessRunner
+import whisk.core.containerpool.docker._
 
 import scala.concurrent.ExecutionContext
 import scala.concurrent.duration._
 import scala.concurrent.Await
 import org.scalatest.Matchers
-import whisk.core.containerpool.docker.ProcessRunningException
 
 import scala.language.reflectiveCalls // Needed to invoke run() method of structural ProcessRunner extension
 
@@ -55,18 +54,53 @@ class ProcessRunnerTests extends FlatSpec with Matchers with WskActorSystem {
   }
 
   it should "run an external command unsuccessfully and capture its output" in {
-    val exitCode = 1
+    val exitStatus = ExitStatus(1)
     val stdout = "Output"
     val stderr = "Error"
 
-    val future = processRunner.run(Seq("/bin/sh", "-c", s"echo ${stdout}; echo ${stderr} 1>&2; exit ${exitCode}"))
+    val future =
+      processRunner.run(Seq("/bin/sh", "-c", s"echo ${stdout}; echo ${stderr} 1>&2; exit ${exitStatus.statusValue}"))
 
-    the[ProcessRunningException] thrownBy await(future) shouldBe ProcessRunningException(exitCode, stdout, stderr)
+    val exception = the[ProcessRunningException] thrownBy await(future)
+    exception shouldBe ProcessUnsuccessfulException(exitStatus, stdout, stderr)
+    exception.getMessage should startWith("info: command was unsuccessful")
   }
 
   it should "terminate an external command after the specified timeout is reached" in {
-    val future = processRunner.run(Seq("sleep", "1"), 100.milliseconds)
-    val exception = the[ProcessRunningException] thrownBy await(future)
-    exception.exitCode shouldBe 143
+    val timeout = 100.milliseconds
+    // Run "sleep" command for 1 second and make sure that stdout and stderr are dropped
+    val future = processRunner.run(Seq("/bin/sh", "-c", "sleep 1 1>/dev/null 2>/dev/null"), timeout)
+    val exception = the[ProcessTimeoutException] thrownBy await(future)
+    exception shouldBe ProcessTimeoutException(timeout, ExitStatus(143), "", "")
+    exception.getMessage should startWith(s"info: command was terminated, took longer than $timeout")
+  }
+
+  behavior of "ExitStatus"
+
+  it should "provide a proper textual representation" in {
+    Seq[(Int, String)](
+      (0, "successful"),
+      (1, "unsuccessful"),
+      (125, "unsuccessful"),
+      (126, "not executable"),
+      (127, "not found"),
+      (128, "terminated by signal 0"),
+      (129, "terminated by signal SIGHUP"),
+      (130, "terminated by signal SIGINT"),
+      (131, "terminated by signal SIGQUIT"),
+      (134, "terminated by signal SIGABRT"),
+      (137, "terminated by signal SIGKILL"),
+      (142, "terminated by signal SIGALRM"),
+      (143, "terminated by signal SIGTERM"),
+      (144, "terminated by signal 16")).foreach {
+      case (statusValue, detailText) =>
+        ExitStatus(statusValue).toString shouldBe s"$statusValue ($detailText)"
+    }
+  }
+
+  it should "properly classify exit status" in {
+    withClue("Exit status 0 is successful - ") { ExitStatus(0).successful shouldBe true }
+    withClue("Exit status 1 is not successful - ") { ExitStatus(1).successful shouldBe false }
+    withClue("Exit status 143 means terminated by SIGTERM - ") { ExitStatus(143).terminatedBySIGTERM shouldBe true }
   }
 }
diff --git a/tests/src/test/scala/whisk/core/containerpool/kubernetes/test/KubernetesContainerTests.scala b/tests/src/test/scala/whisk/core/containerpool/kubernetes/test/KubernetesContainerTests.scala
index 227d3c9089..573b581dfe 100644
--- a/tests/src/test/scala/whisk/core/containerpool/kubernetes/test/KubernetesContainerTests.scala
+++ b/tests/src/test/scala/whisk/core/containerpool/kubernetes/test/KubernetesContainerTests.scala
@@ -173,7 +173,7 @@ class KubernetesContainerTests
         env: Map[String, String] = Map.empty,
         labels: Map[String, String] = Map.empty)(implicit transid: TransactionId): Future[KubernetesContainer] = {
         runs += ((name, image, env, labels))
-        Future.failed(ProcessRunningException(1, "", ""))
+        Future.failed(ProcessUnsuccessfulException(ExitStatus(1), "", ""))
       }
     }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services