You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by bm...@apache.org on 2015/09/15 02:26:54 UTC

[7/7] mesos git commit: Fixed the perf event isolator to continue sampling in the presence of failures.

Fixed the perf event isolator to continue sampling in the presence of failures.

Review: https://reviews.apache.org/r/38382


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/ddaa556d
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/ddaa556d
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/ddaa556d

Branch: refs/heads/master
Commit: ddaa556dde8f9937f89b9341d83b1ba1e9305ff4
Parents: 72f8452
Author: Benjamin Mahler <be...@gmail.com>
Authored: Mon Sep 14 14:45:47 2015 -0700
Committer: Benjamin Mahler <be...@gmail.com>
Committed: Mon Sep 14 17:05:29 2015 -0700

----------------------------------------------------------------------
 .../isolators/cgroups/perf_event.cpp            | 30 +++++++++-----------
 1 file changed, 14 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/ddaa556d/src/slave/containerizer/isolators/cgroups/perf_event.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/perf_event.cpp b/src/slave/containerizer/isolators/cgroups/perf_event.cpp
index f6ab8bc..03035df 100644
--- a/src/slave/containerizer/isolators/cgroups/perf_event.cpp
+++ b/src/slave/containerizer/isolators/cgroups/perf_event.cpp
@@ -383,10 +383,8 @@ void CgroupsPerfEventIsolatorProcess::sample()
     }
   }
 
-  // The timeout includes an allowance of twice the process::reap
-  // interval to ensure we see the perf process exit. If the sample
-  // is not ready after the timeout something very unexpected has
-  // occurred so we discard it and halt all sampling.
+  // The discard timeout includes an allowance of twice the
+  // reaper interval to ensure we see the perf process exit.
   Duration timeout = flags.perf_duration + process::MAX_REAP_INTERVAL() * 2;
 
   perf::sample(events, cgroups, flags.perf_duration)
@@ -407,22 +405,22 @@ void CgroupsPerfEventIsolatorProcess::_sample(
     const Future<hashmap<string, PerfStatistics>>& statistics)
 {
   if (!statistics.isReady()) {
-    // Failure can occur for many reasons but all are unexpected and
-    // indicate something is not right so we'll stop sampling.
-    LOG(ERROR) << "Failed to get perf sample, sampling will be halted: "
+    // In case the failure is transient or this is due to a timeout,
+    // we continue sampling. Note that since sampling is done on an
+    // interval, it should be ok if this is a non-transient failure.
+    LOG(ERROR) << "Failed to get perf sample: "
                << (statistics.isFailed()
                    ? statistics.failure()
                    : "discarded due to timeout");
-    return;
-  }
-
-  // Store the latest statistics, note that cgroups added in the
-  // interim will be picked up by the next sample.
-  foreachvalue (Info* info, infos) {
-    CHECK_NOTNULL(info);
+  } else {
+    // Store the latest statistics, note that cgroups added in the
+    // interim will be picked up by the next sample.
+    foreachvalue (Info* info, infos) {
+      CHECK_NOTNULL(info);
 
-    if (statistics->contains(info->cgroup)) {
-      info->statistics = statistics->get(info->cgroup).get();
+      if (statistics->contains(info->cgroup)) {
+        info->statistics = statistics->get(info->cgroup).get();
+      }
     }
   }