You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by gi...@apache.org on 2018/03/29 05:50:04 UTC
[3/5] mesos git commit: Added `--fetcher_stall_timeout` to abort
stalled artifact fetching.
Added `--fetcher_stall_timeout` to abort stalled artifact fetching.
This flag specifies a timeout for `mesos-fetcher` to wait before
aborting if the download speed keeps below 1 bytes/sec. This would avoid
containers to get stuck at FETCHING.
Review: https://reviews.apache.org/r/65856/
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/4971a277
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/4971a277
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/4971a277
Branch: refs/heads/master
Commit: 4971a277ddc943b8179cc71567e0412629ed0260
Parents: f7f3c22
Author: Chun-Hung Hsiao <ch...@apache.org>
Authored: Wed Mar 28 22:47:52 2018 -0700
Committer: Gilbert Song <so...@gmail.com>
Committed: Wed Mar 28 22:48:54 2018 -0700
----------------------------------------------------------------------
docs/configuration/agent.md | 12 ++++++++++
include/mesos/fetcher/fetcher.proto | 3 +++
src/launcher/fetcher.cpp | 40 +++++++++++++++++++++-----------
src/slave/constants.hpp | 3 +++
src/slave/containerizer/fetcher.cpp | 3 +++
src/slave/flags.cpp | 9 +++++++
src/slave/flags.hpp | 1 +
7 files changed, 58 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/docs/configuration/agent.md
----------------------------------------------------------------------
diff --git a/docs/configuration/agent.md b/docs/configuration/agent.md
index 18857e2..962211a 100644
--- a/docs/configuration/agent.md
+++ b/docs/configuration/agent.md
@@ -782,6 +782,18 @@ Size of the fetcher cache in Bytes. (default: 2GB)
</tr>
<tr>
<td>
+ --fetcher_stall_timeout=VALUE
+ </td>
+ <td>
+Amount of time for the fetcher to wait before considering a download
+being too slow and abort it when the download stalls (i.e., the speed
+keeps below one byte per second).
+<b>NOTE</b>: This feature only applies when downloading data from the net and
+does not apply to HDFS. (default: 1mins)
+ </td>
+</tr>
+<tr>
+ <td>
--frameworks_home=VALUE
</td>
<td>
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/include/mesos/fetcher/fetcher.proto
----------------------------------------------------------------------
diff --git a/include/mesos/fetcher/fetcher.proto b/include/mesos/fetcher/fetcher.proto
index 6a5d807..d668106 100644
--- a/include/mesos/fetcher/fetcher.proto
+++ b/include/mesos/fetcher/fetcher.proto
@@ -64,4 +64,7 @@ message FetcherInfo {
repeated Item items = 3;
optional string user = 4;
optional string frameworks_home = 5;
+
+ // Only applies when fetching artifacts from the net.
+ optional DurationInfo stall_timeout = 6;
}
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/src/launcher/fetcher.cpp
----------------------------------------------------------------------
diff --git a/src/launcher/fetcher.cpp b/src/launcher/fetcher.cpp
index 2f42fa2..8a8d7c3 100644
--- a/src/launcher/fetcher.cpp
+++ b/src/launcher/fetcher.cpp
@@ -167,7 +167,8 @@ static Try<string> downloadWithHadoopClient(
static Try<string> downloadWithNet(
const string& sourceUri,
- const string& destinationPath)
+ const string& destinationPath,
+ const Option<Duration>& stallTimeout)
{
// The net::download function only supports these protocols.
CHECK(strings::startsWith(sourceUri, "http://") ||
@@ -178,7 +179,7 @@ static Try<string> downloadWithNet(
LOG(INFO) << "Downloading resource from '" << sourceUri
<< "' to '" << destinationPath << "'";
- Try<int> code = net::download(sourceUri, destinationPath);
+ Try<int> code = net::download(sourceUri, destinationPath, stallTimeout);
if (code.isError()) {
return Error("Error downloading resource: " + code.error());
} else {
@@ -219,7 +220,8 @@ static Try<string> copyFile(
static Try<string> download(
const string& _sourceUri,
const string& destinationPath,
- const Option<string>& frameworksHome)
+ const Option<string>& frameworksHome,
+ const Option<Duration>& stallTimeout)
{
// Trim leading whitespace for 'sourceUri'.
const string sourceUri = strings::trim(_sourceUri, strings::PREFIX);
@@ -245,7 +247,7 @@ static Try<string> download(
// 2. Try to fetch URI using os::net / libcurl implementation.
// We consider http, https, ftp, ftps compatible with libcurl.
if (Fetcher::isNetUri(sourceUri)) {
- return downloadWithNet(sourceUri, destinationPath);
+ return downloadWithNet(sourceUri, destinationPath, stallTimeout);
}
// 3. Try to fetch the URI using hadoop client.
@@ -288,7 +290,8 @@ static Try<string> chmodExecutable(const string& filePath)
static Try<string> fetchBypassingCache(
const CommandInfo::URI& uri,
const string& sandboxDirectory,
- const Option<string>& frameworksHome)
+ const Option<string>& frameworksHome,
+ const Option<Duration>& stallTimeout)
{
LOG(INFO) << "Fetching directly into the sandbox directory";
@@ -317,7 +320,8 @@ static Try<string> fetchBypassingCache(
string path = path::join(sandboxDirectory, outputFile.get());
- Try<string> downloaded = download(uri.value(), path, frameworksHome);
+ Try<string> downloaded =
+ download(uri.value(), path, frameworksHome, stallTimeout);
if (downloaded.isError()) {
return Error(downloaded.error());
}
@@ -406,7 +410,8 @@ static Try<string> fetchThroughCache(
const FetcherInfo::Item& item,
const Option<string>& cacheDirectory,
const string& sandboxDirectory,
- const Option<string>& frameworksHome)
+ const Option<string>& frameworksHome,
+ const Option<Duration>& stallTimeout)
{
if (cacheDirectory.isNone() || cacheDirectory->empty()) {
return Error("Cache directory not specified");
@@ -430,7 +435,8 @@ static Try<string> fetchThroughCache(
Try<string> downloaded = download(
item.uri().value(),
path::join(cacheDirectory.get(), item.cache_filename()),
- frameworksHome);
+ frameworksHome,
+ stallTimeout);
if (downloaded.isError()) {
return Error(downloaded.error());
@@ -447,7 +453,8 @@ static Try<string> fetch(
const FetcherInfo::Item& item,
const Option<string>& cacheDirectory,
const string& sandboxDirectory,
- const Option<string>& frameworksHome)
+ const Option<string>& frameworksHome,
+ const Option<Duration>& stallTimeout)
{
LOG(INFO) << "Fetching URI '" << item.uri().value() << "'";
@@ -455,14 +462,16 @@ static Try<string> fetch(
return fetchBypassingCache(
item.uri(),
sandboxDirectory,
- frameworksHome);
+ frameworksHome,
+ stallTimeout);
}
return fetchThroughCache(
item,
cacheDirectory,
sandboxDirectory,
- frameworksHome);
+ frameworksHome,
+ stallTimeout);
}
@@ -593,10 +602,15 @@ int main(int argc, char* argv[])
? Option<string>::some(fetcherInfo->frameworks_home())
: Option<string>::none();
+ const Option<Duration> stallTimeout =
+ fetcherInfo->has_stall_timeout()
+ ? Nanoseconds(fetcherInfo->stall_timeout().nanoseconds())
+ : Option<Duration>::none();
+
// Fetch each URI to a local file and chmod if necessary.
foreach (const FetcherInfo::Item& item, fetcherInfo->items()) {
- Try<string> fetched =
- fetch(item, cacheDirectory, sandboxDirectory, frameworksHome);
+ Try<string> fetched = fetch(
+ item, cacheDirectory, sandboxDirectory, frameworksHome, stallTimeout);
if (fetched.isError()) {
EXIT(EXIT_FAILURE)
<< "Failed to fetch '" << item.uri().value() << "': " + fetched.error();
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/src/slave/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/constants.hpp b/src/slave/constants.hpp
index f1fc2bf..d1d15c3 100644
--- a/src/slave/constants.hpp
+++ b/src/slave/constants.hpp
@@ -170,6 +170,9 @@ constexpr char EXECUTOR_HTTP_AUTHENTICATION_REALM[] = "mesos-agent-executor";
// Default maximum storage space to be used by the fetcher cache.
constexpr Bytes DEFAULT_FETCHER_CACHE_SIZE = Gigabytes(2);
+// Default timeout for the fetcher to wait when a net download stalls.
+constexpr Duration DEFAULT_FETCHER_STALL_TIMEOUT = Minutes(1);
+
// If no pings received within this timeout, then the slave will
// trigger a re-detection of the master to cause a re-registration.
Duration DEFAULT_MASTER_PING_TIMEOUT();
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/src/slave/containerizer/fetcher.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/fetcher.cpp b/src/slave/containerizer/fetcher.cpp
index f9ab554..7de57c2 100644
--- a/src/slave/containerizer/fetcher.cpp
+++ b/src/slave/containerizer/fetcher.cpp
@@ -561,6 +561,9 @@ Future<Nothing> FetcherProcess::__fetch(
info.set_frameworks_home(flags.frameworks_home);
}
+ info.mutable_stall_timeout()
+ ->set_nanoseconds(flags.fetcher_stall_timeout.ns());
+
return run(containerId, sandboxDirectory, user, info)
.repair(defer(self(), [=](const Future<Nothing>& future) {
++metrics.task_fetches_failed;
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/src/slave/flags.cpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.cpp b/src/slave/flags.cpp
index 005ce7f..bdfc49a 100644
--- a/src/slave/flags.cpp
+++ b/src/slave/flags.cpp
@@ -248,6 +248,15 @@ mesos::internal::slave::Flags::Flags()
" each other when occupying a shared space (i.e. disk contention).",
path::join(os::temp(), "mesos", "fetch"));
+ add(&Flags::fetcher_stall_timeout,
+ "fetcher_stall_timeout",
+ "Amount of time for the fetcher to wait before considering a download\n"
+ "being too slow and abort it when the download stalls (i.e., the speed\n"
+ "keeps below one byte per second).\n"
+ "NOTE: This feature only applies when downloading data from the net and\n"
+ "does not apply to HDFS.",
+ DEFAULT_FETCHER_STALL_TIMEOUT);
+
add(&Flags::work_dir,
"work_dir",
"Path of the agent work directory. This is where executor sandboxes\n"
http://git-wip-us.apache.org/repos/asf/mesos/blob/4971a277/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 949a478..beae47f 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -65,6 +65,7 @@ public:
Option<std::string> attributes;
Bytes fetcher_cache_size;
std::string fetcher_cache_dir;
+ Duration fetcher_stall_timeout;
std::string work_dir;
std::string runtime_dir;
std::string launcher_dir;