You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2015/10/26 19:40:25 UTC
[01/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Repository: mesos
Updated Branches:
refs/heads/master 974906bf0 -> 9a722d742
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/store.cpp b/src/slave/containerizer/provisioner/docker/store.cpp
deleted file mode 100644
index 5034013..0000000
--- a/src/slave/containerizer/provisioner/docker/store.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-#include <list>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/json.hpp>
-#include <stout/os.hpp>
-#include <stout/result.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/subprocess.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/flags.hpp"
-
-using namespace process;
-
-using std::list;
-using std::pair;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class StoreProcess : public Process<StoreProcess>
-{
-public:
- StoreProcess(
- const Flags& _flags,
- const Owned<MetadataManager>& _metadataManager,
- const Owned<Puller>& _puller)
- : flags(_flags), metadataManager(_metadataManager), puller(_puller) {}
-
- ~StoreProcess() {}
-
- Future<Nothing> recover();
-
- Future<vector<string>> get(const mesos::Image& image);
-
-private:
- Future<Image> _get(
- const Image::Name& name,
- const Option<Image>& image);
-
- Future<vector<string>> __get(const Image& image);
-
- Future<vector<string>> moveLayers(
- const std::string& staging,
- const std::list<pair<string, string>>& layerPaths);
-
- Future<Image> storeImage(
- const Image::Name& name,
- const std::vector<std::string>& layerIds);
-
- Future<Nothing> moveLayer(const pair<string, string>& layerPath);
-
- const Flags flags;
- Owned<MetadataManager> metadataManager;
- Owned<Puller> puller;
-};
-
-
-Try<Owned<slave::Store>> Store::create(const Flags& flags)
-{
- Try<Owned<Puller>> puller = Puller::create(flags);
- if (puller.isError()) {
- return Error("Failed to create Docker puller: " + puller.error());
- }
-
- if (!os::exists(flags.docker_store_dir)) {
- Try<Nothing> mkdir = os::mkdir(flags.docker_store_dir);
- if (mkdir.isError()) {
- return Error("Failed to create Docker store directory: " + mkdir.error());
- }
- }
-
- if (!os::exists(paths::getStagingDir(flags.docker_store_dir))) {
- Try<Nothing> mkdir =
- os::mkdir(paths::getStagingDir(flags.docker_store_dir));
-
- if (mkdir.isError()) {
- return Error("Failed to create Docker store staging directory: " +
- mkdir.error());
- }
- }
-
- Try<Owned<MetadataManager>> metadataManager = MetadataManager::create(flags);
- if (metadataManager.isError()) {
- return Error(metadataManager.error());
- }
-
- Owned<StoreProcess> process(
- new StoreProcess(flags, metadataManager.get(), puller.get()));
-
- return Owned<slave::Store>(new Store(process));
-}
-
-
-Store::Store(const Owned<StoreProcess>& _process) : process(_process)
-{
- process::spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Store::~Store()
-{
- process::terminate(process.get());
- process::wait(process.get());
-}
-
-
-Future<Nothing> Store::recover()
-{
- return dispatch(process.get(), &StoreProcess::recover);
-}
-
-
-Future<vector<string>> Store::get(const mesos::Image& image)
-{
- return dispatch(process.get(), &StoreProcess::get, image);
-}
-
-
-Future<vector<string>> StoreProcess::get(const mesos::Image& image)
-{
- if (image.type() != mesos::Image::DOCKER) {
- return Failure("Docker provisioner store only supports Docker images");
- }
-
- Image::Name imageName = parseImageName(image.docker().name());
-
- return metadataManager->get(imageName)
- .then(defer(self(), &Self::_get, imageName, lambda::_1))
- .then(defer(self(), &Self::__get, lambda::_1));
-}
-
-
-Future<Image> StoreProcess::_get(
- const Image::Name& name,
- const Option<Image>& image)
-{
- if (image.isSome()) {
- return image.get();
- }
-
- Try<string> staging =
- os::mkdtemp(paths::getStagingTempDir(flags.docker_store_dir));
-
- if (staging.isError()) {
- return Failure("Failed to create a staging directory");
- }
-
- return puller->pull(name, staging.get())
- .then(defer(self(), &Self::moveLayers, staging.get(), lambda::_1))
- .then(defer(self(), &Self::storeImage, name, lambda::_1))
- .onAny([staging]() {
- Try<Nothing> rmdir = os::rmdir(staging.get());
- if (rmdir.isError()) {
- LOG(WARNING) << "Failed to remove staging directory: " << rmdir.error();
- }
- });
-}
-
-
-Future<vector<string>> StoreProcess::__get(const Image& image)
-{
- vector<string> layerDirectories;
- foreach (const string& layer, image.layer_ids()) {
- layerDirectories.push_back(
- paths::getImageLayerRootfsPath(
- flags.docker_store_dir, layer));
- }
-
- return layerDirectories;
-}
-
-
-Future<Nothing> StoreProcess::recover()
-{
- return metadataManager->recover();
-}
-
-
-Future<vector<string>> StoreProcess::moveLayers(
- const string& staging,
- const list<pair<string, string>>& layerPaths)
-{
- list<Future<Nothing>> futures;
- foreach (const auto& layerPath, layerPaths) {
- futures.push_back(moveLayer(layerPath));
- }
-
- return collect(futures)
- .then([layerPaths]() {
- vector<string> layerIds;
- foreach (const auto& layerPath, layerPaths) {
- layerIds.push_back(layerPath.first);
- }
-
- return layerIds;
- });
-}
-
-
-Future<Image> StoreProcess::storeImage(
- const Image::Name& name,
- const vector<string>& layerIds)
-{
- return metadataManager->put(name, layerIds);
-}
-
-
-Future<Nothing> StoreProcess::moveLayer(const pair<string, string>& layerPath)
-{
- if (!os::exists(layerPath.second)) {
- return Failure("Unable to find layer '" + layerPath.first + "' in '" +
- layerPath.second + "'");
- }
-
- const string imageLayerPath =
- paths::getImageLayerPath(flags.docker_store_dir, layerPath.first);
-
- if (!os::exists(imageLayerPath)) {
- Try<Nothing> mkdir = os::mkdir(imageLayerPath);
- if (mkdir.isError()) {
- return Failure("Failed to create layer path in store for id '" +
- layerPath.first + "': " + mkdir.error());
- }
- }
-
- Try<Nothing> status = os::rename(
- layerPath.second,
- paths::getImageLayerRootfsPath(
- flags.docker_store_dir, layerPath.first));
-
- if (status.isError()) {
- return Failure("Failed to move layer '" + layerPath.first +
- "' to store directory: " + status.error());
- }
-
- return Nothing();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/store.hpp b/src/slave/containerizer/provisioner/docker/store.hpp
deleted file mode 100644
index b5e6c87..0000000
--- a/src/slave/containerizer/provisioner/docker/store.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_STORE_HPP__
-#define __PROVISIONER_DOCKER_STORE_HPP__
-
-#include <string>
-
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward Declarations.
-class Puller;
-class StoreProcess;
-
-
-// Store fetches the Docker images and stores them on disk.
-class Store : public slave::Store
-{
-public:
- static Try<process::Owned<slave::Store>> create(const Flags& flags);
-
- ~Store();
-
- process::Future<Nothing> recover();
-
- process::Future<std::vector<std::string>> get(const mesos::Image& image);
-
-private:
- explicit Store(const process::Owned<StoreProcess>& _process);
-
- Store& operator=(const Store&) = delete; // Not assignable.
- Store(const Store&) = delete; // Not copyable.
-
- process::Owned<StoreProcess> process;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_STORE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/token_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/token_manager.cpp b/src/slave/containerizer/provisioner/docker/token_manager.cpp
deleted file mode 100644
index 18b29c3..0000000
--- a/src/slave/containerizer/provisioner/docker/token_manager.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
-
-using std::hash;
-using std::string;
-using std::vector;
-
-using process::Clock;
-using process::Failure;
-using process::Future;
-using process::Owned;
-using process::Process;
-using process::Time;
-
-using process::http::Request;
-using process::http::Response;
-using process::http::URL;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-class TokenManagerProcess : public Process<TokenManagerProcess>
-{
-public:
- static Try<Owned<TokenManagerProcess>> create(const URL& realm);
-
- Future<Token> getToken(
- const string& service,
- const string& scope,
- const Option<string>& account);
-
-private:
- static const string TOKEN_PATH_PREFIX;
- static const Duration RESPONSE_TIMEOUT;
-
- TokenManagerProcess(const URL& realm)
- : realm_(realm) {}
-
- Try<Token> getTokenFromResponse(const Response& response) const;
-
- /**
- * Key for the token cache.
- */
- struct TokenCacheKey
- {
- string service;
- string scope;
- };
-
- struct TokenCacheKeyHash
- {
- size_t operator()(const TokenCacheKey& key) const
- {
- hash<string> hashFn;
-
- return (hashFn(key.service) ^
- (hashFn(key.scope) << 1));
- }
- };
-
- struct TokenCacheKeyEqual
- {
- bool operator()(
- const TokenCacheKey& left,
- const TokenCacheKey& right) const
- {
- return ((left.service == right.service) &&
- (left.scope == right.scope));
- }
- };
-
- typedef hashmap<
- const TokenCacheKey,
- Token,
- TokenCacheKeyHash,
- TokenCacheKeyEqual> TokenCacheType;
-
- const URL realm_;
- TokenCacheType tokenCache_;
-
- TokenManagerProcess(const TokenManagerProcess&) = delete;
- TokenManagerProcess& operator=(const TokenManagerProcess&) = delete;
-};
-
-const Duration TokenManagerProcess::RESPONSE_TIMEOUT = Seconds(10);
-const string TokenManagerProcess::TOKEN_PATH_PREFIX = "/v2/token/";
-
-
-Token::Token(
- const string& _raw,
- const JSON::Object& _header,
- const JSON::Object& _claims,
- const Option<Time>& _expiration,
- const Option<Time>& _notBefore)
- : raw(_raw),
- header(_header),
- claims(_claims),
- expiration(_expiration),
- notBefore(_notBefore) {}
-
-
-// TODO(josephw): Parse this string with some protobufs.
-Try<Token> Token::create(const string& raw)
-{
- auto decode = [](
- const string& segment) -> Try<JSON::Object> {
- const auto padding = segment.length() % 4;
- string paddedSegment(segment);
-
- if (padding) {
- paddedSegment.append(padding, '=');
- }
-
- Try<string> decoded = base64::decode(paddedSegment);
- if (decoded.isError()) {
- return Error(decoded.error());
- }
-
- return JSON::parse<JSON::Object>(decoded.get());
- };
-
- const vector<string> tokens = strings::tokenize(raw, ".");
-
- if (tokens.size() != 3) {
- return Error("Invalid raw token string");
- }
-
- Try<JSON::Object> header = decode(tokens[0]);
- if (header.isError()) {
- return Error("Failed to decode 'header' segment: " + header.error());
- }
-
- Try<JSON::Object> claims = decode(tokens[1]);
- if (claims.isError()) {
- return Error("Failed to decode 'claims' segment: " + claims.error());
- }
-
- Result<Time> expirationTime = getTimeValue(claims.get(), "exp");
- if (expirationTime.isError()) {
- return Error("Failed to decode expiration time: " + expirationTime.error());
- }
-
- Option<Time> expiration;
- if (expirationTime.isSome()) {
- expiration = expirationTime.get();
- }
-
- Result<Time> notBeforeTime = getTimeValue(claims.get(), "nbf");
- if (notBeforeTime.isError()) {
- return Error("Failed to decode not-before time: " + notBeforeTime.error());
- }
-
- Option<Time> notBefore;
- if (notBeforeTime.isSome()) {
- notBefore = notBeforeTime.get();
- }
-
- Token token(raw, header.get(), claims.get(), expiration, notBefore);
-
- if (token.isExpired()) {
- return Error("Token has expired");
- }
-
- // TODO(jojy): Add signature validation.
- return token;
-}
-
-
-Result<Time> Token::getTimeValue(const JSON::Object& object, const string& key)
-{
- Result<JSON::Number> jsonValue = object.find<JSON::Number>(key);
-
- Option<Time> timeValue;
-
- // If expiration is provided, we will process it for future validations.
- if (jsonValue.isSome()) {
- Try<Time> time = Time::create(jsonValue.get().as<double>());
- if (time.isError()) {
- return Error("Failed to decode time: " + time.error());
- }
-
- timeValue = time.get();
- }
-
- return timeValue;
-}
-
-
-bool Token::isExpired() const
-{
- if (expiration.isSome()) {
- return (Clock::now() >= expiration.get());
- }
-
- return false;
-}
-
-
-bool Token::isValid() const
-{
- if (!isExpired()) {
- if (notBefore.isSome()) {
- return (Clock::now() >= notBefore.get());
- }
-
- return true;
- }
-
- // TODO(jojy): Add signature validation.
- return false;
-}
-
-
-Try<Owned<TokenManager>> TokenManager::create(
- const URL& realm)
-{
- Try<Owned<TokenManagerProcess>> process = TokenManagerProcess::create(realm);
- if (process.isError()) {
- return Error(process.error());
- }
-
- return Owned<TokenManager>(new TokenManager(process.get()));
-}
-
-
-TokenManager::TokenManager(Owned<TokenManagerProcess>& process)
- : process_(process)
-{
- spawn(CHECK_NOTNULL(process_.get()));
-}
-
-
-TokenManager::~TokenManager()
-{
- terminate(process_.get());
- process::wait(process_.get());
-}
-
-
-Future<Token> TokenManager::getToken(
- const string& service,
- const string& scope,
- const Option<string>& account)
-{
- return dispatch(
- process_.get(),
- &TokenManagerProcess::getToken,
- service,
- scope,
- account);
-}
-
-
-Try<Owned<TokenManagerProcess>> TokenManagerProcess::create(const URL& realm)
-{
- return Owned<TokenManagerProcess>(new TokenManagerProcess(realm));
-}
-
-
-Try<Token> TokenManagerProcess::getTokenFromResponse(
- const Response& response) const
-{
- Try<JSON::Object> tokenJSON = JSON::parse<JSON::Object>(response.body);
- if (tokenJSON.isError()) {
- return Error(tokenJSON.error());
- }
-
- Result<JSON::String> tokenString =
- tokenJSON.get().find<JSON::String>("token");
-
- if (tokenString.isError()) {
- return Error(tokenString.error());
- }
-
- Try<Token> result = Token::create(tokenString.get().value);
- if (result.isError()) {
- return Error(result.error());
- }
-
- return result.get();;
-}
-
-
-Future<Token> TokenManagerProcess::getToken(
- const string& service,
- const string& scope,
- const Option<string>& account)
-{
- const TokenCacheKey tokenKey = {service, scope};
-
- if (tokenCache_.contains(tokenKey)) {
- Token token = tokenCache_.at(tokenKey);
-
- if (token.isValid()) {
- return token;
- } else {
- LOG(WARNING) << "Cached token was invalid. Will fetch once again";
- }
- }
-
- URL tokenUrl = realm_;
- tokenUrl.path = TOKEN_PATH_PREFIX;
-
- tokenUrl.query = {
- {"service", service},
- {"scope", scope},
- };
-
- if (account.isSome()) {
- tokenUrl.query.insert({"account", account.get()});
- }
-
- return process::http::get(tokenUrl, None())
- .after(RESPONSE_TIMEOUT, [] (Future<Response> resp) -> Future<Response> {
- resp.discard();
- return Failure("Timeout waiting for response to token request");
- })
- .then(defer(self(), [this, tokenKey](
- const Future<Response>& response) -> Future<Token> {
- Try<Token> token = getTokenFromResponse(response.get());
- if (token.isError()) {
- return Failure(
- "Failed to parse JSON Web Token object from response: " +
- token.error());
- }
-
- tokenCache_.insert({tokenKey, token.get()});
-
- return token.get();
- }));
-}
-
-// TODO(jojy): Add implementation for basic authentication based getToken API.
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/token_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/token_manager.hpp b/src/slave/containerizer/provisioner/docker/token_manager.hpp
deleted file mode 100644
index 2f4abff..0000000
--- a/src/slave/containerizer/provisioner/docker/token_manager.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
-#define __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
-
-#include <functional>
-#include <string>
-
-#include <stout/base64.hpp>
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/strings.hpp>
-
-#include <process/future.hpp>
-#include <process/http.hpp>
-#include <process/process.hpp>
-#include <process/time.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-
-/**
- * Encapsulates JSON Web Token.
- *
- * Reference: https://tools.ietf.org/html/rfc7519.
- */
-struct Token
-{
- /**
- * Factory method for Token object.
- *
- * Parses the raw token string and validates for token's expiration.
- *
- * @returns Token if parsing and validation succeeds.
- * Error if parsing or validation fails.
- */
- static Try<Token> create(const std::string& rawString);
-
- /**
- * Compares token's expiration time(expressed in seconds) with current time.
- *
- * @returns True if token's expiration time is greater than current time.
- * False if token's expiration time is less than or equal to current
- * time.
- */
- bool isExpired() const;
-
- /**
- * Validates the token if its "exp" "nbf" values are in range.
- *
- * @returns True if current time is within token's "exp" and "nbf" values.
- * False if current time is not within token's "exp" and "nbf"
- * values.
- */
- bool isValid() const;
-
- const std::string raw;
- const JSON::Object header;
- const JSON::Object claims;
- // TODO(jojy): Add signature information.
-
-private:
- Token(
- const std::string& raw,
- const JSON::Object& headerJson,
- const JSON::Object& claimsJson,
- const Option<process::Time>& expireTime,
- const Option<process::Time>& notBeforeTime);
-
- static Result<process::Time> getTimeValue(
- const JSON::Object& object,
- const std::string& key);
-
- const Option<process::Time> expiration;
- const Option<process::Time> notBefore;
-};
-
-
-// Forward declaration.
-class TokenManagerProcess;
-
-
-/**
- * Acquires and manages docker registry tokens. It keeps the tokens in its
- * cache to server any future request for the same token.
- * The cache grows unbounded.
- * TODO(jojy): The cache can be optimized to prune based on the expiry time of
- * the token and server's issue time.
- */
-class TokenManager
-{
-public:
- /**
- * Factory method for creating TokenManager object.
- *
- * TokenManager and registry authorization realm has a 1:1 relationship.
- *
- * @param realm URL of the authorization server from where token will be
- * requested by this TokenManager.
- * @returns Owned<TokenManager> if success.
- * Error on failure.
- */
- static Try<process::Owned<TokenManager>> create(
- const process::http::URL& realm);
-
- /**
- * Returns JSON Web Token from cache or from remote server using "Basic
- * authorization".
- *
- * @param service Name of the service that hosts the resource for which
- * token is being requested.
- * @param scope unique scope returned by the 401 Unauthorized response
- * from the registry.
- * @param account Name of the account which the client is acting as.
- * @param user base64 encoded userid for basic authorization.
- * @param password base64 encoded password for basic authorization.
- * @returns Token struct that encapsulates JSON Web Token.
- */
- process::Future<Token> getToken(
- const std::string& service,
- const std::string& scope,
- const Option<std::string>& account,
- const std::string& user,
- const Option<std::string>& password);
-
- /**
- * Returns JSON Web Token from cache or from remote server using "TLS/Cert"
- * based authorization.
- *
- * @param service Name of the service that hosts the resource for which
- * token is being requested.
- * @param scope unique scope returned by the 401 Unauthorized response
- * from the registry.
- * @param account Name of the account which the client is acting as.
- * @returns Token struct that encapsulates JSON Web Token.
- */
- process::Future<Token> getToken(
- const std::string& service,
- const std::string& scope,
- const Option<std::string>& account);
-
- ~TokenManager();
-
-private:
- TokenManager(process::Owned<TokenManagerProcess>& process);
-
- TokenManager(const TokenManager&) = delete;
- TokenManager& operator=(const TokenManager&) = delete;
-
- process::Owned<TokenManagerProcess> process_;
-};
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/paths.cpp b/src/slave/containerizer/provisioner/paths.cpp
deleted file mode 100644
index 8d0f7d3..0000000
--- a/src/slave/containerizer/provisioner/paths.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <mesos/type_utils.hpp>
-
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include <stout/os/stat.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/provisioner/paths.hpp"
-
-using std::list;
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace provisioner {
-namespace paths {
-
-static string getContainersDir(const string& provisionerDir)
-{
- return path::join(provisionerDir, "containers");
-}
-
-
-static string getBackendsDir(const string& containerDir)
-{
- return path::join(containerDir, "backends");
-}
-
-
-static string getBackendDir(const string& backendsDir, const string& backend)
-{
- return path::join(backendsDir, backend);
-}
-
-
-static string getRootfsesDir(const string& backendDir)
-{
- return path::join(backendDir, "rootfses");
-}
-
-
-static string getRootfsDir(const string& rootfsesDir, const string& roofsId)
-{
- return path::join(rootfsesDir, roofsId);
-}
-
-
-string getContainerDir(
- const string& provisionerDir,
- const ContainerID& containerId)
-{
- return path::join(getContainersDir(provisionerDir), containerId.value());
-}
-
-
-string getContainerRootfsDir(
- const string& provisionerDir,
- const ContainerID& containerId,
- const string& backend,
- const string& rootfsId)
-{
- return getRootfsDir(
- getRootfsesDir(
- getBackendDir(
- getBackendsDir(
- getContainerDir(
- provisionerDir,
- containerId)),
- backend)),
- rootfsId);
-}
-
-
-Try<hashset<ContainerID>> listContainers(
- const string& provisionerDir)
-{
- hashset<ContainerID> results;
-
- string containersDir = getContainersDir(provisionerDir);
- if (!os::exists(containersDir)) {
- // No container has been created yet.
- return results;
- }
-
- Try<list<string>> containerIds = os::ls(containersDir);
- if (containerIds.isError()) {
- return Error("Unable to list the containers directory: " +
- containerIds.error());
- }
-
- foreach (const string& entry, containerIds.get()) {
- string containerPath = path::join(containersDir, entry);
-
- if (!os::stat::isdir(containerPath)) {
- LOG(WARNING) << "Ignoring unexpected container entry at: "
- << containerPath;
- continue;
- }
-
- ContainerID containerId;
- containerId.set_value(entry);
- results.insert(containerId);
- }
-
- return results;
-}
-
-
-Try<hashmap<string, hashset<string>>> listContainerRootfses(
- const string& provisionerDir,
- const ContainerID& containerId)
-{
- hashmap<string, hashset<string>> results;
-
- string backendsDir = getBackendsDir(
- getContainerDir(
- provisionerDir,
- containerId));
-
- Try<list<string>> backends = os::ls(backendsDir);
- if (backends.isError()) {
- return Error("Unable to list the container directory: " + backends.error());
- }
-
- foreach (const string& backend, backends.get()) {
- string backendDir = getBackendDir(backendsDir, backend);
- if (!os::stat::isdir(backendDir)) {
- LOG(WARNING) << "Ignoring unexpected backend entry at: " << backendDir;
- continue;
- }
-
- Try<list<string>> rootfses = os::ls(getRootfsesDir(backendDir));
- if (rootfses.isError()) {
- return Error("Unable to list the backend directory: " + rootfses.error());
- }
-
- hashset<string> backendResults;
-
- foreach (const string& rootfsId, rootfses.get()) {
- string rootfs = getRootfsDir(getRootfsesDir(backendDir), rootfsId);
-
- if (!os::stat::isdir(rootfs)) {
- LOG(WARNING) << "Ignoring unexpected rootfs entry at: " << backendDir;
- continue;
- }
-
- backendResults.insert(rootfsId);
- }
-
- if (backendResults.empty()) {
- LOG(WARNING) << "Ignoring a backend directory with no rootfs in it: "
- << backendDir;
- continue;
- }
-
- // The rootfs directory has passed validation.
- results.put(backend, backendResults);
- }
-
- return results;
-}
-
-} // namespace paths {
-} // namespace provisioner {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/paths.hpp b/src/slave/containerizer/provisioner/paths.hpp
deleted file mode 100644
index 7ebd36d..0000000
--- a/src/slave/containerizer/provisioner/paths.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_PATHS_HPP__
-#define __PROVISIONER_PATHS_HPP__
-
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/try.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace provisioner {
-namespace paths {
-
-// The provisioner rootfs directory is as follows:
-// <work_dir> ('--work_dir' flag)
-// |-- provisioner
-// |-- containers
-// |-- <container_id>
-// |-- backends
-// |-- <backend> (copy, bind, etc.)
-// |-- rootfses
-// |-- <rootfs_id> (the rootfs)
-//
-// There can be multiple backends due to the change of backend flags.
-// Under each backend a rootfs is identified by the 'rootfs_id' which
-// is a UUID.
-
-std::string getContainerDir(
- const std::string& provisionerDir,
- const ContainerID& containerId);
-
-
-std::string getContainerRootfsDir(
- const std::string& provisionerDir,
- const ContainerID& containerId,
- const std::string& backend,
- const std::string& rootfsId);
-
-
-// Recursively "ls" the container directory and return a map of
-// backend -> {rootfsId, ...}
-Try<hashmap<std::string, hashset<std::string>>>
-listContainerRootfses(
- const std::string& provisionerDir,
- const ContainerID& containerId);
-
-
-// Return a set of container IDs.
-Try<hashset<ContainerID>> listContainers(
- const std::string& provisionerDir);
-
-} // namespace paths {
-} // namespace provisioner {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/provisioner.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/provisioner.cpp b/src/slave/containerizer/provisioner/provisioner.cpp
deleted file mode 100644
index 30a85a5..0000000
--- a/src/slave/containerizer/provisioner/provisioner.cpp
+++ /dev/null
@@ -1,444 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <mesos/type_utils.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/process.hpp>
-
-#include <process/metrics/counter.hpp>
-#include <process/metrics/metrics.hpp>
-
-#include <stout/foreach.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/os.hpp>
-#include <stout/stringify.hpp>
-#include <stout/uuid.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/provisioner/backend.hpp"
-#include "slave/containerizer/provisioner/paths.hpp"
-#include "slave/containerizer/provisioner/provisioner.hpp"
-#include "slave/containerizer/provisioner/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerState;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class ProvisionerProcess : public Process<ProvisionerProcess>
-{
-public:
- ProvisionerProcess(
- const Flags& flags,
- const string& rootDir,
- const hashmap<Image::Type, Owned<Store>>& stores,
- const hashmap<string, Owned<Backend>>& backends);
-
- Future<Nothing> recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- Future<string> provision(
- const ContainerID& containerId,
- const Image& image);
-
- Future<bool> destroy(const ContainerID& containerId);
-
-private:
- Future<string> _provision(
- const ContainerID& containerId,
- const vector<string>& layers);
-
- Future<bool> _destroy(const ContainerID& containerId);
-
- const Flags flags;
-
- // Absolute path to the provisioner root directory. It can be
- // derived from '--work_dir' but we keep a separate copy here
- // because we converted it into an absolute path so managed rootfs
- // paths match the ones in 'mountinfo' (important if mount-based
- // backends are used).
- const string rootDir;
-
- const hashmap<Image::Type, Owned<Store>> stores;
- const hashmap<string, Owned<Backend>> backends;
-
- struct Info
- {
- // Mappings: backend -> {rootfsId, ...}
- hashmap<string, hashset<string>> rootfses;
- };
-
- hashmap<ContainerID, Owned<Info>> infos;
-
- struct Metrics
- {
- Metrics();
- ~Metrics();
-
- process::metrics::Counter remove_container_errors;
- } metrics;
-};
-
-
-Try<Owned<Provisioner>> Provisioner::create(
- const Flags& flags,
- Fetcher* fetcher)
-{
- string _rootDir = slave::paths::getProvisionerDir(flags.work_dir);
-
- Try<Nothing> mkdir = os::mkdir(_rootDir);
- if (mkdir.isError()) {
- return Error(
- "Failed to create provisioner root directory '" +
- _rootDir + "': " + mkdir.error());
- }
-
- Result<string> rootDir = os::realpath(_rootDir);
- if (rootDir.isError()) {
- return Error(
- "Failed to resolve the realpath of provisioner root directory '" +
- _rootDir + "': " + rootDir.error());
- }
-
- CHECK_SOME(rootDir); // Can't be None since we just created it.
-
- Try<hashmap<Image::Type, Owned<Store>>> stores = Store::create(flags);
- if (stores.isError()) {
- return Error("Failed to create image stores: " + stores.error());
- }
-
- hashmap<string, Owned<Backend>> backends = Backend::create(flags);
- if (backends.empty()) {
- return Error("No usable provisioner backend created");
- }
-
- if (!backends.contains(flags.image_provisioner_backend)) {
- return Error(
- "The specified provisioner backend '" +
- flags.image_provisioner_backend + "' is unsupported");
- }
-
- return Owned<Provisioner>(new Provisioner(
- Owned<ProvisionerProcess>(new ProvisionerProcess(
- flags,
- rootDir.get(),
- stores.get(),
- backends))));
-}
-
-
-Provisioner::Provisioner(Owned<ProvisionerProcess> _process)
- : process(_process)
-{
- spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Provisioner::~Provisioner()
-{
- if (process.get() != NULL) {
- terminate(process.get());
- wait(process.get());
- }
-}
-
-
-Future<Nothing> Provisioner::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- return dispatch(
- CHECK_NOTNULL(process.get()),
- &ProvisionerProcess::recover,
- states,
- orphans);
-}
-
-
-Future<string> Provisioner::provision(
- const ContainerID& containerId,
- const Image& image)
-{
- return dispatch(
- CHECK_NOTNULL(process.get()),
- &ProvisionerProcess::provision,
- containerId,
- image);
-}
-
-
-Future<bool> Provisioner::destroy(const ContainerID& containerId)
-{
- return dispatch(
- CHECK_NOTNULL(process.get()),
- &ProvisionerProcess::destroy,
- containerId);
-}
-
-
-ProvisionerProcess::ProvisionerProcess(
- const Flags& _flags,
- const string& _rootDir,
- const hashmap<Image::Type, Owned<Store>>& _stores,
- const hashmap<string, Owned<Backend>>& _backends)
- : flags(_flags),
- rootDir(_rootDir),
- stores(_stores),
- backends(_backends) {}
-
-
-Future<Nothing> ProvisionerProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- // Register living containers, including the ones that do not
- // provision images.
- hashset<ContainerID> alive;
- foreach (const ContainerState& state, states) {
- alive.insert(state.container_id());
- }
-
- // List provisioned containers; recover living ones; destroy unknown
- // orphans. Note that known orphan containers are recovered as well
- // and they will be destroyed by the containerizer using the normal
- // cleanup path. See MESOS-2367 for details.
- Try<hashset<ContainerID>> containers =
- provisioner::paths::listContainers(rootDir);
-
- if (containers.isError()) {
- return Failure(
- "Failed to list the containers managed by the provisioner: " +
- containers.error());
- }
-
- // Scan the list of containers, register all of them with 'infos'
- // but mark unknown orphans for immediate cleanup.
- hashset<ContainerID> unknownOrphans;
-
- foreach (const ContainerID& containerId, containers.get()) {
- Owned<Info> info = Owned<Info>(new Info());
-
- Try<hashmap<string, hashset<string>>> rootfses =
- provisioner::paths::listContainerRootfses(rootDir, containerId);
-
- if (rootfses.isError()) {
- return Failure(
- "Unable to list rootfses belonged to container " +
- stringify(containerId) + ": " + rootfses.error());
- }
-
- foreachkey (const string& backend, rootfses.get()) {
- if (!backends.contains(backend)) {
- return Failure(
- "Found rootfses managed by an unrecognized backend: " + backend);
- }
-
- info->rootfses.put(backend, rootfses.get()[backend]);
- }
-
- infos.put(containerId, info);
-
- if (alive.contains(containerId) || orphans.contains(containerId)) {
- LOG(INFO) << "Recovered container " << containerId;
- continue;
- } else {
- // For immediate cleanup below.
- unknownOrphans.insert(containerId);
- }
- }
-
- // Cleanup unknown orphan containers' rootfses.
- list<Future<bool>> cleanups;
- foreach (const ContainerID& containerId, unknownOrphans) {
- LOG(INFO) << "Cleaning up unknown orphan container " << containerId;
- cleanups.push_back(destroy(containerId));
- }
-
- Future<Nothing> cleanup = collect(cleanups)
- .then([]() -> Future<Nothing> { return Nothing(); });
-
- // Recover stores.
- list<Future<Nothing>> recovers;
- foreachvalue (const Owned<Store>& store, stores) {
- recovers.push_back(store->recover());
- }
-
- Future<Nothing> recover = collect(recovers)
- .then([]() -> Future<Nothing> { return Nothing(); });
-
- // A successful provisioner recovery depends on:
- // 1) Recovery of living containers and known orphans (done above).
- // 2) Successful cleanup of unknown orphans.
- // 3) Successful store recovery.
- //
- // TODO(jieyu): Do not recover 'store' before unknown orphans are
- // cleaned up. In the future, we may want to cleanup unused rootfses
- // in 'store', which might fail if there still exist unknown orphans
- // holding references to them.
- return collect(cleanup, recover)
- .then([=]() -> Future<Nothing> {
- LOG(INFO) << "Provisioner recovery complete";
- return Nothing();
- });
-}
-
-
-Future<string> ProvisionerProcess::provision(
- const ContainerID& containerId,
- const Image& image)
-{
- if (!stores.contains(image.type())) {
- return Failure(
- "Unsupported container image type: " +
- stringify(image.type()));
- }
-
- // Get and then provision image layers from the store.
- return stores.get(image.type()).get()->get(image)
- .then(defer(self(), &Self::_provision, containerId, lambda::_1));
-}
-
-
-Future<string> ProvisionerProcess::_provision(
- const ContainerID& containerId,
- const vector<string>& layers)
-{
- // TODO(jieyu): Choose a backend smartly. For instance, if there is
- // only one layer returned from the store. prefer to use bind
- // backend because it's the simplest.
- const string& backend = flags.image_provisioner_backend;
- CHECK(backends.contains(backend));
-
- string rootfsId = UUID::random().toString();
-
- string rootfs = provisioner::paths::getContainerRootfsDir(
- rootDir,
- containerId,
- backend,
- rootfsId);
-
- LOG(INFO) << "Provisioning image rootfs '" << rootfs
- << "' for container " << containerId;
-
- // NOTE: It's likely that the container ID already exists in 'infos'
- // because one container might provision multiple images.
- if (!infos.contains(containerId)) {
- infos.put(containerId, Owned<Info>(new Info()));
- }
-
- infos[containerId]->rootfses[backend].insert(rootfsId);
-
- return backends.get(backend).get()->provision(layers, rootfs)
- .then([rootfs]() -> Future<string> { return rootfs; });
-}
-
-
-Future<bool> ProvisionerProcess::destroy(const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- LOG(INFO) << "Ignoring destroy request for unknown container "
- << containerId;
-
- return false;
- }
-
- // Unregister the container first. If destroy() fails, we can rely
- // on recover() to retry it later.
- Owned<Info> info = infos[containerId];
- infos.erase(containerId);
-
- list<Future<bool>> futures;
- foreachkey (const string& backend, info->rootfses) {
- if (!backends.contains(backend)) {
- return Failure("Unknown backend '" + backend + "'");
- }
-
- foreach (const string& rootfsId, info->rootfses[backend]) {
- string rootfs = provisioner::paths::getContainerRootfsDir(
- rootDir,
- containerId,
- backend,
- rootfsId);
-
- LOG(INFO) << "Destroying container rootfs at '" << rootfs
- << "' for container " << containerId;
-
- futures.push_back(backends.get(backend).get()->destroy(rootfs));
- }
- }
-
- // TODO(xujyan): Revisit the usefulness of this return value.
- return collect(futures)
- .then(defer(self(), &ProvisionerProcess::_destroy, containerId));
-}
-
-
-Future<bool> ProvisionerProcess::_destroy(const ContainerID& containerId)
-{
- // This should be fairly cheap as the directory should only
- // contain a few empty sub-directories at this point.
- //
- // TODO(jieyu): Currently, it's possible that some directories
- // cannot be removed due to EBUSY. EBUSY is caused by the race
- // between cleaning up this container and new containers copying
- // the host mount table. It's OK to ignore them. The cleanup
- // will be retried during slave recovery.
- string containerDir =
- provisioner::paths::getContainerDir(rootDir, containerId);
-
- Try<Nothing> rmdir = os::rmdir(containerDir);
- if (rmdir.isError()) {
- LOG(ERROR) << "Failed to remove the provisioned container directory "
- << "at '" << containerDir << "': " << rmdir.error();
-
- ++metrics.remove_container_errors;
- }
-
- return true;
-}
-
-
-ProvisionerProcess::Metrics::Metrics()
- : remove_container_errors(
- "containerizer/mesos/provisioner/remove_container_errors")
-{
- process::metrics::add(remove_container_errors);
-}
-
-
-ProvisionerProcess::Metrics::~Metrics()
-{
- process::metrics::remove(remove_container_errors);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/provisioner.hpp b/src/slave/containerizer/provisioner/provisioner.hpp
deleted file mode 100644
index 912fc5a..0000000
--- a/src/slave/containerizer/provisioner/provisioner.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_HPP__
-#define __PROVISIONER_HPP__
-
-#include <list>
-
-#include <mesos/resources.hpp>
-
-#include <mesos/slave/isolator.hpp> // For ContainerState.
-
-#include <stout/nothing.hpp>
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/fetcher.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class ProvisionerProcess;
-
-
-class Provisioner
-{
-public:
- // Create the provisioner based on the specified flags.
- static Try<process::Owned<Provisioner>> create(
- const Flags& flags,
- Fetcher* fetcher);
-
- // NOTE: Made 'virtual' for mocking and testing.
- virtual ~Provisioner();
-
- // Recover root filesystems for containers from the run states and
- // the orphan containers (known to the launcher but not known to the
- // slave) detected by the launcher. This function is also
- // responsible for cleaning up any intermediate artifacts (e.g.
- // directories) to not leak anything.
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- // Provision a root filesystem for the container using the specified
- // image and return the absolute path to the root filesystem.
- virtual process::Future<std::string> provision(
- const ContainerID& containerId,
- const Image& image);
-
- // Destroy a previously provisioned root filesystem. Assumes that
- // all references (e.g., mounts, open files) to the provisioned
- // filesystem have been removed. Return false if there is no
- // provisioned root filesystem for the given container.
- virtual process::Future<bool> destroy(const ContainerID& containerId);
-
-protected:
- Provisioner() {} // For creating mock object.
-
-private:
- explicit Provisioner(process::Owned<ProvisionerProcess> process);
-
- Provisioner(const Provisioner&) = delete; // Not copyable.
- Provisioner& operator=(const Provisioner&) = delete; // Not assignable.
-
- process::Owned<ProvisionerProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/store.cpp b/src/slave/containerizer/provisioner/store.cpp
deleted file mode 100644
index a8bc302..0000000
--- a/src/slave/containerizer/provisioner/store.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-
-#include <mesos/type_utils.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/containerizer/provisioner/appc/store.hpp"
-
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-using namespace process;
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<hashmap<Image::Type, Owned<Store>>> Store::create(const Flags& flags)
-{
- if (flags.image_providers.isNone()) {
- return hashmap<Image::Type, Owned<Store>>();
- }
-
- hashmap<Image::Type, Try<Owned<Store>>(*)(const Flags&)> creators;
- creators.put(Image::APPC, &appc::Store::create);
- creators.put(Image::DOCKER, &docker::Store::create);
-
- hashmap<Image::Type, Owned<Store>> stores;
-
- foreach (const string& type,
- strings::tokenize(flags.image_providers.get(), ",")) {
- Image::Type imageType;
- if (!Image::Type_Parse(strings::upper(type), &imageType)) {
- return Error("Unknown image type '" + type + "'");
- }
-
- if (!creators.contains(imageType)) {
- return Error("Unsupported image type '" + type + "'");
- }
-
- Try<Owned<Store>> store = creators[imageType](flags);
- if (store.isError()) {
- return Error(
- "Failed to create store for image type '" +
- type + "': " + store.error());
- }
-
- stores.put(imageType, store.get());
- }
-
- return stores;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/store.hpp b/src/slave/containerizer/provisioner/store.hpp
deleted file mode 100644
index cf3e7d7..0000000
--- a/src/slave/containerizer/provisioner/store.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_STORE_HPP__
-#define __PROVISIONER_STORE_HPP__
-
-#include <string>
-#include <vector>
-
-#include <mesos/mesos.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/try.hpp>
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// An image store abstraction that "stores" images. It serves as a
-// read-through cache (cache misses are fetched remotely and
-// transparently) for images.
-class Store
-{
-public:
- static Try<hashmap<Image::Type, process::Owned<Store>>> create(
- const Flags& flags);
-
- virtual ~Store() {}
-
- virtual process::Future<Nothing> recover() = 0;
-
- // Get the specified image (and all its recursive dependencies) as a
- // list of rootfs layers in the topological order (dependencies go
- // before dependents in the list). The images required to build this
- // list are either retrieved from the local cache or fetched
- // remotely.
- //
- // NOTE: The returned list should not have duplicates. e.g., in the
- // following scenario the result should be [C, B, D, A] (B before D
- // in this example is decided by the order in which A specifies its
- // dependencies).
- //
- // A --> B --> C
- // | ^
- // |---> D ----|
- //
- // The returned future fails if the requested image or any of its
- // dependencies cannot be found or failed to be fetched.
- virtual process::Future<std::vector<std::string>> get(const Image& image) = 0;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_STORE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/filesystem_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/filesystem_isolator_tests.cpp b/src/tests/containerizer/filesystem_isolator_tests.cpp
index 1e332e0..39008f6 100644
--- a/src/tests/containerizer/filesystem_isolator_tests.cpp
+++ b/src/tests/containerizer/filesystem_isolator_tests.cpp
@@ -42,12 +42,12 @@
#ifdef __linux__
#include "slave/containerizer/linux_launcher.hpp"
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
#endif
#include "slave/containerizer/mesos/containerizer.hpp"
-#include "slave/containerizer/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
#include "tests/flags.hpp"
#include "tests/mesos.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/isolator_tests.cpp b/src/tests/containerizer/isolator_tests.cpp
index 237f3f2..4e1e90b 100644
--- a/src/tests/containerizer/isolator_tests.cpp
+++ b/src/tests/containerizer/isolator_tests.cpp
@@ -50,13 +50,13 @@
#include "slave/slave.hpp"
#ifdef __linux__
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
#endif // __linux__
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
#include "slave/containerizer/launcher.hpp"
#ifdef __linux__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/port_mapping_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/port_mapping_tests.cpp b/src/tests/containerizer/port_mapping_tests.cpp
index feca204..ae2c0e6 100644
--- a/src/tests/containerizer/port_mapping_tests.cpp
+++ b/src/tests/containerizer/port_mapping_tests.cpp
@@ -56,7 +56,7 @@
#include "slave/flags.hpp"
#include "slave/slave.hpp"
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
#include "slave/containerizer/fetcher.hpp"
#include "slave/containerizer/launcher.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner.hpp b/src/tests/containerizer/provisioner.hpp
index 54aab5f..507e141 100644
--- a/src/tests/containerizer/provisioner.hpp
+++ b/src/tests/containerizer/provisioner.hpp
@@ -26,7 +26,7 @@
#include <stout/hashmap.hpp>
#include <stout/stringify.hpp>
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
#include "tests/containerizer/rootfs.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_appc_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_appc_tests.cpp b/src/tests/containerizer/provisioner_appc_tests.cpp
index 3da759a..213e9c0 100644
--- a/src/tests/containerizer/provisioner_appc_tests.cpp
+++ b/src/tests/containerizer/provisioner_appc_tests.cpp
@@ -31,11 +31,11 @@
#include "slave/paths.hpp"
-#include "slave/containerizer/provisioner/paths.hpp"
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-#include "slave/containerizer/provisioner/appc/store.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
#include "tests/utils.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_backend_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_backend_tests.cpp b/src/tests/containerizer/provisioner_backend_tests.cpp
index b72edc6..3347d58 100644
--- a/src/tests/containerizer/provisioner_backend_tests.cpp
+++ b/src/tests/containerizer/provisioner_backend_tests.cpp
@@ -29,8 +29,8 @@
#include "linux/fs.hpp"
#endif // __linux__
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-#include "slave/containerizer/provisioner/backends/copy.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
#include "tests/flags.hpp"
#include "tests/utils.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_docker_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_docker_tests.cpp b/src/tests/containerizer/provisioner_docker_tests.cpp
index 01d3025..cf8aac3 100644
--- a/src/tests/containerizer/provisioner_docker_tests.cpp
+++ b/src/tests/containerizer/provisioner_docker_tests.cpp
@@ -37,12 +37,12 @@
#include <process/ssl/gtest.hpp>
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/registry_client.hpp"
-#include "slave/containerizer/provisioner/docker/spec.hpp"
-#include "slave/containerizer/provisioner/docker/store.hpp"
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/registry_client.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
#include "tests/mesos.hpp"
#include "tests/utils.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/disk_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/disk_quota_tests.cpp b/src/tests/disk_quota_tests.cpp
index 1ed1342..47a0640 100644
--- a/src/tests/disk_quota_tests.cpp
+++ b/src/tests/disk_quota_tests.cpp
@@ -40,7 +40,7 @@
#include "slave/containerizer/fetcher.hpp"
-#include "slave/containerizer/isolators/posix/disk.hpp"
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
#include "slave/containerizer/mesos/containerizer.hpp"
[03/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp b/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
new file mode 100644
index 0000000..ad5b63f
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
@@ -0,0 +1,362 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
+
+using std::hash;
+using std::string;
+using std::vector;
+
+using process::Clock;
+using process::Failure;
+using process::Future;
+using process::Owned;
+using process::Process;
+using process::Time;
+
+using process::http::Request;
+using process::http::Response;
+using process::http::URL;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+class TokenManagerProcess : public Process<TokenManagerProcess>
+{
+public:
+ static Try<Owned<TokenManagerProcess>> create(const URL& realm);
+
+ Future<Token> getToken(
+ const string& service,
+ const string& scope,
+ const Option<string>& account);
+
+private:
+ static const string TOKEN_PATH_PREFIX;
+ static const Duration RESPONSE_TIMEOUT;
+
+ TokenManagerProcess(const URL& realm)
+ : realm_(realm) {}
+
+ Try<Token> getTokenFromResponse(const Response& response) const;
+
+ /**
+ * Key for the token cache.
+ */
+ struct TokenCacheKey
+ {
+ string service;
+ string scope;
+ };
+
+ struct TokenCacheKeyHash
+ {
+ size_t operator()(const TokenCacheKey& key) const
+ {
+ hash<string> hashFn;
+
+ return (hashFn(key.service) ^
+ (hashFn(key.scope) << 1));
+ }
+ };
+
+ struct TokenCacheKeyEqual
+ {
+ bool operator()(
+ const TokenCacheKey& left,
+ const TokenCacheKey& right) const
+ {
+ return ((left.service == right.service) &&
+ (left.scope == right.scope));
+ }
+ };
+
+ typedef hashmap<
+ const TokenCacheKey,
+ Token,
+ TokenCacheKeyHash,
+ TokenCacheKeyEqual> TokenCacheType;
+
+ const URL realm_;
+ TokenCacheType tokenCache_;
+
+ TokenManagerProcess(const TokenManagerProcess&) = delete;
+ TokenManagerProcess& operator=(const TokenManagerProcess&) = delete;
+};
+
+const Duration TokenManagerProcess::RESPONSE_TIMEOUT = Seconds(10);
+const string TokenManagerProcess::TOKEN_PATH_PREFIX = "/v2/token/";
+
+
+Token::Token(
+ const string& _raw,
+ const JSON::Object& _header,
+ const JSON::Object& _claims,
+ const Option<Time>& _expiration,
+ const Option<Time>& _notBefore)
+ : raw(_raw),
+ header(_header),
+ claims(_claims),
+ expiration(_expiration),
+ notBefore(_notBefore) {}
+
+
+// TODO(josephw): Parse this string with some protobufs.
+Try<Token> Token::create(const string& raw)
+{
+ auto decode = [](
+ const string& segment) -> Try<JSON::Object> {
+ const auto padding = segment.length() % 4;
+ string paddedSegment(segment);
+
+ if (padding) {
+ paddedSegment.append(padding, '=');
+ }
+
+ Try<string> decoded = base64::decode(paddedSegment);
+ if (decoded.isError()) {
+ return Error(decoded.error());
+ }
+
+ return JSON::parse<JSON::Object>(decoded.get());
+ };
+
+ const vector<string> tokens = strings::tokenize(raw, ".");
+
+ if (tokens.size() != 3) {
+ return Error("Invalid raw token string");
+ }
+
+ Try<JSON::Object> header = decode(tokens[0]);
+ if (header.isError()) {
+ return Error("Failed to decode 'header' segment: " + header.error());
+ }
+
+ Try<JSON::Object> claims = decode(tokens[1]);
+ if (claims.isError()) {
+ return Error("Failed to decode 'claims' segment: " + claims.error());
+ }
+
+ Result<Time> expirationTime = getTimeValue(claims.get(), "exp");
+ if (expirationTime.isError()) {
+ return Error("Failed to decode expiration time: " + expirationTime.error());
+ }
+
+ Option<Time> expiration;
+ if (expirationTime.isSome()) {
+ expiration = expirationTime.get();
+ }
+
+ Result<Time> notBeforeTime = getTimeValue(claims.get(), "nbf");
+ if (notBeforeTime.isError()) {
+ return Error("Failed to decode not-before time: " + notBeforeTime.error());
+ }
+
+ Option<Time> notBefore;
+ if (notBeforeTime.isSome()) {
+ notBefore = notBeforeTime.get();
+ }
+
+ Token token(raw, header.get(), claims.get(), expiration, notBefore);
+
+ if (token.isExpired()) {
+ return Error("Token has expired");
+ }
+
+ // TODO(jojy): Add signature validation.
+ return token;
+}
+
+
+Result<Time> Token::getTimeValue(const JSON::Object& object, const string& key)
+{
+ Result<JSON::Number> jsonValue = object.find<JSON::Number>(key);
+
+ Option<Time> timeValue;
+
+ // If expiration is provided, we will process it for future validations.
+ if (jsonValue.isSome()) {
+ Try<Time> time = Time::create(jsonValue.get().as<double>());
+ if (time.isError()) {
+ return Error("Failed to decode time: " + time.error());
+ }
+
+ timeValue = time.get();
+ }
+
+ return timeValue;
+}
+
+
+bool Token::isExpired() const
+{
+ if (expiration.isSome()) {
+ return (Clock::now() >= expiration.get());
+ }
+
+ return false;
+}
+
+
+bool Token::isValid() const
+{
+ if (!isExpired()) {
+ if (notBefore.isSome()) {
+ return (Clock::now() >= notBefore.get());
+ }
+
+ return true;
+ }
+
+ // TODO(jojy): Add signature validation.
+ return false;
+}
+
+
+Try<Owned<TokenManager>> TokenManager::create(
+ const URL& realm)
+{
+ Try<Owned<TokenManagerProcess>> process = TokenManagerProcess::create(realm);
+ if (process.isError()) {
+ return Error(process.error());
+ }
+
+ return Owned<TokenManager>(new TokenManager(process.get()));
+}
+
+
+TokenManager::TokenManager(Owned<TokenManagerProcess>& process)
+ : process_(process)
+{
+ spawn(CHECK_NOTNULL(process_.get()));
+}
+
+
+TokenManager::~TokenManager()
+{
+ terminate(process_.get());
+ process::wait(process_.get());
+}
+
+
+Future<Token> TokenManager::getToken(
+ const string& service,
+ const string& scope,
+ const Option<string>& account)
+{
+ return dispatch(
+ process_.get(),
+ &TokenManagerProcess::getToken,
+ service,
+ scope,
+ account);
+}
+
+
+Try<Owned<TokenManagerProcess>> TokenManagerProcess::create(const URL& realm)
+{
+ return Owned<TokenManagerProcess>(new TokenManagerProcess(realm));
+}
+
+
+Try<Token> TokenManagerProcess::getTokenFromResponse(
+ const Response& response) const
+{
+ Try<JSON::Object> tokenJSON = JSON::parse<JSON::Object>(response.body);
+ if (tokenJSON.isError()) {
+ return Error(tokenJSON.error());
+ }
+
+ Result<JSON::String> tokenString =
+ tokenJSON.get().find<JSON::String>("token");
+
+ if (tokenString.isError()) {
+ return Error(tokenString.error());
+ }
+
+ Try<Token> result = Token::create(tokenString.get().value);
+ if (result.isError()) {
+ return Error(result.error());
+ }
+
+ return result.get();;
+}
+
+
+Future<Token> TokenManagerProcess::getToken(
+ const string& service,
+ const string& scope,
+ const Option<string>& account)
+{
+ const TokenCacheKey tokenKey = {service, scope};
+
+ if (tokenCache_.contains(tokenKey)) {
+ Token token = tokenCache_.at(tokenKey);
+
+ if (token.isValid()) {
+ return token;
+ } else {
+ LOG(WARNING) << "Cached token was invalid. Will fetch once again";
+ }
+ }
+
+ URL tokenUrl = realm_;
+ tokenUrl.path = TOKEN_PATH_PREFIX;
+
+ tokenUrl.query = {
+ {"service", service},
+ {"scope", scope},
+ };
+
+ if (account.isSome()) {
+ tokenUrl.query.insert({"account", account.get()});
+ }
+
+ return process::http::get(tokenUrl, None())
+ .after(RESPONSE_TIMEOUT, [] (Future<Response> resp) -> Future<Response> {
+ resp.discard();
+ return Failure("Timeout waiting for response to token request");
+ })
+ .then(defer(self(), [this, tokenKey](
+ const Future<Response>& response) -> Future<Token> {
+ Try<Token> token = getTokenFromResponse(response.get());
+ if (token.isError()) {
+ return Failure(
+ "Failed to parse JSON Web Token object from response: " +
+ token.error());
+ }
+
+ tokenCache_.insert({tokenKey, token.get()});
+
+ return token.get();
+ }));
+}
+
+// TODO(jojy): Add implementation for basic authentication based getToken API.
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp b/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
new file mode 100644
index 0000000..2f4abff
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
+#define __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
+
+#include <functional>
+#include <string>
+
+#include <stout/base64.hpp>
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/strings.hpp>
+
+#include <process/future.hpp>
+#include <process/http.hpp>
+#include <process/process.hpp>
+#include <process/time.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+
+/**
+ * Encapsulates JSON Web Token.
+ *
+ * Reference: https://tools.ietf.org/html/rfc7519.
+ */
+struct Token
+{
+ /**
+ * Factory method for Token object.
+ *
+ * Parses the raw token string and validates for token's expiration.
+ *
+ * @returns Token if parsing and validation succeeds.
+ * Error if parsing or validation fails.
+ */
+ static Try<Token> create(const std::string& rawString);
+
+ /**
+ * Compares token's expiration time(expressed in seconds) with current time.
+ *
+ * @returns True if token's expiration time is greater than current time.
+ * False if token's expiration time is less than or equal to current
+ * time.
+ */
+ bool isExpired() const;
+
+ /**
+ * Validates the token if its "exp" "nbf" values are in range.
+ *
+ * @returns True if current time is within token's "exp" and "nbf" values.
+ * False if current time is not within token's "exp" and "nbf"
+ * values.
+ */
+ bool isValid() const;
+
+ const std::string raw;
+ const JSON::Object header;
+ const JSON::Object claims;
+ // TODO(jojy): Add signature information.
+
+private:
+ Token(
+ const std::string& raw,
+ const JSON::Object& headerJson,
+ const JSON::Object& claimsJson,
+ const Option<process::Time>& expireTime,
+ const Option<process::Time>& notBeforeTime);
+
+ static Result<process::Time> getTimeValue(
+ const JSON::Object& object,
+ const std::string& key);
+
+ const Option<process::Time> expiration;
+ const Option<process::Time> notBefore;
+};
+
+
+// Forward declaration.
+class TokenManagerProcess;
+
+
+/**
+ * Acquires and manages docker registry tokens. It keeps the tokens in its
+ * cache to server any future request for the same token.
+ * The cache grows unbounded.
+ * TODO(jojy): The cache can be optimized to prune based on the expiry time of
+ * the token and server's issue time.
+ */
+class TokenManager
+{
+public:
+ /**
+ * Factory method for creating TokenManager object.
+ *
+ * TokenManager and registry authorization realm has a 1:1 relationship.
+ *
+ * @param realm URL of the authorization server from where token will be
+ * requested by this TokenManager.
+ * @returns Owned<TokenManager> if success.
+ * Error on failure.
+ */
+ static Try<process::Owned<TokenManager>> create(
+ const process::http::URL& realm);
+
+ /**
+ * Returns JSON Web Token from cache or from remote server using "Basic
+ * authorization".
+ *
+ * @param service Name of the service that hosts the resource for which
+ * token is being requested.
+ * @param scope unique scope returned by the 401 Unauthorized response
+ * from the registry.
+ * @param account Name of the account which the client is acting as.
+ * @param user base64 encoded userid for basic authorization.
+ * @param password base64 encoded password for basic authorization.
+ * @returns Token struct that encapsulates JSON Web Token.
+ */
+ process::Future<Token> getToken(
+ const std::string& service,
+ const std::string& scope,
+ const Option<std::string>& account,
+ const std::string& user,
+ const Option<std::string>& password);
+
+ /**
+ * Returns JSON Web Token from cache or from remote server using "TLS/Cert"
+ * based authorization.
+ *
+ * @param service Name of the service that hosts the resource for which
+ * token is being requested.
+ * @param scope unique scope returned by the 401 Unauthorized response
+ * from the registry.
+ * @param account Name of the account which the client is acting as.
+ * @returns Token struct that encapsulates JSON Web Token.
+ */
+ process::Future<Token> getToken(
+ const std::string& service,
+ const std::string& scope,
+ const Option<std::string>& account);
+
+ ~TokenManager();
+
+private:
+ TokenManager(process::Owned<TokenManagerProcess>& process);
+
+ TokenManager(const TokenManager&) = delete;
+ TokenManager& operator=(const TokenManager&) = delete;
+
+ process::Owned<TokenManagerProcess> process_;
+};
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/paths.cpp b/src/slave/containerizer/mesos/provisioner/paths.cpp
new file mode 100644
index 0000000..b019eb9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/paths.cpp
@@ -0,0 +1,192 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <mesos/type_utils.hpp>
+
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include <stout/os/stat.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+
+using std::list;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace provisioner {
+namespace paths {
+
+static string getContainersDir(const string& provisionerDir)
+{
+ return path::join(provisionerDir, "containers");
+}
+
+
+static string getBackendsDir(const string& containerDir)
+{
+ return path::join(containerDir, "backends");
+}
+
+
+static string getBackendDir(const string& backendsDir, const string& backend)
+{
+ return path::join(backendsDir, backend);
+}
+
+
+static string getRootfsesDir(const string& backendDir)
+{
+ return path::join(backendDir, "rootfses");
+}
+
+
+static string getRootfsDir(const string& rootfsesDir, const string& roofsId)
+{
+ return path::join(rootfsesDir, roofsId);
+}
+
+
+string getContainerDir(
+ const string& provisionerDir,
+ const ContainerID& containerId)
+{
+ return path::join(getContainersDir(provisionerDir), containerId.value());
+}
+
+
+string getContainerRootfsDir(
+ const string& provisionerDir,
+ const ContainerID& containerId,
+ const string& backend,
+ const string& rootfsId)
+{
+ return getRootfsDir(
+ getRootfsesDir(
+ getBackendDir(
+ getBackendsDir(
+ getContainerDir(
+ provisionerDir,
+ containerId)),
+ backend)),
+ rootfsId);
+}
+
+
+Try<hashset<ContainerID>> listContainers(
+ const string& provisionerDir)
+{
+ hashset<ContainerID> results;
+
+ string containersDir = getContainersDir(provisionerDir);
+ if (!os::exists(containersDir)) {
+ // No container has been created yet.
+ return results;
+ }
+
+ Try<list<string>> containerIds = os::ls(containersDir);
+ if (containerIds.isError()) {
+ return Error("Unable to list the containers directory: " +
+ containerIds.error());
+ }
+
+ foreach (const string& entry, containerIds.get()) {
+ string containerPath = path::join(containersDir, entry);
+
+ if (!os::stat::isdir(containerPath)) {
+ LOG(WARNING) << "Ignoring unexpected container entry at: "
+ << containerPath;
+ continue;
+ }
+
+ ContainerID containerId;
+ containerId.set_value(entry);
+ results.insert(containerId);
+ }
+
+ return results;
+}
+
+
+Try<hashmap<string, hashset<string>>> listContainerRootfses(
+ const string& provisionerDir,
+ const ContainerID& containerId)
+{
+ hashmap<string, hashset<string>> results;
+
+ string backendsDir = getBackendsDir(
+ getContainerDir(
+ provisionerDir,
+ containerId));
+
+ Try<list<string>> backends = os::ls(backendsDir);
+ if (backends.isError()) {
+ return Error("Unable to list the container directory: " + backends.error());
+ }
+
+ foreach (const string& backend, backends.get()) {
+ string backendDir = getBackendDir(backendsDir, backend);
+ if (!os::stat::isdir(backendDir)) {
+ LOG(WARNING) << "Ignoring unexpected backend entry at: " << backendDir;
+ continue;
+ }
+
+ Try<list<string>> rootfses = os::ls(getRootfsesDir(backendDir));
+ if (rootfses.isError()) {
+ return Error("Unable to list the backend directory: " + rootfses.error());
+ }
+
+ hashset<string> backendResults;
+
+ foreach (const string& rootfsId, rootfses.get()) {
+ string rootfs = getRootfsDir(getRootfsesDir(backendDir), rootfsId);
+
+ if (!os::stat::isdir(rootfs)) {
+ LOG(WARNING) << "Ignoring unexpected rootfs entry at: " << backendDir;
+ continue;
+ }
+
+ backendResults.insert(rootfsId);
+ }
+
+ if (backendResults.empty()) {
+ LOG(WARNING) << "Ignoring a backend directory with no rootfs in it: "
+ << backendDir;
+ continue;
+ }
+
+ // The rootfs directory has passed validation.
+ results.put(backend, backendResults);
+ }
+
+ return results;
+}
+
+} // namespace paths {
+} // namespace provisioner {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/paths.hpp b/src/slave/containerizer/mesos/provisioner/paths.hpp
new file mode 100644
index 0000000..7ebd36d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/paths.hpp
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_PATHS_HPP__
+#define __PROVISIONER_PATHS_HPP__
+
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/try.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace provisioner {
+namespace paths {
+
+// The provisioner rootfs directory is as follows:
+// <work_dir> ('--work_dir' flag)
+// |-- provisioner
+// |-- containers
+// |-- <container_id>
+// |-- backends
+// |-- <backend> (copy, bind, etc.)
+// |-- rootfses
+// |-- <rootfs_id> (the rootfs)
+//
+// There can be multiple backends due to the change of backend flags.
+// Under each backend a rootfs is identified by the 'rootfs_id' which
+// is a UUID.
+
+std::string getContainerDir(
+ const std::string& provisionerDir,
+ const ContainerID& containerId);
+
+
+std::string getContainerRootfsDir(
+ const std::string& provisionerDir,
+ const ContainerID& containerId,
+ const std::string& backend,
+ const std::string& rootfsId);
+
+
+// Recursively "ls" the container directory and return a map of
+// backend -> {rootfsId, ...}
+Try<hashmap<std::string, hashset<std::string>>>
+listContainerRootfses(
+ const std::string& provisionerDir,
+ const ContainerID& containerId);
+
+
+// Return a set of container IDs.
+Try<hashset<ContainerID>> listContainers(
+ const std::string& provisionerDir);
+
+} // namespace paths {
+} // namespace provisioner {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/provisioner.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/provisioner.cpp b/src/slave/containerizer/mesos/provisioner/provisioner.cpp
new file mode 100644
index 0000000..e99c1c9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/provisioner.cpp
@@ -0,0 +1,444 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mesos/type_utils.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/process.hpp>
+
+#include <process/metrics/counter.hpp>
+#include <process/metrics/metrics.hpp>
+
+#include <stout/foreach.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/os.hpp>
+#include <stout/stringify.hpp>
+#include <stout/uuid.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerState;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class ProvisionerProcess : public Process<ProvisionerProcess>
+{
+public:
+ ProvisionerProcess(
+ const Flags& flags,
+ const string& rootDir,
+ const hashmap<Image::Type, Owned<Store>>& stores,
+ const hashmap<string, Owned<Backend>>& backends);
+
+ Future<Nothing> recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ Future<string> provision(
+ const ContainerID& containerId,
+ const Image& image);
+
+ Future<bool> destroy(const ContainerID& containerId);
+
+private:
+ Future<string> _provision(
+ const ContainerID& containerId,
+ const vector<string>& layers);
+
+ Future<bool> _destroy(const ContainerID& containerId);
+
+ const Flags flags;
+
+ // Absolute path to the provisioner root directory. It can be
+ // derived from '--work_dir' but we keep a separate copy here
+ // because we converted it into an absolute path so managed rootfs
+ // paths match the ones in 'mountinfo' (important if mount-based
+ // backends are used).
+ const string rootDir;
+
+ const hashmap<Image::Type, Owned<Store>> stores;
+ const hashmap<string, Owned<Backend>> backends;
+
+ struct Info
+ {
+ // Mappings: backend -> {rootfsId, ...}
+ hashmap<string, hashset<string>> rootfses;
+ };
+
+ hashmap<ContainerID, Owned<Info>> infos;
+
+ struct Metrics
+ {
+ Metrics();
+ ~Metrics();
+
+ process::metrics::Counter remove_container_errors;
+ } metrics;
+};
+
+
+Try<Owned<Provisioner>> Provisioner::create(
+ const Flags& flags,
+ Fetcher* fetcher)
+{
+ string _rootDir = slave::paths::getProvisionerDir(flags.work_dir);
+
+ Try<Nothing> mkdir = os::mkdir(_rootDir);
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create provisioner root directory '" +
+ _rootDir + "': " + mkdir.error());
+ }
+
+ Result<string> rootDir = os::realpath(_rootDir);
+ if (rootDir.isError()) {
+ return Error(
+ "Failed to resolve the realpath of provisioner root directory '" +
+ _rootDir + "': " + rootDir.error());
+ }
+
+ CHECK_SOME(rootDir); // Can't be None since we just created it.
+
+ Try<hashmap<Image::Type, Owned<Store>>> stores = Store::create(flags);
+ if (stores.isError()) {
+ return Error("Failed to create image stores: " + stores.error());
+ }
+
+ hashmap<string, Owned<Backend>> backends = Backend::create(flags);
+ if (backends.empty()) {
+ return Error("No usable provisioner backend created");
+ }
+
+ if (!backends.contains(flags.image_provisioner_backend)) {
+ return Error(
+ "The specified provisioner backend '" +
+ flags.image_provisioner_backend + "' is unsupported");
+ }
+
+ return Owned<Provisioner>(new Provisioner(
+ Owned<ProvisionerProcess>(new ProvisionerProcess(
+ flags,
+ rootDir.get(),
+ stores.get(),
+ backends))));
+}
+
+
+Provisioner::Provisioner(Owned<ProvisionerProcess> _process)
+ : process(_process)
+{
+ spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Provisioner::~Provisioner()
+{
+ if (process.get() != NULL) {
+ terminate(process.get());
+ wait(process.get());
+ }
+}
+
+
+Future<Nothing> Provisioner::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ return dispatch(
+ CHECK_NOTNULL(process.get()),
+ &ProvisionerProcess::recover,
+ states,
+ orphans);
+}
+
+
+Future<string> Provisioner::provision(
+ const ContainerID& containerId,
+ const Image& image)
+{
+ return dispatch(
+ CHECK_NOTNULL(process.get()),
+ &ProvisionerProcess::provision,
+ containerId,
+ image);
+}
+
+
+Future<bool> Provisioner::destroy(const ContainerID& containerId)
+{
+ return dispatch(
+ CHECK_NOTNULL(process.get()),
+ &ProvisionerProcess::destroy,
+ containerId);
+}
+
+
+ProvisionerProcess::ProvisionerProcess(
+ const Flags& _flags,
+ const string& _rootDir,
+ const hashmap<Image::Type, Owned<Store>>& _stores,
+ const hashmap<string, Owned<Backend>>& _backends)
+ : flags(_flags),
+ rootDir(_rootDir),
+ stores(_stores),
+ backends(_backends) {}
+
+
+Future<Nothing> ProvisionerProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ // Register living containers, including the ones that do not
+ // provision images.
+ hashset<ContainerID> alive;
+ foreach (const ContainerState& state, states) {
+ alive.insert(state.container_id());
+ }
+
+ // List provisioned containers; recover living ones; destroy unknown
+ // orphans. Note that known orphan containers are recovered as well
+ // and they will be destroyed by the containerizer using the normal
+ // cleanup path. See MESOS-2367 for details.
+ Try<hashset<ContainerID>> containers =
+ provisioner::paths::listContainers(rootDir);
+
+ if (containers.isError()) {
+ return Failure(
+ "Failed to list the containers managed by the provisioner: " +
+ containers.error());
+ }
+
+ // Scan the list of containers, register all of them with 'infos'
+ // but mark unknown orphans for immediate cleanup.
+ hashset<ContainerID> unknownOrphans;
+
+ foreach (const ContainerID& containerId, containers.get()) {
+ Owned<Info> info = Owned<Info>(new Info());
+
+ Try<hashmap<string, hashset<string>>> rootfses =
+ provisioner::paths::listContainerRootfses(rootDir, containerId);
+
+ if (rootfses.isError()) {
+ return Failure(
+ "Unable to list rootfses belonged to container " +
+ stringify(containerId) + ": " + rootfses.error());
+ }
+
+ foreachkey (const string& backend, rootfses.get()) {
+ if (!backends.contains(backend)) {
+ return Failure(
+ "Found rootfses managed by an unrecognized backend: " + backend);
+ }
+
+ info->rootfses.put(backend, rootfses.get()[backend]);
+ }
+
+ infos.put(containerId, info);
+
+ if (alive.contains(containerId) || orphans.contains(containerId)) {
+ LOG(INFO) << "Recovered container " << containerId;
+ continue;
+ } else {
+ // For immediate cleanup below.
+ unknownOrphans.insert(containerId);
+ }
+ }
+
+ // Cleanup unknown orphan containers' rootfses.
+ list<Future<bool>> cleanups;
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ LOG(INFO) << "Cleaning up unknown orphan container " << containerId;
+ cleanups.push_back(destroy(containerId));
+ }
+
+ Future<Nothing> cleanup = collect(cleanups)
+ .then([]() -> Future<Nothing> { return Nothing(); });
+
+ // Recover stores.
+ list<Future<Nothing>> recovers;
+ foreachvalue (const Owned<Store>& store, stores) {
+ recovers.push_back(store->recover());
+ }
+
+ Future<Nothing> recover = collect(recovers)
+ .then([]() -> Future<Nothing> { return Nothing(); });
+
+ // A successful provisioner recovery depends on:
+ // 1) Recovery of living containers and known orphans (done above).
+ // 2) Successful cleanup of unknown orphans.
+ // 3) Successful store recovery.
+ //
+ // TODO(jieyu): Do not recover 'store' before unknown orphans are
+ // cleaned up. In the future, we may want to cleanup unused rootfses
+ // in 'store', which might fail if there still exist unknown orphans
+ // holding references to them.
+ return collect(cleanup, recover)
+ .then([=]() -> Future<Nothing> {
+ LOG(INFO) << "Provisioner recovery complete";
+ return Nothing();
+ });
+}
+
+
+Future<string> ProvisionerProcess::provision(
+ const ContainerID& containerId,
+ const Image& image)
+{
+ if (!stores.contains(image.type())) {
+ return Failure(
+ "Unsupported container image type: " +
+ stringify(image.type()));
+ }
+
+ // Get and then provision image layers from the store.
+ return stores.get(image.type()).get()->get(image)
+ .then(defer(self(), &Self::_provision, containerId, lambda::_1));
+}
+
+
+Future<string> ProvisionerProcess::_provision(
+ const ContainerID& containerId,
+ const vector<string>& layers)
+{
+ // TODO(jieyu): Choose a backend smartly. For instance, if there is
+ // only one layer returned from the store. prefer to use bind
+ // backend because it's the simplest.
+ const string& backend = flags.image_provisioner_backend;
+ CHECK(backends.contains(backend));
+
+ string rootfsId = UUID::random().toString();
+
+ string rootfs = provisioner::paths::getContainerRootfsDir(
+ rootDir,
+ containerId,
+ backend,
+ rootfsId);
+
+ LOG(INFO) << "Provisioning image rootfs '" << rootfs
+ << "' for container " << containerId;
+
+ // NOTE: It's likely that the container ID already exists in 'infos'
+ // because one container might provision multiple images.
+ if (!infos.contains(containerId)) {
+ infos.put(containerId, Owned<Info>(new Info()));
+ }
+
+ infos[containerId]->rootfses[backend].insert(rootfsId);
+
+ return backends.get(backend).get()->provision(layers, rootfs)
+ .then([rootfs]() -> Future<string> { return rootfs; });
+}
+
+
+Future<bool> ProvisionerProcess::destroy(const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ LOG(INFO) << "Ignoring destroy request for unknown container "
+ << containerId;
+
+ return false;
+ }
+
+ // Unregister the container first. If destroy() fails, we can rely
+ // on recover() to retry it later.
+ Owned<Info> info = infos[containerId];
+ infos.erase(containerId);
+
+ list<Future<bool>> futures;
+ foreachkey (const string& backend, info->rootfses) {
+ if (!backends.contains(backend)) {
+ return Failure("Unknown backend '" + backend + "'");
+ }
+
+ foreach (const string& rootfsId, info->rootfses[backend]) {
+ string rootfs = provisioner::paths::getContainerRootfsDir(
+ rootDir,
+ containerId,
+ backend,
+ rootfsId);
+
+ LOG(INFO) << "Destroying container rootfs at '" << rootfs
+ << "' for container " << containerId;
+
+ futures.push_back(backends.get(backend).get()->destroy(rootfs));
+ }
+ }
+
+ // TODO(xujyan): Revisit the usefulness of this return value.
+ return collect(futures)
+ .then(defer(self(), &ProvisionerProcess::_destroy, containerId));
+}
+
+
+Future<bool> ProvisionerProcess::_destroy(const ContainerID& containerId)
+{
+ // This should be fairly cheap as the directory should only
+ // contain a few empty sub-directories at this point.
+ //
+ // TODO(jieyu): Currently, it's possible that some directories
+ // cannot be removed due to EBUSY. EBUSY is caused by the race
+ // between cleaning up this container and new containers copying
+ // the host mount table. It's OK to ignore them. The cleanup
+ // will be retried during slave recovery.
+ string containerDir =
+ provisioner::paths::getContainerDir(rootDir, containerId);
+
+ Try<Nothing> rmdir = os::rmdir(containerDir);
+ if (rmdir.isError()) {
+ LOG(ERROR) << "Failed to remove the provisioned container directory "
+ << "at '" << containerDir << "': " << rmdir.error();
+
+ ++metrics.remove_container_errors;
+ }
+
+ return true;
+}
+
+
+ProvisionerProcess::Metrics::Metrics()
+ : remove_container_errors(
+ "containerizer/mesos/provisioner/remove_container_errors")
+{
+ process::metrics::add(remove_container_errors);
+}
+
+
+ProvisionerProcess::Metrics::~Metrics()
+{
+ process::metrics::remove(remove_container_errors);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/provisioner.hpp b/src/slave/containerizer/mesos/provisioner/provisioner.hpp
new file mode 100644
index 0000000..912fc5a
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/provisioner.hpp
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_HPP__
+#define __PROVISIONER_HPP__
+
+#include <list>
+
+#include <mesos/resources.hpp>
+
+#include <mesos/slave/isolator.hpp> // For ContainerState.
+
+#include <stout/nothing.hpp>
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/fetcher.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class ProvisionerProcess;
+
+
+class Provisioner
+{
+public:
+ // Create the provisioner based on the specified flags.
+ static Try<process::Owned<Provisioner>> create(
+ const Flags& flags,
+ Fetcher* fetcher);
+
+ // NOTE: Made 'virtual' for mocking and testing.
+ virtual ~Provisioner();
+
+ // Recover root filesystems for containers from the run states and
+ // the orphan containers (known to the launcher but not known to the
+ // slave) detected by the launcher. This function is also
+ // responsible for cleaning up any intermediate artifacts (e.g.
+ // directories) to not leak anything.
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ // Provision a root filesystem for the container using the specified
+ // image and return the absolute path to the root filesystem.
+ virtual process::Future<std::string> provision(
+ const ContainerID& containerId,
+ const Image& image);
+
+ // Destroy a previously provisioned root filesystem. Assumes that
+ // all references (e.g., mounts, open files) to the provisioned
+ // filesystem have been removed. Return false if there is no
+ // provisioned root filesystem for the given container.
+ virtual process::Future<bool> destroy(const ContainerID& containerId);
+
+protected:
+ Provisioner() {} // For creating mock object.
+
+private:
+ explicit Provisioner(process::Owned<ProvisionerProcess> process);
+
+ Provisioner(const Provisioner&) = delete; // Not copyable.
+ Provisioner& operator=(const Provisioner&) = delete; // Not assignable.
+
+ process::Owned<ProvisionerProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/store.cpp b/src/slave/containerizer/mesos/provisioner/store.cpp
new file mode 100644
index 0000000..4b8abe6
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/store.cpp
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include <mesos/type_utils.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+using namespace process;
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<hashmap<Image::Type, Owned<Store>>> Store::create(const Flags& flags)
+{
+ if (flags.image_providers.isNone()) {
+ return hashmap<Image::Type, Owned<Store>>();
+ }
+
+ hashmap<Image::Type, Try<Owned<Store>>(*)(const Flags&)> creators;
+ creators.put(Image::APPC, &appc::Store::create);
+ creators.put(Image::DOCKER, &docker::Store::create);
+
+ hashmap<Image::Type, Owned<Store>> stores;
+
+ foreach (const string& type,
+ strings::tokenize(flags.image_providers.get(), ",")) {
+ Image::Type imageType;
+ if (!Image::Type_Parse(strings::upper(type), &imageType)) {
+ return Error("Unknown image type '" + type + "'");
+ }
+
+ if (!creators.contains(imageType)) {
+ return Error("Unsupported image type '" + type + "'");
+ }
+
+ Try<Owned<Store>> store = creators[imageType](flags);
+ if (store.isError()) {
+ return Error(
+ "Failed to create store for image type '" +
+ type + "': " + store.error());
+ }
+
+ stores.put(imageType, store.get());
+ }
+
+ return stores;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/store.hpp b/src/slave/containerizer/mesos/provisioner/store.hpp
new file mode 100644
index 0000000..cf3e7d7
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/store.hpp
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_STORE_HPP__
+#define __PROVISIONER_STORE_HPP__
+
+#include <string>
+#include <vector>
+
+#include <mesos/mesos.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// An image store abstraction that "stores" images. It serves as a
+// read-through cache (cache misses are fetched remotely and
+// transparently) for images.
+class Store
+{
+public:
+ static Try<hashmap<Image::Type, process::Owned<Store>>> create(
+ const Flags& flags);
+
+ virtual ~Store() {}
+
+ virtual process::Future<Nothing> recover() = 0;
+
+ // Get the specified image (and all its recursive dependencies) as a
+ // list of rootfs layers in the topological order (dependencies go
+ // before dependents in the list). The images required to build this
+ // list are either retrieved from the local cache or fetched
+ // remotely.
+ //
+ // NOTE: The returned list should not have duplicates. e.g., in the
+ // following scenario the result should be [C, B, D, A] (B before D
+ // in this example is decided by the order in which A specifies its
+ // dependencies).
+ //
+ // A --> B --> C
+ // | ^
+ // |---> D ----|
+ //
+ // The returned future fails if the requested image or any of its
+ // dependencies cannot be found or failed to be fetched.
+ virtual process::Future<std::vector<std::string>> get(const Image& image) = 0;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_STORE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/paths.cpp b/src/slave/containerizer/provisioner/appc/paths.cpp
deleted file mode 100644
index 8817c0f..0000000
--- a/src/slave/containerizer/provisioner/appc/paths.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <stout/path.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-
-using std::list;
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace paths {
-
-string getStagingDir(const string& storeDir)
-{
- return path::join(storeDir, "staging");
-}
-
-
-string getImagesDir(const string& storeDir)
-{
- return path::join(storeDir, "images");
-}
-
-
-string getImagePath(const string& storeDir, const string& imageId)
-{
- return path::join(getImagesDir(storeDir), imageId);
-}
-
-
-string getImageRootfsPath(
- const string& storeDir,
- const string& imageId)
-{
- return path::join(getImagePath(storeDir, imageId), "rootfs");
-}
-
-
-string getImageRootfsPath(const string& imagePath)
-{
- return path::join(imagePath, "rootfs");
-}
-
-
-string getImageManifestPath(
- const string& storeDir,
- const string& imageId)
-{
- return path::join(getImagePath(storeDir, imageId), "manifest");
-}
-
-
-string getImageManifestPath(const string& imagePath)
-{
- return path::join(imagePath, "manifest");
-}
-
-} // namespace paths {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/paths.hpp b/src/slave/containerizer/provisioner/appc/paths.hpp
deleted file mode 100644
index 7c36d67..0000000
--- a/src/slave/containerizer/provisioner/appc/paths.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_PATHS_HPP__
-#define __PROVISIONER_APPC_PATHS_HPP__
-
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/try.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace paths {
-
-// The appc store file system layout is as follows:
-//
-// <store_dir> ('--appc_store_dir' flag)
-// |--staging (contains temp directories for staging downloads)
-// |
-// |--images (stores validated images)
-// |--<image_id> (in the form of "sha512-<128_character_hash_sum>")
-// |--manifest
-// |--rootfs
-// |--... (according to the ACI spec)
-//
-// TODO(xujyan): The staging directory is unused for now (it's
-// externally managed) but implemented to illustrate the need for a
-// separate 'images' directory. Complete the layout diagram when the
-// staging directory is utilized by the provisioner.
-
-std::string getStagingDir(const std::string& storeDir);
-
-
-std::string getImagesDir(const std::string& storeDir);
-
-
-std::string getImagePath(
- const std::string& storeDir,
- const std::string& imageId);
-
-
-std::string getImageRootfsPath(
- const std::string& storeDir,
- const std::string& imageId);
-
-
-std::string getImageRootfsPath(const std::string& imagePath);
-
-
-std::string getImageManifestPath(
- const std::string& storeDir,
- const std::string& imageId);
-
-
-std::string getImageManifestPath(const std::string& imagePath);
-
-} // namespace paths {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/spec.cpp b/src/slave/containerizer/provisioner/appc/spec.cpp
deleted file mode 100644
index bbe523d..0000000
--- a/src/slave/containerizer/provisioner/appc/spec.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/os/stat.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace spec {
-
-Option<Error> validateManifest(const AppcImageManifest& manifest)
-{
- // TODO(idownes): Validate that required fields are present when
- // this cannot be expressed in the protobuf specification, e.g.,
- // repeated fields with >= 1.
- // TODO(xujyan): More thorough type validation:
- // https://github.com/appc/spec/blob/master/spec/types.md
- if (manifest.ackind() != "ImageManifest") {
- return Error("Incorrect acKind field: " + manifest.ackind());
- }
-
- return None();
-}
-
-
-Option<Error> validateImageID(const string& imageId)
-{
- if (!strings::startsWith(imageId, "sha512-")) {
- return Error("Image ID needs to start with sha512-");
- }
-
- string hash = strings::remove(imageId, "sha512-", strings::PREFIX);
- if (hash.length() != 128) {
- return Error("Invalid hash length for: " + hash);
- }
-
- return None();
-}
-
-
-Option<Error> validateLayout(const string& imagePath)
-{
- if (!os::stat::isdir(paths::getImageRootfsPath(imagePath))) {
- return Error("No rootfs directory found in image layout");
- }
-
- if (!os::stat::isfile(paths::getImageManifestPath(imagePath))) {
- return Error("No manifest found in image layout");
- }
-
- return None();
-}
-
-
-Try<AppcImageManifest> parse(const string& value)
-{
- Try<JSON::Object> json = JSON::parse<JSON::Object>(value);
- if (json.isError()) {
- return Error("JSON parse failed: " + json.error());
- }
-
- Try<AppcImageManifest> manifest =
- protobuf::parse<AppcImageManifest>(json.get());
-
- if (manifest.isError()) {
- return Error("Protobuf parse failed: " + manifest.error());
- }
-
- Option<Error> error = validateManifest(manifest.get());
- if (error.isSome()) {
- return Error("Schema validation failed: " + error.get().message);
- }
-
- return manifest.get();
-}
-
-} // namespace spec {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/spec.hpp b/src/slave/containerizer/provisioner/appc/spec.hpp
deleted file mode 100644
index 2bc8c6f..0000000
--- a/src/slave/containerizer/provisioner/appc/spec.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_SPEC_HPP__
-#define __PROVISIONER_APPC_SPEC_HPP__
-
-#include <string>
-
-#include <stout/error.hpp>
-#include <stout/option.hpp>
-
-#include <mesos/mesos.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Appc spec.
-Option<Error> validateManifest(const AppcImageManifest& manifest);
-
-// Validate if the specified image ID conforms to the Appc spec.
-Option<Error> validateImageID(const std::string& imageId);
-
-// Validate if the specified image has the disk layout that conforms
-// to the Appc spec.
-Option<Error> validateLayout(const std::string& imagePath);
-
-// Parse the AppcImageManifest in the specified JSON string.
-Try<AppcImageManifest> parse(const std::string& value);
-
-} // namespace spec {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_SPEC_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/store.cpp b/src/slave/containerizer/provisioner/appc/store.cpp
deleted file mode 100644
index a5ef4ea..0000000
--- a/src/slave/containerizer/provisioner/appc/store.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-
-#include <stout/check.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-#include "slave/containerizer/provisioner/appc/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-
-// Defines a locally cached image (which has passed validation).
-struct CachedImage
-{
- static Try<CachedImage> create(const string& imagePath);
-
- CachedImage(
- const AppcImageManifest& _manifest,
- const string& _id,
- const string& _path)
- : manifest(_manifest), id(_id), path(_path) {}
-
- string rootfs() const
- {
- return path::join(path, "rootfs");
- }
-
- const AppcImageManifest manifest;
-
- // Image ID of the format "sha512-value" where "value" is the hex
- // encoded string of the sha512 digest of the uncompressed tar file
- // of the image.
- const string id;
-
- // Absolute path to the extracted image.
- const string path;
-};
-
-
-Try<CachedImage> CachedImage::create(const string& imagePath)
-{
- Option<Error> error = spec::validateLayout(imagePath);
- if (error.isSome()) {
- return Error("Invalid image layout: " + error.get().message);
- }
-
- string imageId = Path(imagePath).basename();
-
- error = spec::validateImageID(imageId);
- if (error.isSome()) {
- return Error("Invalid image ID: " + error.get().message);
- }
-
- Try<string> read = os::read(paths::getImageManifestPath(imagePath));
- if (read.isError()) {
- return Error("Failed to read manifest: " + read.error());
- }
-
- Try<AppcImageManifest> manifest = spec::parse(read.get());
- if (manifest.isError()) {
- return Error("Failed to parse manifest: " + manifest.error());
- }
-
- return CachedImage(manifest.get(), imageId, imagePath);
-}
-
-
-// Helper that implements this:
-// https://github.com/appc/spec/blob/master/spec/aci.md#dependency-matching
-static bool matches(Image::Appc requirements, const CachedImage& candidate)
-{
- // The name must match.
- if (candidate.manifest.name() != requirements.name()) {
- return false;
- }
-
- // If an id is specified the candidate must match.
- if (requirements.has_id() && (candidate.id != requirements.id())) {
- return false;
- }
-
- // Extract labels for easier comparison, this also weeds out duplicates.
- // TODO(xujyan): Detect duplicate labels in image manifest validation
- // and Image::Appc validation.
- hashmap<string, string> requiredLabels;
- foreach (const Label& label, requirements.labels().labels()) {
- requiredLabels[label.key()] = label.value();
- }
-
- hashmap<string, string> candidateLabels;
- foreach (const AppcImageManifest::Label& label,
- candidate.manifest.labels()) {
- candidateLabels[label.name()] = label.value();
- }
-
- // Any label specified must be present and match in the candidate.
- foreachpair (const string& name,
- const string& value,
- requiredLabels) {
- if (!candidateLabels.contains(name) ||
- candidateLabels.get(name).get() != value) {
- return false;
- }
- }
-
- return true;
-}
-
-
-class StoreProcess : public Process<StoreProcess>
-{
-public:
- StoreProcess(const string& rootDir);
-
- ~StoreProcess() {}
-
- Future<Nothing> recover();
-
- Future<vector<string>> get(const Image& image);
-
-private:
- // Absolute path to the root directory of the store as defined by
- // --appc_store_dir.
- const string rootDir;
-
- // Mappings: name -> id -> image.
- hashmap<string, hashmap<string, CachedImage>> images;
-};
-
-
-Try<Owned<slave::Store>> Store::create(const Flags& flags)
-{
- Try<Nothing> mkdir = os::mkdir(paths::getImagesDir(flags.appc_store_dir));
- if (mkdir.isError()) {
- return Error("Failed to create the images directory: " + mkdir.error());
- }
-
- // Make sure the root path is canonical so all image paths derived
- // from it are canonical too.
- Result<string> rootDir = os::realpath(flags.appc_store_dir);
- if (!rootDir.isSome()) {
- // The above mkdir call recursively creates the store directory
- // if necessary so it cannot be None here.
- CHECK_ERROR(rootDir);
-
- return Error(
- "Failed to get the realpath of the store root directory: " +
- rootDir.error());
- }
-
- return Owned<slave::Store>(new Store(
- Owned<StoreProcess>(new StoreProcess(rootDir.get()))));
-}
-
-
-Store::Store(Owned<StoreProcess> _process)
- : process(_process)
-{
- spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Store::~Store()
-{
- terminate(process.get());
- wait(process.get());
-}
-
-
-Future<Nothing> Store::recover()
-{
- return dispatch(process.get(), &StoreProcess::recover);
-}
-
-
-Future<vector<string>> Store::get(const Image& image)
-{
- return dispatch(process.get(), &StoreProcess::get, image);
-}
-
-
-StoreProcess::StoreProcess(const string& _rootDir) : rootDir(_rootDir) {}
-
-
-Future<Nothing> StoreProcess::recover()
-{
- // Recover everything in the store.
- Try<list<string>> imageIds = os::ls(paths::getImagesDir(rootDir));
- if (imageIds.isError()) {
- return Failure(
- "Failed to list images under '" +
- paths::getImagesDir(rootDir) + "': " +
- imageIds.error());
- }
-
- foreach (const string& imageId, imageIds.get()) {
- string path = paths::getImagePath(rootDir, imageId);
- if (!os::stat::isdir(path)) {
- LOG(WARNING) << "Unexpected entry in storage: " << imageId;
- continue;
- }
-
- Try<CachedImage> image = CachedImage::create(path);
- if (image.isError()) {
- LOG(WARNING) << "Unexpected entry in storage: " << image.error();
- continue;
- }
-
- LOG(INFO) << "Restored image '" << image.get().manifest.name() << "'";
-
- images[image.get().manifest.name()].put(image.get().id, image.get());
- }
-
- return Nothing();
-}
-
-
-Future<vector<string>> StoreProcess::get(const Image& image)
-{
- if (image.type() != Image::APPC) {
- return Failure("Not an Appc image: " + stringify(image.type()));
- }
-
- const Image::Appc& appc = image.appc();
-
- if (!images.contains(appc.name())) {
- return Failure("No Appc image named '" + appc.name() + "' can be found");
- }
-
- // Get local candidates.
- vector<CachedImage> candidates;
- foreach (const CachedImage& candidate, images[appc.name()].values()) {
- // The first match is returned.
- // TODO(xujyan): Some tie-breaking rules are necessary.
- if (matches(appc, candidate)) {
- LOG(INFO) << "Found match for Appc image '" << appc.name()
- << "' in the store";
-
- // The Appc store current doesn't support dependencies and this
- // is enforced by manifest validation: if the image's manifest
- // contains dependencies it would fail the validation and
- // wouldn't be stored in the store.
- return vector<string>({candidate.rootfs()});
- }
- }
-
- return Failure("No Appc image named '" + appc.name() +
- "' can match the requirements");
-}
-
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/store.hpp b/src/slave/containerizer/provisioner/appc/store.hpp
deleted file mode 100644
index e845519..0000000
--- a/src/slave/containerizer/provisioner/appc/store.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_STORE_HPP__
-#define __PROVISIONER_APPC_STORE_HPP__
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-
-// Forward declaration.
-class StoreProcess;
-
-
-class Store : public slave::Store
-{
-public:
- static Try<process::Owned<slave::Store>> create(const Flags& flags);
-
- ~Store();
-
- virtual process::Future<Nothing> recover();
-
- // TODO(xujyan): Fetching remotely is not implemented for now and
- // until then the future fails directly if the image is not in the
- // local cache.
- // TODO(xujyan): The store currently doesn't support images that
- // have dependencies and we should add it later.
- virtual process::Future<std::vector<std::string>> get(const Image& image);
-
-private:
- Store(process::Owned<StoreProcess> process);
-
- Store(const Store&) = delete; // Not copyable.
- Store& operator=(const Store&) = delete; // Not assignable.
-
- process::Owned<StoreProcess> process;
-};
-
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_STORE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backend.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backend.cpp b/src/slave/containerizer/provisioner/backend.cpp
deleted file mode 100644
index b5d9670..0000000
--- a/src/slave/containerizer/provisioner/backend.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <glog/logging.h>
-
-#include <stout/os.hpp>
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-#include "slave/containerizer/provisioner/backends/copy.hpp"
-
-using namespace process;
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-hashmap<string, Owned<Backend>> Backend::create(const Flags& flags)
-{
- hashmap<string, Try<Owned<Backend>>(*)(const Flags&)> creators;
-
-#ifdef __linux__
- creators.put("bind", &BindBackend::create);
-#endif // __linux__
- creators.put("copy", &CopyBackend::create);
-
- hashmap<string, Owned<Backend>> backends;
-
- foreachkey (const string& name, creators) {
- Try<Owned<Backend>> backend = creators[name](flags);
- if (backend.isError()) {
- LOG(WARNING) << "Failed to create '" << name << "' backend: "
- << backend.error();
- continue;
- }
- backends.put(name, backend.get());
- }
-
- return backends;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backend.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backend.hpp b/src/slave/containerizer/provisioner/backend.hpp
deleted file mode 100644
index 1c80b79..0000000
--- a/src/slave/containerizer/provisioner/backend.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKEND_HPP__
-#define __PROVISIONER_BACKEND_HPP__
-
-#include <string>
-#include <vector>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/try.hpp>
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Provision a root filesystem for a container.
-class Backend
-{
-public:
- virtual ~Backend() {}
-
- // Return a map of all supported backends keyed by their names. Note
- // that Backends that failed to be created due to incorrect flags are
- // simply not added to the result.
- static hashmap<std::string, process::Owned<Backend>> create(
- const Flags& flags);
-
- // Provision a root filesystem for a container into the specified 'rootfs'
- // directory by applying the specified list of root filesystem layers in
- // the list order, i.e., files in a layer can overwrite/shadow those from
- // another layer earlier in the list.
- virtual process::Future<Nothing> provision(
- const std::vector<std::string>& layers,
- const std::string& rootfs) = 0;
-
- // Destroy the root filesystem provisioned at the specified 'rootfs'
- // directory. Return false if there is no provisioned root filesystem
- // to destroy for the given directory.
- virtual process::Future<bool> destroy(const std::string& rootfs) = 0;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKEND_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/bind.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/bind.cpp b/src/slave/containerizer/provisioner/backends/bind.cpp
deleted file mode 100644
index 1fe1746..0000000
--- a/src/slave/containerizer/provisioner/backends/bind.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <process/dispatch.hpp>
-#include <process/process.hpp>
-
-#include <process/metrics/counter.hpp>
-#include <process/metrics/metrics.hpp>
-
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-
-#include "linux/fs.hpp"
-
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-
-using namespace process;
-
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class BindBackendProcess : public Process<BindBackendProcess>
-{
-public:
- Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
-
- Future<bool> destroy(const string& rootfs);
-
- struct Metrics
- {
- Metrics();
- ~Metrics();
-
- process::metrics::Counter remove_rootfs_errors;
- } metrics;
-};
-
-
-Try<Owned<Backend>> BindBackend::create(const Flags&)
-{
- Result<string> user = os::user();
- if (!user.isSome()) {
- return Error("Failed to determine user: " +
- (user.isError() ? user.error() : "username not found"));
- }
-
- if (user.get() != "root") {
- return Error("BindBackend requires root privileges");
- }
-
- return Owned<Backend>(new BindBackend(
- Owned<BindBackendProcess>(new BindBackendProcess())));
-}
-
-
-BindBackend::~BindBackend()
-{
- terminate(process.get());
- wait(process.get());
-}
-
-
-BindBackend::BindBackend(Owned<BindBackendProcess> _process)
- : process(_process)
-{
- spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Future<Nothing> BindBackend::provision(
- const vector<string>& layers,
- const string& rootfs)
-{
- return dispatch(
- process.get(), &BindBackendProcess::provision, layers, rootfs);
-}
-
-
-Future<bool> BindBackend::destroy(const string& rootfs)
-{
- return dispatch(process.get(), &BindBackendProcess::destroy, rootfs);
-}
-
-
-Future<Nothing> BindBackendProcess::provision(
- const vector<string>& layers,
- const string& rootfs)
-{
- if (layers.size() > 1) {
- return Failure(
- "Multiple layers are not supported by the bind backend");
- }
-
- if (layers.size() == 0) {
- return Failure("No filesystem layer provided");
- }
-
- Try<Nothing> mkdir = os::mkdir(rootfs);
- if (mkdir.isError()) {
- return Failure("Failed to create container rootfs at " + rootfs);
- }
-
- // TODO(xujyan): Use MS_REC? Does any provisioner use mounts within
- // its image store in a single layer?
- Try<Nothing> mount = fs::mount(
- layers.front(),
- rootfs,
- None(),
- MS_BIND,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to bind mount rootfs '" + layers.front() +
- "' to '" + rootfs + "': " + mount.error());
- }
-
- // And remount it read-only.
- mount = fs::mount(
- None(), // Ignored.
- rootfs,
- None(),
- MS_BIND | MS_RDONLY | MS_REMOUNT,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to remount rootfs '" + rootfs + "' read-only: " +
- mount.error());
- }
-
- // Mark the mount as shared+slave.
- mount = fs::mount(
- None(),
- rootfs,
- None(),
- MS_SLAVE,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to mark mount '" + rootfs +
- "' as a slave mount: " + mount.error());
- }
-
- mount = fs::mount(
- None(),
- rootfs,
- None(),
- MS_SHARED,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to mark mount '" + rootfs +
- "' as a shared mount: " + mount.error());
- }
-
- return Nothing();
-}
-
-
-Future<bool> BindBackendProcess::destroy(const string& rootfs)
-{
- Try<fs::MountInfoTable> mountTable = fs::MountInfoTable::read();
-
- if (mountTable.isError()) {
- return Failure("Failed to read mount table: " + mountTable.error());
- }
-
- foreach (const fs::MountInfoTable::Entry& entry, mountTable.get().entries) {
- // TODO(xujyan): If MS_REC was used in 'provision()' we would need
- // to check `strings::startsWith(entry.target, rootfs)` here to
- // unmount all nested mounts.
- if (entry.target == rootfs) {
- // NOTE: This would fail if the rootfs is still in use.
- Try<Nothing> unmount = fs::unmount(entry.target);
- if (unmount.isError()) {
- return Failure(
- "Failed to destroy bind-mounted rootfs '" + rootfs + "': " +
- unmount.error());
- }
-
- // TODO(jieyu): If 'rmdir' here returns EBUSY, we still returns
- // a success. This is currently possible because the parent
- // mount of 'rootfs' might not be a shared mount. Thus,
- // containers in different mount namespaces might hold extra
- // references to this mount. It is OK to ignore the EBUSY error
- // because the provisioner will later try to delete all the
- // rootfses for the terminated containers.
- if (::rmdir(rootfs.c_str()) != 0) {
- string message =
- "Failed to remove rootfs mount point '" + rootfs +
- "': " + strerror(errno);
-
- if (errno == EBUSY) {
- LOG(ERROR) << message;
- ++metrics.remove_rootfs_errors;
- } else {
- return Failure(message);
- }
- }
-
- return true;
- }
- }
-
- return false;
-}
-
-
-BindBackendProcess::Metrics::Metrics()
- : remove_rootfs_errors(
- "containerizer/mesos/provisioner/bind/remove_rootfs_errors")
-{
- process::metrics::add(remove_rootfs_errors);
-}
-
-
-BindBackendProcess::Metrics::~Metrics()
-{
- process::metrics::remove(remove_rootfs_errors);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/bind.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/bind.hpp b/src/slave/containerizer/provisioner/backends/bind.hpp
deleted file mode 100644
index 1685938..0000000
--- a/src/slave/containerizer/provisioner/backends/bind.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKENDS_BIND_HPP__
-#define __PROVISIONER_BACKENDS_BIND_HPP__
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class BindBackendProcess;
-
-
-// This is a specialized backend that may be useful for deployments
-// using large (multi-GB) single-layer images *and* where more recent
-// kernel features such as overlayfs are not available (overlayfs-based
-// backend tracked by MESOS-2971). For small images (10's to 100's of MB)
-// the copy backend may be sufficient. NOTE:
-// 1) BindBackend supports only a single layer. Multi-layer images will
-// fail to provision and the container will fail to launch!
-// 2) The filesystem is read-only because all containers using this
-// image share the source. Select writable areas can be achieved by
-// mounting read-write volumes to places like /tmp, /var/tmp,
-// /home, etc. using the ContainerInfo. These can be relative to
-// the executor work directory.
-// N.B. Since the filesystem is read-only, '--sandbox_directory' must
-// already exist within the filesystem because the filesystem isolator
-// is unable to create it!
-// 3) It's fast because the bind mount requires (nearly) zero IO.
-class BindBackend : public Backend
-{
-public:
- virtual ~BindBackend();
-
- // BindBackend doesn't use any flag.
- static Try<process::Owned<Backend>> create(const Flags&);
-
- virtual process::Future<Nothing> provision(
- const std::vector<std::string>& layers,
- const std::string& rootfs);
-
- virtual process::Future<bool> destroy(const std::string& rootfs);
-
-private:
- explicit BindBackend(process::Owned<BindBackendProcess> process);
-
- BindBackend(const BindBackend&); // Not copyable.
- BindBackend& operator=(const BindBackend&); // Not assignable.
-
- process::Owned<BindBackendProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKENDS_BIND_HPP__
[11/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/perf_event.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/perf_event.hpp b/src/slave/containerizer/isolators/cgroups/perf_event.hpp
deleted file mode 100644
index 1f722ef..0000000
--- a/src/slave/containerizer/isolators/cgroups/perf_event.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PERF_EVENT_ISOLATOR_HPP__
-#define __PERF_EVENT_ISOLATOR_HPP__
-
-#include <set>
-
-#include <process/time.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/nothing.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CgroupsPerfEventIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~CgroupsPerfEventIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-protected:
- virtual void initialize();
-
-private:
- CgroupsPerfEventIsolatorProcess(
- const Flags& _flags,
- const std::string& _hierarchy,
- const std::set<std::string>& _events)
- : flags(_flags),
- hierarchy(_hierarchy),
- events(_events) {}
-
- void sample();
-
- void _sample(
- const process::Time& next,
- const process::Future<hashmap<std::string, PerfStatistics>>& statistics);
-
- virtual process::Future<Nothing> _cleanup(const ContainerID& containerId);
-
- struct Info
- {
- Info(const ContainerID& _containerId, const std::string& _cgroup)
- : containerId(_containerId), cgroup(_cgroup), destroying(false)
- {
- // Ensure the initial statistics include the required fields.
- // Note the duration is set to zero to indicate no sampling has
- // taken place. This empty sample will be returned from usage()
- // until the first true sample is obtained.
- statistics.set_timestamp(process::Clock::now().secs());
- statistics.set_duration(Seconds(0).secs());
- }
-
- const ContainerID containerId;
- const std::string cgroup;
- PerfStatistics statistics;
- // Mark a container when we start destruction so we stop sampling it.
- bool destroying;
- };
-
- const Flags flags;
-
- // The path to the cgroups subsystem hierarchy root.
- const std::string hierarchy;
-
- // Set of events to sample.
- std::set<std::string> events;
-
- // TODO(jieyu): Use Owned<Info>.
- hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PERF_EVENT_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/linux.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/linux.cpp b/src/slave/containerizer/isolators/filesystem/linux.cpp
deleted file mode 100644
index 8823b78..0000000
--- a/src/slave/containerizer/isolators/filesystem/linux.cpp
+++ /dev/null
@@ -1,893 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <sstream>
-#include <string>
-
-#include <glog/logging.h>
-
-#include <process/collect.hpp>
-
-#include <process/metrics/metrics.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/strings.hpp>
-
-#include <stout/os/shell.hpp>
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
-
-using namespace process;
-
-using std::list;
-using std::ostringstream;
-using std::string;
-
-using mesos::slave::ContainerState;
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> LinuxFilesystemIsolatorProcess::create(
- const Flags& flags,
- const Owned<Provisioner>& provisioner)
-{
- Result<string> user = os::user();
- if (!user.isSome()) {
- return Error("Failed to determine user: " +
- (user.isError() ? user.error() : "username not found"));
- }
-
- if (user.get() != "root") {
- return Error("LinuxFilesystemIsolator requires root privileges");
- }
-
- // Make slave's work_dir a shared mount so that when forking a child
- // process (with a new mount namespace), the child process does not
- // hold extra references to container's work directory mounts and
- // provisioner mounts (e.g., when using the bind backend) because
- // cleanup operations within work_dir can be propagted to all
- // container namespaces. See MESOS-3483 for more details.
- LOG(INFO) << "Making '" << flags.work_dir << "' a shared mount";
-
- Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
- if (table.isError()) {
- return Error("Failed to get mount table: " + table.error());
- }
-
- Option<fs::MountInfoTable::Entry> workDirMount;
- foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
- // TODO(jieyu): Make sure 'flags.work_dir' is a canonical path.
- if (entry.target == flags.work_dir) {
- workDirMount = entry;
- break;
- }
- }
-
- // Do a self bind mount if needed. If the mount already exists, make
- // sure it is a shared mount of its own peer group.
- if (workDirMount.isNone()) {
- // NOTE: Instead of using fs::mount to perform the bind mount, we
- // use the shell command here because the syscall 'mount' does not
- // update the mount table (i.e., /etc/mtab). In other words, the
- // mount will not be visible if the operator types command
- // 'mount'. Since this mount will still be presented after all
- // containers and the slave are stopped, it's better to make it
- // visible. It's OK to use the blocking os::shell here because
- // 'create' will only be invoked during initialization.
- Try<string> mount = os::shell(
- "mount --bind %s %s && "
- "mount --make-slave %s && "
- "mount --make-shared %s",
- flags.work_dir.c_str(),
- flags.work_dir.c_str(),
- flags.work_dir.c_str(),
- flags.work_dir.c_str());
-
- if (mount.isError()) {
- return Error(
- "Failed to self bind mount '" + flags.work_dir +
- "' and make it a shared mount: " + mount.error());
- }
- } else {
- if (workDirMount.get().shared().isNone()) {
- // This is the case where the work directory mount is not a
- // shared mount yet (possibly due to slave crash while preparing
- // the work directory mount). It's safe to re-do the following.
- Try<string> mount = os::shell(
- "mount --make-slave %s && "
- "mount --make-shared %s",
- flags.work_dir.c_str(),
- flags.work_dir.c_str());
-
- if (mount.isError()) {
- return Error(
- "Failed to self bind mount '" + flags.work_dir +
- "' and make it a shared mount: " + mount.error());
- }
- } else {
- // We need to make sure that the shared mount is in its own peer
- // group. To check that, we need to get the parent mount.
- foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
- if (entry.id == workDirMount.get().parent) {
- // If the work directory mount and its parent mount are in
- // the same peer group, we need to re-do the following
- // commands so that they are in different peer groups.
- if (entry.shared() == workDirMount.get().shared()) {
- Try<string> mount = os::shell(
- "mount --make-slave %s && "
- "mount --make-shared %s",
- flags.work_dir.c_str(),
- flags.work_dir.c_str());
-
- if (mount.isError()) {
- return Error(
- "Failed to self bind mount '" + flags.work_dir +
- "' and make it a shared mount: " + mount.error());
- }
- }
-
- break;
- }
- }
- }
- }
-
- Owned<MesosIsolatorProcess> process(
- new LinuxFilesystemIsolatorProcess(flags, provisioner));
-
- return new MesosIsolator(process);
-}
-
-
-LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess(
- const Flags& _flags,
- const Owned<Provisioner>& _provisioner)
- : flags(_flags),
- provisioner(_provisioner),
- metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {}
-
-
-LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- // Read the mount table in the host mount namespace to recover paths
- // to containers' work directories if their root filesystems are
- // changed. Method 'cleanup()' relies on this information to clean
- // up mounts in the host mount namespace for each container.
- Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
- if (table.isError()) {
- return Failure("Failed to get mount table: " + table.error());
- }
-
- foreach (const ContainerState& state, states) {
- Owned<Info> info(new Info(state.directory()));
-
- foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
- if (entry.root == info->directory) {
- info->sandbox = entry.target;
- break;
- }
- }
-
- infos.put(state.container_id(), info);
- }
-
- // Recover both known and unknown orphans by scanning the mount
- // table and finding those mounts whose roots are under slave's
- // sandbox root directory. Those mounts are container's work
- // directory mounts. Mounts from unknown orphans will be cleaned up
- // immediately. Mounts from known orphans will be cleaned up when
- // those known orphan containers are being destroyed by the slave.
- hashset<ContainerID> unknownOrphans;
-
- string sandboxRootDir = paths::getSandboxRootDir(flags.work_dir);
-
- foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
- if (!strings::startsWith(entry.root, sandboxRootDir)) {
- continue;
- }
-
- // TODO(jieyu): Here, we retrieve the container ID by taking the
- // basename of 'entry.root'. This assumes that the slave's sandbox
- // root directory are organized according to the comments in the
- // beginning of slave/paths.hpp.
- ContainerID containerId;
- containerId.set_value(Path(entry.root).basename());
-
- if (infos.contains(containerId)) {
- continue;
- }
-
- Owned<Info> info(new Info(entry.root));
-
- if (entry.root != entry.target) {
- info->sandbox = entry.target;
- }
-
- infos.put(containerId, info);
-
- // Remember all the unknown orphan containers.
- if (!orphans.contains(containerId)) {
- unknownOrphans.insert(containerId);
- }
- }
-
- // Cleanup mounts from unknown orphans.
- list<Future<Nothing>> futures;
- foreach (const ContainerID& containerId, unknownOrphans) {
- futures.push_back(cleanup(containerId));
- }
-
- return collect(futures)
- .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
- &LinuxFilesystemIsolatorProcess::_recover,
- states,
- orphans));
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::_recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- return provisioner->recover(states, orphans)
- .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- infos.put(containerId, Owned<Info>(new Info(directory)));
-
- if (!executorInfo.has_container()) {
- return __prepare(containerId, executorInfo, directory, user, None());
- }
-
- // Provision the root filesystem if needed.
- CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
- if (!executorInfo.container().mesos().has_image()) {
- return _prepare(containerId, executorInfo, directory, user, None());
- }
-
- const Image& image = executorInfo.container().mesos().image();
-
- return provisioner->provision(containerId, image)
- .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
- &LinuxFilesystemIsolatorProcess::_prepare,
- containerId,
- executorInfo,
- directory,
- user,
- lambda::_1));
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user,
- const Option<string>& rootfs)
-{
- CHECK(executorInfo.has_container());
- CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
- // We will provision the images specified in ContainerInfo::volumes
- // as well. We will mutate ContainerInfo::volumes to include the
- // paths to the provisioned root filesystems (by setting the
- // 'host_path') if the volume specifies an image as the source.
- Owned<ExecutorInfo> _executorInfo(new ExecutorInfo(executorInfo));
- list<Future<Nothing>> futures;
-
- for (int i = 0; i < _executorInfo->container().volumes_size(); i++) {
- Volume* volume = _executorInfo->mutable_container()->mutable_volumes(i);
-
- if (!volume->has_image()) {
- continue;
- }
-
- const Image& image = volume->image();
-
- futures.push_back(
- provisioner->provision(containerId, image)
- .then([volume](const string& path) -> Future<Nothing> {
- volume->set_host_path(path);
- return Nothing();
- }));
- }
-
- return collect(futures)
- .then([=]() -> Future<Option<ContainerPrepareInfo>> {
- return __prepare(containerId, *_executorInfo, directory, user, rootfs);
- });
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::__prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user,
- const Option<string>& rootfs)
-{
- CHECK(infos.contains(containerId));
-
- const Owned<Info>& info = infos[containerId];
-
- ContainerPrepareInfo prepareInfo;
- prepareInfo.set_namespaces(CLONE_NEWNS);
-
- if (rootfs.isSome()) {
- // If the container changes its root filesystem, we need to mount
- // the container's work directory into its root filesystem
- // (creating it if needed) so that the executor and the task can
- // access the work directory.
- //
- // NOTE: The mount of the work directory must be a shared mount in
- // the host filesystem so that any mounts underneath it will
- // propagate into the container's mount namespace. This is how we
- // can update persistent volumes for the container.
-
- // This is the mount point of the work directory in the root filesystem.
- const string sandbox = path::join(rootfs.get(), flags.sandbox_directory);
-
- // Save the path 'sandbox' which will be used in 'cleanup()'.
- info->sandbox = sandbox;
-
- if (!os::exists(sandbox)) {
- Try<Nothing> mkdir = os::mkdir(sandbox);
- if (mkdir.isError()) {
- return Failure(
- "Failed to create sandbox mount point at '" +
- sandbox + "': " + mkdir.error());
- }
- }
-
- LOG(INFO) << "Bind mounting work directory from '" << directory
- << "' to '" << sandbox << "' for container " << containerId;
-
- Try<Nothing> mount = fs::mount(
- directory,
- sandbox,
- None(),
- MS_BIND,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to mount work directory '" + directory +
- "' to '" + sandbox + "': " + mount.error());
- }
-
- mount = fs::mount(
- None(),
- sandbox,
- None(),
- MS_SLAVE,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to mark sandbox '" + sandbox +
- "' as a slave mount: " + mount.error());
- }
-
- mount = fs::mount(
- None(),
- sandbox,
- None(),
- MS_SHARED,
- NULL);
-
- if (mount.isError()) {
- return Failure(
- "Failed to mark sandbox '" + sandbox +
- "' as a shared mount: " + mount.error());
- }
-
- prepareInfo.set_rootfs(rootfs.get());
- }
-
- // Prepare the commands that will be run in the container's mount
- // namespace right after forking the executor process. We use these
- // commands to mount those volumes specified in the container info
- // so that they don't pollute the host mount namespace.
- Try<string> _script = script(containerId, executorInfo, directory, rootfs);
- if (_script.isError()) {
- return Failure("Failed to generate isolation script: " + _script.error());
- }
-
- CommandInfo* command = prepareInfo.add_commands();
- command->set_value(_script.get());
-
- return update(containerId, executorInfo.resources())
- .then([prepareInfo]() -> Future<Option<ContainerPrepareInfo>> {
- return prepareInfo;
- });
-}
-
-
-Try<string> LinuxFilesystemIsolatorProcess::script(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& rootfs)
-{
- ostringstream out;
- out << "#!/bin/sh\n";
- out << "set -x -e\n";
-
- // Make sure mounts in the container mount namespace do not
- // propagate back to the host mount namespace.
- out << "mount --make-rslave /\n";
-
- // Try to unmount work directory mounts and persistent volume mounts
- // for other containers to release the extra references to them.
- // NOTE:
- // 1) This doesn't completely eliminate the race condition between
- // this container copying mount table and other containers being
- // cleaned up. This is instead a best-effort attempt.
- // 2) This script assumes that all the mounts the container needs
- // under the slave work directory have its container ID in the
- // path either for the mount source (e.g. sandbox self-bind mount)
- // or the mount target (e.g. mounting sandbox into new rootfs).
- //
- // TODO(xujyan): This command may fail if --work_dir is not specified
- // with a real path as real paths are used in the mount table. It
- // doesn't work when the paths contain reserved characters such as
- // spaces either because such characters in mount info are encoded
- // in the escaped form (i.e. '\0xx').
- out << "grep -E '" << flags.work_dir << "/.+' /proc/self/mountinfo | "
- << "grep -v '" << containerId.value() << "' | "
- << "cut -d' ' -f5 | " // '-f5' is the mount target. See MountInfoTable.
- << "xargs --no-run-if-empty umount -l || "
- << "true \n"; // We mask errors in this command.
-
- if (!executorInfo.has_container()) {
- return out.str();
- }
-
- foreach (const Volume& volume, executorInfo.container().volumes()) {
- if (!volume.has_host_path()) {
- return Error("A volume misses 'host_path'");
- }
-
- // If both 'host_path' and 'container_path' are relative paths,
- // return an error because the user can just directly access the
- // volume in the work directory.
- if (!strings::startsWith(volume.host_path(), "/") &&
- !strings::startsWith(volume.container_path(), "/")) {
- return Error(
- "Both 'host_path' and 'container_path' of a volume are relative");
- }
-
- // Determine the source of the mount.
- string source;
-
- if (strings::startsWith(volume.host_path(), "/")) {
- source = volume.host_path();
-
- // An absolute path must already exist.
- if (!os::exists(source)) {
- return Error("Absolute host path does not exist");
- }
- } else {
- // Path is interpreted as relative to the work directory.
- source = path::join(directory, volume.host_path());
-
- // TODO(jieyu): We need to check that source resolves under the
- // work directory because a user can potentially use a container
- // path like '../../abc'.
-
- if (!os::exists(source)) {
- Try<Nothing> mkdir = os::mkdir(source);
- if (mkdir.isError()) {
- return Error(
- "Failed to create the source of the mount at '" +
- source + "': " + mkdir.error());
- }
-
- // TODO(idownes): Consider setting ownership and mode.
- }
- }
-
- // Determine the target of the mount.
- string target;
-
- if (strings::startsWith(volume.container_path(), "/")) {
- if (rootfs.isSome()) {
- target = path::join(rootfs.get(), volume.container_path());
- } else {
- target = volume.container_path();
- }
-
- // An absolute path must already exist. This is because we want
- // to avoid creating mount points outside the work directory in
- // the host filesystem or in the container filesystem root.
- if (!os::exists(target)) {
- return Error("Absolute container path does not exist");
- }
-
- // TODO(jieyu): We need to check that target resolves under
- // 'rootfs' because a user can potentially use a container path
- // like '/../../abc'.
- } else {
- if (rootfs.isSome()) {
- target = path::join(rootfs.get(),
- flags.sandbox_directory,
- volume.container_path());
- } else {
- target = path::join(directory, volume.container_path());
- }
-
- // TODO(jieyu): We need to check that target resolves under the
- // sandbox because a user can potentially use a container path
- // like '../../abc'.
-
- if (!os::exists(target)) {
- Try<Nothing> mkdir = os::mkdir(target);
- if (mkdir.isError()) {
- return Error(
- "Failed to create the target of the mount at '" +
- target + "': " + mkdir.error());
- }
- }
- }
-
- // TODO(jieyu): Consider the mode in the volume.
- out << "mount -n --rbind '" << source << "' '" << target << "'\n";
- }
-
- return out.str();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- // No-op, isolation happens when unsharing the mount namespace.
- return Nothing();
-}
-
-
-Future<ContainerLimitation> LinuxFilesystemIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- // No-op.
- return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- // Mount persistent volumes. We do this in the host namespace and
- // rely on mount propagation for them to be visible inside the
- // container.
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- const Owned<Info>& info = infos[containerId];
-
- Resources current = info->resources;
-
- // We first remove unneeded persistent volumes.
- foreach (const Resource& resource, current.persistentVolumes()) {
- // This is enforced by the master.
- CHECK(resource.disk().has_volume());
-
- // Ignore absolute and nested paths.
- const string& containerPath = resource.disk().volume().container_path();
- if (strings::contains(containerPath, "/")) {
- LOG(WARNING) << "Skipping updating mount for persistent volume "
- << resource << " of container " << containerId
- << " because the container path '" << containerPath
- << "' contains slash";
- continue;
- }
-
- if (resources.contains(resource)) {
- continue;
- }
-
- // Determine the target of the mount.
- string target;
-
- if (info->sandbox.isSome()) {
- target = path::join(info->sandbox.get(), containerPath);
- } else {
- target = path::join(info->directory, containerPath);
- }
-
- LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
- << resource << " of container " << containerId;
-
- // The unmount will fail if the task/executor is still using files
- // or directories under 'target'.
- Try<Nothing> unmount = fs::unmount(target);
- if (unmount.isError()) {
- return Failure(
- "Failed to unmount unneeded persistent volume at '" +
- target + "': " + unmount.error());
- }
-
- // NOTE: This is a non-recursive rmdir.
- Try<Nothing> rmdir = os::rmdir(target, false);
- if (rmdir.isError()) {
- return Failure(
- "Failed to remove persistent volume mount point at '" +
- target + "': " + rmdir.error());
- }
- }
-
- // We then mount new persistent volumes.
- foreach (const Resource& resource, resources.persistentVolumes()) {
- // This is enforced by the master.
- CHECK(resource.disk().has_volume());
-
- // Ignore absolute and nested paths.
- const string& containerPath = resource.disk().volume().container_path();
- if (strings::contains(containerPath, "/")) {
- LOG(WARNING) << "Skipping updating mount for persistent volume "
- << resource << " of container " << containerId
- << " because the container path '" << containerPath
- << "' contains slash";
- continue;
- }
-
- if (current.contains(resource)) {
- continue;
- }
-
- // Determine the source of the mount.
- string source = paths::getPersistentVolumePath(
- flags.work_dir,
- resource.role(),
- resource.disk().persistence().id());
-
- // Set the ownership of the persistent volume to match that of the
- // sandbox directory.
- //
- // NOTE: Currently, persistent volumes in Mesos are exclusive,
- // meaning that if a persistent volume is used by one task or
- // executor, it cannot be concurrently used by other task or
- // executor. But if we allow multiple executors to use same
- // persistent volume at the same time in the future, the ownership
- // of the persistent volume may conflict here.
- //
- // TODO(haosdent): Consider letting the frameworks specify the
- // user/group of the persistent volumes.
- struct stat s;
- if (::stat(info->directory.c_str(), &s) < 0) {
- return Failure(
- "Failed to get ownership for '" + info->directory +
- "': " + strerror(errno));
- }
-
- LOG(INFO) << "Changing the ownership of the persistent volume at '"
- << source << "' with uid " << s.st_uid
- << " and gid " << s.st_gid;
-
- Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, source, true);
- if (chown.isError()) {
- return Failure(
- "Failed to change the ownership of the persistent volume at '" +
- source + "' with uid " + stringify(s.st_uid) +
- " and gid " + stringify(s.st_gid) + ": " + chown.error());
- }
-
- // Determine the target of the mount.
- string target;
-
- if (info->sandbox.isSome()) {
- target = path::join(info->sandbox.get(), containerPath);
- } else {
- target = path::join(info->directory, containerPath);
- }
-
- if (os::exists(target)) {
- // NOTE: This is possible because 'info->resources' will be
- // reset when slave restarts and recovers. When the slave calls
- // 'containerizer->update' after the executor re-registers,
- // we'll try to re-mount all the already mounted volumes.
-
- // TODO(jieyu): Check the source of the mount matches the entry
- // with the same target in the mount table if one can be found.
- // If not, mount the persistent volume as we did below. This is
- // possible because the slave could crash after it unmounts the
- // volume but before it is able to delete the mount point.
- } else {
- Try<Nothing> mkdir = os::mkdir(target);
- if (mkdir.isError()) {
- return Failure(
- "Failed to create persistent volume mount point at '" +
- target + "': " + mkdir.error());
- }
-
- LOG(INFO) << "Mounting '" << source << "' to '" << target
- << "' for persistent volume " << resource
- << " of container " << containerId;
-
- Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
- if (mount.isError()) {
- return Failure(
- "Failed to mount persistent volume from '" +
- source + "' to '" + target + "': " + mount.error());
- }
- }
- }
-
- // Store the new resources;
- info->resources = resources;
-
- return Nothing();
-}
-
-
-Future<ResourceStatistics> LinuxFilesystemIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- // No-op, no usage gathered.
- return ResourceStatistics();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- VLOG(1) << "Ignoring cleanup request for unknown container: "
- << containerId;
-
- return Nothing();
- }
-
- const Owned<Info>& info = infos[containerId];
-
- // NOTE: We don't need to cleanup mounts in the container's mount
- // namespace because it's done automatically by the kernel when the
- // mount namespace is destroyed after the last process terminates.
-
- // The path to the container' work directory which is the parent of
- // all the persistent volume mounts.
- string sandbox;
-
- if (info->sandbox.isSome()) {
- sandbox = info->sandbox.get();
- } else {
- sandbox = info->directory;
- }
-
- infos.erase(containerId);
-
- // Cleanup the mounts for this container in the host mount
- // namespace, including container's work directory and all the
- // persistent volume mounts.
- Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
- if (table.isError()) {
- return Failure("Failed to get mount table: " + table.error());
- }
-
- bool sandboxMountExists = false;
-
- foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
- // NOTE: All persistent volumes are mounted at targets under the
- // container's work directory. We unmount all the persistent
- // volumes before unmounting the sandbox/work directory mount.
- if (entry.target == sandbox) {
- sandboxMountExists = true;
- } else if (strings::startsWith(entry.target, sandbox)) {
- LOG(INFO) << "Unmounting volume '" << entry.target
- << "' for container " << containerId;
-
- Try<Nothing> unmount = fs::unmount(entry.target);
- if (unmount.isError()) {
- return Failure(
- "Failed to unmount volume '" + entry.target +
- "': " + unmount.error());
- }
- }
- }
-
- if (!sandboxMountExists) {
- // This could happen if the container was not launched by this
- // isolator (e.g., slaves prior to 0.25.0), or the container did
- // not specify a root filesystem.
- LOG(INFO) << "Ignoring unmounting sandbox/work directory"
- << " for container " << containerId;
- } else {
- LOG(INFO) << "Unmounting sandbox/work directory '" << sandbox
- << "' for container " << containerId;
-
- Try<Nothing> unmount = fs::unmount(sandbox);
- if (unmount.isError()) {
- return Failure(
- "Failed to unmount sandbox/work directory '" + sandbox +
- "': " + unmount.error());
- }
- }
-
- // Destroy the provisioned root filesystems.
- return provisioner->destroy(containerId)
- .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-LinuxFilesystemIsolatorProcess::Metrics::Metrics(
- const PID<LinuxFilesystemIsolatorProcess>& isolator)
- : containers_new_rootfs(
- "containerizer/mesos/filesystem/containers_new_rootfs",
- defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs))
-{
- process::metrics::add(containers_new_rootfs);
-}
-
-
-LinuxFilesystemIsolatorProcess::Metrics::~Metrics()
-{
- process::metrics::remove(containers_new_rootfs);
-}
-
-
-double LinuxFilesystemIsolatorProcess::_containers_new_rootfs()
-{
- double count = 0.0;
-
- foreachvalue (const Owned<Info>& info, infos) {
- if (info->sandbox.isSome()) {
- ++count;
- }
- }
-
- return count;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/linux.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/linux.hpp b/src/slave/containerizer/isolators/filesystem/linux.hpp
deleted file mode 100644
index 93e85f2..0000000
--- a/src/slave/containerizer/isolators/filesystem/linux.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LINUX_FILESYSTEM_ISOLATOR_HPP__
-#define __LINUX_FILESYSTEM_ISOLATOR_HPP__
-
-#include <mesos/mesos.hpp>
-#include <mesos/resources.hpp>
-
-#include <process/owned.hpp>
-#include <process/pid.hpp>
-
-#include <process/metrics/gauge.hpp>
-
-#include <stout/hashmap.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The filesystem isolator on Linux that is responsible for preparing
-// the root filesystems and volumes (e.g., persistent volumes) for
-// containers. It relies on Linux mount namespace to prevent mounts of
-// a container from being propagated to the host mount table.
-class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(
- const Flags& flags,
- const process::Owned<Provisioner>& provisioner);
-
- virtual ~LinuxFilesystemIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- LinuxFilesystemIsolatorProcess(
- const Flags& flags,
- const process::Owned<Provisioner>& provisioner);
-
- process::Future<Nothing> _recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- process::Future<Option<mesos::slave::ContainerPrepareInfo>> _prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user,
- const Option<std::string>& rootfs);
-
- process::Future<Option<mesos::slave::ContainerPrepareInfo>> __prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user,
- const Option<std::string>& rootfs);
-
- Try<std::string> script(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& rootfs);
-
- const Flags flags;
- const process::Owned<Provisioner> provisioner;
-
- struct Info
- {
- Info(const std::string& _directory) : directory(_directory) {}
-
- const std::string directory;
-
- // The absolute path to the container's work directory mount point
- // in the new root filesystem if the container changes its root
- // filesystem (i.e., '<rootfs>/<flags.sandbox_directory>'). If the
- // container does not specify a root filesystem, this field will
- // not be set.
- Option<std::string> sandbox;
-
- // Track resources so we can unmount unneeded persistent volumes.
- Resources resources;
- };
-
- hashmap<ContainerID, process::Owned<Info>> infos;
-
- struct Metrics
- {
- explicit Metrics(
- const process::PID<LinuxFilesystemIsolatorProcess>& isolator);
- ~Metrics();
-
- process::metrics::Gauge containers_new_rootfs;
- } metrics;
-
- double _containers_new_rootfs();
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __LINUX_FILESYSTEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/posix.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/posix.cpp b/src/slave/containerizer/isolators/filesystem/posix.cpp
deleted file mode 100644
index eec510c..0000000
--- a/src/slave/containerizer/isolators/filesystem/posix.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <string>
-
-#include <stout/fs.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/isolators/filesystem/posix.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess(
- const Flags& _flags)
- : flags(_flags) {}
-
-
-PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {}
-
-
-Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags)
-{
- process::Owned<MesosIsolatorProcess> process(
- new PosixFilesystemIsolatorProcess(flags));
-
- return new MesosIsolator(process);
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- foreach (const ContainerState& state, states) {
- infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> PosixFilesystemIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- // Return failure if the container change the filesystem root
- // because the symlinks will become invalid in the new root.
- if (executorInfo.has_container()) {
- CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
- if (executorInfo.container().mesos().has_image()) {
- return Failure("Container root filesystems not supported");
- }
-
- // TODO(jieyu): Also return a failure if there exists images in
- // the specified volumes.
- }
-
- infos.put(containerId, Owned<Info>(new Info(directory)));
-
- return update(containerId, executorInfo.resources())
- .then([]() -> Future<Option<ContainerPrepareInfo>> { return None(); });
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- // No-op.
- return Nothing();
-}
-
-
-Future<ContainerLimitation> PosixFilesystemIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- // No-op.
- return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- const Owned<Info>& info = infos[containerId];
-
- // TODO(jieyu): Currently, we only allow non-nested relative
- // container paths for volumes. This is enforced by the master. For
- // those volumes, we create symlinks in the executor directory.
- Resources current = info->resources;
-
- // We first remove unneeded persistent volumes.
- foreach (const Resource& resource, current.persistentVolumes()) {
- // This is enforced by the master.
- CHECK(resource.disk().has_volume());
-
- // Ignore absolute and nested paths.
- const string& containerPath = resource.disk().volume().container_path();
- if (strings::contains(containerPath, "/")) {
- LOG(WARNING) << "Skipping updating symlink for persistent volume "
- << resource << " of container " << containerId
- << " because the container path '" << containerPath
- << "' contains slash";
- continue;
- }
-
- if (resources.contains(resource)) {
- continue;
- }
-
- string link = path::join(info->directory, containerPath);
-
- LOG(INFO) << "Removing symlink '" << link << "' for persistent volume "
- << resource << " of container " << containerId;
-
- Try<Nothing> rm = os::rm(link);
- if (rm.isError()) {
- return Failure(
- "Failed to remove the symlink for the unneeded "
- "persistent volume at '" + link + "'");
- }
- }
-
- // We then link additional persistent volumes.
- foreach (const Resource& resource, resources.persistentVolumes()) {
- // This is enforced by the master.
- CHECK(resource.disk().has_volume());
-
- // Ignore absolute and nested paths.
- const string& containerPath = resource.disk().volume().container_path();
- if (strings::contains(containerPath, "/")) {
- LOG(WARNING) << "Skipping updating symlink for persistent volume "
- << resource << " of container " << containerId
- << " because the container path '" << containerPath
- << "' contains slash";
- continue;
- }
-
- if (current.contains(resource)) {
- continue;
- }
-
- string original = paths::getPersistentVolumePath(
- flags.work_dir,
- resource.role(),
- resource.disk().persistence().id());
-
- // Set the ownership of the persistent volume to match that of the
- // sandbox directory.
- //
- // NOTE: Currently, persistent volumes in Mesos are exclusive,
- // meaning that if a persistent volume is used by one task or
- // executor, it cannot be concurrently used by other task or
- // executor. But if we allow multiple executors to use same
- // persistent volume at the same time in the future, the ownership
- // of the persistent volume may conflict here.
- //
- // TODO(haosdent): Consider letting the frameworks specify the
- // user/group of the persistent volumes.
- struct stat s;
- if (::stat(info->directory.c_str(), &s) < 0) {
- return Failure(
- "Failed to get ownership for '" + info->directory +
- "': " + strerror(errno));
- }
-
- LOG(INFO) << "Changing the ownership of the persistent volume at '"
- << original << "' with uid " << s.st_uid
- << " and gid " << s.st_gid;
-
- Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true);
- if (chown.isError()) {
- return Failure(
- "Failed to change the ownership of the persistent volume at '" +
- original + "' with uid " + stringify(s.st_uid) +
- " and gid " + stringify(s.st_gid) + ": " + chown.error());
- }
-
- string link = path::join(info->directory, containerPath);
-
- if (os::exists(link)) {
- // NOTE: This is possible because 'info->resources' will be
- // reset when slave restarts and recovers. When the slave calls
- // 'containerizer->update' after the executor re-registers,
- // we'll try to relink all the already symlinked volumes.
- Result<string> realpath = os::realpath(link);
- if (!realpath.isSome()) {
- return Failure(
- "Failed to get the realpath of symlink '" + link + "': " +
- (realpath.isError() ? realpath.error() : "No such directory"));
- }
-
- // A sanity check to make sure the target of the symlink does
- // not change. In fact, this is not supposed to happen.
- // NOTE: Here, we compare the realpaths because 'original' might
- // contain symbolic links.
- Result<string> _original = os::realpath(original);
- if (!_original.isSome()) {
- return Failure(
- "Failed to get the realpath of volume '" + original + "': " +
- (_original.isError() ? _original.error() : "No such directory"));
- }
-
- if (realpath.get() != _original.get()) {
- return Failure(
- "The existing symlink '" + link + "' points to '" +
- _original.get() + "' and the new target is '" +
- realpath.get() + "'");
- }
- } else {
- LOG(INFO) << "Adding symlink from '" << original << "' to '"
- << link << "' for persistent volume " << resource
- << " of container " << containerId;
-
- Try<Nothing> symlink = ::fs::symlink(original, link);
- if (symlink.isError()) {
- return Failure(
- "Failed to symlink persistent volume from '" +
- original + "' to '" + link + "'");
- }
- }
- }
-
- // Store the updated resources.
- info->resources = resources;
-
- return Nothing();
-}
-
-
-Future<ResourceStatistics> PosixFilesystemIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- // No-op, no usage gathered.
- return ResourceStatistics();
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- // Symlinks for persistent resources will be removed when the work
- // directory is GC'ed, therefore no need to do explicit cleanup.
- infos.erase(containerId);
-
- return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/posix.hpp b/src/slave/containerizer/isolators/filesystem/posix.hpp
deleted file mode 100644
index 61b81dd..0000000
--- a/src/slave/containerizer/isolators/filesystem/posix.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_FILESYSTEM_ISOLATOR_HPP__
-#define __POSIX_FILESYSTEM_ISOLATOR_HPP__
-
-#include <mesos/resources.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class PosixFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~PosixFilesystemIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- PosixFilesystemIsolatorProcess(const Flags& flags);
-
- const Flags flags;
-
- struct Info
- {
- explicit Info(const std::string& _directory)
- : directory(_directory) {}
-
- const std::string directory;
-
- // Track resources so we can unlink unneeded persistent volumes.
- Resources resources;
- };
-
- hashmap<ContainerID, process::Owned<Info>> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_FILESYSTEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/shared.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/shared.cpp b/src/slave/containerizer/isolators/filesystem/shared.cpp
deleted file mode 100644
index 73804ca..0000000
--- a/src/slave/containerizer/isolators/filesystem/shared.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <set>
-
-#include "linux/ns.hpp"
-
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-SharedFilesystemIsolatorProcess::SharedFilesystemIsolatorProcess(
- const Flags& _flags)
- : flags(_flags) {}
-
-
-SharedFilesystemIsolatorProcess::~SharedFilesystemIsolatorProcess() {}
-
-
-Try<Isolator*> SharedFilesystemIsolatorProcess::create(const Flags& flags)
-{
- Result<string> user = os::user();
- if (!user.isSome()) {
- return Error("Failed to determine user: " +
- (user.isError() ? user.error() : "username not found"));
- }
-
- if (user.get() != "root") {
- return Error("SharedFilesystemIsolator requires root privileges");
- }
-
- process::Owned<MesosIsolatorProcess> process(
- new SharedFilesystemIsolatorProcess(flags));
-
- return new MesosIsolator(process);
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- // There is nothing to recover because we do not keep any state and
- // do not monitor filesystem usage or perform any action on cleanup.
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> SharedFilesystemIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (executorInfo.has_container() &&
- executorInfo.container().type() != ContainerInfo::MESOS) {
- return Failure("Can only prepare filesystem for a MESOS container");
- }
-
- LOG(INFO) << "Preparing shared filesystem for container: "
- << stringify(containerId);
-
- if (!executorInfo.has_container()) {
- // We don't consider this an error, there's just nothing to do so
- // we return None.
-
- return None();
- }
-
- // We don't support mounting to a container path which is a parent
- // to another container path as this can mask entries. We'll keep
- // track of all container paths so we can check this.
- set<string> containerPaths;
- containerPaths.insert(directory);
-
- ContainerPrepareInfo prepareInfo;
- prepareInfo.set_namespaces(CLONE_NEWNS);
-
- foreach (const Volume& volume, executorInfo.container().volumes()) {
- // Because the filesystem is shared we require the container path
- // already exist, otherwise containers can create arbitrary paths
- // outside their sandbox.
- if (!os::exists(volume.container_path())) {
- return Failure("Volume with container path '" +
- volume.container_path() +
- "' must exist on host for shared filesystem isolator");
- }
-
- // Host path must be provided.
- if (!volume.has_host_path()) {
- return Failure("Volume with container path '" +
- volume.container_path() +
- "' must specify host path for shared filesystem isolator");
- }
-
- // Check we won't mask another volume.
- // NOTE: Assuming here that the container path is absolute, see
- // Volume protobuf.
- // TODO(idownes): This test is unnecessarily strict and could be
- // relaxed if mounts could be re-ordered.
- foreach (const string& containerPath, containerPaths) {
- if (strings::startsWith(volume.container_path(), containerPath)) {
- return Failure("Cannot mount volume to '" +
- volume.container_path() +
- "' because it is under volume '" +
- containerPath +
- "'");
- }
-
- if (strings::startsWith(containerPath, volume.container_path())) {
- return Failure("Cannot mount volume to '" +
- containerPath +
- "' because it is under volume '" +
- volume.container_path() +
- "'");
- }
- }
- containerPaths.insert(volume.container_path());
-
- // A relative host path will be created in the container's work
- // directory, otherwise check it already exists.
- string hostPath;
- if (!strings::startsWith(volume.host_path(), "/")) {
- hostPath = path::join(directory, volume.host_path());
-
- // Do not support any relative components in the resulting path.
- // There should not be any links in the work directory to
- // resolve.
- if (strings::contains(hostPath, "/./") ||
- strings::contains(hostPath, "/../")) {
- return Failure("Relative host path '" +
- hostPath +
- "' cannot contain relative components");
- }
-
- Try<Nothing> mkdir = os::mkdir(hostPath, true);
- if (mkdir.isError()) {
- return Failure("Failed to create host_path '" +
- hostPath +
- "' for mount to '" +
- volume.container_path() +
- "': " +
- mkdir.error());
- }
-
- // Set the ownership and permissions to match the container path
- // as these are inherited from host path on bind mount.
- struct stat stat;
- if (::stat(volume.container_path().c_str(), &stat) < 0) {
- return Failure("Failed to get permissions on '" +
- volume.container_path() + "'" +
- ": " + strerror(errno));
- }
-
- Try<Nothing> chmod = os::chmod(hostPath, stat.st_mode);
- if (chmod.isError()) {
- return Failure("Failed to chmod hostPath '" +
- hostPath +
- "': " +
- chmod.error());
- }
-
- Try<Nothing> chown = os::chown(stat.st_uid, stat.st_gid, hostPath, false);
- if (chown.isError()) {
- return Failure("Failed to chown hostPath '" +
- hostPath +
- "': " +
- chown.error());
- }
- } else {
- hostPath = volume.host_path();
-
- if (!os::exists(hostPath)) {
- return Failure("Volume with container path '" +
- volume.container_path() +
- "' must have host path '" +
- hostPath +
- "' present on host for shared filesystem isolator");
- }
- }
-
- prepareInfo.add_commands()->set_value(
- "mount -n --bind " + hostPath + " " + volume.container_path());
- }
-
- return prepareInfo;
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- // No-op, isolation happens when unsharing the mount namespace.
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> SharedFilesystemIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- // No-op, for now.
-
- return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- // No-op, nothing enforced.
-
- return Nothing();
-}
-
-
-Future<ResourceStatistics> SharedFilesystemIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- // No-op, no usage gathered.
-
- return ResourceStatistics();
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- // Cleanup of mounts is done automatically done by the kernel when
- // the mount namespace is destroyed after the last process
- // terminates.
-
- return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/shared.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/shared.hpp b/src/slave/containerizer/isolators/filesystem/shared.hpp
deleted file mode 100644
index 3a2f7db..0000000
--- a/src/slave/containerizer/isolators/filesystem/shared.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __SHARED_FILESYSTEM_ISOLATOR_HPP__
-#define __SHARED_FILESYSTEM_ISOLATOR_HPP__
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// This isolator is to be used when all containers share the host's
-// filesystem. It supports creating mounting "volumes" from the host
-// into each container's mount namespace. In particular, this can be
-// used to give each container a "private" system directory, such as
-// /tmp and /var/tmp.
-class SharedFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~SharedFilesystemIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- SharedFilesystemIsolatorProcess(const Flags& flags);
-
- const Flags flags;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __SHARED_FILESYSTEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/namespaces/pid.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/namespaces/pid.cpp b/src/slave/containerizer/isolators/namespaces/pid.cpp
deleted file mode 100644
index a9823e0..0000000
--- a/src/slave/containerizer/isolators/namespaces/pid.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sys/mount.h>
-
-#include <list>
-#include <set>
-#include <string>
-
-#include <stout/os.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/ls.hpp>
-#include <stout/os/stat.hpp>
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The root directory where we bind mount all the namespace handles.
-static const char PID_NS_BIND_MOUNT_ROOT[] = "/var/run/mesos/pidns";
-
-
-// The empty directory that we'll use to mask the namespace handles
-// inside each container. This mount ensures they cannot determine the
-// namespace of another container.
-static const char PID_NS_BIND_MOUNT_MASK_DIR[] = "/var/empty/mesos";
-
-
-// Helper to construct the path to a pid's namespace file.
-inline string nsProcFile(pid_t pid)
-{
- return path::join("/proc", stringify(pid), "ns", "pid");
-}
-
-
-// Helper to construct the path to the additional reference created
-// for a container's pid namespace.
-inline string nsExtraReference(const ContainerID& containerId)
-{
- return path::join(PID_NS_BIND_MOUNT_ROOT, stringify(containerId));
-}
-
-
-Try<Isolator*> NamespacesPidIsolatorProcess::create(const Flags& flags)
-{
- // Check for root permission.
- if (geteuid() != 0) {
- return Error("The pid namespace isolator requires root permissions");
- }
-
- // Verify that pid namespaces are available on this kernel.
- if (ns::namespaces().count("pid") == 0) {
- return Error("Pid namespaces are not supported by this kernel");
- }
-
- // Create the directory where bind mounts of the pid namespace will
- // be placed.
- Try<Nothing> mkdir = os::mkdir(PID_NS_BIND_MOUNT_ROOT);
- if (mkdir.isError()) {
- return Error(
- "Failed to create the bind mount root directory at " +
- string(PID_NS_BIND_MOUNT_ROOT) + ": " + mkdir.error());
- }
-
- // Create the empty directory that will be used to mask the bind
- // mounts inside each container.
- mkdir = os::mkdir(PID_NS_BIND_MOUNT_MASK_DIR);
- if (mkdir.isError()) {
- return Error(
- "Failed to create the bind mount mask direcrory at " +
- string(PID_NS_BIND_MOUNT_MASK_DIR) + ": " + mkdir.error());
- }
-
- return new MesosIsolator(Owned<MesosIsolatorProcess>(
- new NamespacesPidIsolatorProcess()));
-}
-
-
-Result<ino_t> NamespacesPidIsolatorProcess::getNamespace(
- const ContainerID& containerId)
-{
- const string target = nsExtraReference(containerId);
-
- if (os::exists(target)) {
- return os::stat::inode(target);
- }
-
- return None();
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- hashset<ContainerID> recovered;
- foreach (const ContainerState& state, states) {
- recovered.insert(state.container_id());
- }
-
- // Clean up any unknown orphaned bind mounts and empty files. Known
- // orphan bind mounts and empty files will be destroyed by the
- // containerizer using the normal cleanup path. See MESOS-2367 for
- // details.
- Try<list<string>> entries = os::ls(PID_NS_BIND_MOUNT_ROOT);
- if (entries.isError()) {
- return Failure("Failed to list existing containers in '" +
- string(PID_NS_BIND_MOUNT_ROOT) + "': " + entries.error());
- }
-
- foreach (const string& entry, entries.get()) {
- ContainerID containerId;
- containerId.set_value(entry);
-
- if (!recovered.contains(containerId) && !orphans.contains(containerId)) {
- cleanup(containerId);
- }
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> NamespacesPidIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- ContainerPrepareInfo prepareInfo;
- prepareInfo.set_namespaces(CLONE_NEWPID | CLONE_NEWNS);
-
- // Mask the bind mount root directory in each container so
- // containers cannot see the namespace bind mount of other
- // containers.
- prepareInfo.add_commands()->set_value(
- "mount -n --bind " + string(PID_NS_BIND_MOUNT_MASK_DIR) +
- " " + string(PID_NS_BIND_MOUNT_ROOT));
-
- // Mount /proc for the container's pid namespace to show the
- // container's pids (and other /proc files), not the parent's. We
- // first recursively make the mount private because /proc is usually
- // marked explicitly as shared (see /proc/self/mountinfo) and
- // changes would propagate to the parent's /proc mount otherwise. We
- // then mount /proc with the standard options. This technique was
- // taken from unshare.c in utils-linux for --mount-proc. We use the
- // -n flag so the mount is not added to the mtab where it will not
- // be correctly removed with the namespace terminates.
- prepareInfo.add_commands()->set_value(
- "mount none /proc --make-private -o rec");
- prepareInfo.add_commands()->set_value(
- "mount -n -t proc proc /proc -o nosuid,noexec,nodev");
-
- return prepareInfo;
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- const string source = nsProcFile(pid);
- const string target = nsExtraReference(containerId);
-
- // Create a bind mount of the pid namespace so we can control the
- // lifetime of the pid namespace. This lets us identify the
- // container's pid namespace, even if the leading pid has exited.
- Try<Nothing> touch = os::touch(target);
- if (touch.isError()) {
- return Failure("Failed to create bind mount target: " + touch.error());
- }
-
- Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
- if (mount.isError()) {
- return Failure(
- "Failed to mount pid namespace handle from " +
- source + " to " + target + ": " + mount.error());
- }
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> NamespacesPidIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- return Nothing();
-}
-
-
-Future<ResourceStatistics> NamespacesPidIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- return ResourceStatistics();
-}
-
-
-// An old glibc might not have this symbol.
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- const string target = nsExtraReference(containerId);
-
- if (os::exists(target)) {
- // We don't expect anyone to have a reference to target but do a
- // lazy umount in case. We do not want to force the umount; it
- // will not cause an issue if this umount is delayed.
- Try<Nothing> unmount = fs::unmount(target, MNT_DETACH);
-
- // This will fail if the unmount hasn't completed yet but this
- // only leaks a uniquely named empty file that will cleaned up as
- // an orphan on recovery.
- os::rm(target);
- }
-
- return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/namespaces/pid.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/namespaces/pid.hpp b/src/slave/containerizer/isolators/namespaces/pid.hpp
deleted file mode 100644
index 87270d0..0000000
--- a/src/slave/containerizer/isolators/namespaces/pid.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NAMESPACES_PID_ISOLATOR_HPP__
-#define __NAMESPACES_PID_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <string>
-
-#include <stout/result.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// This isolator itself does not specify the necessary clone() flags
-// (see the LinuxLauncher for that) but it is used to keep track of a
-// container's pid namespace through a bind mount and exposed by
-// getNamespace().
-class NamespacesPidIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- // Return the pid namespace of the container. Returns None if the
- // container was not created in a separate pid namespace, i.e.,
- // processes are in the same namespace as the slave. This is used by
- // the LinuxLauncher to determine if it can kill the leading process
- // in the container and let the kernel kill the remaining processes.
- // A container may not have a pid namespace if it was created
- // without the namespaces/pid isolator and the slave was
- // subsequently restarted with namespaces/pid enabled.
- static Result<ino_t> getNamespace(const ContainerID& container);
-
- NamespacesPidIsolatorProcess() {}
-
- virtual ~NamespacesPidIsolatorProcess() {}
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __NAMESPACES_PID_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/helper.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/helper.cpp b/src/slave/containerizer/isolators/network/helper.cpp
deleted file mode 100644
index e5fb99e..0000000
--- a/src/slave/containerizer/isolators/network/helper.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/none.hpp>
-#include <stout/subcommand.hpp>
-
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
-
-using namespace mesos::internal::slave;
-
-
-int main(int argc, char** argv)
-{
- return Subcommand::dispatch(
- None(),
- argc,
- argv,
- new PortMappingUpdate(),
- new PortMappingStatistics());
-}
[08/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp b/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
new file mode 100644
index 0000000..fa45439
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
@@ -0,0 +1,733 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <list>
+#include <vector>
+
+#include <mesos/type_utils.hpp>
+#include <mesos/values.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/pid.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/lambda.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "common/protobuf_utils.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+
+using namespace process;
+
+using cgroups::memory::pressure::Level;
+using cgroups::memory::pressure::Counter;
+
+using std::list;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+static const vector<Level> levels()
+{
+ return {Level::LOW, Level::MEDIUM, Level::CRITICAL};
+}
+
+
+CgroupsMemIsolatorProcess::CgroupsMemIsolatorProcess(
+ const Flags& _flags,
+ const string& _hierarchy,
+ const bool _limitSwap)
+ : flags(_flags),
+ hierarchy(_hierarchy),
+ limitSwap(_limitSwap) {}
+
+
+CgroupsMemIsolatorProcess::~CgroupsMemIsolatorProcess() {}
+
+
+Try<Isolator*> CgroupsMemIsolatorProcess::create(const Flags& flags)
+{
+ Try<string> hierarchy = cgroups::prepare(
+ flags.cgroups_hierarchy,
+ "memory",
+ flags.cgroups_root);
+
+ if (hierarchy.isError()) {
+ return Error("Failed to create memory cgroup: " + hierarchy.error());
+ }
+
+ // Ensure that no other subsystem is attached to the hierarchy.
+ Try<set<string>> subsystems = cgroups::subsystems(hierarchy.get());
+ if (subsystems.isError()) {
+ return Error(
+ "Failed to get the list of attached subsystems for hierarchy " +
+ hierarchy.get());
+ } else if (subsystems.get().size() != 1) {
+ return Error(
+ "Unexpected subsystems found attached to the hierarchy " +
+ hierarchy.get());
+ }
+
+ // Make sure the kernel OOM-killer is enabled.
+ // The Mesos OOM handler, as implemented, is not capable of handling
+ // the oom condition by itself safely given the limitations Linux
+ // imposes on this code path.
+ Try<Nothing> enable = cgroups::memory::oom::killer::enable(
+ hierarchy.get(), flags.cgroups_root);
+
+ if (enable.isError()) {
+ return Error(enable.error());
+ }
+
+ // Test if memory pressure listening is enabled. We test that on the
+ // root cgroup. We rely on 'Counter::create' to test if memory
+ // pressure listening is enabled or not. The created counters will
+ // be destroyed immediately.
+ foreach (Level level, levels()) {
+ Try<Owned<Counter>> counter = Counter::create(
+ hierarchy.get(),
+ flags.cgroups_root,
+ level);
+
+ if (counter.isError()) {
+ return Error("Failed to listen on " + stringify(level) +
+ " memory events: " + counter.error());
+ }
+ }
+
+ // Determine whether to limit swap or not.
+ bool limitSwap = false;
+
+ if (flags.cgroups_limit_swap) {
+ Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
+ hierarchy.get(), flags.cgroups_root);
+
+ if (check.isError()) {
+ return Error(
+ "Failed to read 'memory.memsw.limit_in_bytes': " +
+ check.error());
+ } else if (check.isNone()) {
+ return Error("'memory.memsw.limit_in_bytes' is not available");
+ }
+
+ limitSwap = true;
+ }
+
+ process::Owned<MesosIsolatorProcess> process(
+ new CgroupsMemIsolatorProcess(flags, hierarchy.get(), limitSwap));
+
+ return new MesosIsolator(process);
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ foreach (const ContainerState& state, states) {
+ const ContainerID& containerId = state.container_id();
+ const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+ Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+ if (exists.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ infos.clear();
+ return Failure("Failed to check cgroup for container '" +
+ stringify(containerId) + "'");
+ }
+
+ if (!exists.get()) {
+ VLOG(1) << "Couldn't find cgroup for container " << containerId;
+ // This may occur if the executor has exited and the isolator
+ // has destroyed the cgroup but the slave dies before noticing
+ // this. This will be detected when the containerizer tries to
+ // monitor the executor's pid.
+ continue;
+ }
+
+ infos[containerId] = new Info(containerId, cgroup);
+
+ oomListen(containerId);
+ pressureListen(containerId);
+ }
+
+ // Remove orphan cgroups.
+ Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
+ if (cgroups.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ infos.clear();
+ return Failure(cgroups.error());
+ }
+
+ foreach (const string& cgroup, cgroups.get()) {
+ // Ignore the slave cgroup (see the --slave_subsystems flag).
+ // TODO(idownes): Remove this when the cgroups layout is updated,
+ // see MESOS-1185.
+ if (cgroup == path::join(flags.cgroups_root, "slave")) {
+ continue;
+ }
+
+ ContainerID containerId;
+ containerId.set_value(Path(cgroup).basename());
+
+ if (infos.contains(containerId)) {
+ continue;
+ }
+
+ // Known orphan cgroups will be destroyed by the containerizer
+ // using the normal cleanup path. See MESOS-2367 for details.
+ if (orphans.contains(containerId)) {
+ infos[containerId] = new Info(containerId, cgroup);
+ continue;
+ }
+
+ LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
+
+ // We don't wait on the destroy as we don't want to block recovery.
+ cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsMemIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ // TODO(bmahler): Don't insert into 'infos' unless we create the
+ // cgroup successfully. It's safe for now because 'cleanup' gets
+ // called if we return a Failure, but cleanup will fail because the
+ // cgroup does not exist when cgroups::destroy is called.
+ Info* info = new Info(
+ containerId, path::join(flags.cgroups_root, containerId.value()));
+
+ infos[containerId] = info;
+
+ // Create a cgroup for this container.
+ Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
+
+ if (exists.isError()) {
+ return Failure("Failed to prepare isolator: " + exists.error());
+ } else if (exists.get()) {
+ return Failure("Failed to prepare isolator: cgroup already exists");
+ }
+
+ Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
+ if (create.isError()) {
+ return Failure("Failed to prepare isolator: " + create.error());
+ }
+
+ // Chown the cgroup so the executor can create nested cgroups. Do
+ // not recurse so the control files are still owned by the slave
+ // user and thus cannot be changed by the executor.
+ if (user.isSome()) {
+ Try<Nothing> chown = os::chown(
+ user.get(),
+ path::join(hierarchy, info->cgroup),
+ false);
+ if (chown.isError()) {
+ return Failure("Failed to prepare isolator: " + chown.error());
+ }
+ }
+
+ oomListen(containerId);
+ pressureListen(containerId);
+
+ return update(containerId, executorInfo.resources())
+ .then([]() -> Future<Option<ContainerPrepareInfo>> {
+ return None();
+ });
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ CHECK_NONE(info->pid);
+ info->pid = pid;
+
+ Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
+ if (assign.isError()) {
+ return Failure("Failed to assign container '" +
+ stringify(info->containerId) + "' to its own cgroup '" +
+ path::join(hierarchy, info->cgroup) +
+ "' : " + assign.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsMemIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ CHECK_NOTNULL(infos[containerId]);
+
+ return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ if (resources.mem().isNone()) {
+ return Failure("No memory resource given");
+ }
+
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ // New limit.
+ Bytes mem = resources.mem().get();
+ Bytes limit = std::max(mem, MIN_MEMORY);
+
+ // Always set the soft limit.
+ Try<Nothing> write =
+ cgroups::memory::soft_limit_in_bytes(hierarchy, info->cgroup, limit);
+
+ if (write.isError()) {
+ return Failure(
+ "Failed to set 'memory.soft_limit_in_bytes': " + write.error());
+ }
+
+ LOG(INFO) << "Updated 'memory.soft_limit_in_bytes' to " << limit
+ << " for container " << containerId;
+
+ // Read the existing limit.
+ Try<Bytes> currentLimit =
+ cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
+
+ // NOTE: If limitSwap is (has been) used then both limit_in_bytes
+ // and memsw.limit_in_bytes will always be set to the same value.
+ if (currentLimit.isError()) {
+ return Failure(
+ "Failed to read 'memory.limit_in_bytes': " + currentLimit.error());
+ }
+
+ // Determine whether to set the hard limit. If this is the first
+ // time (info->pid.isNone()), or we're raising the existing limit,
+ // then we can update the hard limit safely. Otherwise, if we need
+ // to decrease 'memory.limit_in_bytes' we may induce an OOM if too
+ // much memory is in use. As a result, we only update the soft limit
+ // when the memory reservation is being reduced. This is probably
+ // okay if the machine has available resources.
+ // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
+ // discrepancy between usage and soft limit and introduces a "manual
+ // oom" if necessary.
+ if (info->pid.isNone() || limit > currentLimit.get()) {
+ // We always set limit_in_bytes first and optionally set
+ // memsw.limit_in_bytes if limitSwap is true.
+ Try<Nothing> write = cgroups::memory::limit_in_bytes(
+ hierarchy, info->cgroup, limit);
+
+ if (write.isError()) {
+ return Failure(
+ "Failed to set 'memory.limit_in_bytes': " + write.error());
+ }
+
+ LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
+ << " for container " << containerId;
+
+ if (limitSwap) {
+ Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
+ hierarchy, info->cgroup, limit);
+
+ if (write.isError()) {
+ return Failure(
+ "Failed to set 'memory.memsw.limit_in_bytes': " + write.error());
+ }
+
+ LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
+ << " for container " << containerId;
+ }
+ }
+
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsMemIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ ResourceStatistics result;
+
+ // The rss from memory.stat is wrong in two dimensions:
+ // 1. It does not include child cgroups.
+ // 2. It does not include any file backed pages.
+ Try<Bytes> usage = cgroups::memory::usage_in_bytes(hierarchy, info->cgroup);
+ if (usage.isError()) {
+ return Failure("Failed to parse memory.usage_in_bytes: " + usage.error());
+ }
+
+ result.set_mem_total_bytes(usage.get().bytes());
+
+ if (limitSwap) {
+ Try<Bytes> usage =
+ cgroups::memory::memsw_usage_in_bytes(hierarchy, info->cgroup);
+ if (usage.isError()) {
+ return Failure(
+ "Failed to parse memory.memsw.usage_in_bytes: " + usage.error());
+ }
+
+ result.set_mem_total_memsw_bytes(usage.get().bytes());
+ }
+
+ // TODO(bmahler): Add namespacing to cgroups to enforce the expected
+ // structure, e.g, cgroups::memory::stat.
+ Try<hashmap<string, uint64_t>> stat =
+ cgroups::stat(hierarchy, info->cgroup, "memory.stat");
+ if (stat.isError()) {
+ return Failure("Failed to read memory.stat: " + stat.error());
+ }
+
+ Option<uint64_t> total_cache = stat.get().get("total_cache");
+ if (total_cache.isSome()) {
+ // TODO(chzhcn): mem_file_bytes is deprecated in 0.23.0 and will
+ // be removed in 0.24.0.
+ result.set_mem_file_bytes(total_cache.get());
+
+ result.set_mem_cache_bytes(total_cache.get());
+ }
+
+ Option<uint64_t> total_rss = stat.get().get("total_rss");
+ if (total_rss.isSome()) {
+ // TODO(chzhcn): mem_anon_bytes is deprecated in 0.23.0 and will
+ // be removed in 0.24.0.
+ result.set_mem_anon_bytes(total_rss.get());
+
+ result.set_mem_rss_bytes(total_rss.get());
+ }
+
+ Option<uint64_t> total_mapped_file = stat.get().get("total_mapped_file");
+ if (total_mapped_file.isSome()) {
+ result.set_mem_mapped_file_bytes(total_mapped_file.get());
+ }
+
+ Option<uint64_t> total_swap = stat.get().get("total_swap");
+ if (total_swap.isSome()) {
+ result.set_mem_swap_bytes(total_swap.get());
+ }
+
+ Option<uint64_t> total_unevictable = stat.get().get("total_unevictable");
+ if (total_unevictable.isSome()) {
+ result.set_mem_unevictable_bytes(total_unevictable.get());
+ }
+
+ // Get pressure counter readings.
+ list<Level> levels;
+ list<Future<uint64_t>> values;
+ foreachpair (Level level,
+ const Owned<Counter>& counter,
+ info->pressureCounters) {
+ levels.push_back(level);
+ values.push_back(counter->value());
+ }
+
+ return await(values)
+ .then(defer(PID<CgroupsMemIsolatorProcess>(this),
+ &CgroupsMemIsolatorProcess::_usage,
+ containerId,
+ result,
+ levels,
+ lambda::_1));
+}
+
+
+Future<ResourceStatistics> CgroupsMemIsolatorProcess::_usage(
+ const ContainerID& containerId,
+ ResourceStatistics result,
+ const list<Level>& levels,
+ const list<Future<uint64_t>>& values)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ list<Level>::const_iterator iterator = levels.begin();
+ foreach (const Future<uint64_t>& value, values) {
+ if (value.isReady()) {
+ switch (*iterator) {
+ case Level::LOW:
+ result.set_mem_low_pressure_counter(value.get());
+ break;
+ case Level::MEDIUM:
+ result.set_mem_medium_pressure_counter(value.get());
+ break;
+ case Level::CRITICAL:
+ result.set_mem_critical_pressure_counter(value.get());
+ break;
+ }
+ } else {
+ LOG(ERROR) << "Failed to listen on " << stringify(*iterator)
+ << " pressure events for container " << containerId << ": "
+ << (value.isFailed() ? value.failure() : "discarded");
+ }
+
+ ++iterator;
+ }
+
+ return result;
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ // Multiple calls may occur during test clean up.
+ if (!infos.contains(containerId)) {
+ VLOG(1) << "Ignoring cleanup request for unknown container: "
+ << containerId;
+ return Nothing();
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ if (info->oomNotifier.isPending()) {
+ info->oomNotifier.discard();
+ }
+
+ return cgroups::destroy(hierarchy, info->cgroup, cgroups::DESTROY_TIMEOUT)
+ .onAny(defer(PID<CgroupsMemIsolatorProcess>(this),
+ &CgroupsMemIsolatorProcess::_cleanup,
+ containerId,
+ lambda::_1));
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::_cleanup(
+ const ContainerID& containerId,
+ const Future<Nothing>& future)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ CHECK_NOTNULL(infos[containerId]);
+
+ if (!future.isReady()) {
+ return Failure("Failed to clean up container " + stringify(containerId) +
+ " : " + (future.isFailed() ? future.failure()
+ : "discarded"));
+ }
+
+ delete infos[containerId];
+ infos.erase(containerId);
+
+ return Nothing();
+}
+
+
+void CgroupsMemIsolatorProcess::oomListen(
+ const ContainerID& containerId)
+{
+ CHECK(infos.contains(containerId));
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ info->oomNotifier = cgroups::memory::oom::listen(hierarchy, info->cgroup);
+
+ // If the listening fails immediately, something very wrong
+ // happened. Therefore, we report a fatal error here.
+ if (info->oomNotifier.isFailed()) {
+ LOG(FATAL) << "Failed to listen for OOM events for container "
+ << containerId << ": "
+ << info->oomNotifier.failure();
+ }
+
+ LOG(INFO) << "Started listening for OOM events for container "
+ << containerId;
+
+ info->oomNotifier.onReady(defer(
+ PID<CgroupsMemIsolatorProcess>(this),
+ &CgroupsMemIsolatorProcess::oomWaited,
+ containerId,
+ lambda::_1));
+}
+
+
+void CgroupsMemIsolatorProcess::oomWaited(
+ const ContainerID& containerId,
+ const Future<Nothing>& future)
+{
+ if (future.isDiscarded()) {
+ LOG(INFO) << "Discarded OOM notifier for container "
+ << containerId;
+ } else if (future.isFailed()) {
+ LOG(ERROR) << "Listening on OOM events failed for container "
+ << containerId << ": " << future.failure();
+ } else {
+ // Out-of-memory event happened, call the handler.
+ LOG(INFO) << "OOM notifier is triggered for container " << containerId;
+ oom(containerId);
+ }
+}
+
+
+void CgroupsMemIsolatorProcess::oom(const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ // It is likely that process exited is executed before this
+ // function (e.g. The kill and OOM events happen at the same
+ // time, and the process exit event arrives first.) Therefore, we
+ // should not report a fatal error here.
+ LOG(INFO) << "OOM detected for an already terminated executor";
+ return;
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ LOG(INFO) << "OOM detected for container " << containerId;
+
+ // Construct a "message" string to describe why the isolator
+ // destroyed the executor's cgroup (in order to assist in
+ // debugging).
+ ostringstream message;
+ message << "Memory limit exceeded: ";
+
+ // Output the requested memory limit.
+ // NOTE: If limitSwap is (has been) used then both limit_in_bytes
+ // and memsw.limit_in_bytes will always be set to the same value.
+ Try<Bytes> limit = cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
+
+ if (limit.isError()) {
+ LOG(ERROR) << "Failed to read 'memory.limit_in_bytes': "
+ << limit.error();
+ } else {
+ message << "Requested: " << limit.get() << " ";
+ }
+
+ // Output the maximum memory usage.
+ Try<Bytes> usage = cgroups::memory::max_usage_in_bytes(
+ hierarchy, info->cgroup);
+
+ if (usage.isError()) {
+ LOG(ERROR) << "Failed to read 'memory.max_usage_in_bytes': "
+ << usage.error();
+ } else {
+ message << "Maximum Used: " << usage.get() << "\n";
+ }
+
+ // Output 'memory.stat' of the cgroup to help with debugging.
+ // NOTE: With Kernel OOM-killer enabled these stats may not reflect
+ // memory state at time of OOM.
+ Try<string> read = cgroups::read(hierarchy, info->cgroup, "memory.stat");
+ if (read.isError()) {
+ LOG(ERROR) << "Failed to read 'memory.stat': " << read.error();
+ } else {
+ message << "\nMEMORY STATISTICS: \n" << read.get() << "\n";
+ }
+
+ LOG(INFO) << strings::trim(message.str()); // Trim the extra '\n' at the end.
+
+ // TODO(jieyu): This is not accurate if the memory resource is from
+ // a non-star role or spans roles (e.g., "*" and "role"). Ideally,
+ // we should save the resources passed in and report it here.
+ Resources mem = Resources::parse(
+ "mem",
+ stringify(usage.isSome() ? usage.get().megabytes() : 0),
+ "*").get();
+
+ info->limitation.set(
+ protobuf::slave::createContainerLimitation(
+ mem,
+ message.str(),
+ TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY));
+}
+
+
+void CgroupsMemIsolatorProcess::pressureListen(
+ const ContainerID& containerId)
+{
+ CHECK(infos.contains(containerId));
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ foreach (Level level, levels()) {
+ Try<Owned<Counter>> counter = Counter::create(
+ hierarchy,
+ info->cgroup,
+ level);
+
+ if (counter.isError()) {
+ LOG(ERROR) << "Failed to listen on " << level << " memory pressure "
+ << "events for container " << containerId << ": "
+ << counter.error();
+ } else {
+ info->pressureCounters[level] = counter.get();
+
+ LOG(INFO) << "Started listening on " << level << " memory pressure "
+ << "events for container " << containerId;
+ }
+ }
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp b/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
new file mode 100644
index 0000000..47f73c3
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEM_ISOLATOR_HPP__
+#define __MEM_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/nothing.hpp>
+#include <stout/option.hpp>
+
+#include "linux/cgroups.hpp"
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CgroupsMemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~CgroupsMemIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ CgroupsMemIsolatorProcess(
+ const Flags& flags,
+ const std::string& hierarchy,
+ bool limitSwap);
+
+ process::Future<ResourceStatistics> _usage(
+ const ContainerID& containerId,
+ ResourceStatistics result,
+ const std::list<cgroups::memory::pressure::Level>& levels,
+ const std::list<process::Future<uint64_t>>& values);
+
+ process::Future<Nothing> _cleanup(
+ const ContainerID& containerId,
+ const process::Future<Nothing>& future);
+
+ struct Info
+ {
+ Info(const ContainerID& _containerId, const std::string& _cgroup)
+ : containerId(_containerId), cgroup(_cgroup) {}
+
+ const ContainerID containerId;
+ const std::string cgroup;
+ Option<pid_t> pid;
+
+ process::Promise<mesos::slave::ContainerLimitation> limitation;
+
+ // Used to cancel the OOM listening.
+ process::Future<Nothing> oomNotifier;
+
+ hashmap<cgroups::memory::pressure::Level,
+ process::Owned<cgroups::memory::pressure::Counter>>
+ pressureCounters;
+ };
+
+ // Start listening on OOM events. This function will create an
+ // eventfd and start polling on it.
+ void oomListen(const ContainerID& containerId);
+
+ // This function is invoked when the polling on eventfd has a
+ // result.
+ void oomWaited(
+ const ContainerID& containerId,
+ const process::Future<Nothing>& future);
+
+ // This function is invoked when the OOM event happens.
+ void oom(const ContainerID& containerId);
+
+ // Start listening on memory pressure events.
+ void pressureListen(const ContainerID& containerId);
+
+ const Flags flags;
+
+ // The path to the cgroups subsystem hierarchy root.
+ const std::string hierarchy;
+
+ const bool limitSwap;
+
+ // TODO(bmahler): Use Owned<Info>.
+ hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __MEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
new file mode 100644
index 0000000..323f6ff
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/message.h>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/io.hpp>
+#include <process/pid.hpp>
+#include <process/reap.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/lambda.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "linux/cgroups.hpp"
+#include "linux/perf.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+using std::list;
+using std::set;
+using std::string;
+using std::vector;
+
+using process::Clock;
+using process::Failure;
+using process::Future;
+using process::PID;
+using process::Time;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> CgroupsPerfEventIsolatorProcess::create(const Flags& flags)
+{
+ LOG(INFO) << "Creating PerfEvent isolator";
+
+ if (!perf::supported()) {
+ return Error("Perf is not supported");
+ }
+
+ if (flags.perf_duration > flags.perf_interval) {
+ return Error("Sampling perf for duration (" +
+ stringify(flags.perf_duration) +
+ ") > interval (" +
+ stringify(flags.perf_interval) +
+ ") is not supported.");
+ }
+
+ if (!flags.perf_events.isSome()) {
+ return Error("No perf events specified.");
+ }
+
+ set<string> events;
+ foreach (const string& event,
+ strings::tokenize(flags.perf_events.get(), ",")) {
+ events.insert(event);
+ }
+
+ if (!perf::valid(events)) {
+ return Error("Failed to create PerfEvent isolator, invalid events: " +
+ stringify(events));
+ }
+
+ Try<string> hierarchy = cgroups::prepare(
+ flags.cgroups_hierarchy,
+ "perf_event",
+ flags.cgroups_root);
+
+ if (hierarchy.isError()) {
+ return Error("Failed to create perf_event cgroup: " + hierarchy.error());
+ }
+
+ LOG(INFO) << "PerfEvent isolator will profile for " << flags.perf_duration
+ << " every " << flags.perf_interval
+ << " for events: " << stringify(events);
+
+ process::Owned<MesosIsolatorProcess> process(
+ new CgroupsPerfEventIsolatorProcess(flags, hierarchy.get(), events));
+
+ return new MesosIsolator(process);
+}
+
+
+CgroupsPerfEventIsolatorProcess::~CgroupsPerfEventIsolatorProcess() {}
+
+
+void CgroupsPerfEventIsolatorProcess::initialize()
+{
+ // Start sampling.
+ sample();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ foreach (const ContainerState& state, states) {
+ const ContainerID& containerId = state.container_id();
+ const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+ Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+ if (exists.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+
+ infos.clear();
+ return Failure("Failed to check cgroup " + cgroup +
+ " for container '" + stringify(containerId) + "'");
+ }
+
+ if (!exists.get()) {
+ // This may occur if the executor is exiting and the isolator has
+ // destroyed the cgroup but the slave dies before noticing this. This
+ // will be detected when the containerizer tries to monitor the
+ // executor's pid.
+ // NOTE: This could also occur if this isolator is now enabled for a
+ // container that was started without this isolator. For this
+ // particular isolator it is acceptable to continue running this
+ // container without a perf_event cgroup because we don't ever
+ // query it and the destroy will succeed immediately.
+ VLOG(1) << "Couldn't find perf event cgroup for container " << containerId
+ << ", perf statistics will not be available";
+ continue;
+ }
+
+ infos[containerId] = new Info(containerId, cgroup);
+ }
+
+ // Remove orphan cgroups.
+ Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
+ if (cgroups.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ infos.clear();
+ return Failure(cgroups.error());
+ }
+
+ foreach (const string& cgroup, cgroups.get()) {
+ // Ignore the slave cgroup (see the --slave_subsystems flag).
+ // TODO(idownes): Remove this when the cgroups layout is updated,
+ // see MESOS-1185.
+ if (cgroup == path::join(flags.cgroups_root, "slave")) {
+ continue;
+ }
+
+ ContainerID containerId;
+ containerId.set_value(Path(cgroup).basename());
+
+ if (infos.contains(containerId)) {
+ continue;
+ }
+
+ // Known orphan cgroups will be destroyed by the containerizer
+ // using the normal cleanup path. See details in MESOS-2367.
+ if (orphans.contains(containerId)) {
+ infos[containerId] = new Info(containerId, cgroup);
+ continue;
+ }
+
+ LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
+
+ // We don't wait on the destroy as we don't want to block recovery.
+ cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsPerfEventIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ LOG(INFO) << "Preparing perf event cgroup for " << containerId;
+
+ Info* info = new Info(
+ containerId,
+ path::join(flags.cgroups_root, containerId.value()));
+
+ infos[containerId] = CHECK_NOTNULL(info);
+
+ // Create a cgroup for this container.
+ Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
+
+ if (exists.isError()) {
+ return Failure("Failed to prepare isolator: " + exists.error());
+ }
+
+ if (exists.get()) {
+ return Failure("Failed to prepare isolator: cgroup already exists");
+ }
+
+ if (!exists.get()) {
+ Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
+ if (create.isError()) {
+ return Failure("Failed to prepare isolator: " + create.error());
+ }
+ }
+
+ // Chown the cgroup so the executor can create nested cgroups. Do
+ // not recurse so the control files are still owned by the slave
+ // user and thus cannot be changed by the executor.
+ if (user.isSome()) {
+ Try<Nothing> chown = os::chown(
+ user.get(),
+ path::join(hierarchy, info->cgroup),
+ false);
+ if (chown.isError()) {
+ return Failure("Failed to prepare isolator: " + chown.error());
+ }
+ }
+
+ return None();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
+ if (assign.isError()) {
+ return Failure("Failed to assign container '" +
+ stringify(info->containerId) + "' to its own cgroup '" +
+ path::join(hierarchy, info->cgroup) +
+ "' : " + assign.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsPerfEventIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ // No resources are limited.
+ return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ // Nothing to update.
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsPerfEventIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ // Return an empty ResourceStatistics, i.e., without
+ // PerfStatistics, if we don't know about this container.
+ return ResourceStatistics();
+ }
+
+ CHECK_NOTNULL(infos[containerId]);
+
+ ResourceStatistics statistics;
+ statistics.mutable_perf()->CopyFrom(infos[containerId]->statistics);
+
+ return statistics;
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ // Tolerate clean up attempts for unknown containers which may arise from
+ // repeated clean up attempts (during test cleanup).
+ if (!infos.contains(containerId)) {
+ VLOG(1) << "Ignoring cleanup request for unknown container: "
+ << containerId;
+ return Nothing();
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ info->destroying = true;
+
+ return cgroups::destroy(hierarchy, info->cgroup)
+ .then(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
+ &CgroupsPerfEventIsolatorProcess::_cleanup,
+ containerId));
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::_cleanup(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Nothing();
+ }
+
+ delete infos[containerId];
+ infos.erase(containerId);
+
+ return Nothing();
+}
+
+
+Future<hashmap<string, PerfStatistics>> discardSample(
+ Future<hashmap<string, PerfStatistics>> future,
+ const Duration& duration,
+ const Duration& timeout)
+{
+ LOG(ERROR) << "Perf sample of " << stringify(duration)
+ << " failed to complete within " << stringify(timeout)
+ << "; sampling will be halted";
+
+ future.discard();
+
+ return future;
+}
+
+
+void CgroupsPerfEventIsolatorProcess::sample()
+{
+ // Collect a perf sample for all cgroups that are not being
+ // destroyed. Since destroyal is asynchronous, 'perf stat' may
+ // fail if the cgroup is destroyed before running perf.
+ set<string> cgroups;
+
+ foreachvalue (Info* info, infos) {
+ CHECK_NOTNULL(info);
+
+ if (!info->destroying) {
+ cgroups.insert(info->cgroup);
+ }
+ }
+
+ // The discard timeout includes an allowance of twice the
+ // reaper interval to ensure we see the perf process exit.
+ Duration timeout = flags.perf_duration + process::MAX_REAP_INTERVAL() * 2;
+
+ perf::sample(events, cgroups, flags.perf_duration)
+ .after(timeout,
+ lambda::bind(&discardSample,
+ lambda::_1,
+ flags.perf_duration,
+ timeout))
+ .onAny(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
+ &CgroupsPerfEventIsolatorProcess::_sample,
+ Clock::now() + flags.perf_interval,
+ lambda::_1));
+}
+
+
+void CgroupsPerfEventIsolatorProcess::_sample(
+ const Time& next,
+ const Future<hashmap<string, PerfStatistics>>& statistics)
+{
+ if (!statistics.isReady()) {
+ // In case the failure is transient or this is due to a timeout,
+ // we continue sampling. Note that since sampling is done on an
+ // interval, it should be ok if this is a non-transient failure.
+ LOG(ERROR) << "Failed to get perf sample: "
+ << (statistics.isFailed()
+ ? statistics.failure()
+ : "discarded due to timeout");
+ } else {
+ // Store the latest statistics, note that cgroups added in the
+ // interim will be picked up by the next sample.
+ foreachvalue (Info* info, infos) {
+ CHECK_NOTNULL(info);
+
+ if (statistics->contains(info->cgroup)) {
+ info->statistics = statistics->get(info->cgroup).get();
+ }
+ }
+ }
+
+ // Schedule sample for the next time.
+ delay(next - Clock::now(),
+ PID<CgroupsPerfEventIsolatorProcess>(this),
+ &CgroupsPerfEventIsolatorProcess::sample);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
new file mode 100644
index 0000000..1f722ef
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PERF_EVENT_ISOLATOR_HPP__
+#define __PERF_EVENT_ISOLATOR_HPP__
+
+#include <set>
+
+#include <process/time.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/nothing.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CgroupsPerfEventIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~CgroupsPerfEventIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+protected:
+ virtual void initialize();
+
+private:
+ CgroupsPerfEventIsolatorProcess(
+ const Flags& _flags,
+ const std::string& _hierarchy,
+ const std::set<std::string>& _events)
+ : flags(_flags),
+ hierarchy(_hierarchy),
+ events(_events) {}
+
+ void sample();
+
+ void _sample(
+ const process::Time& next,
+ const process::Future<hashmap<std::string, PerfStatistics>>& statistics);
+
+ virtual process::Future<Nothing> _cleanup(const ContainerID& containerId);
+
+ struct Info
+ {
+ Info(const ContainerID& _containerId, const std::string& _cgroup)
+ : containerId(_containerId), cgroup(_cgroup), destroying(false)
+ {
+ // Ensure the initial statistics include the required fields.
+ // Note the duration is set to zero to indicate no sampling has
+ // taken place. This empty sample will be returned from usage()
+ // until the first true sample is obtained.
+ statistics.set_timestamp(process::Clock::now().secs());
+ statistics.set_duration(Seconds(0).secs());
+ }
+
+ const ContainerID containerId;
+ const std::string cgroup;
+ PerfStatistics statistics;
+ // Mark a container when we start destruction so we stop sampling it.
+ bool destroying;
+ };
+
+ const Flags flags;
+
+ // The path to the cgroups subsystem hierarchy root.
+ const std::string hierarchy;
+
+ // Set of events to sample.
+ std::set<std::string> events;
+
+ // TODO(jieyu): Use Owned<Info>.
+ hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PERF_EVENT_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
new file mode 100644
index 0000000..a126cd6
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <sstream>
+#include <string>
+
+#include <glog/logging.h>
+
+#include <process/collect.hpp>
+
+#include <process/metrics/metrics.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/strings.hpp>
+
+#include <stout/os/shell.hpp>
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
+
+using namespace process;
+
+using std::list;
+using std::ostringstream;
+using std::string;
+
+using mesos::slave::ContainerState;
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> LinuxFilesystemIsolatorProcess::create(
+ const Flags& flags,
+ const Owned<Provisioner>& provisioner)
+{
+ Result<string> user = os::user();
+ if (!user.isSome()) {
+ return Error("Failed to determine user: " +
+ (user.isError() ? user.error() : "username not found"));
+ }
+
+ if (user.get() != "root") {
+ return Error("LinuxFilesystemIsolator requires root privileges");
+ }
+
+ // Make slave's work_dir a shared mount so that when forking a child
+ // process (with a new mount namespace), the child process does not
+ // hold extra references to container's work directory mounts and
+ // provisioner mounts (e.g., when using the bind backend) because
+ // cleanup operations within work_dir can be propagted to all
+ // container namespaces. See MESOS-3483 for more details.
+ LOG(INFO) << "Making '" << flags.work_dir << "' a shared mount";
+
+ Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+ if (table.isError()) {
+ return Error("Failed to get mount table: " + table.error());
+ }
+
+ Option<fs::MountInfoTable::Entry> workDirMount;
+ foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+ // TODO(jieyu): Make sure 'flags.work_dir' is a canonical path.
+ if (entry.target == flags.work_dir) {
+ workDirMount = entry;
+ break;
+ }
+ }
+
+ // Do a self bind mount if needed. If the mount already exists, make
+ // sure it is a shared mount of its own peer group.
+ if (workDirMount.isNone()) {
+ // NOTE: Instead of using fs::mount to perform the bind mount, we
+ // use the shell command here because the syscall 'mount' does not
+ // update the mount table (i.e., /etc/mtab). In other words, the
+ // mount will not be visible if the operator types command
+ // 'mount'. Since this mount will still be presented after all
+ // containers and the slave are stopped, it's better to make it
+ // visible. It's OK to use the blocking os::shell here because
+ // 'create' will only be invoked during initialization.
+ Try<string> mount = os::shell(
+ "mount --bind %s %s && "
+ "mount --make-slave %s && "
+ "mount --make-shared %s",
+ flags.work_dir.c_str(),
+ flags.work_dir.c_str(),
+ flags.work_dir.c_str(),
+ flags.work_dir.c_str());
+
+ if (mount.isError()) {
+ return Error(
+ "Failed to self bind mount '" + flags.work_dir +
+ "' and make it a shared mount: " + mount.error());
+ }
+ } else {
+ if (workDirMount.get().shared().isNone()) {
+ // This is the case where the work directory mount is not a
+ // shared mount yet (possibly due to slave crash while preparing
+ // the work directory mount). It's safe to re-do the following.
+ Try<string> mount = os::shell(
+ "mount --make-slave %s && "
+ "mount --make-shared %s",
+ flags.work_dir.c_str(),
+ flags.work_dir.c_str());
+
+ if (mount.isError()) {
+ return Error(
+ "Failed to self bind mount '" + flags.work_dir +
+ "' and make it a shared mount: " + mount.error());
+ }
+ } else {
+ // We need to make sure that the shared mount is in its own peer
+ // group. To check that, we need to get the parent mount.
+ foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+ if (entry.id == workDirMount.get().parent) {
+ // If the work directory mount and its parent mount are in
+ // the same peer group, we need to re-do the following
+ // commands so that they are in different peer groups.
+ if (entry.shared() == workDirMount.get().shared()) {
+ Try<string> mount = os::shell(
+ "mount --make-slave %s && "
+ "mount --make-shared %s",
+ flags.work_dir.c_str(),
+ flags.work_dir.c_str());
+
+ if (mount.isError()) {
+ return Error(
+ "Failed to self bind mount '" + flags.work_dir +
+ "' and make it a shared mount: " + mount.error());
+ }
+ }
+
+ break;
+ }
+ }
+ }
+ }
+
+ Owned<MesosIsolatorProcess> process(
+ new LinuxFilesystemIsolatorProcess(flags, provisioner));
+
+ return new MesosIsolator(process);
+}
+
+
+LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess(
+ const Flags& _flags,
+ const Owned<Provisioner>& _provisioner)
+ : flags(_flags),
+ provisioner(_provisioner),
+ metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {}
+
+
+LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ // Read the mount table in the host mount namespace to recover paths
+ // to containers' work directories if their root filesystems are
+ // changed. Method 'cleanup()' relies on this information to clean
+ // up mounts in the host mount namespace for each container.
+ Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+ if (table.isError()) {
+ return Failure("Failed to get mount table: " + table.error());
+ }
+
+ foreach (const ContainerState& state, states) {
+ Owned<Info> info(new Info(state.directory()));
+
+ foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+ if (entry.root == info->directory) {
+ info->sandbox = entry.target;
+ break;
+ }
+ }
+
+ infos.put(state.container_id(), info);
+ }
+
+ // Recover both known and unknown orphans by scanning the mount
+ // table and finding those mounts whose roots are under slave's
+ // sandbox root directory. Those mounts are container's work
+ // directory mounts. Mounts from unknown orphans will be cleaned up
+ // immediately. Mounts from known orphans will be cleaned up when
+ // those known orphan containers are being destroyed by the slave.
+ hashset<ContainerID> unknownOrphans;
+
+ string sandboxRootDir = paths::getSandboxRootDir(flags.work_dir);
+
+ foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+ if (!strings::startsWith(entry.root, sandboxRootDir)) {
+ continue;
+ }
+
+ // TODO(jieyu): Here, we retrieve the container ID by taking the
+ // basename of 'entry.root'. This assumes that the slave's sandbox
+ // root directory are organized according to the comments in the
+ // beginning of slave/paths.hpp.
+ ContainerID containerId;
+ containerId.set_value(Path(entry.root).basename());
+
+ if (infos.contains(containerId)) {
+ continue;
+ }
+
+ Owned<Info> info(new Info(entry.root));
+
+ if (entry.root != entry.target) {
+ info->sandbox = entry.target;
+ }
+
+ infos.put(containerId, info);
+
+ // Remember all the unknown orphan containers.
+ if (!orphans.contains(containerId)) {
+ unknownOrphans.insert(containerId);
+ }
+ }
+
+ // Cleanup mounts from unknown orphans.
+ list<Future<Nothing>> futures;
+ foreach (const ContainerID& containerId, unknownOrphans) {
+ futures.push_back(cleanup(containerId));
+ }
+
+ return collect(futures)
+ .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
+ &LinuxFilesystemIsolatorProcess::_recover,
+ states,
+ orphans));
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::_recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ return provisioner->recover(states, orphans)
+ .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ infos.put(containerId, Owned<Info>(new Info(directory)));
+
+ if (!executorInfo.has_container()) {
+ return __prepare(containerId, executorInfo, directory, user, None());
+ }
+
+ // Provision the root filesystem if needed.
+ CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+ if (!executorInfo.container().mesos().has_image()) {
+ return _prepare(containerId, executorInfo, directory, user, None());
+ }
+
+ const Image& image = executorInfo.container().mesos().image();
+
+ return provisioner->provision(containerId, image)
+ .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
+ &LinuxFilesystemIsolatorProcess::_prepare,
+ containerId,
+ executorInfo,
+ directory,
+ user,
+ lambda::_1));
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user,
+ const Option<string>& rootfs)
+{
+ CHECK(executorInfo.has_container());
+ CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+ // We will provision the images specified in ContainerInfo::volumes
+ // as well. We will mutate ContainerInfo::volumes to include the
+ // paths to the provisioned root filesystems (by setting the
+ // 'host_path') if the volume specifies an image as the source.
+ Owned<ExecutorInfo> _executorInfo(new ExecutorInfo(executorInfo));
+ list<Future<Nothing>> futures;
+
+ for (int i = 0; i < _executorInfo->container().volumes_size(); i++) {
+ Volume* volume = _executorInfo->mutable_container()->mutable_volumes(i);
+
+ if (!volume->has_image()) {
+ continue;
+ }
+
+ const Image& image = volume->image();
+
+ futures.push_back(
+ provisioner->provision(containerId, image)
+ .then([volume](const string& path) -> Future<Nothing> {
+ volume->set_host_path(path);
+ return Nothing();
+ }));
+ }
+
+ return collect(futures)
+ .then([=]() -> Future<Option<ContainerPrepareInfo>> {
+ return __prepare(containerId, *_executorInfo, directory, user, rootfs);
+ });
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::__prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user,
+ const Option<string>& rootfs)
+{
+ CHECK(infos.contains(containerId));
+
+ const Owned<Info>& info = infos[containerId];
+
+ ContainerPrepareInfo prepareInfo;
+ prepareInfo.set_namespaces(CLONE_NEWNS);
+
+ if (rootfs.isSome()) {
+ // If the container changes its root filesystem, we need to mount
+ // the container's work directory into its root filesystem
+ // (creating it if needed) so that the executor and the task can
+ // access the work directory.
+ //
+ // NOTE: The mount of the work directory must be a shared mount in
+ // the host filesystem so that any mounts underneath it will
+ // propagate into the container's mount namespace. This is how we
+ // can update persistent volumes for the container.
+
+ // This is the mount point of the work directory in the root filesystem.
+ const string sandbox = path::join(rootfs.get(), flags.sandbox_directory);
+
+ // Save the path 'sandbox' which will be used in 'cleanup()'.
+ info->sandbox = sandbox;
+
+ if (!os::exists(sandbox)) {
+ Try<Nothing> mkdir = os::mkdir(sandbox);
+ if (mkdir.isError()) {
+ return Failure(
+ "Failed to create sandbox mount point at '" +
+ sandbox + "': " + mkdir.error());
+ }
+ }
+
+ LOG(INFO) << "Bind mounting work directory from '" << directory
+ << "' to '" << sandbox << "' for container " << containerId;
+
+ Try<Nothing> mount = fs::mount(
+ directory,
+ sandbox,
+ None(),
+ MS_BIND,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mount work directory '" + directory +
+ "' to '" + sandbox + "': " + mount.error());
+ }
+
+ mount = fs::mount(
+ None(),
+ sandbox,
+ None(),
+ MS_SLAVE,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mark sandbox '" + sandbox +
+ "' as a slave mount: " + mount.error());
+ }
+
+ mount = fs::mount(
+ None(),
+ sandbox,
+ None(),
+ MS_SHARED,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mark sandbox '" + sandbox +
+ "' as a shared mount: " + mount.error());
+ }
+
+ prepareInfo.set_rootfs(rootfs.get());
+ }
+
+ // Prepare the commands that will be run in the container's mount
+ // namespace right after forking the executor process. We use these
+ // commands to mount those volumes specified in the container info
+ // so that they don't pollute the host mount namespace.
+ Try<string> _script = script(containerId, executorInfo, directory, rootfs);
+ if (_script.isError()) {
+ return Failure("Failed to generate isolation script: " + _script.error());
+ }
+
+ CommandInfo* command = prepareInfo.add_commands();
+ command->set_value(_script.get());
+
+ return update(containerId, executorInfo.resources())
+ .then([prepareInfo]() -> Future<Option<ContainerPrepareInfo>> {
+ return prepareInfo;
+ });
+}
+
+
+Try<string> LinuxFilesystemIsolatorProcess::script(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& rootfs)
+{
+ ostringstream out;
+ out << "#!/bin/sh\n";
+ out << "set -x -e\n";
+
+ // Make sure mounts in the container mount namespace do not
+ // propagate back to the host mount namespace.
+ out << "mount --make-rslave /\n";
+
+ // Try to unmount work directory mounts and persistent volume mounts
+ // for other containers to release the extra references to them.
+ // NOTE:
+ // 1) This doesn't completely eliminate the race condition between
+ // this container copying mount table and other containers being
+ // cleaned up. This is instead a best-effort attempt.
+ // 2) This script assumes that all the mounts the container needs
+ // under the slave work directory have its container ID in the
+ // path either for the mount source (e.g. sandbox self-bind mount)
+ // or the mount target (e.g. mounting sandbox into new rootfs).
+ //
+ // TODO(xujyan): This command may fail if --work_dir is not specified
+ // with a real path as real paths are used in the mount table. It
+ // doesn't work when the paths contain reserved characters such as
+ // spaces either because such characters in mount info are encoded
+ // in the escaped form (i.e. '\0xx').
+ out << "grep -E '" << flags.work_dir << "/.+' /proc/self/mountinfo | "
+ << "grep -v '" << containerId.value() << "' | "
+ << "cut -d' ' -f5 | " // '-f5' is the mount target. See MountInfoTable.
+ << "xargs --no-run-if-empty umount -l || "
+ << "true \n"; // We mask errors in this command.
+
+ if (!executorInfo.has_container()) {
+ return out.str();
+ }
+
+ foreach (const Volume& volume, executorInfo.container().volumes()) {
+ if (!volume.has_host_path()) {
+ return Error("A volume misses 'host_path'");
+ }
+
+ // If both 'host_path' and 'container_path' are relative paths,
+ // return an error because the user can just directly access the
+ // volume in the work directory.
+ if (!strings::startsWith(volume.host_path(), "/") &&
+ !strings::startsWith(volume.container_path(), "/")) {
+ return Error(
+ "Both 'host_path' and 'container_path' of a volume are relative");
+ }
+
+ // Determine the source of the mount.
+ string source;
+
+ if (strings::startsWith(volume.host_path(), "/")) {
+ source = volume.host_path();
+
+ // An absolute path must already exist.
+ if (!os::exists(source)) {
+ return Error("Absolute host path does not exist");
+ }
+ } else {
+ // Path is interpreted as relative to the work directory.
+ source = path::join(directory, volume.host_path());
+
+ // TODO(jieyu): We need to check that source resolves under the
+ // work directory because a user can potentially use a container
+ // path like '../../abc'.
+
+ if (!os::exists(source)) {
+ Try<Nothing> mkdir = os::mkdir(source);
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the source of the mount at '" +
+ source + "': " + mkdir.error());
+ }
+
+ // TODO(idownes): Consider setting ownership and mode.
+ }
+ }
+
+ // Determine the target of the mount.
+ string target;
+
+ if (strings::startsWith(volume.container_path(), "/")) {
+ if (rootfs.isSome()) {
+ target = path::join(rootfs.get(), volume.container_path());
+ } else {
+ target = volume.container_path();
+ }
+
+ // An absolute path must already exist. This is because we want
+ // to avoid creating mount points outside the work directory in
+ // the host filesystem or in the container filesystem root.
+ if (!os::exists(target)) {
+ return Error("Absolute container path does not exist");
+ }
+
+ // TODO(jieyu): We need to check that target resolves under
+ // 'rootfs' because a user can potentially use a container path
+ // like '/../../abc'.
+ } else {
+ if (rootfs.isSome()) {
+ target = path::join(rootfs.get(),
+ flags.sandbox_directory,
+ volume.container_path());
+ } else {
+ target = path::join(directory, volume.container_path());
+ }
+
+ // TODO(jieyu): We need to check that target resolves under the
+ // sandbox because a user can potentially use a container path
+ // like '../../abc'.
+
+ if (!os::exists(target)) {
+ Try<Nothing> mkdir = os::mkdir(target);
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the target of the mount at '" +
+ target + "': " + mkdir.error());
+ }
+ }
+ }
+
+ // TODO(jieyu): Consider the mode in the volume.
+ out << "mount -n --rbind '" << source << "' '" << target << "'\n";
+ }
+
+ return out.str();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ // No-op, isolation happens when unsharing the mount namespace.
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> LinuxFilesystemIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ // No-op.
+ return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ // Mount persistent volumes. We do this in the host namespace and
+ // rely on mount propagation for them to be visible inside the
+ // container.
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ const Owned<Info>& info = infos[containerId];
+
+ Resources current = info->resources;
+
+ // We first remove unneeded persistent volumes.
+ foreach (const Resource& resource, current.persistentVolumes()) {
+ // This is enforced by the master.
+ CHECK(resource.disk().has_volume());
+
+ // Ignore absolute and nested paths.
+ const string& containerPath = resource.disk().volume().container_path();
+ if (strings::contains(containerPath, "/")) {
+ LOG(WARNING) << "Skipping updating mount for persistent volume "
+ << resource << " of container " << containerId
+ << " because the container path '" << containerPath
+ << "' contains slash";
+ continue;
+ }
+
+ if (resources.contains(resource)) {
+ continue;
+ }
+
+ // Determine the target of the mount.
+ string target;
+
+ if (info->sandbox.isSome()) {
+ target = path::join(info->sandbox.get(), containerPath);
+ } else {
+ target = path::join(info->directory, containerPath);
+ }
+
+ LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
+ << resource << " of container " << containerId;
+
+ // The unmount will fail if the task/executor is still using files
+ // or directories under 'target'.
+ Try<Nothing> unmount = fs::unmount(target);
+ if (unmount.isError()) {
+ return Failure(
+ "Failed to unmount unneeded persistent volume at '" +
+ target + "': " + unmount.error());
+ }
+
+ // NOTE: This is a non-recursive rmdir.
+ Try<Nothing> rmdir = os::rmdir(target, false);
+ if (rmdir.isError()) {
+ return Failure(
+ "Failed to remove persistent volume mount point at '" +
+ target + "': " + rmdir.error());
+ }
+ }
+
+ // We then mount new persistent volumes.
+ foreach (const Resource& resource, resources.persistentVolumes()) {
+ // This is enforced by the master.
+ CHECK(resource.disk().has_volume());
+
+ // Ignore absolute and nested paths.
+ const string& containerPath = resource.disk().volume().container_path();
+ if (strings::contains(containerPath, "/")) {
+ LOG(WARNING) << "Skipping updating mount for persistent volume "
+ << resource << " of container " << containerId
+ << " because the container path '" << containerPath
+ << "' contains slash";
+ continue;
+ }
+
+ if (current.contains(resource)) {
+ continue;
+ }
+
+ // Determine the source of the mount.
+ string source = paths::getPersistentVolumePath(
+ flags.work_dir,
+ resource.role(),
+ resource.disk().persistence().id());
+
+ // Set the ownership of the persistent volume to match that of the
+ // sandbox directory.
+ //
+ // NOTE: Currently, persistent volumes in Mesos are exclusive,
+ // meaning that if a persistent volume is used by one task or
+ // executor, it cannot be concurrently used by other task or
+ // executor. But if we allow multiple executors to use same
+ // persistent volume at the same time in the future, the ownership
+ // of the persistent volume may conflict here.
+ //
+ // TODO(haosdent): Consider letting the frameworks specify the
+ // user/group of the persistent volumes.
+ struct stat s;
+ if (::stat(info->directory.c_str(), &s) < 0) {
+ return Failure(
+ "Failed to get ownership for '" + info->directory +
+ "': " + strerror(errno));
+ }
+
+ LOG(INFO) << "Changing the ownership of the persistent volume at '"
+ << source << "' with uid " << s.st_uid
+ << " and gid " << s.st_gid;
+
+ Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, source, true);
+ if (chown.isError()) {
+ return Failure(
+ "Failed to change the ownership of the persistent volume at '" +
+ source + "' with uid " + stringify(s.st_uid) +
+ " and gid " + stringify(s.st_gid) + ": " + chown.error());
+ }
+
+ // Determine the target of the mount.
+ string target;
+
+ if (info->sandbox.isSome()) {
+ target = path::join(info->sandbox.get(), containerPath);
+ } else {
+ target = path::join(info->directory, containerPath);
+ }
+
+ if (os::exists(target)) {
+ // NOTE: This is possible because 'info->resources' will be
+ // reset when slave restarts and recovers. When the slave calls
+ // 'containerizer->update' after the executor re-registers,
+ // we'll try to re-mount all the already mounted volumes.
+
+ // TODO(jieyu): Check the source of the mount matches the entry
+ // with the same target in the mount table if one can be found.
+ // If not, mount the persistent volume as we did below. This is
+ // possible because the slave could crash after it unmounts the
+ // volume but before it is able to delete the mount point.
+ } else {
+ Try<Nothing> mkdir = os::mkdir(target);
+ if (mkdir.isError()) {
+ return Failure(
+ "Failed to create persistent volume mount point at '" +
+ target + "': " + mkdir.error());
+ }
+
+ LOG(INFO) << "Mounting '" << source << "' to '" << target
+ << "' for persistent volume " << resource
+ << " of container " << containerId;
+
+ Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mount persistent volume from '" +
+ source + "' to '" + target + "': " + mount.error());
+ }
+ }
+ }
+
+ // Store the new resources;
+ info->resources = resources;
+
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> LinuxFilesystemIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ // No-op, no usage gathered.
+ return ResourceStatistics();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ VLOG(1) << "Ignoring cleanup request for unknown container: "
+ << containerId;
+
+ return Nothing();
+ }
+
+ const Owned<Info>& info = infos[containerId];
+
+ // NOTE: We don't need to cleanup mounts in the container's mount
+ // namespace because it's done automatically by the kernel when the
+ // mount namespace is destroyed after the last process terminates.
+
+ // The path to the container' work directory which is the parent of
+ // all the persistent volume mounts.
+ string sandbox;
+
+ if (info->sandbox.isSome()) {
+ sandbox = info->sandbox.get();
+ } else {
+ sandbox = info->directory;
+ }
+
+ infos.erase(containerId);
+
+ // Cleanup the mounts for this container in the host mount
+ // namespace, including container's work directory and all the
+ // persistent volume mounts.
+ Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+ if (table.isError()) {
+ return Failure("Failed to get mount table: " + table.error());
+ }
+
+ bool sandboxMountExists = false;
+
+ foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+ // NOTE: All persistent volumes are mounted at targets under the
+ // container's work directory. We unmount all the persistent
+ // volumes before unmounting the sandbox/work directory mount.
+ if (entry.target == sandbox) {
+ sandboxMountExists = true;
+ } else if (strings::startsWith(entry.target, sandbox)) {
+ LOG(INFO) << "Unmounting volume '" << entry.target
+ << "' for container " << containerId;
+
+ Try<Nothing> unmount = fs::unmount(entry.target);
+ if (unmount.isError()) {
+ return Failure(
+ "Failed to unmount volume '" + entry.target +
+ "': " + unmount.error());
+ }
+ }
+ }
+
+ if (!sandboxMountExists) {
+ // This could happen if the container was not launched by this
+ // isolator (e.g., slaves prior to 0.25.0), or the container did
+ // not specify a root filesystem.
+ LOG(INFO) << "Ignoring unmounting sandbox/work directory"
+ << " for container " << containerId;
+ } else {
+ LOG(INFO) << "Unmounting sandbox/work directory '" << sandbox
+ << "' for container " << containerId;
+
+ Try<Nothing> unmount = fs::unmount(sandbox);
+ if (unmount.isError()) {
+ return Failure(
+ "Failed to unmount sandbox/work directory '" + sandbox +
+ "': " + unmount.error());
+ }
+ }
+
+ // Destroy the provisioned root filesystems.
+ return provisioner->destroy(containerId)
+ .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+LinuxFilesystemIsolatorProcess::Metrics::Metrics(
+ const PID<LinuxFilesystemIsolatorProcess>& isolator)
+ : containers_new_rootfs(
+ "containerizer/mesos/filesystem/containers_new_rootfs",
+ defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs))
+{
+ process::metrics::add(containers_new_rootfs);
+}
+
+
+LinuxFilesystemIsolatorProcess::Metrics::~Metrics()
+{
+ process::metrics::remove(containers_new_rootfs);
+}
+
+
+double LinuxFilesystemIsolatorProcess::_containers_new_rootfs()
+{
+ double count = 0.0;
+
+ foreachvalue (const Owned<Info>& info, infos) {
+ if (info->sandbox.isSome()) {
+ ++count;
+ }
+ }
+
+ return count;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
new file mode 100644
index 0000000..cd99966
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LINUX_FILESYSTEM_ISOLATOR_HPP__
+#define __LINUX_FILESYSTEM_ISOLATOR_HPP__
+
+#include <mesos/mesos.hpp>
+#include <mesos/resources.hpp>
+
+#include <process/owned.hpp>
+#include <process/pid.hpp>
+
+#include <process/metrics/gauge.hpp>
+
+#include <stout/hashmap.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The filesystem isolator on Linux that is responsible for preparing
+// the root filesystems and volumes (e.g., persistent volumes) for
+// containers. It relies on Linux mount namespace to prevent mounts of
+// a container from being propagated to the host mount table.
+class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(
+ const Flags& flags,
+ const process::Owned<Provisioner>& provisioner);
+
+ virtual ~LinuxFilesystemIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ LinuxFilesystemIsolatorProcess(
+ const Flags& flags,
+ const process::Owned<Provisioner>& provisioner);
+
+ process::Future<Nothing> _recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ process::Future<Option<mesos::slave::ContainerPrepareInfo>> _prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user,
+ const Option<std::string>& rootfs);
+
+ process::Future<Option<mesos::slave::ContainerPrepareInfo>> __prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user,
+ const Option<std::string>& rootfs);
+
+ Try<std::string> script(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& rootfs);
+
+ const Flags flags;
+ const process::Owned<Provisioner> provisioner;
+
+ struct Info
+ {
+ Info(const std::string& _directory) : directory(_directory) {}
+
+ const std::string directory;
+
+ // The absolute path to the container's work directory mount point
+ // in the new root filesystem if the container changes its root
+ // filesystem (i.e., '<rootfs>/<flags.sandbox_directory>'). If the
+ // container does not specify a root filesystem, this field will
+ // not be set.
+ Option<std::string> sandbox;
+
+ // Track resources so we can unmount unneeded persistent volumes.
+ Resources resources;
+ };
+
+ hashmap<ContainerID, process::Owned<Info>> infos;
+
+ struct Metrics
+ {
+ explicit Metrics(
+ const process::PID<LinuxFilesystemIsolatorProcess>& isolator);
+ ~Metrics();
+
+ process::metrics::Gauge containers_new_rootfs;
+ } metrics;
+
+ double _containers_new_rootfs();
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LINUX_FILESYSTEM_ISOLATOR_HPP__
[02/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/copy.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/copy.cpp b/src/slave/containerizer/provisioner/backends/copy.cpp
deleted file mode 100644
index 92fb098..0000000
--- a/src/slave/containerizer/provisioner/backends/copy.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/io.hpp>
-#include <process/process.hpp>
-#include <process/subprocess.hpp>
-
-
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/backends/copy.hpp"
-
-
-using namespace process;
-
-using std::string;
-using std::list;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CopyBackendProcess : public Process<CopyBackendProcess>
-{
-public:
- Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
-
- Future<bool> destroy(const string& rootfs);
-
-private:
- Future<Nothing> _provision(string layer, const string& rootfs);
-};
-
-
-Try<Owned<Backend>> CopyBackend::create(const Flags&)
-{
- return Owned<Backend>(new CopyBackend(
- Owned<CopyBackendProcess>(new CopyBackendProcess())));
-}
-
-
-CopyBackend::~CopyBackend()
-{
- terminate(process.get());
- wait(process.get());
-}
-
-
-CopyBackend::CopyBackend(Owned<CopyBackendProcess> _process)
- : process(_process)
-{
- spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Future<Nothing> CopyBackend::provision(
- const vector<string>& layers,
- const string& rootfs)
-{
- return dispatch(
- process.get(), &CopyBackendProcess::provision, layers, rootfs);
-}
-
-
-Future<bool> CopyBackend::destroy(const string& rootfs)
-{
- return dispatch(process.get(), &CopyBackendProcess::destroy, rootfs);
-}
-
-
-Future<Nothing> CopyBackendProcess::provision(
- const vector<string>& layers,
- const string& rootfs)
-{
- if (layers.size() == 0) {
- return Failure("No filesystem layers provided");
- }
-
- if (os::exists(rootfs)) {
- return Failure("Rootfs is already provisioned");
- }
-
- Try<Nothing> mkdir = os::mkdir(rootfs);
- if (mkdir.isError()) {
- return Failure("Failed to create rootfs directory: " + mkdir.error());
- }
-
- list<Future<Nothing>> futures{Nothing()};
-
- foreach (const string layer, layers) {
- futures.push_back(
- futures.back().then(
- defer(self(), &Self::_provision, layer, rootfs)));
- }
-
- return collect(futures)
- .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-Future<Nothing> CopyBackendProcess::_provision(
- string layer,
- const string& rootfs)
-{
- VLOG(1) << "Copying layer path '" << layer << "' to rootfs '" << rootfs
- << "'";
-
-#ifdef __APPLE__
- if (!strings::endsWith(layer, "/")) {
- layer += "/";
- }
-
- // OSX cp doesn't support -T flag, but supports source trailing
- // slash so we only copy the content but not the folder.
- vector<string> args{"cp", "-a", layer, rootfs};
-#else
- vector<string> args{"cp", "-aT", layer, rootfs};
-#endif // __APPLE__
-
- Try<Subprocess> s = subprocess(
- "cp",
- args,
- Subprocess::PATH("/dev/null"),
- Subprocess::PATH("/dev/null"),
- Subprocess::PIPE());
-
- if (s.isError()) {
- return Failure("Failed to create 'cp' subprocess: " + s.error());
- }
-
- Subprocess cp = s.get();
-
- return cp.status()
- .then([cp](const Option<int>& status) -> Future<Nothing> {
- if (status.isNone()) {
- return Failure("Failed to reap subprocess to copy image");
- } else if (status.get() != 0) {
- return io::read(cp.err().get())
- .then([](const string& err) -> Future<Nothing> {
- return Failure("Failed to copy layer: " + err);
- });
- }
-
- return Nothing();
- });
-}
-
-
-Future<bool> CopyBackendProcess::destroy(const string& rootfs)
-{
- vector<string> argv{"rm", "-rf", rootfs};
-
- Try<Subprocess> s = subprocess(
- "rm",
- argv,
- Subprocess::PATH("/dev/null"),
- Subprocess::FD(STDOUT_FILENO),
- Subprocess::FD(STDERR_FILENO));
-
- if (s.isError()) {
- return Failure("Failed to create 'rm' subprocess: " + s.error());
- }
-
- return s.get().status()
- .then([](const Option<int>& status) -> Future<bool> {
- if (status.isNone()) {
- return Failure("Failed to reap subprocess to destroy rootfs");
- } else if (status.get() != 0) {
- return Failure("Failed to destroy rootfs, exit status: " +
- WSTRINGIFY(status.get()));
- }
-
- return true;
- });
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/copy.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/copy.hpp b/src/slave/containerizer/provisioner/backends/copy.hpp
deleted file mode 100644
index 7a5aaa4..0000000
--- a/src/slave/containerizer/provisioner/backends/copy.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKENDS_COPY_HPP__
-#define __PROVISIONER_BACKENDS_COPY_HPP__
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class CopyBackendProcess;
-
-
-// The backend implementation that copies the layers to the target.
-// NOTE: Using this backend currently has a few implications:
-// 1) The disk space used by the provisioned rootfs is not counted
-// towards either the usage by the executor/task or the store
-// cache, which can interfere with the slave hosts's disk space
-// allocation.
-// 2) The task can write unrestrictedly into the provisioned rootfs
-// which is not accounted for (in terms of disk usage) either.
-class CopyBackend : public Backend
-{
-public:
- virtual ~CopyBackend();
-
- // CopyBackend doesn't use any flag.
- static Try<process::Owned<Backend>> create(const Flags&);
-
- // Provisions a rootfs given the layers' paths and target rootfs
- // path.
- virtual process::Future<Nothing> provision(
- const std::vector<std::string>& layers,
- const std::string& rootfs);
-
- virtual process::Future<bool> destroy(const std::string& rootfs);
-
-private:
- explicit CopyBackend(process::Owned<CopyBackendProcess> process);
-
- CopyBackend(const CopyBackend&); // Not copyable.
- CopyBackend& operator=(const CopyBackend&); // Not assignable.
-
- process::Owned<CopyBackendProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKENDS_COPY_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/local_puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/local_puller.cpp b/src/slave/containerizer/provisioner/docker/local_puller.cpp
deleted file mode 100644
index 74d0e1e..0000000
--- a/src/slave/containerizer/provisioner/docker/local_puller.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/json.hpp>
-#include <stout/os.hpp>
-#include <stout/result.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/subprocess.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/local_puller.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::pair;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class LocalPullerProcess : public process::Process<LocalPullerProcess>
-{
-public:
- LocalPullerProcess(const Flags& _flags) : flags(_flags) {}
-
- ~LocalPullerProcess() {}
-
- process::Future<list<pair<string, string>>> pull(
- const Image::Name& name,
- const string& directory);
-
-private:
- process::Future<Nothing> untarImage(
- const std::string& tarPath,
- const std::string& directory);
-
- process::Future<list<pair<string, string>>> putImage(
- const Image::Name& name,
- const std::string& directory);
-
- process::Future<list<pair<string, string>>> putLayers(
- const std::string& directory,
- const std::vector<std::string>& layerIds);
-
- process::Future<pair<string, string>> putLayer(
- const std::string& directory,
- const std::string& layerId);
-
- const Flags flags;
-};
-
-
-LocalPuller::LocalPuller(const Flags& flags)
-{
- process = Owned<LocalPullerProcess>(new LocalPullerProcess(flags));
- process::spawn(process.get());
-}
-
-
-LocalPuller::~LocalPuller()
-{
- process::terminate(process.get());
- process::wait(process.get());
-}
-
-
-Future<list<pair<string, string>>> LocalPuller::pull(
- const Image::Name& name,
- const string& directory)
-{
- return dispatch(process.get(), &LocalPullerProcess::pull, name, directory);
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::pull(
- const Image::Name& name,
- const string& directory)
-{
- const string tarPath = paths::getImageArchiveTarPath(
- flags.docker_local_archives_dir,
- stringify(name));
-
- if (!os::exists(tarPath)) {
- return Failure("Failed to find archive for image '" + stringify(name) +
- "' at '" + tarPath + "'");
- }
-
- return untarImage(tarPath, directory)
- .then(defer(self(), &Self::putImage, name, directory));
-}
-
-
-Future<Nothing> LocalPullerProcess::untarImage(
- const string& tarPath,
- const string& directory)
-{
- VLOG(1) << "Untarring image from '" << tarPath
- << "' to '" << directory << "'";
-
- // Untar store_discovery_local_dir/name.tar into directory/.
- // TODO(tnachen): Terminate tar process when slave exits.
- const vector<string> argv = {
- "tar",
- "-C",
- directory,
- "-x",
- "-f",
- tarPath
- };
-
- Try<Subprocess> s = subprocess(
- "tar",
- argv,
- Subprocess::PATH("/dev/null"),
- Subprocess::PATH("/dev/null"),
- Subprocess::PATH("/dev/null"));
-
- if (s.isError()) {
- return Failure("Failed to create tar subprocess: " + s.error());
- }
-
- return s.get().status()
- .then([tarPath](const Option<int>& status) -> Future<Nothing> {
- if (status.isNone()) {
- return Failure("Failed to reap status for tar subprocess in " +
- tarPath);
- }
- if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
- return Failure("Untar image failed with exit code: " +
- WSTRINGIFY(status.get()));
- }
-
- return Nothing();
- });
-}
-
-
-static Result<string> getParentId(
- const string& directory,
- const string& layerId)
-{
- Try<string> manifest =
- os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
-
- if (manifest.isError()) {
- return Error("Failed to read manifest: " + manifest.error());
- }
-
- Try<JSON::Object> json = JSON::parse<JSON::Object>(manifest.get());
- if (json.isError()) {
- return Error("Failed to parse manifest: " + json.error());
- }
-
- Result<JSON::String> parentId = json.get().find<JSON::String>("parent");
- if (parentId.isNone() || (parentId.isSome() && parentId.get() == "")) {
- return None();
- } else if (parentId.isError()) {
- return Error("Failed to read parent of layer: " + parentId.error());
- }
-
- return parentId.get().value;
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::putImage(
- const Image::Name& name,
- const string& directory)
-{
- Try<string> value =
- os::read(paths::getImageArchiveRepositoriesPath(directory));
-
- if (value.isError()) {
- return Failure("Failed to read repository JSON: " + value.error());
- }
-
- Try<JSON::Object> json = JSON::parse<JSON::Object>(value.get());
- if (json.isError()) {
- return Failure("Failed to parse JSON: " + json.error());
- }
-
- Result<JSON::Object> repositoryValue =
- json.get().find<JSON::Object>(name.repository());
-
- if (repositoryValue.isError()) {
- return Failure("Failed to find repository: " + repositoryValue.error());
- } else if (repositoryValue.isNone()) {
- return Failure("Repository '" + name.repository() + "' is not found");
- }
-
- const JSON::Object repositoryJson = repositoryValue.get();
-
- // We don't use JSON find here because a tag might contain a '.'.
- std::map<string, JSON::Value>::const_iterator entry =
- repositoryJson.values.find(name.tag());
-
- if (entry == repositoryJson.values.end()) {
- return Failure("Tag '" + name.tag() + "' is not found");
- } else if (!entry->second.is<JSON::String>()) {
- return Failure("Tag JSON value expected to be JSON::String");
- }
-
- const string layerId = entry->second.as<JSON::String>().value;
-
- Try<string> manifest =
- os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
-
- if (manifest.isError()) {
- return Failure("Failed to read manifest: " + manifest.error());
- }
-
- Try<JSON::Object> manifestJson = JSON::parse<JSON::Object>(manifest.get());
- if (manifestJson.isError()) {
- return Failure("Failed to parse manifest: " + manifestJson.error());
- }
-
- vector<string> layerIds;
- layerIds.push_back(layerId);
- Result<string> parentId = getParentId(directory, layerId);
- while (parentId.isSome()) {
- layerIds.insert(layerIds.begin(), parentId.get());
- parentId = getParentId(directory, parentId.get());
- }
-
- if (parentId.isError()) {
- return Failure("Failed to find parent layer id of layer '" + layerId +
- "': " + parentId.error());
- }
-
- return putLayers(directory, layerIds);
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::putLayers(
- const string& directory,
- const vector<string>& layerIds)
-{
- list<Future<pair<string, string>>> futures;
- foreach (const string& layerId, layerIds) {
- futures.push_back(putLayer(directory, layerId));
- }
-
- return collect(futures);
-}
-
-
-Future<pair<string, string>> LocalPullerProcess::putLayer(
- const string& directory,
- const string& layerId)
-{
- // We untar the layer from source into a directory, then move the
- // layer into store. We do this instead of untarring directly to
- // store to make sure we don't end up with partially untarred layer
- // rootfs.
-
- const string localRootfsPath =
- paths::getImageArchiveLayerRootfsPath(directory, layerId);
-
- // Image layer has been untarred but is not present in the store directory.
- if (os::exists(localRootfsPath)) {
- LOG(WARNING) << "Image layer '" << layerId << "' rootfs present at but not "
- << "in store directory '" << localRootfsPath << "'. Removing "
- << "staged rootfs and untarring layer again.";
-
- Try<Nothing> rmdir = os::rmdir(localRootfsPath);
- if (rmdir.isError()) {
- return Failure("Failed to remove incomplete staged rootfs for layer '" +
- layerId + "': " + rmdir.error());
- }
- }
-
- Try<Nothing> mkdir = os::mkdir(localRootfsPath);
- if (mkdir.isError()) {
- return Failure("Failed to create rootfs path '" + localRootfsPath +
- "': " + mkdir.error());
- }
-
- // Untar directory/id/layer.tar into directory/id/rootfs.
- // The tar file will be removed when the staging directory is
- // removed.
- const vector<string> argv = {
- "tar",
- "-C",
- localRootfsPath,
- "-x",
- "-f",
- paths::getImageArchiveLayerTarPath(directory, layerId)
- };
-
- Try<Subprocess> s = subprocess(
- "tar",
- argv,
- Subprocess::PATH("/dev/null"),
- Subprocess::PATH("/dev/null"),
- Subprocess::PATH("/dev/null"));
-
- if (s.isError()) {
- return Failure("Failed to create tar subprocess: " + s.error());
- }
-
- return s.get().status()
- .then([directory, layerId](
- const Option<int>& status) -> Future<pair<string, string>> {
- if (status.isNone()) {
- return Failure("Failed to reap subprocess to untar image");
- } else if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
- return Failure("Untar failed with exit code: " +
- WSTRINGIFY(status.get()));
- }
-
- const string rootfsPath =
- paths::getImageArchiveLayerRootfsPath(directory, layerId);
-
- if (!os::exists(rootfsPath)) {
- return Failure("Failed to find the rootfs path after extracting layer"
- " '" + layerId + "'");
- }
-
- return pair<string, string>(layerId, rootfsPath);
- });
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/local_puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/local_puller.hpp b/src/slave/containerizer/provisioner/docker/local_puller.hpp
deleted file mode 100644
index 4574e8a..0000000
--- a/src/slave/containerizer/provisioner/docker/local_puller.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
-#define __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward declaration.
-class LocalPullerProcess;
-
-
-/**
- * LocalPuller assumes Docker images are stored in a local directory
- * (configured with flags.docker_local_archives_dir), with all the
- * images saved as tars with file names in the form of <repo>:<tag>.tar.
- */
-class LocalPuller : public Puller
-{
-public:
- explicit LocalPuller(const Flags& flags);
-
- ~LocalPuller();
-
- process::Future<std::list<std::pair<std::string, std::string>>> pull(
- const Image::Name& name,
- const std::string& directory);
-
-private:
- LocalPuller& operator=(const LocalPuller&) = delete; // Not assignable.
- LocalPuller(const LocalPuller&) = delete; // Not copyable.
-
- process::Owned<LocalPullerProcess> process;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/message.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/message.hpp b/src/slave/containerizer/provisioner/docker/message.hpp
deleted file mode 100644
index 466e6f8..0000000
--- a/src/slave/containerizer/provisioner/docker/message.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __MESSAGES_DOCKER_PROVISIONER_HPP__
-#define __MESSAGES_DOCKER_PROVISIONER_HPP__
-
-#include <stout/strings.hpp>
-
-// ONLY USEFUL AFTER RUNNING PROTOC.
-#include "slave/containerizer/provisioner/docker/message.pb.h"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Docker expects the image to be specified on the command line as:
-// [REGISTRY_HOST[:REGISTRY_PORT]/]REPOSITORY[:TAG|@TYPE:DIGEST]
-//
-// This format is inherently ambiguous when dealing with repository
-// names that include forward slashes. To disambiguate, the docker
-// code looks for '.', or ':', or 'localhost' to decide if the
-// first component is a registry or a respository name. For more
-// detail, drill into the implementation of docker pull.
-//
-// TODO(bmahler): We currently store the digest as a tag, does
-// that makes sense?
-//
-// TODO(bmahler): Validate based on docker's validation logic
-// and return a Try here.
-inline Image::Name parseImageName(std::string s)
-{
- Image::Name name;
-
- // Extract the digest.
- if (strings::contains(s, "@")) {
- std::vector<std::string> split = strings::split(s, "@");
-
- s = split[0];
- name.set_tag(split[1]);
- }
-
- // Remove the tag. We need to watch out for a
- // host:port registry, which also contains ':'.
- if (strings::contains(s, ":")) {
- std::vector<std::string> split = strings::split(s, ":");
-
- // The tag must be the last component. If a slash is
- // present there is a registry port and no tag.
- if (!strings::contains(split.back(), "/")) {
- name.set_tag(split.back());
- split.pop_back();
-
- s = strings::join(":", split);
- }
- }
-
- // Default to the 'latest' tag when omitted.
- if (name.tag().empty()) {
- name.set_tag("latest");
- }
-
- // Extract the registry and repository. The first component can
- // either be the registry, or the first part of the repository!
- // We resolve this ambiguity using the same hacks used in the
- // docker code ('.', ':', 'localhost' indicate a registry).
- std::vector<std::string> split = strings::split(s, "/", 2);
-
- if (split.size() == 1) {
- name.set_repository(s);
- } else if (strings::contains(split[0], ".") ||
- strings::contains(split[0], ":") ||
- split[0] == "localhost") {
- name.set_registry(split[0]);
- name.set_repository(split[1]);
- } else {
- name.set_repository(s);
- }
-
- return name;
-}
-
-
-inline std::ostream& operator<<(
- std::ostream& stream,
- const Image::Name& name)
-{
- if (name.has_registry()) {
- return stream << name.registry() << "/" << name.repository() << ":"
- << name.tag();
- }
-
- return stream << name.repository() << ":" << name.tag();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __MESSAGES_DOCKER_PROVISIONER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/message.proto
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/message.proto b/src/slave/containerizer/provisioner/docker/message.proto
deleted file mode 100644
index c33e0c5..0000000
--- a/src/slave/containerizer/provisioner/docker/message.proto
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import "mesos/mesos.proto";
-
-package mesos.internal.slave.docker;
-
-/**
- * A Docker Image name and the layer ids of the layers that comprise the image.
- * The layerIds are ordered, with the root layer id (no parent layer id) first
- * and the leaf layer id last.
- */
-message Image {
- message Name {
- optional string registry = 1;
- required string repository = 2;
-
- // TODO(bmahler): This may hold a tag or a digest, split these?
- required string tag = 3;
- }
-
- required Name name = 1;
-
- // The order of the layers represents the dependency between layers.
- repeated string layer_ids = 2;
-}
-
-
-message Images {
- repeated Image images = 1;
-}
-
-
-/**
-* Protobuf for the Docker image manifest JSON schema:
-* https://github.com/docker/distribution/blob/master/docs/spec/manifest-v2-1.md
-*/
-message DockerImageManifest {
- required string name = 1;
- required string tag = 2;
- required string architecture = 3;
-
- message FsLayers {
- required string blobSum = 1;
- }
-
- repeated FsLayers fsLayers = 4;
-
- message History {
- message V1Compatibility {
- required string id = 1;
- required string parent = 2;
- }
-
- required V1Compatibility v1Compatibility = 1;
- }
-
- repeated History history = 5;
- required uint32 schemaVersion = 6;
-
- message Signatures {
-
- //JOSE (A JSON Web Signature).
- message Header {
-
- //JSON Web Key.
- message Jwk {
- required string crv = 1;
- required string kid = 2;
- required string kty = 3;
- required string x = 4;
- required string y = 5;
- }
-
- optional Jwk jwk = 1;
- required string alg = 2;
- }
-
- required Header header = 1;
- required string signature = 2;
- required string protected = 3;
- }
-
- repeated Signatures signatures = 7;
-}
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/metadata_manager.cpp b/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
deleted file mode 100644
index 2b2de52..0000000
--- a/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/os.hpp>
-#include <stout/protobuf.hpp>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/owned.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/state.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class MetadataManagerProcess : public process::Process<MetadataManagerProcess>
-{
-public:
- MetadataManagerProcess(const Flags& _flags) : flags(_flags) {}
-
- ~MetadataManagerProcess() {}
-
- Future<Nothing> recover();
-
- Future<Image> put(
- const Image::Name& name,
- const std::vector<std::string>& layerIds);
-
- Future<Option<Image>> get(const Image::Name& name);
-
- // TODO(chenlily): Implement removal of unreferenced images.
-
-private:
- // Write out metadata manager state to persistent store.
- Try<Nothing> persist();
-
- const Flags flags;
-
- // This is a lookup table for images that are stored in memory. It is keyed
- // by the name of the Image.
- // For example, "ubuntu:14.04" -> ubuntu14:04 Image.
- hashmap<std::string, Image> storedImages;
-};
-
-
-Try<Owned<MetadataManager>> MetadataManager::create(const Flags& flags)
-{
- Owned<MetadataManagerProcess> process(new MetadataManagerProcess(flags));
-
- return Owned<MetadataManager>(new MetadataManager(process));
-}
-
-
-MetadataManager::MetadataManager(Owned<MetadataManagerProcess> process)
- : process(process)
-{
- process::spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-MetadataManager::~MetadataManager()
-{
- process::terminate(process.get());
- process::wait(process.get());
-}
-
-
-Future<Nothing> MetadataManager::recover()
-{
- return process::dispatch(process.get(), &MetadataManagerProcess::recover);
-}
-
-
-Future<Image> MetadataManager::put(
- const Image::Name& name,
- const vector<string>& layerIds)
-{
- return dispatch(
- process.get(),
- &MetadataManagerProcess::put,
- name,
- layerIds);
-}
-
-
-Future<Option<Image>> MetadataManager::get(const Image::Name& name)
-{
- return dispatch(process.get(), &MetadataManagerProcess::get, name);
-}
-
-
-Future<Image> MetadataManagerProcess::put(
- const Image::Name& name,
- const vector<string>& layerIds)
-{
- const string imageName = stringify(name);
-
- Image dockerImage;
- dockerImage.mutable_name()->CopyFrom(name);
- foreach (const string& layerId, layerIds) {
- dockerImage.add_layer_ids(layerId);
- }
-
- storedImages[imageName] = dockerImage;
-
- Try<Nothing> status = persist();
- if (status.isError()) {
- return Failure("Failed to save state of Docker images: " + status.error());
- }
-
- return dockerImage;
-}
-
-
-Future<Option<Image>> MetadataManagerProcess::get(
- const Image::Name& name)
-{
- const string imageName = stringify(name);
-
- if (!storedImages.contains(imageName)) {
- return None();
- }
-
- return storedImages[imageName];
-}
-
-
-Try<Nothing> MetadataManagerProcess::persist()
-{
- Images images;
-
- foreachvalue (const Image& image, storedImages) {
- images.add_images()->CopyFrom(image);
- }
-
- Try<Nothing> status = state::checkpoint(
- paths::getStoredImagesPath(flags.docker_store_dir), images);
- if (status.isError()) {
- return Error("Failed to perform checkpoint: " + status.error());
- }
-
- return Nothing();
-}
-
-
-Future<Nothing> MetadataManagerProcess::recover()
-{
- string storedImagesPath = paths::getStoredImagesPath(flags.docker_store_dir);
-
- if (!os::exists(storedImagesPath)) {
- LOG(INFO) << "No images to load from disk. Docker provisioner image "
- << "storage path '" << storedImagesPath << "' does not exist";
- return Nothing();
- }
-
- Result<Images> images = ::protobuf::read<Images>(storedImagesPath);
- if (images.isError()) {
- return Failure("Failed to read protobuf for Docker provisioner image: " +
- images.error());
- }
-
- foreach (const Image image, images.get().images()) {
- vector<string> missingLayerIds;
- foreach (const string layerId, image.layer_ids()) {
- const string rootfsPath =
- paths::getImageLayerRootfsPath(flags.docker_store_dir, layerId);
-
- if (!os::exists(rootfsPath)) {
- missingLayerIds.push_back(layerId);
- }
- }
-
- if (!missingLayerIds.empty()) {
- LOG(WARNING) << "Skipped loading image '" << stringify(image.name())
- << "' due to missing layers: " << stringify(missingLayerIds);
- continue;
- }
-
- const string imageName = stringify(image.name());
- if (storedImages.contains(imageName)) {
- LOG(WARNING) << "Found duplicate image in recovery for image name '"
- << imageName << "'";
- } else {
- storedImages[imageName] = image;
- }
- }
-
- LOG(INFO) << "Loaded " << storedImages.size() << " Docker images";
-
- return Nothing();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/metadata_manager.hpp b/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
deleted file mode 100644
index 885080d..0000000
--- a/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
-#define __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
-
-#include <list>
-#include <string>
-
-#include <stout/hashmap.hpp>
-#include <stout/json.hpp>
-#include <stout/option.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-#include <process/process.hpp>
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward Declaration.
-class MetadataManagerProcess;
-
-/**
- * The MetadataManager tracks the Docker images cached by the
- * provisioner that are stored on disk. It keeps track of the layers
- * that Docker images are composed of and recovers Image objects
- * upon initialization by checking for dependent layers stored on disk.
- * Currently, image layers are stored indefinitely, with no garbage
- * collection of unreferenced image layers.
- */
-class MetadataManager
-{
-public:
- static Try<process::Owned<MetadataManager>> create(const Flags& flags);
-
- ~MetadataManager();
-
- /**
- * Recover all stored Image and its layer references.
- */
- process::Future<Nothing> recover();
-
- /**
- * Create an Image, put it in metadata manager and persist the reference
- * store state to disk.
- *
- * @param name the name of the Docker image to place in the reference
- * store.
- * @param layerIds the list of layer ids that comprise the Docker image in
- * order where the root layer's id (no parent layer) is first
- * and the leaf layer's id is last.
- */
- process::Future<Image> put(
- const Image::Name& name,
- const std::vector<std::string>& layerIds);
-
- /**
- * Retrieve Image based on image name if it is among the Images
- * stored in memory.
- *
- * @param name the name of the Docker image to retrieve
- */
- process::Future<Option<Image>> get(const Image::Name& name);
-
-private:
- explicit MetadataManager(process::Owned<MetadataManagerProcess> process);
-
- MetadataManager(const MetadataManager&); // Not copyable.
- MetadataManager& operator=(const MetadataManager&); // Not assignable.
-
- process::Owned<MetadataManagerProcess> process;
-};
-
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/paths.cpp b/src/slave/containerizer/provisioner/docker/paths.cpp
deleted file mode 100644
index 5733fb7..0000000
--- a/src/slave/containerizer/provisioner/docker/paths.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-
-#include <stout/path.hpp>
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace paths {
-
-string getStagingDir(const string& storeDir)
-{
- return path::join(storeDir, "staging");
-}
-
-
-string getStagingTempDir(const string& storeDir)
-{
- return path::join(getStagingDir(storeDir), "XXXXXX");
-}
-
-
-string getImageArchiveTarPath(
- const string& discoveryDir,
- const string& name)
-{
- return path::join(discoveryDir, name + ".tar");
-}
-
-
-string getImageArchiveRepositoriesPath(const string& archivePath)
-{
- return path::join(archivePath, "repositories");
-}
-
-
-std::string getImageArchiveLayerPath(
- const string& archivePath,
- const string& layerId)
-{
- return path::join(archivePath, layerId);
-}
-
-
-string getImageArchiveLayerManifestPath(
- const string& archivePath,
- const string& layerId)
-{
- return path::join(getImageArchiveLayerPath(archivePath, layerId), "json");
-}
-
-
-string getImageArchiveLayerTarPath(
- const string& archivePath,
- const string& layerId)
-{
- return path::join(
- getImageArchiveLayerPath(archivePath, layerId), "layer.tar");
-}
-
-
-string getImageArchiveLayerRootfsPath(
- const string& archivePath,
- const string& layerId)
-{
- return path::join(getImageArchiveLayerPath(archivePath, layerId), "rootfs");
-}
-
-
-string getImageLayerPath(
- const string& storeDir,
- const string& layerId)
-{
- return path::join(storeDir, "layers", layerId);
-}
-
-
-string getImageLayerRootfsPath(
- const string& storeDir,
- const string& layerId)
-{
- return path::join(getImageLayerPath(storeDir, layerId), "rootfs");
-}
-
-
-string getStoredImagesPath(const string& storeDir)
-{
- return path::join(storeDir, "storedImages");
-}
-
-} // namespace paths {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/paths.hpp b/src/slave/containerizer/provisioner/docker/paths.hpp
deleted file mode 100644
index 18beb2e..0000000
--- a/src/slave/containerizer/provisioner/docker/paths.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_PATHS_HPP__
-#define __PROVISIONER_DOCKER_PATHS_HPP__
-
-#include <list>
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace paths {
-
-/**
- * The Docker store file system layout is as follows:
- * Image store dir ('--docker_store_dir' slave flag)
- * |--staging
- * |-- <temp_dir_archive>
- * |-- <layer_id>
- * |-- rootfs
- * |--layers
- * |--<layer_id>
- * |--rootfs
- * |--storedImages (file holding on cached images)
- */
-
-std::string getStagingDir(const std::string& storeDir);
-
-
-std::string getStagingTempDir(const std::string& storeDir);
-
-
-std::string getImageArchiveTarPath(
- const std::string& discoveryDir,
- const std::string& name);
-
-
-std::string getImageArchiveRepositoriesPath(const std::string& archivePath);
-
-
-std::string getImageArchiveLayerPath(
- const std::string& archivePath,
- const std::string& layerId);
-
-
-std::string getImageArchiveLayerManifestPath(
- const std::string& archivePath,
- const std::string& layerId);
-
-
-std::string getImageArchiveLayerTarPath(
- const std::string& archivePath,
- const std::string& layerId);
-
-
-std::string getImageArchiveLayerRootfsPath(
- const std::string& archivePath,
- const std::string& layerId);
-
-
-std::string getImageLayerPath(
- const std::string& storeDir,
- const std::string& layerId);
-
-
-std::string getImageLayerRootfsPath(
- const std::string& storeDir,
- const std::string& layerId);
-
-
-std::string getStoredImagesPath(const std::string& storeDir);
-
-} // namespace paths {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/puller.cpp b/src/slave/containerizer/provisioner/docker/puller.cpp
deleted file mode 100644
index cb05324..0000000
--- a/src/slave/containerizer/provisioner/docker/puller.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/containerizer/provisioner/docker/local_puller.hpp"
-
-using std::string;
-
-using process::Owned;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-Try<Owned<Puller>> Puller::create(const Flags& flags)
-{
- const string puller = flags.docker_puller;
-
- if (puller == "local") {
- return Owned<Puller>(new LocalPuller(flags));
- }
-
- return Error("Unknown or unsupported docker puller: " + puller);
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/puller.hpp b/src/slave/containerizer/provisioner/docker/puller.hpp
deleted file mode 100644
index 105b4e7..0000000
--- a/src/slave/containerizer/provisioner/docker/puller.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_PULLER_HPP__
-#define __PROVISIONER_DOCKER_PULLER_HPP__
-
-#include <list>
-#include <utility>
-
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class Puller
-{
-public:
- static Try<process::Owned<Puller>> create(const Flags& flags);
-
- virtual ~Puller() {}
-
- /**
- * Pull a Docker image layers into the specified directory, and
- * return the list of layer ids in that image in the right
- * dependency order, and also return the directory where
- * the puller puts its changeset.
- *
- * @param name The name of the image.
- * @param directory The target directory to store the layers.
- * @return list of layers maped to its local directory ordered by its
- * dependency.
- */
- virtual process::Future<std::list<std::pair<std::string, std::string>>> pull(
- const docker::Image::Name& name,
- const std::string& directory) = 0;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-
-#endif // __PROVISIONER_DOCKER_PULLER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/registry_client.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/registry_client.cpp b/src/slave/containerizer/provisioner/docker/registry_client.cpp
deleted file mode 100644
index 24aa95c..0000000
--- a/src/slave/containerizer/provisioner/docker/registry_client.cpp
+++ /dev/null
@@ -1,641 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/http.hpp>
-#include <process/io.hpp>
-
-#include <stout/os.hpp>
-
-#include "slave/containerizer/provisioner/docker/registry_client.hpp"
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
-
-using std::string;
-using std::vector;
-
-using process::Failure;
-using process::Future;
-using process::Owned;
-using process::Process;
-
-using process::http::Request;
-using process::http::Response;
-using process::http::URL;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-using FileSystemLayerInfo = RegistryClient::FileSystemLayerInfo;
-
-using ManifestResponse = RegistryClient::ManifestResponse;
-
-const Duration RegistryClient::DEFAULT_MANIFEST_TIMEOUT_SECS = Seconds(10);
-
-const size_t RegistryClient::DEFAULT_MANIFEST_MAXSIZE_BYTES = 4096;
-
-static const uint16_t DEFAULT_SSL_PORT = 443;
-
-class RegistryClientProcess : public Process<RegistryClientProcess>
-{
-public:
- static Try<Owned<RegistryClientProcess>> create(
- const URL& registry,
- const URL& authServer,
- const Option<RegistryClient::Credentials>& creds);
-
- Future<RegistryClient::ManifestResponse> getManifest(
- const string& path,
- const Option<string>& tag,
- const Duration& timeout);
-
- Future<size_t> getBlob(
- const string& path,
- const Option<string>& digest,
- const Path& filePath,
- const Duration& timeout,
- size_t maxSize);
-
-private:
- RegistryClientProcess(
- const URL& registryServer,
- const Owned<TokenManager>& tokenManager,
- const Option<RegistryClient::Credentials>& creds);
-
- Future<Response> doHttpGet(
- const URL& url,
- const Option<process::http::Headers>& headers,
- const Duration& timeout,
- bool resend,
- const Option<string>& lastResponse) const;
-
- Try<process::http::Headers> getAuthenticationAttributes(
- const Response& httpResponse) const;
-
- const URL registryServer_;
- Owned<TokenManager> tokenManager_;
- const Option<RegistryClient::Credentials> credentials_;
-
- RegistryClientProcess(const RegistryClientProcess&) = delete;
- RegistryClientProcess& operator = (const RegistryClientProcess&) = delete;
-};
-
-
-Try<Owned<RegistryClient>> RegistryClient::create(
- const URL& registryServer,
- const URL& authServer,
- const Option<Credentials>& creds)
-{
- Try<Owned<RegistryClientProcess>> process =
- RegistryClientProcess::create(authServer, registryServer, creds);
-
- if (process.isError()) {
- return Error(process.error());
- }
-
- return Owned<RegistryClient>(
- new RegistryClient(authServer, registryServer, creds, process.get()));
-}
-
-
-RegistryClient::RegistryClient(
- const URL& registryServer,
- const URL& authServer,
- const Option<Credentials>& creds,
- const Owned<RegistryClientProcess>& process)
- : registryServer_(registryServer),
- authServer_(authServer),
- credentials_(creds),
- process_(process)
-{
- spawn(CHECK_NOTNULL(process_.get()));
-}
-
-
-RegistryClient::~RegistryClient()
-{
- terminate(process_.get());
- process::wait(process_.get());
-}
-
-
-Future<ManifestResponse> RegistryClient::getManifest(
- const string& _path,
- const Option<string>& _tag,
- const Option<Duration>& _timeout)
-{
- Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
-
- return dispatch(
- process_.get(),
- &RegistryClientProcess::getManifest,
- _path,
- _tag,
- timeout);
-}
-
-
-Future<size_t> RegistryClient::getBlob(
- const string& _path,
- const Option<string>& _digest,
- const Path& _filePath,
- const Option<Duration>& _timeout,
- const Option<size_t>& _maxSize)
-{
- Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
- size_t maxSize = _maxSize.getOrElse(DEFAULT_MANIFEST_MAXSIZE_BYTES);
-
- return dispatch(
- process_.get(),
- &RegistryClientProcess::getBlob,
- _path,
- _digest,
- _filePath,
- timeout,
- maxSize);
-}
-
-
-Try<Owned<RegistryClientProcess>> RegistryClientProcess::create(
- const URL& registryServer,
- const URL& authServer,
- const Option<RegistryClient::Credentials>& creds)
-{
- Try<Owned<TokenManager>> tokenMgr = TokenManager::create(authServer);
- if (tokenMgr.isError()) {
- return Error("Failed to create token manager: " + tokenMgr.error());
- }
-
- return Owned<RegistryClientProcess>(
- new RegistryClientProcess(registryServer, tokenMgr.get(), creds));
-}
-
-
-RegistryClientProcess::RegistryClientProcess(
- const URL& registryServer,
- const Owned<TokenManager>& tokenMgr,
- const Option<RegistryClient::Credentials>& creds)
- : registryServer_(registryServer),
- tokenManager_(tokenMgr),
- credentials_(creds) {}
-
-
-Try<process::http::Headers> RegistryClientProcess::getAuthenticationAttributes(
- const Response& httpResponse) const
-{
- if (httpResponse.headers.find("WWW-Authenticate") ==
- httpResponse.headers.end()) {
- return Error("Failed to find WWW-Authenticate header value");
- }
-
- const string& authString = httpResponse.headers.at("WWW-Authenticate");
-
- const vector<string> authStringTokens = strings::tokenize(authString, " ");
- if ((authStringTokens.size() != 2) || (authStringTokens[0] != "Bearer")) {
- // TODO(jojy): Look at various possibilities of auth response. We currently
- // assume that the string will have realm information.
- return Error("Invalid authentication header value: " + authString);
- }
-
- const vector<string> authParams = strings::tokenize(authStringTokens[1], ",");
-
- process::http::Headers authAttributes;
- auto addAttribute = [&authAttributes](
- const string& param) -> Try<Nothing> {
- const vector<string> paramTokens =
- strings::tokenize(param, "=\"");
-
- if (paramTokens.size() != 2) {
- return Error(
- "Failed to get authentication attribute from response parameter " +
- param);
- }
-
- authAttributes.insert({paramTokens[0], paramTokens[1]});
-
- return Nothing();
- };
-
- foreach (const string& param, authParams) {
- Try<Nothing> addRes = addAttribute(param);
- if (addRes.isError()) {
- return Error(addRes.error());
- }
- }
-
- return authAttributes;
-}
-
-
-Future<Response> RegistryClientProcess::doHttpGet(
- const URL& url,
- const Option<process::http::Headers>& headers,
- const Duration& timeout,
- bool resend,
- const Option<string>& lastResponseStatus) const
-{
- return process::http::get(url, headers)
- .after(timeout, [](
- const Future<Response>& httpResponseFuture) -> Future<Response> {
- return Failure("Response timeout");
- })
- .then(defer(self(), [=](
- const Response& httpResponse) -> Future<Response> {
- VLOG(1) << "Response status: " + httpResponse.status;
-
- // Set the future if we get a OK response.
- if (httpResponse.status == "200 OK") {
- return httpResponse;
- } else if (httpResponse.status == "400 Bad Request") {
- Try<JSON::Object> errorResponse =
- JSON::parse<JSON::Object>(httpResponse.body);
-
- if (errorResponse.isError()) {
- return Failure("Failed to parse bad request response JSON: " +
- errorResponse.error());
- }
-
- std::ostringstream out;
- bool first = true;
- Result<JSON::Array> errorObjects =
- errorResponse.get().find<JSON::Array>("errors");
-
- if (errorObjects.isError()) {
- return Failure("Failed to find 'errors' in bad request response: " +
- errorObjects.error());
- } else if (errorObjects.isNone()) {
- return Failure("Errors not found in bad request response");
- }
-
- foreach (const JSON::Value& error, errorObjects.get().values) {
- Result<JSON::String> message =
- error.as<JSON::Object>().find<JSON::String>("message");
-
- if (message.isError()) {
- return Failure("Failed to parse bad request error message: " +
- message.error());
- } else if (message.isNone()) {
- continue;
- }
-
- if (first) {
- out << message.get().value;
- first = false;
- } else {
- out << ", " << message.get().value;
- }
- }
-
- return Failure("Received Bad request, errors: [" + out.str() + "]");
- }
-
- // Prevent infinite recursion.
- if (lastResponseStatus.isSome() &&
- (lastResponseStatus.get() == httpResponse.status)) {
- return Failure("Invalid response: " + httpResponse.status);
- }
-
- // If resend is not set, we dont try again and stop here.
- if (!resend) {
- return Failure("Bad response: " + httpResponse.status);
- }
-
- // Handle 401 Unauthorized.
- if (httpResponse.status == "401 Unauthorized") {
- Try<process::http::Headers> authAttributes =
- getAuthenticationAttributes(httpResponse);
-
- if (authAttributes.isError()) {
- return Failure(
- "Failed to get authentication attributes: " +
- authAttributes.error());
- }
-
- // TODO(jojy): Currently only handling TLS/cert authentication.
- Future<Token> tokenResponse = tokenManager_->getToken(
- authAttributes.get().at("service"),
- authAttributes.get().at("scope"),
- None());
-
- return tokenResponse
- .after(timeout, [=](
- Future<Token> tokenResponse) -> Future<Token> {
- tokenResponse.discard();
- return Failure("Token response timeout");
- })
- .then(defer(self(), [=](
- const Future<Token>& tokenResponse) {
- // Send request with acquired token.
- process::http::Headers authHeaders = {
- {"Authorization", "Bearer " + tokenResponse.get().raw}
- };
-
- return doHttpGet(
- url,
- authHeaders,
- timeout,
- true,
- httpResponse.status);
- }));
- } else if (httpResponse.status == "307 Temporary Redirect") {
- // Handle redirect.
-
- // TODO(jojy): Add redirect functionality in http::get.
-
- auto toURL = [](
- const string& urlString) -> Try<URL> {
- // TODO(jojy): Need to add functionality to URL class that parses a
- // string to its URL components. For now, assuming:
- // - scheme is https
- // - path always ends with /
-
- static const string schemePrefix = "https://";
-
- if (!strings::contains(urlString, schemePrefix)) {
- return Error(
- "Failed to find expected token '" + schemePrefix +
- "' in redirect url");
- }
-
- const string schemeSuffix = urlString.substr(schemePrefix.length());
-
- const vector<string> components =
- strings::tokenize(schemeSuffix, "/");
-
- const string path = schemeSuffix.substr(components[0].length());
-
- const vector<string> addrComponents =
- strings::tokenize(components[0], ":");
-
- uint16_t port = DEFAULT_SSL_PORT;
- string domain = components[0];
-
- // Parse the port.
- if (addrComponents.size() == 2) {
- domain = addrComponents[0];
-
- Try<uint16_t> tryPort = numify<uint16_t>(addrComponents[1]);
- if (tryPort.isError()) {
- return Error(
- "Failed to parse location: " + urlString + " for port.");
- }
-
- port = tryPort.get();
- }
-
- return URL("https", domain, port, path);
- };
-
- if (httpResponse.headers.find("Location") ==
- httpResponse.headers.end()) {
- return Failure(
- "Invalid redirect response: 'Location' not found in headers.");
- }
-
- const string& location = httpResponse.headers.at("Location");
- Try<URL> tryUrl = toURL(location);
- if (tryUrl.isError()) {
- return Failure(
- "Failed to parse '" + location + "': " + tryUrl.error());
- }
-
- return doHttpGet(
- tryUrl.get(),
- headers,
- timeout,
- false,
- httpResponse.status);
- } else {
- return Failure("Invalid response: " + httpResponse.status);
- }
- }));
-}
-
-
-Future<ManifestResponse> RegistryClientProcess::getManifest(
- const string& path,
- const Option<string>& tag,
- const Duration& timeout)
-{
- if (strings::contains(path, " ")) {
- return Failure("Invalid repository path: " + path);
- }
-
- string repoTag = tag.getOrElse("latest");
- if (strings::contains(repoTag, " ")) {
- return Failure("Invalid repository tag: " + repoTag);
- }
-
- URL manifestURL(registryServer_);
- manifestURL.path =
- "v2/" + path + "/manifests/" + repoTag;
-
- auto getManifestResponse = [](
- const Response& httpResponse) -> Try<ManifestResponse> {
- if (!httpResponse.headers.contains("Docker-Content-Digest")) {
- return Error("Docker-Content-Digest header missing in response");
- }
-
- Try<JSON::Object> responseJSON =
- JSON::parse<JSON::Object>(httpResponse.body);
-
- if (responseJSON.isError()) {
- return Error(responseJSON.error());
- }
-
- Result<JSON::String> name = responseJSON.get().find<JSON::String>("name");
- if (name.isNone()) {
- return Error("Failed to find \"name\" in manifest response");
- }
-
- Result<JSON::Array> fsLayers =
- responseJSON.get().find<JSON::Array>("fsLayers");
-
- if (fsLayers.isNone()) {
- return Error("Failed to find \"fsLayers\" in manifest response");
- }
-
- Result<JSON::Array> historyArray =
- responseJSON.get().find<JSON::Array>("history");
-
- if (historyArray.isNone()) {
- return Error("Failed to find \"history\" in manifest response");
- }
-
- if (historyArray.get().values.size() != fsLayers.get().values.size()) {
- return Error(
- "\"history\" and \"fsLayers\" array count mismatch"
- "in manifest response");
- }
-
- vector<FileSystemLayerInfo> fsLayerInfoList;
- size_t index = 0;
-
- foreach (const JSON::Value& layer, fsLayers.get().values) {
- if (!layer.is<JSON::Object>()) {
- return Error(
- "Failed to parse layer as a JSON object for index: " +
- stringify(index));
- }
-
- const JSON::Object& layerInfoJSON = layer.as<JSON::Object>();
-
- // Get blobsum for layer.
- const Result<JSON::String> blobSumInfo =
- layerInfoJSON.find<JSON::String>("blobSum");
-
- if (blobSumInfo.isNone()) {
- return Error("Failed to find \"blobSum\" in manifest response");
- }
-
- // Get history for layer.
- if (!historyArray.get().values[index].is<JSON::Object>()) {
- return Error(
- "Failed to parse history as a JSON object for index: " +
- stringify(index));
- }
- const JSON::Object& historyObj =
- historyArray.get().values[index].as<JSON::Object>();
-
- // Get layer id.
- const Result<JSON::String> v1CompatibilityJSON =
- historyObj.find<JSON::String>("v1Compatibility");
-
- if (!v1CompatibilityJSON.isSome()) {
- return Error(
- "Failed to obtain layer v1 compability json in manifest for layer: "
- + stringify(index));
- }
-
- Try<JSON::Object> v1CompatibilityObj =
- JSON::parse<JSON::Object>(v1CompatibilityJSON.get().value);
-
- if (!v1CompatibilityObj.isSome()) {
- return Error(
- "Failed to parse v1 compability json in manifest for layer: "
- + stringify(index));
- }
-
- const Result<JSON::String> id =
- v1CompatibilityObj.get().find<JSON::String>("id");
-
- if (!id.isSome()) {
- return Error(
- "Failed to find \"id\" in manifest for layer: " + stringify(index));
- }
-
- fsLayerInfoList.emplace_back(
- FileSystemLayerInfo{
- blobSumInfo.get().value,
- id.get().value,
- });
-
- index++;
- }
-
- return ManifestResponse {
- name.get().value,
- httpResponse.headers.at("Docker-Content-Digest"),
- fsLayerInfoList,
- };
- };
-
- return doHttpGet(manifestURL, None(), timeout, true, None())
- .then([getManifestResponse] (
- const Response& response) -> Future<ManifestResponse> {
- Try<ManifestResponse> manifestResponse = getManifestResponse(response);
-
- if (manifestResponse.isError()) {
- return Failure(
- "Failed to parse manifest response: " + manifestResponse.error());
- }
-
- return manifestResponse.get();
- });
-}
-
-
-Future<size_t> RegistryClientProcess::getBlob(
- const string& path,
- const Option<string>& digest,
- const Path& filePath,
- const Duration& timeout,
- size_t maxSize)
-{
- auto prepare = ([&filePath]() -> Try<Nothing> {
- const string dirName = filePath.dirname();
-
- // TODO(jojy): Return more state, for example - if the directory is new.
- Try<Nothing> dirResult = os::mkdir(dirName, true);
- if (dirResult.isError()) {
- return Error(
- "Failed to create directory to download blob: " +
- dirResult.error());
- }
-
- return dirResult;
- })();
-
- // TODO(jojy): This currently leaves a residue in failure cases. Would be
- // ideal if we can completely rollback.
- if (prepare.isError()) {
- return Failure(prepare.error());
- }
-
- if (strings::contains(path, " ")) {
- return Failure("Invalid repository path: " + path);
- }
-
- URL blobURL(registryServer_);
- blobURL.path =
- "v2/" + path + "/blobs/" + digest.getOrElse("");
-
- auto saveBlob = [filePath](
- const Response& httpResponse) -> Future<size_t> {
- // TODO(jojy): Add verification step.
- // TODO(jojy): Add check for max size.
- size_t size = httpResponse.body.length();
- Try<int> fd = os::open(
- filePath.value,
- O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC,
- S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-
- if (fd.isError()) {
- return Failure("Failed to open file '" + filePath.value + "': " +
- fd.error());
- }
-
- return process::io::write(fd.get(), httpResponse.body)
- .then([size](const Future<Nothing>&) { return size; })
- .onAny([fd]() { os::close(fd.get()); } );
- };
-
- return doHttpGet(blobURL, None(), timeout, true, None())
- .then([saveBlob](const Response& response) { return saveBlob(response); });
-}
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/registry_client.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/registry_client.hpp b/src/slave/containerizer/provisioner/docker/registry_client.hpp
deleted file mode 100644
index 1d3377e..0000000
--- a/src/slave/containerizer/provisioner/docker/registry_client.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
-#define __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
-
-#include <string>
-#include <vector>
-
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/json.hpp>
-#include <stout/path.hpp>
-
-#include <process/future.hpp>
-#include <process/http.hpp>
-#include <process/process.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-// Forward declarations.
-class RegistryClientProcess;
-
-
-class RegistryClient
-{
-public:
- /**
- * Encapsulates information about a file system layer.
- */
- struct FileSystemLayerInfo {
- // TODO(jojy): This string includes the checksum type also now. Need to
- // separate this into checksum method and checksum.
- const std::string checksumInfo;
- const std::string layerId;
- };
-
- /**
- * Encapsulates response of "GET Manifest" request.
- *
- * Reference: https://docs.docker.com/registry/spec/api
- */
- struct ManifestResponse {
- const std::string name;
- const std::string digest;
- const std::vector<FileSystemLayerInfo> fsLayerInfoList;
- };
-
- /**
- * Encapsulates auth credentials for the client sessions.
- * TODO(jojy): Secure heap to protect the credentials.
- */
- struct Credentials {
- /**
- * UserId for basic authentication.
- */
- const Option<std::string> userId;
- /**
- * Password for basic authentication.
- */
- const Option<std::string> password;
- /**
- * Account for fetching data from registry.
- */
- const Option<std::string> account;
- };
-
- /**
- * Factory method for creating RegistryClient objects.
- *
- * @param registryServer URL of docker registry server.
- * @param authServer URL of authorization server.
- * @param credentials credentials for client session (optional).
- * @return RegistryClient on Success.
- * Error on failure.
- */
- static Try<process::Owned<RegistryClient>> create(
- const process::http::URL& registryServer,
- const process::http::URL& authServer,
- const Option<Credentials>& credentials);
-
- /**
- * Fetches manifest for a repository from the client's remote registry server.
- *
- * @param path path of the repository on the registry.
- * @param tag unique tag that identifies the repository. Will default to
- * latest.
- * @param timeout Maximum time ater which the request will timeout and return
- * a failure. Will default to RESPONSE_TIMEOUT.
- * @return JSON object on success.
- * Failure on process failure.
- */
- process::Future<ManifestResponse> getManifest(
- const std::string& path,
- const Option<std::string>& tag,
- const Option<Duration>& timeout);
-
- /**
- * Fetches blob for a repository from the client's remote registry server.
- *
- * @param path path of the repository on the registry.
- * @param digest digest of the blob (from manifest).
- * @param filePath file path to store the fetched blob.
- * @param timeout Maximum time ater which the request will timeout and return
- * a failure. Will default to RESPONSE_TIMEOUT.
- * @param maxSize Maximum size of the response thats acceptable. Will default
- * to MAX_RESPONSE_SIZE.
- * @return size of downloaded blob on success.
- * Failure in case of any errors.
- */
- process::Future<size_t> getBlob(
- const std::string& path,
- const Option<std::string>& digest,
- const Path& filePath,
- const Option<Duration>& timeout,
- const Option<size_t>& maxSize);
-
- ~RegistryClient();
-
-private:
- RegistryClient(
- const process::http::URL& registryServer,
- const process::http::URL& authServer,
- const Option<Credentials>& credentials,
- const process::Owned<RegistryClientProcess>& process);
-
- static const Duration DEFAULT_MANIFEST_TIMEOUT_SECS;
- static const size_t DEFAULT_MANIFEST_MAXSIZE_BYTES;
-
- const process::http::URL registryServer_;
- const process::http::URL authServer_;
- const Option<Credentials> credentials_;
- process::Owned<RegistryClientProcess> process_;
-
- RegistryClient(const RegistryClient&) = delete;
- RegistryClient& operator=(const RegistryClient&) = delete;
-};
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/spec.cpp b/src/slave/containerizer/provisioner/docker/spec.cpp
deleted file mode 100644
index 5d6893d..0000000
--- a/src/slave/containerizer/provisioner/docker/spec.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/foreach.hpp>
-#include <stout/json.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/docker/spec.hpp"
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Docker spec.
-Option<Error> validateManifest(const DockerImageManifest& manifest)
-{
- // Validate required fields are present,
- // e.g., repeated fields that has to be >= 1.
- if (manifest.fslayers_size() <= 0) {
- return Error("FsLayers field must have at least one blobSum");
- }
-
- if (manifest.history_size() <= 0) {
- return Error("History field must have at least one v1Compatibility");
- }
-
- if (manifest.signatures_size() <= 0) {
- return Error("Signatures field must have at least one signature");
- }
-
- // Verify that blobSum and v1Compatibility numbers are equal.
- if (manifest.fslayers_size() != manifest.history_size()) {
- return Error("Size of blobSum and v1Compatibility must be equal");
- }
-
- // FsLayers field validation.
- foreach (const docker::DockerImageManifest::FsLayers& fslayer,
- manifest.fslayers()) {
- const string& blobSum = fslayer.blobsum();
- if (!strings::contains(blobSum, ":")) {
- return Error("Incorrect blobSum format");
- }
- }
-
- return None();
-}
-
-
-Try<docker::DockerImageManifest> parse(const JSON::Object& json)
-{
- Try<docker::DockerImageManifest> manifest =
- protobuf::parse<docker::DockerImageManifest>(json);
-
- if (manifest.isError()) {
- return Error("Protobuf parse failed: " + manifest.error());
- }
-
- Option<Error> error = validateManifest(manifest.get());
- if (error.isSome()) {
- return Error("Docker Image Manifest Validation failed: " +
- error.get().message);
- }
-
- return manifest.get();
-}
-
-} // namespace spec {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/spec.hpp b/src/slave/containerizer/provisioner/docker/spec.hpp
deleted file mode 100644
index 199db54..0000000
--- a/src/slave/containerizer/provisioner/docker/spec.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_SPEC_HPP__
-#define __PROVISIONER_DOCKER_SPEC_HPP__
-
-#include <stout/error.hpp>
-#include <stout/json.hpp>
-#include <stout/option.hpp>
-
-#include <mesos/mesos.hpp>
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Docker spec.
-Option<Error> validateManifest(const docker::DockerImageManifest& manifest);
-
-// TODO(Gilbert): add validations here, e.g., Manifest, Blob, Layout, ImageID.
-
-// Parse the DockerImageManifest from the specified JSON object.
-Try<docker::DockerImageManifest> parse(const JSON::Object& json);
-
-} // namespace spec {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_SPEC_HPP__
[04/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp b/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
new file mode 100644
index 0000000..f314f20
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
@@ -0,0 +1,355 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/json.hpp>
+#include <stout/os.hpp>
+#include <stout/result.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/subprocess.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/local_puller.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::pair;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class LocalPullerProcess : public process::Process<LocalPullerProcess>
+{
+public:
+ LocalPullerProcess(const Flags& _flags) : flags(_flags) {}
+
+ ~LocalPullerProcess() {}
+
+ process::Future<list<pair<string, string>>> pull(
+ const Image::Name& name,
+ const string& directory);
+
+private:
+ process::Future<Nothing> untarImage(
+ const std::string& tarPath,
+ const std::string& directory);
+
+ process::Future<list<pair<string, string>>> putImage(
+ const Image::Name& name,
+ const std::string& directory);
+
+ process::Future<list<pair<string, string>>> putLayers(
+ const std::string& directory,
+ const std::vector<std::string>& layerIds);
+
+ process::Future<pair<string, string>> putLayer(
+ const std::string& directory,
+ const std::string& layerId);
+
+ const Flags flags;
+};
+
+
+LocalPuller::LocalPuller(const Flags& flags)
+{
+ process = Owned<LocalPullerProcess>(new LocalPullerProcess(flags));
+ process::spawn(process.get());
+}
+
+
+LocalPuller::~LocalPuller()
+{
+ process::terminate(process.get());
+ process::wait(process.get());
+}
+
+
+Future<list<pair<string, string>>> LocalPuller::pull(
+ const Image::Name& name,
+ const string& directory)
+{
+ return dispatch(process.get(), &LocalPullerProcess::pull, name, directory);
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::pull(
+ const Image::Name& name,
+ const string& directory)
+{
+ const string tarPath = paths::getImageArchiveTarPath(
+ flags.docker_local_archives_dir,
+ stringify(name));
+
+ if (!os::exists(tarPath)) {
+ return Failure("Failed to find archive for image '" + stringify(name) +
+ "' at '" + tarPath + "'");
+ }
+
+ return untarImage(tarPath, directory)
+ .then(defer(self(), &Self::putImage, name, directory));
+}
+
+
+Future<Nothing> LocalPullerProcess::untarImage(
+ const string& tarPath,
+ const string& directory)
+{
+ VLOG(1) << "Untarring image from '" << tarPath
+ << "' to '" << directory << "'";
+
+ // Untar store_discovery_local_dir/name.tar into directory/.
+ // TODO(tnachen): Terminate tar process when slave exits.
+ const vector<string> argv = {
+ "tar",
+ "-C",
+ directory,
+ "-x",
+ "-f",
+ tarPath
+ };
+
+ Try<Subprocess> s = subprocess(
+ "tar",
+ argv,
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PATH("/dev/null"));
+
+ if (s.isError()) {
+ return Failure("Failed to create tar subprocess: " + s.error());
+ }
+
+ return s.get().status()
+ .then([tarPath](const Option<int>& status) -> Future<Nothing> {
+ if (status.isNone()) {
+ return Failure("Failed to reap status for tar subprocess in " +
+ tarPath);
+ }
+ if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
+ return Failure("Untar image failed with exit code: " +
+ WSTRINGIFY(status.get()));
+ }
+
+ return Nothing();
+ });
+}
+
+
+static Result<string> getParentId(
+ const string& directory,
+ const string& layerId)
+{
+ Try<string> manifest =
+ os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
+
+ if (manifest.isError()) {
+ return Error("Failed to read manifest: " + manifest.error());
+ }
+
+ Try<JSON::Object> json = JSON::parse<JSON::Object>(manifest.get());
+ if (json.isError()) {
+ return Error("Failed to parse manifest: " + json.error());
+ }
+
+ Result<JSON::String> parentId = json.get().find<JSON::String>("parent");
+ if (parentId.isNone() || (parentId.isSome() && parentId.get() == "")) {
+ return None();
+ } else if (parentId.isError()) {
+ return Error("Failed to read parent of layer: " + parentId.error());
+ }
+
+ return parentId.get().value;
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::putImage(
+ const Image::Name& name,
+ const string& directory)
+{
+ Try<string> value =
+ os::read(paths::getImageArchiveRepositoriesPath(directory));
+
+ if (value.isError()) {
+ return Failure("Failed to read repository JSON: " + value.error());
+ }
+
+ Try<JSON::Object> json = JSON::parse<JSON::Object>(value.get());
+ if (json.isError()) {
+ return Failure("Failed to parse JSON: " + json.error());
+ }
+
+ Result<JSON::Object> repositoryValue =
+ json.get().find<JSON::Object>(name.repository());
+
+ if (repositoryValue.isError()) {
+ return Failure("Failed to find repository: " + repositoryValue.error());
+ } else if (repositoryValue.isNone()) {
+ return Failure("Repository '" + name.repository() + "' is not found");
+ }
+
+ const JSON::Object repositoryJson = repositoryValue.get();
+
+ // We don't use JSON find here because a tag might contain a '.'.
+ std::map<string, JSON::Value>::const_iterator entry =
+ repositoryJson.values.find(name.tag());
+
+ if (entry == repositoryJson.values.end()) {
+ return Failure("Tag '" + name.tag() + "' is not found");
+ } else if (!entry->second.is<JSON::String>()) {
+ return Failure("Tag JSON value expected to be JSON::String");
+ }
+
+ const string layerId = entry->second.as<JSON::String>().value;
+
+ Try<string> manifest =
+ os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
+
+ if (manifest.isError()) {
+ return Failure("Failed to read manifest: " + manifest.error());
+ }
+
+ Try<JSON::Object> manifestJson = JSON::parse<JSON::Object>(manifest.get());
+ if (manifestJson.isError()) {
+ return Failure("Failed to parse manifest: " + manifestJson.error());
+ }
+
+ vector<string> layerIds;
+ layerIds.push_back(layerId);
+ Result<string> parentId = getParentId(directory, layerId);
+ while (parentId.isSome()) {
+ layerIds.insert(layerIds.begin(), parentId.get());
+ parentId = getParentId(directory, parentId.get());
+ }
+
+ if (parentId.isError()) {
+ return Failure("Failed to find parent layer id of layer '" + layerId +
+ "': " + parentId.error());
+ }
+
+ return putLayers(directory, layerIds);
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::putLayers(
+ const string& directory,
+ const vector<string>& layerIds)
+{
+ list<Future<pair<string, string>>> futures;
+ foreach (const string& layerId, layerIds) {
+ futures.push_back(putLayer(directory, layerId));
+ }
+
+ return collect(futures);
+}
+
+
+Future<pair<string, string>> LocalPullerProcess::putLayer(
+ const string& directory,
+ const string& layerId)
+{
+ // We untar the layer from source into a directory, then move the
+ // layer into store. We do this instead of untarring directly to
+ // store to make sure we don't end up with partially untarred layer
+ // rootfs.
+
+ const string localRootfsPath =
+ paths::getImageArchiveLayerRootfsPath(directory, layerId);
+
+ // Image layer has been untarred but is not present in the store directory.
+ if (os::exists(localRootfsPath)) {
+ LOG(WARNING) << "Image layer '" << layerId << "' rootfs present at but not "
+ << "in store directory '" << localRootfsPath << "'. Removing "
+ << "staged rootfs and untarring layer again.";
+
+ Try<Nothing> rmdir = os::rmdir(localRootfsPath);
+ if (rmdir.isError()) {
+ return Failure("Failed to remove incomplete staged rootfs for layer '" +
+ layerId + "': " + rmdir.error());
+ }
+ }
+
+ Try<Nothing> mkdir = os::mkdir(localRootfsPath);
+ if (mkdir.isError()) {
+ return Failure("Failed to create rootfs path '" + localRootfsPath +
+ "': " + mkdir.error());
+ }
+
+ // Untar directory/id/layer.tar into directory/id/rootfs.
+ // The tar file will be removed when the staging directory is
+ // removed.
+ const vector<string> argv = {
+ "tar",
+ "-C",
+ localRootfsPath,
+ "-x",
+ "-f",
+ paths::getImageArchiveLayerTarPath(directory, layerId)
+ };
+
+ Try<Subprocess> s = subprocess(
+ "tar",
+ argv,
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PATH("/dev/null"));
+
+ if (s.isError()) {
+ return Failure("Failed to create tar subprocess: " + s.error());
+ }
+
+ return s.get().status()
+ .then([directory, layerId](
+ const Option<int>& status) -> Future<pair<string, string>> {
+ if (status.isNone()) {
+ return Failure("Failed to reap subprocess to untar image");
+ } else if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
+ return Failure("Untar failed with exit code: " +
+ WSTRINGIFY(status.get()));
+ }
+
+ const string rootfsPath =
+ paths::getImageArchiveLayerRootfsPath(directory, layerId);
+
+ if (!os::exists(rootfsPath)) {
+ return Failure("Failed to find the rootfs path after extracting layer"
+ " '" + layerId + "'");
+ }
+
+ return pair<string, string>(layerId, rootfsPath);
+ });
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp b/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
new file mode 100644
index 0000000..87d8002
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
+#define __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward declaration.
+class LocalPullerProcess;
+
+
+/**
+ * LocalPuller assumes Docker images are stored in a local directory
+ * (configured with flags.docker_local_archives_dir), with all the
+ * images saved as tars with file names in the form of <repo>:<tag>.tar.
+ */
+class LocalPuller : public Puller
+{
+public:
+ explicit LocalPuller(const Flags& flags);
+
+ ~LocalPuller();
+
+ process::Future<std::list<std::pair<std::string, std::string>>> pull(
+ const Image::Name& name,
+ const std::string& directory);
+
+private:
+ LocalPuller& operator=(const LocalPuller&) = delete; // Not assignable.
+ LocalPuller(const LocalPuller&) = delete; // Not copyable.
+
+ process::Owned<LocalPullerProcess> process;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/message.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/message.hpp b/src/slave/containerizer/mesos/provisioner/docker/message.hpp
new file mode 100644
index 0000000..bb5248c
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/message.hpp
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MESSAGES_DOCKER_PROVISIONER_HPP__
+#define __MESSAGES_DOCKER_PROVISIONER_HPP__
+
+#include <stout/strings.hpp>
+
+// ONLY USEFUL AFTER RUNNING PROTOC.
+#include "slave/containerizer/mesos/provisioner/docker/message.pb.h"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Docker expects the image to be specified on the command line as:
+// [REGISTRY_HOST[:REGISTRY_PORT]/]REPOSITORY[:TAG|@TYPE:DIGEST]
+//
+// This format is inherently ambiguous when dealing with repository
+// names that include forward slashes. To disambiguate, the docker
+// code looks for '.', or ':', or 'localhost' to decide if the
+// first component is a registry or a respository name. For more
+// detail, drill into the implementation of docker pull.
+//
+// TODO(bmahler): We currently store the digest as a tag, does
+// that makes sense?
+//
+// TODO(bmahler): Validate based on docker's validation logic
+// and return a Try here.
+inline Image::Name parseImageName(std::string s)
+{
+ Image::Name name;
+
+ // Extract the digest.
+ if (strings::contains(s, "@")) {
+ std::vector<std::string> split = strings::split(s, "@");
+
+ s = split[0];
+ name.set_tag(split[1]);
+ }
+
+ // Remove the tag. We need to watch out for a
+ // host:port registry, which also contains ':'.
+ if (strings::contains(s, ":")) {
+ std::vector<std::string> split = strings::split(s, ":");
+
+ // The tag must be the last component. If a slash is
+ // present there is a registry port and no tag.
+ if (!strings::contains(split.back(), "/")) {
+ name.set_tag(split.back());
+ split.pop_back();
+
+ s = strings::join(":", split);
+ }
+ }
+
+ // Default to the 'latest' tag when omitted.
+ if (name.tag().empty()) {
+ name.set_tag("latest");
+ }
+
+ // Extract the registry and repository. The first component can
+ // either be the registry, or the first part of the repository!
+ // We resolve this ambiguity using the same hacks used in the
+ // docker code ('.', ':', 'localhost' indicate a registry).
+ std::vector<std::string> split = strings::split(s, "/", 2);
+
+ if (split.size() == 1) {
+ name.set_repository(s);
+ } else if (strings::contains(split[0], ".") ||
+ strings::contains(split[0], ":") ||
+ split[0] == "localhost") {
+ name.set_registry(split[0]);
+ name.set_repository(split[1]);
+ } else {
+ name.set_repository(s);
+ }
+
+ return name;
+}
+
+
+inline std::ostream& operator<<(
+ std::ostream& stream,
+ const Image::Name& name)
+{
+ if (name.has_registry()) {
+ return stream << name.registry() << "/" << name.repository() << ":"
+ << name.tag();
+ }
+
+ return stream << name.repository() << ":" << name.tag();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __MESSAGES_DOCKER_PROVISIONER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/message.proto
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/message.proto b/src/slave/containerizer/mesos/provisioner/docker/message.proto
new file mode 100644
index 0000000..c33e0c5
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/message.proto
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import "mesos/mesos.proto";
+
+package mesos.internal.slave.docker;
+
+/**
+ * A Docker Image name and the layer ids of the layers that comprise the image.
+ * The layerIds are ordered, with the root layer id (no parent layer id) first
+ * and the leaf layer id last.
+ */
+message Image {
+ message Name {
+ optional string registry = 1;
+ required string repository = 2;
+
+ // TODO(bmahler): This may hold a tag or a digest, split these?
+ required string tag = 3;
+ }
+
+ required Name name = 1;
+
+ // The order of the layers represents the dependency between layers.
+ repeated string layer_ids = 2;
+}
+
+
+message Images {
+ repeated Image images = 1;
+}
+
+
+/**
+* Protobuf for the Docker image manifest JSON schema:
+* https://github.com/docker/distribution/blob/master/docs/spec/manifest-v2-1.md
+*/
+message DockerImageManifest {
+ required string name = 1;
+ required string tag = 2;
+ required string architecture = 3;
+
+ message FsLayers {
+ required string blobSum = 1;
+ }
+
+ repeated FsLayers fsLayers = 4;
+
+ message History {
+ message V1Compatibility {
+ required string id = 1;
+ required string parent = 2;
+ }
+
+ required V1Compatibility v1Compatibility = 1;
+ }
+
+ repeated History history = 5;
+ required uint32 schemaVersion = 6;
+
+ message Signatures {
+
+ //JOSE (A JSON Web Signature).
+ message Header {
+
+ //JSON Web Key.
+ message Jwk {
+ required string crv = 1;
+ required string kid = 2;
+ required string kty = 3;
+ required string x = 4;
+ required string y = 5;
+ }
+
+ optional Jwk jwk = 1;
+ required string alg = 2;
+ }
+
+ required Header header = 1;
+ required string signature = 2;
+ required string protected = 3;
+ }
+
+ repeated Signatures signatures = 7;
+}
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
new file mode 100644
index 0000000..af6f5b8
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/os.hpp>
+#include <stout/protobuf.hpp>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/owned.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/state.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class MetadataManagerProcess : public process::Process<MetadataManagerProcess>
+{
+public:
+ MetadataManagerProcess(const Flags& _flags) : flags(_flags) {}
+
+ ~MetadataManagerProcess() {}
+
+ Future<Nothing> recover();
+
+ Future<Image> put(
+ const Image::Name& name,
+ const std::vector<std::string>& layerIds);
+
+ Future<Option<Image>> get(const Image::Name& name);
+
+ // TODO(chenlily): Implement removal of unreferenced images.
+
+private:
+ // Write out metadata manager state to persistent store.
+ Try<Nothing> persist();
+
+ const Flags flags;
+
+ // This is a lookup table for images that are stored in memory. It is keyed
+ // by the name of the Image.
+ // For example, "ubuntu:14.04" -> ubuntu14:04 Image.
+ hashmap<std::string, Image> storedImages;
+};
+
+
+Try<Owned<MetadataManager>> MetadataManager::create(const Flags& flags)
+{
+ Owned<MetadataManagerProcess> process(new MetadataManagerProcess(flags));
+
+ return Owned<MetadataManager>(new MetadataManager(process));
+}
+
+
+MetadataManager::MetadataManager(Owned<MetadataManagerProcess> process)
+ : process(process)
+{
+ process::spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+MetadataManager::~MetadataManager()
+{
+ process::terminate(process.get());
+ process::wait(process.get());
+}
+
+
+Future<Nothing> MetadataManager::recover()
+{
+ return process::dispatch(process.get(), &MetadataManagerProcess::recover);
+}
+
+
+Future<Image> MetadataManager::put(
+ const Image::Name& name,
+ const vector<string>& layerIds)
+{
+ return dispatch(
+ process.get(),
+ &MetadataManagerProcess::put,
+ name,
+ layerIds);
+}
+
+
+Future<Option<Image>> MetadataManager::get(const Image::Name& name)
+{
+ return dispatch(process.get(), &MetadataManagerProcess::get, name);
+}
+
+
+Future<Image> MetadataManagerProcess::put(
+ const Image::Name& name,
+ const vector<string>& layerIds)
+{
+ const string imageName = stringify(name);
+
+ Image dockerImage;
+ dockerImage.mutable_name()->CopyFrom(name);
+ foreach (const string& layerId, layerIds) {
+ dockerImage.add_layer_ids(layerId);
+ }
+
+ storedImages[imageName] = dockerImage;
+
+ Try<Nothing> status = persist();
+ if (status.isError()) {
+ return Failure("Failed to save state of Docker images: " + status.error());
+ }
+
+ return dockerImage;
+}
+
+
+Future<Option<Image>> MetadataManagerProcess::get(
+ const Image::Name& name)
+{
+ const string imageName = stringify(name);
+
+ if (!storedImages.contains(imageName)) {
+ return None();
+ }
+
+ return storedImages[imageName];
+}
+
+
+Try<Nothing> MetadataManagerProcess::persist()
+{
+ Images images;
+
+ foreachvalue (const Image& image, storedImages) {
+ images.add_images()->CopyFrom(image);
+ }
+
+ Try<Nothing> status = state::checkpoint(
+ paths::getStoredImagesPath(flags.docker_store_dir), images);
+ if (status.isError()) {
+ return Error("Failed to perform checkpoint: " + status.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<Nothing> MetadataManagerProcess::recover()
+{
+ string storedImagesPath = paths::getStoredImagesPath(flags.docker_store_dir);
+
+ if (!os::exists(storedImagesPath)) {
+ LOG(INFO) << "No images to load from disk. Docker provisioner image "
+ << "storage path '" << storedImagesPath << "' does not exist";
+ return Nothing();
+ }
+
+ Result<Images> images = ::protobuf::read<Images>(storedImagesPath);
+ if (images.isError()) {
+ return Failure("Failed to read protobuf for Docker provisioner image: " +
+ images.error());
+ }
+
+ foreach (const Image image, images.get().images()) {
+ vector<string> missingLayerIds;
+ foreach (const string layerId, image.layer_ids()) {
+ const string rootfsPath =
+ paths::getImageLayerRootfsPath(flags.docker_store_dir, layerId);
+
+ if (!os::exists(rootfsPath)) {
+ missingLayerIds.push_back(layerId);
+ }
+ }
+
+ if (!missingLayerIds.empty()) {
+ LOG(WARNING) << "Skipped loading image '" << stringify(image.name())
+ << "' due to missing layers: " << stringify(missingLayerIds);
+ continue;
+ }
+
+ const string imageName = stringify(image.name());
+ if (storedImages.contains(imageName)) {
+ LOG(WARNING) << "Found duplicate image in recovery for image name '"
+ << imageName << "'";
+ } else {
+ storedImages[imageName] = image;
+ }
+ }
+
+ LOG(INFO) << "Loaded " << storedImages.size() << " Docker images";
+
+ return Nothing();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
new file mode 100644
index 0000000..dbae8d8
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
+#define __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
+
+#include <list>
+#include <string>
+
+#include <stout/hashmap.hpp>
+#include <stout/json.hpp>
+#include <stout/option.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+#include <process/process.hpp>
+
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward Declaration.
+class MetadataManagerProcess;
+
+/**
+ * The MetadataManager tracks the Docker images cached by the
+ * provisioner that are stored on disk. It keeps track of the layers
+ * that Docker images are composed of and recovers Image objects
+ * upon initialization by checking for dependent layers stored on disk.
+ * Currently, image layers are stored indefinitely, with no garbage
+ * collection of unreferenced image layers.
+ */
+class MetadataManager
+{
+public:
+ static Try<process::Owned<MetadataManager>> create(const Flags& flags);
+
+ ~MetadataManager();
+
+ /**
+ * Recover all stored Image and its layer references.
+ */
+ process::Future<Nothing> recover();
+
+ /**
+ * Create an Image, put it in metadata manager and persist the reference
+ * store state to disk.
+ *
+ * @param name the name of the Docker image to place in the reference
+ * store.
+ * @param layerIds the list of layer ids that comprise the Docker image in
+ * order where the root layer's id (no parent layer) is first
+ * and the leaf layer's id is last.
+ */
+ process::Future<Image> put(
+ const Image::Name& name,
+ const std::vector<std::string>& layerIds);
+
+ /**
+ * Retrieve Image based on image name if it is among the Images
+ * stored in memory.
+ *
+ * @param name the name of the Docker image to retrieve
+ */
+ process::Future<Option<Image>> get(const Image::Name& name);
+
+private:
+ explicit MetadataManager(process::Owned<MetadataManagerProcess> process);
+
+ MetadataManager(const MetadataManager&); // Not copyable.
+ MetadataManager& operator=(const MetadataManager&); // Not assignable.
+
+ process::Owned<MetadataManagerProcess> process;
+};
+
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/paths.cpp b/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
new file mode 100644
index 0000000..e3392ea
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+
+#include <stout/path.hpp>
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace paths {
+
+string getStagingDir(const string& storeDir)
+{
+ return path::join(storeDir, "staging");
+}
+
+
+string getStagingTempDir(const string& storeDir)
+{
+ return path::join(getStagingDir(storeDir), "XXXXXX");
+}
+
+
+string getImageArchiveTarPath(
+ const string& discoveryDir,
+ const string& name)
+{
+ return path::join(discoveryDir, name + ".tar");
+}
+
+
+string getImageArchiveRepositoriesPath(const string& archivePath)
+{
+ return path::join(archivePath, "repositories");
+}
+
+
+std::string getImageArchiveLayerPath(
+ const string& archivePath,
+ const string& layerId)
+{
+ return path::join(archivePath, layerId);
+}
+
+
+string getImageArchiveLayerManifestPath(
+ const string& archivePath,
+ const string& layerId)
+{
+ return path::join(getImageArchiveLayerPath(archivePath, layerId), "json");
+}
+
+
+string getImageArchiveLayerTarPath(
+ const string& archivePath,
+ const string& layerId)
+{
+ return path::join(
+ getImageArchiveLayerPath(archivePath, layerId), "layer.tar");
+}
+
+
+string getImageArchiveLayerRootfsPath(
+ const string& archivePath,
+ const string& layerId)
+{
+ return path::join(getImageArchiveLayerPath(archivePath, layerId), "rootfs");
+}
+
+
+string getImageLayerPath(
+ const string& storeDir,
+ const string& layerId)
+{
+ return path::join(storeDir, "layers", layerId);
+}
+
+
+string getImageLayerRootfsPath(
+ const string& storeDir,
+ const string& layerId)
+{
+ return path::join(getImageLayerPath(storeDir, layerId), "rootfs");
+}
+
+
+string getStoredImagesPath(const string& storeDir)
+{
+ return path::join(storeDir, "storedImages");
+}
+
+} // namespace paths {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/paths.hpp b/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
new file mode 100644
index 0000000..18beb2e
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_PATHS_HPP__
+#define __PROVISIONER_DOCKER_PATHS_HPP__
+
+#include <list>
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace paths {
+
+/**
+ * The Docker store file system layout is as follows:
+ * Image store dir ('--docker_store_dir' slave flag)
+ * |--staging
+ * |-- <temp_dir_archive>
+ * |-- <layer_id>
+ * |-- rootfs
+ * |--layers
+ * |--<layer_id>
+ * |--rootfs
+ * |--storedImages (file holding on cached images)
+ */
+
+std::string getStagingDir(const std::string& storeDir);
+
+
+std::string getStagingTempDir(const std::string& storeDir);
+
+
+std::string getImageArchiveTarPath(
+ const std::string& discoveryDir,
+ const std::string& name);
+
+
+std::string getImageArchiveRepositoriesPath(const std::string& archivePath);
+
+
+std::string getImageArchiveLayerPath(
+ const std::string& archivePath,
+ const std::string& layerId);
+
+
+std::string getImageArchiveLayerManifestPath(
+ const std::string& archivePath,
+ const std::string& layerId);
+
+
+std::string getImageArchiveLayerTarPath(
+ const std::string& archivePath,
+ const std::string& layerId);
+
+
+std::string getImageArchiveLayerRootfsPath(
+ const std::string& archivePath,
+ const std::string& layerId);
+
+
+std::string getImageLayerPath(
+ const std::string& storeDir,
+ const std::string& layerId);
+
+
+std::string getImageLayerRootfsPath(
+ const std::string& storeDir,
+ const std::string& layerId);
+
+
+std::string getStoredImagesPath(const std::string& storeDir);
+
+} // namespace paths {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/puller.cpp b/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
new file mode 100644
index 0000000..f61f9e5
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/local_puller.hpp"
+
+using std::string;
+
+using process::Owned;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+Try<Owned<Puller>> Puller::create(const Flags& flags)
+{
+ const string puller = flags.docker_puller;
+
+ if (puller == "local") {
+ return Owned<Puller>(new LocalPuller(flags));
+ }
+
+ return Error("Unknown or unsupported docker puller: " + puller);
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/puller.hpp b/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
new file mode 100644
index 0000000..8010b8a
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_PULLER_HPP__
+#define __PROVISIONER_DOCKER_PULLER_HPP__
+
+#include <list>
+#include <utility>
+
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class Puller
+{
+public:
+ static Try<process::Owned<Puller>> create(const Flags& flags);
+
+ virtual ~Puller() {}
+
+ /**
+ * Pull a Docker image layers into the specified directory, and
+ * return the list of layer ids in that image in the right
+ * dependency order, and also return the directory where
+ * the puller puts its changeset.
+ *
+ * @param name The name of the image.
+ * @param directory The target directory to store the layers.
+ * @return list of layers maped to its local directory ordered by its
+ * dependency.
+ */
+ virtual process::Future<std::list<std::pair<std::string, std::string>>> pull(
+ const docker::Image::Name& name,
+ const std::string& directory) = 0;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+
+#endif // __PROVISIONER_DOCKER_PULLER_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp b/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
new file mode 100644
index 0000000..5a01f1b
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
@@ -0,0 +1,641 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/http.hpp>
+#include <process/io.hpp>
+
+#include <stout/os.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/registry_client.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
+
+using std::string;
+using std::vector;
+
+using process::Failure;
+using process::Future;
+using process::Owned;
+using process::Process;
+
+using process::http::Request;
+using process::http::Response;
+using process::http::URL;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+using FileSystemLayerInfo = RegistryClient::FileSystemLayerInfo;
+
+using ManifestResponse = RegistryClient::ManifestResponse;
+
+const Duration RegistryClient::DEFAULT_MANIFEST_TIMEOUT_SECS = Seconds(10);
+
+const size_t RegistryClient::DEFAULT_MANIFEST_MAXSIZE_BYTES = 4096;
+
+static const uint16_t DEFAULT_SSL_PORT = 443;
+
+class RegistryClientProcess : public Process<RegistryClientProcess>
+{
+public:
+ static Try<Owned<RegistryClientProcess>> create(
+ const URL& registry,
+ const URL& authServer,
+ const Option<RegistryClient::Credentials>& creds);
+
+ Future<RegistryClient::ManifestResponse> getManifest(
+ const string& path,
+ const Option<string>& tag,
+ const Duration& timeout);
+
+ Future<size_t> getBlob(
+ const string& path,
+ const Option<string>& digest,
+ const Path& filePath,
+ const Duration& timeout,
+ size_t maxSize);
+
+private:
+ RegistryClientProcess(
+ const URL& registryServer,
+ const Owned<TokenManager>& tokenManager,
+ const Option<RegistryClient::Credentials>& creds);
+
+ Future<Response> doHttpGet(
+ const URL& url,
+ const Option<process::http::Headers>& headers,
+ const Duration& timeout,
+ bool resend,
+ const Option<string>& lastResponse) const;
+
+ Try<process::http::Headers> getAuthenticationAttributes(
+ const Response& httpResponse) const;
+
+ const URL registryServer_;
+ Owned<TokenManager> tokenManager_;
+ const Option<RegistryClient::Credentials> credentials_;
+
+ RegistryClientProcess(const RegistryClientProcess&) = delete;
+ RegistryClientProcess& operator = (const RegistryClientProcess&) = delete;
+};
+
+
+Try<Owned<RegistryClient>> RegistryClient::create(
+ const URL& registryServer,
+ const URL& authServer,
+ const Option<Credentials>& creds)
+{
+ Try<Owned<RegistryClientProcess>> process =
+ RegistryClientProcess::create(authServer, registryServer, creds);
+
+ if (process.isError()) {
+ return Error(process.error());
+ }
+
+ return Owned<RegistryClient>(
+ new RegistryClient(authServer, registryServer, creds, process.get()));
+}
+
+
+RegistryClient::RegistryClient(
+ const URL& registryServer,
+ const URL& authServer,
+ const Option<Credentials>& creds,
+ const Owned<RegistryClientProcess>& process)
+ : registryServer_(registryServer),
+ authServer_(authServer),
+ credentials_(creds),
+ process_(process)
+{
+ spawn(CHECK_NOTNULL(process_.get()));
+}
+
+
+RegistryClient::~RegistryClient()
+{
+ terminate(process_.get());
+ process::wait(process_.get());
+}
+
+
+Future<ManifestResponse> RegistryClient::getManifest(
+ const string& _path,
+ const Option<string>& _tag,
+ const Option<Duration>& _timeout)
+{
+ Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
+
+ return dispatch(
+ process_.get(),
+ &RegistryClientProcess::getManifest,
+ _path,
+ _tag,
+ timeout);
+}
+
+
+Future<size_t> RegistryClient::getBlob(
+ const string& _path,
+ const Option<string>& _digest,
+ const Path& _filePath,
+ const Option<Duration>& _timeout,
+ const Option<size_t>& _maxSize)
+{
+ Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
+ size_t maxSize = _maxSize.getOrElse(DEFAULT_MANIFEST_MAXSIZE_BYTES);
+
+ return dispatch(
+ process_.get(),
+ &RegistryClientProcess::getBlob,
+ _path,
+ _digest,
+ _filePath,
+ timeout,
+ maxSize);
+}
+
+
+Try<Owned<RegistryClientProcess>> RegistryClientProcess::create(
+ const URL& registryServer,
+ const URL& authServer,
+ const Option<RegistryClient::Credentials>& creds)
+{
+ Try<Owned<TokenManager>> tokenMgr = TokenManager::create(authServer);
+ if (tokenMgr.isError()) {
+ return Error("Failed to create token manager: " + tokenMgr.error());
+ }
+
+ return Owned<RegistryClientProcess>(
+ new RegistryClientProcess(registryServer, tokenMgr.get(), creds));
+}
+
+
+RegistryClientProcess::RegistryClientProcess(
+ const URL& registryServer,
+ const Owned<TokenManager>& tokenMgr,
+ const Option<RegistryClient::Credentials>& creds)
+ : registryServer_(registryServer),
+ tokenManager_(tokenMgr),
+ credentials_(creds) {}
+
+
+Try<process::http::Headers> RegistryClientProcess::getAuthenticationAttributes(
+ const Response& httpResponse) const
+{
+ if (httpResponse.headers.find("WWW-Authenticate") ==
+ httpResponse.headers.end()) {
+ return Error("Failed to find WWW-Authenticate header value");
+ }
+
+ const string& authString = httpResponse.headers.at("WWW-Authenticate");
+
+ const vector<string> authStringTokens = strings::tokenize(authString, " ");
+ if ((authStringTokens.size() != 2) || (authStringTokens[0] != "Bearer")) {
+ // TODO(jojy): Look at various possibilities of auth response. We currently
+ // assume that the string will have realm information.
+ return Error("Invalid authentication header value: " + authString);
+ }
+
+ const vector<string> authParams = strings::tokenize(authStringTokens[1], ",");
+
+ process::http::Headers authAttributes;
+ auto addAttribute = [&authAttributes](
+ const string& param) -> Try<Nothing> {
+ const vector<string> paramTokens =
+ strings::tokenize(param, "=\"");
+
+ if (paramTokens.size() != 2) {
+ return Error(
+ "Failed to get authentication attribute from response parameter " +
+ param);
+ }
+
+ authAttributes.insert({paramTokens[0], paramTokens[1]});
+
+ return Nothing();
+ };
+
+ foreach (const string& param, authParams) {
+ Try<Nothing> addRes = addAttribute(param);
+ if (addRes.isError()) {
+ return Error(addRes.error());
+ }
+ }
+
+ return authAttributes;
+}
+
+
+Future<Response> RegistryClientProcess::doHttpGet(
+ const URL& url,
+ const Option<process::http::Headers>& headers,
+ const Duration& timeout,
+ bool resend,
+ const Option<string>& lastResponseStatus) const
+{
+ return process::http::get(url, headers)
+ .after(timeout, [](
+ const Future<Response>& httpResponseFuture) -> Future<Response> {
+ return Failure("Response timeout");
+ })
+ .then(defer(self(), [=](
+ const Response& httpResponse) -> Future<Response> {
+ VLOG(1) << "Response status: " + httpResponse.status;
+
+ // Set the future if we get a OK response.
+ if (httpResponse.status == "200 OK") {
+ return httpResponse;
+ } else if (httpResponse.status == "400 Bad Request") {
+ Try<JSON::Object> errorResponse =
+ JSON::parse<JSON::Object>(httpResponse.body);
+
+ if (errorResponse.isError()) {
+ return Failure("Failed to parse bad request response JSON: " +
+ errorResponse.error());
+ }
+
+ std::ostringstream out;
+ bool first = true;
+ Result<JSON::Array> errorObjects =
+ errorResponse.get().find<JSON::Array>("errors");
+
+ if (errorObjects.isError()) {
+ return Failure("Failed to find 'errors' in bad request response: " +
+ errorObjects.error());
+ } else if (errorObjects.isNone()) {
+ return Failure("Errors not found in bad request response");
+ }
+
+ foreach (const JSON::Value& error, errorObjects.get().values) {
+ Result<JSON::String> message =
+ error.as<JSON::Object>().find<JSON::String>("message");
+
+ if (message.isError()) {
+ return Failure("Failed to parse bad request error message: " +
+ message.error());
+ } else if (message.isNone()) {
+ continue;
+ }
+
+ if (first) {
+ out << message.get().value;
+ first = false;
+ } else {
+ out << ", " << message.get().value;
+ }
+ }
+
+ return Failure("Received Bad request, errors: [" + out.str() + "]");
+ }
+
+ // Prevent infinite recursion.
+ if (lastResponseStatus.isSome() &&
+ (lastResponseStatus.get() == httpResponse.status)) {
+ return Failure("Invalid response: " + httpResponse.status);
+ }
+
+ // If resend is not set, we dont try again and stop here.
+ if (!resend) {
+ return Failure("Bad response: " + httpResponse.status);
+ }
+
+ // Handle 401 Unauthorized.
+ if (httpResponse.status == "401 Unauthorized") {
+ Try<process::http::Headers> authAttributes =
+ getAuthenticationAttributes(httpResponse);
+
+ if (authAttributes.isError()) {
+ return Failure(
+ "Failed to get authentication attributes: " +
+ authAttributes.error());
+ }
+
+ // TODO(jojy): Currently only handling TLS/cert authentication.
+ Future<Token> tokenResponse = tokenManager_->getToken(
+ authAttributes.get().at("service"),
+ authAttributes.get().at("scope"),
+ None());
+
+ return tokenResponse
+ .after(timeout, [=](
+ Future<Token> tokenResponse) -> Future<Token> {
+ tokenResponse.discard();
+ return Failure("Token response timeout");
+ })
+ .then(defer(self(), [=](
+ const Future<Token>& tokenResponse) {
+ // Send request with acquired token.
+ process::http::Headers authHeaders = {
+ {"Authorization", "Bearer " + tokenResponse.get().raw}
+ };
+
+ return doHttpGet(
+ url,
+ authHeaders,
+ timeout,
+ true,
+ httpResponse.status);
+ }));
+ } else if (httpResponse.status == "307 Temporary Redirect") {
+ // Handle redirect.
+
+ // TODO(jojy): Add redirect functionality in http::get.
+
+ auto toURL = [](
+ const string& urlString) -> Try<URL> {
+ // TODO(jojy): Need to add functionality to URL class that parses a
+ // string to its URL components. For now, assuming:
+ // - scheme is https
+ // - path always ends with /
+
+ static const string schemePrefix = "https://";
+
+ if (!strings::contains(urlString, schemePrefix)) {
+ return Error(
+ "Failed to find expected token '" + schemePrefix +
+ "' in redirect url");
+ }
+
+ const string schemeSuffix = urlString.substr(schemePrefix.length());
+
+ const vector<string> components =
+ strings::tokenize(schemeSuffix, "/");
+
+ const string path = schemeSuffix.substr(components[0].length());
+
+ const vector<string> addrComponents =
+ strings::tokenize(components[0], ":");
+
+ uint16_t port = DEFAULT_SSL_PORT;
+ string domain = components[0];
+
+ // Parse the port.
+ if (addrComponents.size() == 2) {
+ domain = addrComponents[0];
+
+ Try<uint16_t> tryPort = numify<uint16_t>(addrComponents[1]);
+ if (tryPort.isError()) {
+ return Error(
+ "Failed to parse location: " + urlString + " for port.");
+ }
+
+ port = tryPort.get();
+ }
+
+ return URL("https", domain, port, path);
+ };
+
+ if (httpResponse.headers.find("Location") ==
+ httpResponse.headers.end()) {
+ return Failure(
+ "Invalid redirect response: 'Location' not found in headers.");
+ }
+
+ const string& location = httpResponse.headers.at("Location");
+ Try<URL> tryUrl = toURL(location);
+ if (tryUrl.isError()) {
+ return Failure(
+ "Failed to parse '" + location + "': " + tryUrl.error());
+ }
+
+ return doHttpGet(
+ tryUrl.get(),
+ headers,
+ timeout,
+ false,
+ httpResponse.status);
+ } else {
+ return Failure("Invalid response: " + httpResponse.status);
+ }
+ }));
+}
+
+
+Future<ManifestResponse> RegistryClientProcess::getManifest(
+ const string& path,
+ const Option<string>& tag,
+ const Duration& timeout)
+{
+ if (strings::contains(path, " ")) {
+ return Failure("Invalid repository path: " + path);
+ }
+
+ string repoTag = tag.getOrElse("latest");
+ if (strings::contains(repoTag, " ")) {
+ return Failure("Invalid repository tag: " + repoTag);
+ }
+
+ URL manifestURL(registryServer_);
+ manifestURL.path =
+ "v2/" + path + "/manifests/" + repoTag;
+
+ auto getManifestResponse = [](
+ const Response& httpResponse) -> Try<ManifestResponse> {
+ if (!httpResponse.headers.contains("Docker-Content-Digest")) {
+ return Error("Docker-Content-Digest header missing in response");
+ }
+
+ Try<JSON::Object> responseJSON =
+ JSON::parse<JSON::Object>(httpResponse.body);
+
+ if (responseJSON.isError()) {
+ return Error(responseJSON.error());
+ }
+
+ Result<JSON::String> name = responseJSON.get().find<JSON::String>("name");
+ if (name.isNone()) {
+ return Error("Failed to find \"name\" in manifest response");
+ }
+
+ Result<JSON::Array> fsLayers =
+ responseJSON.get().find<JSON::Array>("fsLayers");
+
+ if (fsLayers.isNone()) {
+ return Error("Failed to find \"fsLayers\" in manifest response");
+ }
+
+ Result<JSON::Array> historyArray =
+ responseJSON.get().find<JSON::Array>("history");
+
+ if (historyArray.isNone()) {
+ return Error("Failed to find \"history\" in manifest response");
+ }
+
+ if (historyArray.get().values.size() != fsLayers.get().values.size()) {
+ return Error(
+ "\"history\" and \"fsLayers\" array count mismatch"
+ "in manifest response");
+ }
+
+ vector<FileSystemLayerInfo> fsLayerInfoList;
+ size_t index = 0;
+
+ foreach (const JSON::Value& layer, fsLayers.get().values) {
+ if (!layer.is<JSON::Object>()) {
+ return Error(
+ "Failed to parse layer as a JSON object for index: " +
+ stringify(index));
+ }
+
+ const JSON::Object& layerInfoJSON = layer.as<JSON::Object>();
+
+ // Get blobsum for layer.
+ const Result<JSON::String> blobSumInfo =
+ layerInfoJSON.find<JSON::String>("blobSum");
+
+ if (blobSumInfo.isNone()) {
+ return Error("Failed to find \"blobSum\" in manifest response");
+ }
+
+ // Get history for layer.
+ if (!historyArray.get().values[index].is<JSON::Object>()) {
+ return Error(
+ "Failed to parse history as a JSON object for index: " +
+ stringify(index));
+ }
+ const JSON::Object& historyObj =
+ historyArray.get().values[index].as<JSON::Object>();
+
+ // Get layer id.
+ const Result<JSON::String> v1CompatibilityJSON =
+ historyObj.find<JSON::String>("v1Compatibility");
+
+ if (!v1CompatibilityJSON.isSome()) {
+ return Error(
+ "Failed to obtain layer v1 compability json in manifest for layer: "
+ + stringify(index));
+ }
+
+ Try<JSON::Object> v1CompatibilityObj =
+ JSON::parse<JSON::Object>(v1CompatibilityJSON.get().value);
+
+ if (!v1CompatibilityObj.isSome()) {
+ return Error(
+ "Failed to parse v1 compability json in manifest for layer: "
+ + stringify(index));
+ }
+
+ const Result<JSON::String> id =
+ v1CompatibilityObj.get().find<JSON::String>("id");
+
+ if (!id.isSome()) {
+ return Error(
+ "Failed to find \"id\" in manifest for layer: " + stringify(index));
+ }
+
+ fsLayerInfoList.emplace_back(
+ FileSystemLayerInfo{
+ blobSumInfo.get().value,
+ id.get().value,
+ });
+
+ index++;
+ }
+
+ return ManifestResponse {
+ name.get().value,
+ httpResponse.headers.at("Docker-Content-Digest"),
+ fsLayerInfoList,
+ };
+ };
+
+ return doHttpGet(manifestURL, None(), timeout, true, None())
+ .then([getManifestResponse] (
+ const Response& response) -> Future<ManifestResponse> {
+ Try<ManifestResponse> manifestResponse = getManifestResponse(response);
+
+ if (manifestResponse.isError()) {
+ return Failure(
+ "Failed to parse manifest response: " + manifestResponse.error());
+ }
+
+ return manifestResponse.get();
+ });
+}
+
+
+Future<size_t> RegistryClientProcess::getBlob(
+ const string& path,
+ const Option<string>& digest,
+ const Path& filePath,
+ const Duration& timeout,
+ size_t maxSize)
+{
+ auto prepare = ([&filePath]() -> Try<Nothing> {
+ const string dirName = filePath.dirname();
+
+ // TODO(jojy): Return more state, for example - if the directory is new.
+ Try<Nothing> dirResult = os::mkdir(dirName, true);
+ if (dirResult.isError()) {
+ return Error(
+ "Failed to create directory to download blob: " +
+ dirResult.error());
+ }
+
+ return dirResult;
+ })();
+
+ // TODO(jojy): This currently leaves a residue in failure cases. Would be
+ // ideal if we can completely rollback.
+ if (prepare.isError()) {
+ return Failure(prepare.error());
+ }
+
+ if (strings::contains(path, " ")) {
+ return Failure("Invalid repository path: " + path);
+ }
+
+ URL blobURL(registryServer_);
+ blobURL.path =
+ "v2/" + path + "/blobs/" + digest.getOrElse("");
+
+ auto saveBlob = [filePath](
+ const Response& httpResponse) -> Future<size_t> {
+ // TODO(jojy): Add verification step.
+ // TODO(jojy): Add check for max size.
+ size_t size = httpResponse.body.length();
+ Try<int> fd = os::open(
+ filePath.value,
+ O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+
+ if (fd.isError()) {
+ return Failure("Failed to open file '" + filePath.value + "': " +
+ fd.error());
+ }
+
+ return process::io::write(fd.get(), httpResponse.body)
+ .then([size](const Future<Nothing>&) { return size; })
+ .onAny([fd]() { os::close(fd.get()); } );
+ };
+
+ return doHttpGet(blobURL, None(), timeout, true, None())
+ .then([saveBlob](const Response& response) { return saveBlob(response); });
+}
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp b/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
new file mode 100644
index 0000000..1d3377e
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
+#define __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
+
+#include <string>
+#include <vector>
+
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/json.hpp>
+#include <stout/path.hpp>
+
+#include <process/future.hpp>
+#include <process/http.hpp>
+#include <process/process.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+// Forward declarations.
+class RegistryClientProcess;
+
+
+class RegistryClient
+{
+public:
+ /**
+ * Encapsulates information about a file system layer.
+ */
+ struct FileSystemLayerInfo {
+ // TODO(jojy): This string includes the checksum type also now. Need to
+ // separate this into checksum method and checksum.
+ const std::string checksumInfo;
+ const std::string layerId;
+ };
+
+ /**
+ * Encapsulates response of "GET Manifest" request.
+ *
+ * Reference: https://docs.docker.com/registry/spec/api
+ */
+ struct ManifestResponse {
+ const std::string name;
+ const std::string digest;
+ const std::vector<FileSystemLayerInfo> fsLayerInfoList;
+ };
+
+ /**
+ * Encapsulates auth credentials for the client sessions.
+ * TODO(jojy): Secure heap to protect the credentials.
+ */
+ struct Credentials {
+ /**
+ * UserId for basic authentication.
+ */
+ const Option<std::string> userId;
+ /**
+ * Password for basic authentication.
+ */
+ const Option<std::string> password;
+ /**
+ * Account for fetching data from registry.
+ */
+ const Option<std::string> account;
+ };
+
+ /**
+ * Factory method for creating RegistryClient objects.
+ *
+ * @param registryServer URL of docker registry server.
+ * @param authServer URL of authorization server.
+ * @param credentials credentials for client session (optional).
+ * @return RegistryClient on Success.
+ * Error on failure.
+ */
+ static Try<process::Owned<RegistryClient>> create(
+ const process::http::URL& registryServer,
+ const process::http::URL& authServer,
+ const Option<Credentials>& credentials);
+
+ /**
+ * Fetches manifest for a repository from the client's remote registry server.
+ *
+ * @param path path of the repository on the registry.
+ * @param tag unique tag that identifies the repository. Will default to
+ * latest.
+ * @param timeout Maximum time ater which the request will timeout and return
+ * a failure. Will default to RESPONSE_TIMEOUT.
+ * @return JSON object on success.
+ * Failure on process failure.
+ */
+ process::Future<ManifestResponse> getManifest(
+ const std::string& path,
+ const Option<std::string>& tag,
+ const Option<Duration>& timeout);
+
+ /**
+ * Fetches blob for a repository from the client's remote registry server.
+ *
+ * @param path path of the repository on the registry.
+ * @param digest digest of the blob (from manifest).
+ * @param filePath file path to store the fetched blob.
+ * @param timeout Maximum time ater which the request will timeout and return
+ * a failure. Will default to RESPONSE_TIMEOUT.
+ * @param maxSize Maximum size of the response thats acceptable. Will default
+ * to MAX_RESPONSE_SIZE.
+ * @return size of downloaded blob on success.
+ * Failure in case of any errors.
+ */
+ process::Future<size_t> getBlob(
+ const std::string& path,
+ const Option<std::string>& digest,
+ const Path& filePath,
+ const Option<Duration>& timeout,
+ const Option<size_t>& maxSize);
+
+ ~RegistryClient();
+
+private:
+ RegistryClient(
+ const process::http::URL& registryServer,
+ const process::http::URL& authServer,
+ const Option<Credentials>& credentials,
+ const process::Owned<RegistryClientProcess>& process);
+
+ static const Duration DEFAULT_MANIFEST_TIMEOUT_SECS;
+ static const size_t DEFAULT_MANIFEST_MAXSIZE_BYTES;
+
+ const process::http::URL registryServer_;
+ const process::http::URL authServer_;
+ const Option<Credentials> credentials_;
+ process::Owned<RegistryClientProcess> process_;
+
+ RegistryClient(const RegistryClient&) = delete;
+ RegistryClient& operator=(const RegistryClient&) = delete;
+};
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/spec.cpp b/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
new file mode 100644
index 0000000..2703b5d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/foreach.hpp>
+#include <stout/json.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/spec.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Docker spec.
+Option<Error> validateManifest(const DockerImageManifest& manifest)
+{
+ // Validate required fields are present,
+ // e.g., repeated fields that has to be >= 1.
+ if (manifest.fslayers_size() <= 0) {
+ return Error("FsLayers field must have at least one blobSum");
+ }
+
+ if (manifest.history_size() <= 0) {
+ return Error("History field must have at least one v1Compatibility");
+ }
+
+ if (manifest.signatures_size() <= 0) {
+ return Error("Signatures field must have at least one signature");
+ }
+
+ // Verify that blobSum and v1Compatibility numbers are equal.
+ if (manifest.fslayers_size() != manifest.history_size()) {
+ return Error("Size of blobSum and v1Compatibility must be equal");
+ }
+
+ // FsLayers field validation.
+ foreach (const docker::DockerImageManifest::FsLayers& fslayer,
+ manifest.fslayers()) {
+ const string& blobSum = fslayer.blobsum();
+ if (!strings::contains(blobSum, ":")) {
+ return Error("Incorrect blobSum format");
+ }
+ }
+
+ return None();
+}
+
+
+Try<docker::DockerImageManifest> parse(const JSON::Object& json)
+{
+ Try<docker::DockerImageManifest> manifest =
+ protobuf::parse<docker::DockerImageManifest>(json);
+
+ if (manifest.isError()) {
+ return Error("Protobuf parse failed: " + manifest.error());
+ }
+
+ Option<Error> error = validateManifest(manifest.get());
+ if (error.isSome()) {
+ return Error("Docker Image Manifest Validation failed: " +
+ error.get().message);
+ }
+
+ return manifest.get();
+}
+
+} // namespace spec {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/spec.hpp b/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
new file mode 100644
index 0000000..96e8d6d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_SPEC_HPP__
+#define __PROVISIONER_DOCKER_SPEC_HPP__
+
+#include <stout/error.hpp>
+#include <stout/json.hpp>
+#include <stout/option.hpp>
+
+#include <mesos/mesos.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Docker spec.
+Option<Error> validateManifest(const docker::DockerImageManifest& manifest);
+
+// TODO(Gilbert): add validations here, e.g., Manifest, Blob, Layout, ImageID.
+
+// Parse the DockerImageManifest from the specified JSON object.
+Try<docker::DockerImageManifest> parse(const JSON::Object& json);
+
+} // namespace spec {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_SPEC_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/store.cpp b/src/slave/containerizer/mesos/provisioner/docker/store.cpp
new file mode 100644
index 0000000..bb02d65
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/store.cpp
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+#include <list>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/json.hpp>
+#include <stout/os.hpp>
+#include <stout/result.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/subprocess.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/flags.hpp"
+
+using namespace process;
+
+using std::list;
+using std::pair;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class StoreProcess : public Process<StoreProcess>
+{
+public:
+ StoreProcess(
+ const Flags& _flags,
+ const Owned<MetadataManager>& _metadataManager,
+ const Owned<Puller>& _puller)
+ : flags(_flags), metadataManager(_metadataManager), puller(_puller) {}
+
+ ~StoreProcess() {}
+
+ Future<Nothing> recover();
+
+ Future<vector<string>> get(const mesos::Image& image);
+
+private:
+ Future<Image> _get(
+ const Image::Name& name,
+ const Option<Image>& image);
+
+ Future<vector<string>> __get(const Image& image);
+
+ Future<vector<string>> moveLayers(
+ const std::string& staging,
+ const std::list<pair<string, string>>& layerPaths);
+
+ Future<Image> storeImage(
+ const Image::Name& name,
+ const std::vector<std::string>& layerIds);
+
+ Future<Nothing> moveLayer(const pair<string, string>& layerPath);
+
+ const Flags flags;
+ Owned<MetadataManager> metadataManager;
+ Owned<Puller> puller;
+};
+
+
+Try<Owned<slave::Store>> Store::create(const Flags& flags)
+{
+ Try<Owned<Puller>> puller = Puller::create(flags);
+ if (puller.isError()) {
+ return Error("Failed to create Docker puller: " + puller.error());
+ }
+
+ if (!os::exists(flags.docker_store_dir)) {
+ Try<Nothing> mkdir = os::mkdir(flags.docker_store_dir);
+ if (mkdir.isError()) {
+ return Error("Failed to create Docker store directory: " + mkdir.error());
+ }
+ }
+
+ if (!os::exists(paths::getStagingDir(flags.docker_store_dir))) {
+ Try<Nothing> mkdir =
+ os::mkdir(paths::getStagingDir(flags.docker_store_dir));
+
+ if (mkdir.isError()) {
+ return Error("Failed to create Docker store staging directory: " +
+ mkdir.error());
+ }
+ }
+
+ Try<Owned<MetadataManager>> metadataManager = MetadataManager::create(flags);
+ if (metadataManager.isError()) {
+ return Error(metadataManager.error());
+ }
+
+ Owned<StoreProcess> process(
+ new StoreProcess(flags, metadataManager.get(), puller.get()));
+
+ return Owned<slave::Store>(new Store(process));
+}
+
+
+Store::Store(const Owned<StoreProcess>& _process) : process(_process)
+{
+ process::spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Store::~Store()
+{
+ process::terminate(process.get());
+ process::wait(process.get());
+}
+
+
+Future<Nothing> Store::recover()
+{
+ return dispatch(process.get(), &StoreProcess::recover);
+}
+
+
+Future<vector<string>> Store::get(const mesos::Image& image)
+{
+ return dispatch(process.get(), &StoreProcess::get, image);
+}
+
+
+Future<vector<string>> StoreProcess::get(const mesos::Image& image)
+{
+ if (image.type() != mesos::Image::DOCKER) {
+ return Failure("Docker provisioner store only supports Docker images");
+ }
+
+ Image::Name imageName = parseImageName(image.docker().name());
+
+ return metadataManager->get(imageName)
+ .then(defer(self(), &Self::_get, imageName, lambda::_1))
+ .then(defer(self(), &Self::__get, lambda::_1));
+}
+
+
+Future<Image> StoreProcess::_get(
+ const Image::Name& name,
+ const Option<Image>& image)
+{
+ if (image.isSome()) {
+ return image.get();
+ }
+
+ Try<string> staging =
+ os::mkdtemp(paths::getStagingTempDir(flags.docker_store_dir));
+
+ if (staging.isError()) {
+ return Failure("Failed to create a staging directory");
+ }
+
+ return puller->pull(name, staging.get())
+ .then(defer(self(), &Self::moveLayers, staging.get(), lambda::_1))
+ .then(defer(self(), &Self::storeImage, name, lambda::_1))
+ .onAny([staging]() {
+ Try<Nothing> rmdir = os::rmdir(staging.get());
+ if (rmdir.isError()) {
+ LOG(WARNING) << "Failed to remove staging directory: " << rmdir.error();
+ }
+ });
+}
+
+
+Future<vector<string>> StoreProcess::__get(const Image& image)
+{
+ vector<string> layerDirectories;
+ foreach (const string& layer, image.layer_ids()) {
+ layerDirectories.push_back(
+ paths::getImageLayerRootfsPath(
+ flags.docker_store_dir, layer));
+ }
+
+ return layerDirectories;
+}
+
+
+Future<Nothing> StoreProcess::recover()
+{
+ return metadataManager->recover();
+}
+
+
+Future<vector<string>> StoreProcess::moveLayers(
+ const string& staging,
+ const list<pair<string, string>>& layerPaths)
+{
+ list<Future<Nothing>> futures;
+ foreach (const auto& layerPath, layerPaths) {
+ futures.push_back(moveLayer(layerPath));
+ }
+
+ return collect(futures)
+ .then([layerPaths]() {
+ vector<string> layerIds;
+ foreach (const auto& layerPath, layerPaths) {
+ layerIds.push_back(layerPath.first);
+ }
+
+ return layerIds;
+ });
+}
+
+
+Future<Image> StoreProcess::storeImage(
+ const Image::Name& name,
+ const vector<string>& layerIds)
+{
+ return metadataManager->put(name, layerIds);
+}
+
+
+Future<Nothing> StoreProcess::moveLayer(const pair<string, string>& layerPath)
+{
+ if (!os::exists(layerPath.second)) {
+ return Failure("Unable to find layer '" + layerPath.first + "' in '" +
+ layerPath.second + "'");
+ }
+
+ const string imageLayerPath =
+ paths::getImageLayerPath(flags.docker_store_dir, layerPath.first);
+
+ if (!os::exists(imageLayerPath)) {
+ Try<Nothing> mkdir = os::mkdir(imageLayerPath);
+ if (mkdir.isError()) {
+ return Failure("Failed to create layer path in store for id '" +
+ layerPath.first + "': " + mkdir.error());
+ }
+ }
+
+ Try<Nothing> status = os::rename(
+ layerPath.second,
+ paths::getImageLayerRootfsPath(
+ flags.docker_store_dir, layerPath.first));
+
+ if (status.isError()) {
+ return Failure("Failed to move layer '" + layerPath.first +
+ "' to store directory: " + status.error());
+ }
+
+ return Nothing();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/store.hpp b/src/slave/containerizer/mesos/provisioner/docker/store.hpp
new file mode 100644
index 0000000..95e46b9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/store.hpp
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_STORE_HPP__
+#define __PROVISIONER_DOCKER_STORE_HPP__
+
+#include <string>
+
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward Declarations.
+class Puller;
+class StoreProcess;
+
+
+// Store fetches the Docker images and stores them on disk.
+class Store : public slave::Store
+{
+public:
+ static Try<process::Owned<slave::Store>> create(const Flags& flags);
+
+ ~Store();
+
+ process::Future<Nothing> recover();
+
+ process::Future<std::vector<std::string>> get(const mesos::Image& image);
+
+private:
+ explicit Store(const process::Owned<StoreProcess>& _process);
+
+ Store& operator=(const Store&) = delete; // Not assignable.
+ Store(const Store&) = delete; // Not copyable.
+
+ process::Owned<StoreProcess> process;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_STORE_HPP__
[10/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.cpp b/src/slave/containerizer/isolators/network/port_mapping.cpp
deleted file mode 100644
index e6bb75e..0000000
--- a/src/slave/containerizer/isolators/network/port_mapping.cpp
+++ /dev/null
@@ -1,3792 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <limits.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <iostream>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <mesos/mesos.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/io.hpp>
-#include <process/pid.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/fs.hpp>
-#include <stout/hashset.hpp>
-#include <stout/json.hpp>
-#include <stout/lambda.hpp>
-#include <stout/mac.hpp>
-#include <stout/multihashmap.hpp>
-#include <stout/numify.hpp>
-#include <stout/os.hpp>
-#include <stout/option.hpp>
-#include <stout/path.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/result.hpp>
-#include <stout/stringify.hpp>
-#include <stout/strings.hpp>
-#include <stout/utils.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/stat.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "linux/routing/route.hpp"
-#include "linux/routing/utils.hpp"
-
-#include "linux/routing/diagnosis/diagnosis.hpp"
-
-#include "linux/routing/filter/basic.hpp"
-#include "linux/routing/filter/icmp.hpp"
-#include "linux/routing/filter/ip.hpp"
-
-#include "linux/routing/handle.hpp"
-
-#include "linux/routing/link/link.hpp"
-
-#include "linux/routing/queueing/fq_codel.hpp"
-#include "linux/routing/queueing/htb.hpp"
-#include "linux/routing/queueing/ingress.hpp"
-#include "linux/routing/queueing/statistics.hpp"
-
-#include "mesos/resources.hpp"
-
-#include "slave/constants.hpp"
-
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
-
-using namespace mesos::internal;
-
-using namespace process;
-
-using namespace routing;
-using namespace routing::filter;
-using namespace routing::queueing;
-using namespace routing::queueing::statistics;
-
-using std::cerr;
-using std::cout;
-using std::dec;
-using std::endl;
-using std::hex;
-using std::list;
-using std::ostringstream;
-using std::set;
-using std::sort;
-using std::string;
-using std::vector;
-
-using filter::ip::PortRange;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-// An old glibc might not have this symbol.
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The minimum number of ephemeral ports a container should have.
-static const uint16_t MIN_EPHEMERAL_PORTS_SIZE = 16;
-
-// Linux traffic control is a combination of queueing disciplines,
-// filters and classes organized as a tree for the ingress (tx) and
-// egress (rx) flows for each interface. Each container provides two
-// networking interfaces, a virtual eth0 and a loopback interface. The
-// flow of packets from the external network to container is shown
-// below:
-//
-// +----------------------+----------------------+
-// | Container |
-// |----------------------|----------------------|
-// | eth0 | lo |
-// +----------------------+----------------------+
-// ^ | ^ |
-// [3] | | [4] | |
-// | | [7] +-----------+ [10]
-// | |
-// | | [8] +-----------+ [9]
-// [2] | | [5] | |
-// | v v v
-// +----------------------+----------------------+
-// | veth0 | lo |
-// +----------------------|----------------------+
-// | Host |
-// |----------------------|----------------------|
-// | eth0 |
-// +----------------------+----------------------|
-// ^ |
-// [1] | | [6]
-// | v
-//
-// Traffic flowing from outside the network into a container enters
-// the system via the host ingress interface [1] and is routed based
-// on destination port to the outbound interface for the matching
-// container [2], which forwards the packet to the container's inbound
-// virtual interface. Outbound traffic destined for the external
-// network flows along the reverse path [4,5,6]. Loopback traffic is
-// directed to the corresponding Ethernet interface, either [7,10] or
-// [8,9] where the same destination port routing can be applied as to
-// external traffic. We use traffic control filters at several of the
-// interfaces to create these packet paths.
-//
-// Linux provides only a very simple topology for ingress interfaces.
-// A root is provided on a fixed handle (handle::INGRESS_ROOT) under
-// which a single qdisc can be installed, with handle ingress::HANDLE.
-// Traffic control filters can then be attached to the ingress qdisc.
-// We install one or more ingress filters on the host eth0 [1] to
-// direct traffic to the correct container, and on the container
-// virtual eth0 [5] to direct traffic to other containers or out of
-// the box. Since we know the ip port assignments for each container,
-// we can direct traffic directly to the appropriate container.
-// However, for ICMP and ARP traffic where no equivalent to a port
-// exists, we send a copy of the packet to every container and rely on
-// the network stack to drop unexpected packets.
-//
-// We install a Hierarchical Token Bucket (HTB) qdisc and class to
-// limit the outbound traffic bandwidth as the egress qdisc inside the
-// container [4] and then add a fq_codel qdisc to limit head of line
-// blocking on the egress filter. The egress traffic control chain is
-// thus:
-//
-// root device: handle::EGRESS_ROOT ->
-// htb egress qdisc: CONTAINER_TX_HTB_HANDLE ->
-// htb rate limiting class: CONTAINER_TX_HTB_CLASS_ID ->
-// buffer-bloat reduction: FQ_CODEL
-constexpr Handle CONTAINER_TX_HTB_HANDLE = Handle(1, 0);
-constexpr Handle CONTAINER_TX_HTB_CLASS_ID =
- Handle(CONTAINER_TX_HTB_HANDLE, 1);
-
-
-// Finally we create a second fq_codel qdisc on the public interface
-// of the host [6] to reduce performance interference between
-// containers. We create independent flows for each container, and
-// one for the host, which ensures packets from each container are
-// guaranteed fair access to the host interface. This egress traffic
-// control chain for the host interface is thus:
-//
-// root device: handle::EGRESS_ROOT ->
-// buffer-bloat reduction: FQ_CODEL
-constexpr Handle HOST_TX_FQ_CODEL_HANDLE = Handle(1, 0);
-
-
-// The primary priority used by each type of filter.
-static const uint8_t ARP_FILTER_PRIORITY = 1;
-static const uint8_t ICMP_FILTER_PRIORITY = 2;
-static const uint8_t IP_FILTER_PRIORITY = 3;
-static const uint8_t DEFAULT_FILTER_PRIORITY = 4;
-
-
-// The secondary priorities used by filters.
-static const uint8_t HIGH = 1;
-static const uint8_t NORMAL = 2;
-static const uint8_t LOW = 3;
-
-
-// We assign a separate flow on host eth0 egress for each container
-// (See MESOS-2422 for details). Host egress traffic is assigned to a
-// reserved flow (HOST_FLOWID). ARP and ICMP traffic from containers
-// are not heavy, so they can share the same flow.
-static const uint16_t HOST_FLOWID = 1;
-static const uint16_t ARP_FLOWID = 2;
-static const uint16_t ICMP_FLOWID = 2;
-static const uint16_t CONTAINER_MIN_FLOWID = 3;
-
-
-// The well known ports. Used for sanity check.
-static Interval<uint16_t> WELL_KNOWN_PORTS()
-{
- return (Bound<uint16_t>::closed(0), Bound<uint16_t>::open(1024));
-}
-
-
-/////////////////////////////////////////////////
-// Helper functions for the isolator.
-/////////////////////////////////////////////////
-
-// Given an integer x, find the largest integer t such that t <= x and
-// t is aligned to power of 2.
-static uint32_t roundDownToPowerOfTwo(uint32_t x)
-{
- // Mutate x from 00001XXX to 0x00001111.
-
- // We know the MSB has to be a 1, so kill the LSB and make sure the
- // first 2 most significant bits are 1s.
- x = x | (x >> 1);
-
- // Now that the 2 most significant bits are 1s, make sure the first
- // 4 most significant bits are 1s, too.
- x = x | (x >> 2);
-
- // We keep going. Note that the 0s left to the MSB are never turned
- // to 1s.
- x = x | (x >> 4);
- x = x | (x >> 8);
-
- // Now we have covered all 32 bits.
- x = x | (x >> 16);
-
- // 0x00001111 - (0x00001111 >> 1)
- return x - (x >> 1);
-}
-
-
-// Returns the name of the host end of the virtual ethernet pair for a
-// given container. The kernel restricts link name to 16 characters or
-// less, so we cannot put container ID into the device name. Instead,
-// we use the pid of the executor process forked by the slave to
-// uniquely name the device for each container. It's safe because we
-// cannot have two active containers having the same pid for the
-// executor process.
-static string veth(pid_t pid)
-{
- return PORT_MAPPING_VETH_PREFIX() + stringify(pid);
-}
-
-
-// Extracts the pid from the given veth name.
-static Option<pid_t> getPidFromVeth(const string& veth)
-{
- if (strings::startsWith(veth, PORT_MAPPING_VETH_PREFIX())) {
- Try<pid_t> pid = numify<pid_t>(
- strings::remove(veth, PORT_MAPPING_VETH_PREFIX(), strings::PREFIX));
-
- if (pid.isSome()) {
- return pid.get();
- }
- }
-
- return None();
-}
-
-
-// Extracts the container ID from the symlink that points to the
-// network namespace handle. The following is the layout of the bind
-// mount root and bind mount symlink root:
-// <PORT_MAPPING_BIND_MOUNT_ROOT()>
-// |--- 3945 (pid) <-|
-// |
-// <PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()> |
-// |--- ecf293e7-e6e8-4cbc-aaee-4d6c958aa276 --|
-// (symlink: container ID -> pid)
-static Try<ContainerID> getContainerIdFromSymlink(const string& symlink)
-{
- if (!os::stat::islink(symlink)) {
- return Error("Not a symlink");
- }
-
- string _containerId = Path(symlink).basename();
-
- ContainerID containerId;
- containerId.set_value(_containerId);
-
- return containerId;
-}
-
-
-// Extracts the pid from the network namespace handle. Returns None if
-// the handle is clearly not created by us.
-static Result<pid_t> getPidFromNamespaceHandle(const string& handle)
-{
- if (os::stat::islink(handle)) {
- return Error("Not expecting a symlink");
- }
-
- string _pid = Path(handle).basename();
-
- Try<pid_t> pid = numify<pid_t>(_pid);
- if (pid.isError()) {
- return None();
- }
-
- return pid.get();
-}
-
-
-// Extracts the pid from the symlink that points to the network
-// namespace handle. Returns None if it's a dangling symlink.
-static Result<pid_t> getPidFromSymlink(const string& symlink)
-{
- if (!os::stat::islink(symlink)) {
- return Error("Not a symlink");
- }
-
- Result<string> target = os::realpath(symlink);
- if (target.isError()) {
- return Error("Failed to follow the symlink: " + target.error());
- } else if (target.isNone()) {
- // This is a dangling symlink.
- return None();
- }
-
- return getPidFromNamespaceHandle(target.get());
-}
-
-
-static string getSymlinkPath(const ContainerID& containerId)
-{
- return path::join(
- PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(),
- stringify(containerId));
-}
-
-
-static string getNamespaceHandlePath(pid_t pid)
-{
- return path::join(
- PORT_MAPPING_BIND_MOUNT_ROOT(),
- stringify(pid));
-}
-
-
-// Converts from value ranges to interval set.
-static IntervalSet<uint16_t> getIntervalSet(const Value::Ranges& ranges)
-{
- IntervalSet<uint16_t> set;
-
- for (int i = 0; i < ranges.range_size(); i++) {
- set += (Bound<uint16_t>::closed(ranges.range(i).begin()),
- Bound<uint16_t>::closed(ranges.range(i).end()));
- }
-
- return set;
-}
-
-/////////////////////////////////////////////////
-// Implementation for PortMappingUpdate.
-/////////////////////////////////////////////////
-
-const char* PortMappingUpdate::NAME = "update";
-
-
-PortMappingUpdate::Flags::Flags()
-{
- add(ð0_name,
- "eth0_name",
- "The name of the public network interface (e.g., eth0)");
-
- add(&lo_name,
- "lo_name",
- "The name of the loopback network interface (e.g., lo)");
-
- add(&pid,
- "pid",
- "The pid of the process whose namespaces we will enter");
-
- add(&ports_to_add,
- "ports_to_add",
- "A collection of port ranges (formatted as a JSON object)\n"
- "for which to add IP filters. E.g.,\n"
- "--ports_to_add={\"range\":[{\"begin\":4,\"end\":8}]}");
-
- add(&ports_to_remove,
- "ports_to_remove",
- "A collection of port ranges (formatted as a JSON object)\n"
- "for which to remove IP filters. E.g.,\n"
- "--ports_to_remove={\"range\":[{\"begin\":4,\"end\":8}]}");
-}
-
-
-// The following two helper functions allow us to convert from a
-// collection of port ranges to a JSON object and vice versa. They
-// will be used for the port mapping update operation.
-template <typename Iterable>
-JSON::Object json(const Iterable& ranges)
-{
- Value::Ranges values;
- foreach (const PortRange& range, ranges) {
- Value::Range value;
- value.set_begin(range.begin());
- value.set_end(range.end());
-
- values.add_range()->CopyFrom(value);
- }
- return JSON::Protobuf(values);
-}
-
-
-static Try<vector<PortRange>> parse(const JSON::Object& object)
-{
- Try<Value::Ranges> parsing = protobuf::parse<Value::Ranges>(object);
- if (parsing.isError()) {
- return Error("Failed to parse JSON: " + parsing.error());
- }
-
- vector<PortRange> ranges;
- Value::Ranges values = parsing.get();
- for (int i = 0; i < values.range_size(); i++) {
- const Value::Range& value = values.range(i);
- Try<PortRange> range = PortRange::fromBeginEnd(value.begin(), value.end());
- if (range.isError()) {
- return Error("Invalid port range: " + range.error());
- }
-
- ranges.push_back(range.get());
- }
- return ranges;
-}
-
-
-// Helper function to set up IP filters inside the container for a
-// given port range.
-static Try<Nothing> addContainerIPFilters(
- const PortRange& range,
- const string& eth0,
- const string& lo)
-{
- // Add an IP packet filter on lo such that local traffic inside a
- // container will not be redirected to eth0.
- Try<bool> loTerminal = filter::ip::create(
- lo,
- ingress::HANDLE,
- ip::Classifier(None(), None(), None(), range),
- Priority(IP_FILTER_PRIORITY, HIGH),
- action::Terminal());
-
- if (loTerminal.isError()) {
- return Error(
- "Failed to create an IP packet filter on " + lo +
- " which stops packets from being sent to " + eth0 +
- ": " + loTerminal.error());
- } else if (!loTerminal.get()) {
- return Error(
- "The IP packet filter on " + lo +
- " which stops packets from being sent to " +
- eth0 + " already exists");
- }
-
- // Add an IP packet filter (for loopback IP) from eth0 to lo to
- // redirect all loopback IP traffic to lo.
- Try<bool> eth0ToLoLoopback = filter::ip::create(
- eth0,
- ingress::HANDLE,
- ip::Classifier(
- None(),
- net::IPNetwork::LOOPBACK_V4().address(),
- None(),
- range),
- Priority(IP_FILTER_PRIORITY, NORMAL),
- action::Redirect(lo));
-
- if (eth0ToLoLoopback.isError()) {
- return Error(
- "Failed to create an IP packet filter (for loopback IP) from " +
- eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
- } else if (!eth0ToLoLoopback.get()) {
- return Error(
- "The IP packet filter (for loopback IP) from " +
- eth0 + " to " + lo + " already exists");
- }
-
- return Nothing();
-}
-
-
-// Helper function to remove IP filters inside the container for a
-// given port range.
-static Try<Nothing> removeContainerIPFilters(
- const PortRange& range,
- const string& eth0,
- const string& lo)
-{
- // Remove the 'terminal' IP packet filter on lo.
- Try<bool> loTerminal = filter::ip::remove(
- lo,
- ingress::HANDLE,
- ip::Classifier(None(), None(), None(), range));
-
- if (loTerminal.isError()) {
- return Error(
- "Failed to remove the IP packet filter on " + lo +
- " which stops packets from being sent to " + eth0 +
- ": " + loTerminal.error());
- } else if (!loTerminal.get()) {
- return Error(
- "The IP packet filter on " + lo +
- " which stops packets from being sent to " + eth0 +
- " does not exist");
- }
-
- // Remove the IP packet filter (for loopback IP) from eth0 to lo.
- Try<bool> eth0ToLoLoopback = filter::ip::remove(
- eth0,
- ingress::HANDLE,
- ip::Classifier(
- None(),
- net::IPNetwork::LOOPBACK_V4().address(),
- None(),
- range));
-
- if (eth0ToLoLoopback.isError()) {
- return Error(
- "Failed to remove the IP packet filter (for loopback IP) from " +
- eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
- } else if (!eth0ToLoLoopback.get()) {
- return Error(
- "The IP packet filter (for loopback IP) from " +
- eth0 + " to " + lo + " does not exist");
- }
-
- return Nothing();
-}
-
-
-int PortMappingUpdate::execute()
-{
- if (flags.help) {
- cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
- << "Supported options:" << endl
- << flags.usage();
- return 0;
- }
-
- if (flags.eth0_name.isNone()) {
- cerr << "The public interface name (e.g., eth0) is not specified" << endl;
- return 1;
- }
-
- if (flags.lo_name.isNone()) {
- cerr << "The loopback interface name (e.g., lo) is not specified" << endl;
- return 1;
- }
-
- if (flags.pid.isNone()) {
- cerr << "The pid is not specified" << endl;
- return 1;
- }
-
- if (flags.ports_to_add.isNone() && flags.ports_to_remove.isNone()) {
- cerr << "Nothing to update" << endl;
- return 1;
- }
-
- Option<vector<PortRange>> portsToAdd;
- Option<vector<PortRange>> portsToRemove;
-
- if (flags.ports_to_add.isSome()) {
- Try<vector<PortRange>> parsing = parse(flags.ports_to_add.get());
- if (parsing.isError()) {
- cerr << "Parsing 'ports_to_add' failed: " << parsing.error() << endl;
- return 1;
- }
- portsToAdd = parsing.get();
- }
-
- if (flags.ports_to_remove.isSome()) {
- Try<vector<PortRange>> parsing = parse(flags.ports_to_remove.get());
- if (parsing.isError()) {
- cerr << "Parsing 'ports_to_remove' failed: " << parsing.error() << endl;
- return 1;
- }
- portsToRemove = parsing.get();
- }
-
- // Enter the network namespace.
- Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
- if (setns.isError()) {
- cerr << "Failed to enter the network namespace of pid " << flags.pid.get()
- << ": " << setns.error() << endl;
- return 1;
- }
-
- // Update IP packet filters.
- const string eth0 = flags.eth0_name.get();
- const string lo = flags.lo_name.get();
-
- if (portsToAdd.isSome()) {
- foreach (const PortRange& range, portsToAdd.get()) {
- Try<Nothing> add = addContainerIPFilters(range, eth0, lo);
- if (add.isError()) {
- cerr << "Failed to add IP filters: " << add.error() << endl;
- return 1;
- }
- }
- }
-
- if (portsToRemove.isSome()) {
- foreach (const PortRange& range, portsToRemove.get()) {
- Try<Nothing> remove = removeContainerIPFilters(range, eth0, lo);
- if (remove.isError()) {
- cerr << "Failed to remove IP filters: " << remove.error() << endl;
- return 1;
- }
- }
- }
-
- return 0;
-}
-
-/////////////////////////////////////////////////
-// Implementation for PortMappingStatistics.
-/////////////////////////////////////////////////
-
-const char* PortMappingStatistics::NAME = "statistics";
-
-
-PortMappingStatistics::Flags::Flags()
-{
- add(ð0_name,
- "eth0_name",
- "The name of the public network interface (e.g., eth0)");
-
- add(&pid,
- "pid",
- "The pid of the process whose namespaces we will enter");
-
- add(&enable_socket_statistics_summary,
- "enable_socket_statistics_summary",
- "Whether to collect socket statistics summary for this container\n",
- false);
-
- add(&enable_socket_statistics_details,
- "enable_socket_statistics_details",
- "Whether to collect socket statistics details (e.g., TCP RTT)\n"
- "for this container.",
- false);
-}
-
-
-// A helper that copies the traffic control statistics from the
-// statistics hashmap into the ResourceStatistics protocol buffer.
-static void addTrafficControlStatistics(
- const string& id,
- const hashmap<string, uint64_t>& statistics,
- ResourceStatistics* result)
-{
- TrafficControlStatistics *tc = result->add_net_traffic_control_statistics();
-
- tc->set_id(id);
-
- // TODO(pbrett) Use protobuf reflection here.
- if (statistics.contains(BACKLOG)) {
- tc->set_backlog(statistics.at(BACKLOG));
- }
- if (statistics.contains(BYTES)) {
- tc->set_bytes(statistics.at(BYTES));
- }
- if (statistics.contains(DROPS)) {
- tc->set_drops(statistics.at(DROPS));
- }
- if (statistics.contains(OVERLIMITS)) {
- tc->set_overlimits(statistics.at(OVERLIMITS));
- }
- if (statistics.contains(PACKETS)) {
- tc->set_packets(statistics.at(PACKETS));
- }
- if (statistics.contains(QLEN)) {
- tc->set_qlen(statistics.at(QLEN));
- }
- if (statistics.contains(RATE_BPS)) {
- tc->set_ratebps(statistics.at(RATE_BPS));
- }
- if (statistics.contains(RATE_PPS)) {
- tc->set_ratepps(statistics.at(RATE_PPS));
- }
- if (statistics.contains(REQUEUES)) {
- tc->set_requeues(statistics.at(REQUEUES));
- }
-}
-
-
-int PortMappingStatistics::execute()
-{
- if (flags.help) {
- cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
- << "Supported options:" << endl
- << flags.usage();
- return 0;
- }
-
- if (flags.pid.isNone()) {
- cerr << "The pid is not specified" << endl;
- return 1;
- }
-
- if (flags.eth0_name.isNone()) {
- cerr << "The public interface name (e.g., eth0) is not specified" << endl;
- return 1;
- }
-
- // Enter the network namespace.
- Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
- if (setns.isError()) {
- // This could happen if the executor exits before this function is
- // invoked. We do not log here to avoid spurious logging.
- return 1;
- }
-
- ResourceStatistics result;
-
- // NOTE: We use a dummy value here since this field will be cleared
- // before the result is sent to the containerizer.
- result.set_timestamp(0);
-
- if (flags.enable_socket_statistics_summary) {
- // Collections for socket statistics summary are below.
-
- // For TCP, get the number of ACTIVE and TIME_WAIT connections,
- // from reading /proc/net/sockstat (/proc/net/sockstat6 for IPV6).
- // This is not as expensive in the kernel because only counter
- // values are accessed instead of a dump of all the sockets.
- // Example output:
-
- // $ cat /proc/net/sockstat
- // sockets: used 1391
- // TCP: inuse 33 orphan 0 tw 0 alloc 37 mem 6
- // UDP: inuse 15 mem 7
- // UDPLITE: inuse 0
- // RAW: inuse 0
- // FRAG: inuse 0 memory 0
-
- Try<string> value = os::read("/proc/net/sockstat");
- if (value.isError()) {
- cerr << "Failed to read /proc/net/sockstat: " << value.error() << endl;
- return 1;
- }
-
- foreach (const string& line, strings::tokenize(value.get(), "\n")) {
- if (!strings::startsWith(line, "TCP")) {
- continue;
- }
-
- vector<string> tokens = strings::tokenize(line, " ");
- for (size_t i = 0; i < tokens.size(); i++) {
- if (tokens[i] == "inuse") {
- if (i + 1 >= tokens.size()) {
- cerr << "Unexpected output from /proc/net/sockstat" << endl;
- // Be a bit forgiving here here since the /proc file
- // output format can change, though not very likely.
- continue;
- }
-
- // Set number of active TCP connections.
- Try<size_t> inuse = numify<size_t>(tokens[i+1]);
- if (inuse.isError()) {
- cerr << "Failed to parse the number of tcp connections in use: "
- << inuse.error() << endl;
- continue;
- }
-
- result.set_net_tcp_active_connections(inuse.get());
- } else if (tokens[i] == "tw") {
- if (i + 1 >= tokens.size()) {
- cerr << "Unexpected output from /proc/net/sockstat" << endl;
- // Be a bit forgiving here here since the /proc file
- // output format can change, though not very likely.
- continue;
- }
-
- // Set number of TIME_WAIT TCP connections.
- Try<size_t> tw = numify<size_t>(tokens[i+1]);
- if (tw.isError()) {
- cerr << "Failed to parse the number of tcp connections in"
- << " TIME_WAIT: " << tw.error() << endl;
- continue;
- }
-
- result.set_net_tcp_time_wait_connections(tw.get());
- }
- }
- }
- }
-
- if (flags.enable_socket_statistics_details) {
- // Collections for socket statistics details are below.
-
- // NOTE: If the underlying library uses the older version of
- // kernel API, the family argument passed in may not be honored.
- Try<vector<diagnosis::socket::Info>> infos =
- diagnosis::socket::infos(AF_INET, diagnosis::socket::state::ALL);
-
- if (infos.isError()) {
- cerr << "Failed to retrieve the socket information" << endl;
- return 1;
- }
-
- vector<uint32_t> RTTs;
- foreach (const diagnosis::socket::Info& info, infos.get()) {
- // We double check on family regardless.
- if (info.family != AF_INET) {
- continue;
- }
-
- // We consider all sockets that have non-zero rtt value.
- if (info.tcpInfo.isSome() && info.tcpInfo.get().tcpi_rtt != 0) {
- RTTs.push_back(info.tcpInfo.get().tcpi_rtt);
- }
- }
-
- // Only print to stdout when we have results.
- if (RTTs.size() > 0) {
- std::sort(RTTs.begin(), RTTs.end());
-
- // NOTE: The size of RTTs is usually within 1 million so we
- // don't need to worry about overflow here.
- // TODO(jieyu): Right now, we choose to use "Nearest rank" for
- // simplicity. Consider directly using the Statistics abstraction
- // which computes "Linear interpolation between closest ranks".
- // http://en.wikipedia.org/wiki/Percentile
- size_t p50 = RTTs.size() * 50 / 100;
- size_t p90 = RTTs.size() * 90 / 100;
- size_t p95 = RTTs.size() * 95 / 100;
- size_t p99 = RTTs.size() * 99 / 100;
-
- result.set_net_tcp_rtt_microsecs_p50(RTTs[p50]);
- result.set_net_tcp_rtt_microsecs_p90(RTTs[p90]);
- result.set_net_tcp_rtt_microsecs_p95(RTTs[p95]);
- result.set_net_tcp_rtt_microsecs_p99(RTTs[p99]);
- }
- }
-
- // Collect traffic statistics for the container from the container
- // virtual interface and export them in JSON.
- const string& eth0 = flags.eth0_name.get();
-
- // Overlimits are reported on the HTB qdisc at the egress root.
- Result<hashmap<string, uint64_t>> statistics =
- htb::statistics(eth0, EGRESS_ROOT);
-
- if (statistics.isSome()) {
- addTrafficControlStatistics(
- NET_ISOLATOR_BW_LIMIT,
- statistics.get(),
- &result);
- } else if (statistics.isNone()) {
- // Traffic control statistics are only available when the
- // container is created on a slave when the egress rate limit is
- // on (i.e., egress_rate_limit_per_container flag is set). We
- // can't just test for that flag here however, since the slave may
- // have been restarted with different flags since the container
- // was created. It is also possible that isolator statistics are
- // unavailable because we the container is in the process of being
- // created or destroy. Hence we do not report a lack of network
- // statistics as an error.
- } else if (statistics.isError()) {
- cerr << "Failed to get htb qdisc statistics on " << eth0
- << " in namespace " << flags.pid.get() << endl;
- }
-
- // Drops due to the bandwidth limit should be reported at the leaf.
- statistics = fq_codel::statistics(eth0, CONTAINER_TX_HTB_CLASS_ID);
- if (statistics.isSome()) {
- addTrafficControlStatistics(
- NET_ISOLATOR_BLOAT_REDUCTION,
- statistics.get(),
- &result);
- } else if (statistics.isNone()) {
- // See discussion on network isolator statistics above.
- } else if (statistics.isError()) {
- cerr << "Failed to get fq_codel qdisc statistics on " << eth0
- << " in namespace " << flags.pid.get() << endl;
- }
-
- cout << stringify(JSON::Protobuf(result));
- return 0;
-}
-
-
-/////////////////////////////////////////////////
-// Implementation for the isolator.
-/////////////////////////////////////////////////
-
-PortMappingIsolatorProcess::Metrics::Metrics()
- : adding_eth0_ip_filters_errors(
- "port_mapping/adding_eth0_ip_filters_errors"),
- adding_eth0_ip_filters_already_exist(
- "port_mapping/adding_eth0_ip_filters_already_exist"),
- adding_eth0_egress_filters_errors(
- "port_mapping/adding_eth0_egress_filters_errors"),
- adding_eth0_egress_filters_already_exist(
- "port_mapping/adding_eth0_egress_filters_already_exist"),
- adding_lo_ip_filters_errors(
- "port_mapping/adding_lo_ip_filters_errors"),
- adding_lo_ip_filters_already_exist(
- "port_mapping/adding_lo_ip_filters_already_exist"),
- adding_veth_ip_filters_errors(
- "port_mapping/adding_veth_ip_filters_errors"),
- adding_veth_ip_filters_already_exist(
- "port_mapping/adding_veth_ip_filters_already_exist"),
- adding_veth_icmp_filters_errors(
- "port_mapping/adding_veth_icmp_filters_errors"),
- adding_veth_icmp_filters_already_exist(
- "port_mapping/adding_veth_icmp_filters_already_exist"),
- adding_veth_arp_filters_errors(
- "port_mapping/adding_veth_arp_filters_errors"),
- adding_veth_arp_filters_already_exist(
- "port_mapping/adding_veth_arp_filters_already_exist"),
- adding_eth0_icmp_filters_errors(
- "port_mapping/adding_eth0_icmp_filters_errors"),
- adding_eth0_icmp_filters_already_exist(
- "port_mapping/adding_eth0_icmp_filters_already_exist"),
- adding_eth0_arp_filters_errors(
- "port_mapping/adding_eth0_arp_filters_errors"),
- adding_eth0_arp_filters_already_exist(
- "port_mapping/adding_eth0_arp_filters_already_exist"),
- removing_eth0_ip_filters_errors(
- "port_mapping/removing_eth0_ip_filters_errors"),
- removing_eth0_ip_filters_do_not_exist(
- "port_mapping/removing_eth0_ip_filters_do_not_exist"),
- removing_eth0_egress_filters_errors(
- "port_mapping/removing_eth0_egress_filters_errors"),
- removing_eth0_egress_filters_do_not_exist(
- "port_mapping/removinging_eth0_egress_filters_do_not_exist"),
- removing_lo_ip_filters_errors(
- "port_mapping/removing_lo_ip_filters_errors"),
- removing_lo_ip_filters_do_not_exist(
- "port_mapping/removing_lo_ip_filters_do_not_exist"),
- removing_veth_ip_filters_errors(
- "port_mapping/removing_veth_ip_filters_errors"),
- removing_veth_ip_filters_do_not_exist(
- "port_mapping/removing_veth_ip_filters_do_not_exist"),
- removing_eth0_icmp_filters_errors(
- "port_mapping/removing_eth0_icmp_filters_errors"),
- removing_eth0_icmp_filters_do_not_exist(
- "port_mapping/removing_eth0_icmp_filters_do_not_exist"),
- removing_eth0_arp_filters_errors(
- "port_mapping/removing_eth0_arp_filters_errors"),
- removing_eth0_arp_filters_do_not_exist(
- "port_mapping/removing_eth0_arp_filters_do_not_exist"),
- updating_eth0_icmp_filters_errors(
- "port_mapping/updating_eth0_icmp_filters_errors"),
- updating_eth0_icmp_filters_already_exist(
- "port_mapping/updating_eth0_icmp_filters_already_exist"),
- updating_eth0_icmp_filters_do_not_exist(
- "port_mapping/updating_eth0_icmp_filters_do_not_exist"),
- updating_eth0_arp_filters_errors(
- "port_mapping/updating_eth0_arp_filters_errors"),
- updating_eth0_arp_filters_already_exist(
- "port_mapping/updating_eth0_arp_filters_already_exist"),
- updating_eth0_arp_filters_do_not_exist(
- "port_mapping/updating_eth0_arp_filters_do_not_exist"),
- updating_container_ip_filters_errors(
- "port_mapping/updating_container_ip_filters_errors")
-{
- process::metrics::add(adding_eth0_ip_filters_errors);
- process::metrics::add(adding_eth0_ip_filters_already_exist);
- process::metrics::add(adding_lo_ip_filters_errors);
- process::metrics::add(adding_lo_ip_filters_already_exist);
- process::metrics::add(adding_veth_ip_filters_errors);
- process::metrics::add(adding_veth_ip_filters_already_exist);
- process::metrics::add(adding_veth_icmp_filters_errors);
- process::metrics::add(adding_veth_icmp_filters_already_exist);
- process::metrics::add(adding_veth_arp_filters_errors);
- process::metrics::add(adding_veth_arp_filters_already_exist);
- process::metrics::add(adding_eth0_icmp_filters_errors);
- process::metrics::add(adding_eth0_icmp_filters_already_exist);
- process::metrics::add(adding_eth0_arp_filters_errors);
- process::metrics::add(adding_eth0_arp_filters_already_exist);
- process::metrics::add(removing_eth0_ip_filters_errors);
- process::metrics::add(removing_eth0_ip_filters_do_not_exist);
- process::metrics::add(removing_lo_ip_filters_errors);
- process::metrics::add(removing_lo_ip_filters_do_not_exist);
- process::metrics::add(removing_veth_ip_filters_errors);
- process::metrics::add(removing_veth_ip_filters_do_not_exist);
- process::metrics::add(removing_eth0_icmp_filters_errors);
- process::metrics::add(removing_eth0_icmp_filters_do_not_exist);
- process::metrics::add(removing_eth0_arp_filters_errors);
- process::metrics::add(removing_eth0_arp_filters_do_not_exist);
- process::metrics::add(updating_eth0_icmp_filters_errors);
- process::metrics::add(updating_eth0_icmp_filters_already_exist);
- process::metrics::add(updating_eth0_icmp_filters_do_not_exist);
- process::metrics::add(updating_eth0_arp_filters_errors);
- process::metrics::add(updating_eth0_arp_filters_already_exist);
- process::metrics::add(updating_eth0_arp_filters_do_not_exist);
- process::metrics::add(updating_container_ip_filters_errors);
-}
-
-
-PortMappingIsolatorProcess::Metrics::~Metrics()
-{
- process::metrics::remove(adding_eth0_ip_filters_errors);
- process::metrics::remove(adding_eth0_ip_filters_already_exist);
- process::metrics::remove(adding_lo_ip_filters_errors);
- process::metrics::remove(adding_lo_ip_filters_already_exist);
- process::metrics::remove(adding_veth_ip_filters_errors);
- process::metrics::remove(adding_veth_ip_filters_already_exist);
- process::metrics::remove(adding_veth_icmp_filters_errors);
- process::metrics::remove(adding_veth_icmp_filters_already_exist);
- process::metrics::remove(adding_veth_arp_filters_errors);
- process::metrics::remove(adding_veth_arp_filters_already_exist);
- process::metrics::remove(adding_eth0_icmp_filters_errors);
- process::metrics::remove(adding_eth0_icmp_filters_already_exist);
- process::metrics::remove(adding_eth0_arp_filters_errors);
- process::metrics::remove(adding_eth0_arp_filters_already_exist);
- process::metrics::remove(removing_eth0_ip_filters_errors);
- process::metrics::remove(removing_eth0_ip_filters_do_not_exist);
- process::metrics::remove(removing_lo_ip_filters_errors);
- process::metrics::remove(removing_lo_ip_filters_do_not_exist);
- process::metrics::remove(removing_veth_ip_filters_errors);
- process::metrics::remove(removing_veth_ip_filters_do_not_exist);
- process::metrics::remove(removing_eth0_icmp_filters_errors);
- process::metrics::remove(removing_eth0_icmp_filters_do_not_exist);
- process::metrics::remove(removing_eth0_arp_filters_errors);
- process::metrics::remove(removing_eth0_arp_filters_do_not_exist);
- process::metrics::remove(updating_eth0_icmp_filters_errors);
- process::metrics::remove(updating_eth0_icmp_filters_already_exist);
- process::metrics::remove(updating_eth0_icmp_filters_do_not_exist);
- process::metrics::remove(updating_eth0_arp_filters_errors);
- process::metrics::remove(updating_eth0_arp_filters_already_exist);
- process::metrics::remove(updating_eth0_arp_filters_do_not_exist);
- process::metrics::remove(updating_container_ip_filters_errors);
-}
-
-
-Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
-{
- // Check for root permission.
- if (geteuid() != 0) {
- return Error("Using network isolator requires root permissions");
- }
-
- // Verify that the network namespace is available by checking the
- // existence of the network namespace handle of the current process.
- if (ns::namespaces().count("net") == 0) {
- return Error(
- "Using network isolator requires network namespace. "
- "Make sure your kernel is newer than 3.4");
- }
-
- // Check the routing library.
- Try<Nothing> check = routing::check();
- if (check.isError()) {
- return Error(
- "Routing library check failed: " +
- check.error());
- }
-
- // Check the availability of a few Linux commands that we will use.
- // We use the blocking os::shell here because 'create' will only be
- // invoked during initialization.
- Try<string> checkCommandTc = os::shell("tc filter show");
- if (checkCommandTc.isError()) {
- return Error("Check command 'tc' failed: " + checkCommandTc.error());
- }
-
- Try<string> checkCommandIp = os::shell("ip link show");
- if (checkCommandIp.isError()) {
- return Error("Check command 'ip' failed: " + checkCommandIp.error());
- }
-
- Try<Resources> resources = Resources::parse(
- flags.resources.getOrElse(""),
- flags.default_role);
-
- if (resources.isError()) {
- return Error("Failed to parse --resources: " + resources.error());
- }
-
- // Get 'ports' resource from 'resources' flag. These ports will be
- // treated as non-ephemeral ports.
- IntervalSet<uint16_t> nonEphemeralPorts;
- if (resources.get().ports().isSome()) {
- nonEphemeralPorts = getIntervalSet(resources.get().ports().get());
- }
-
- // Get 'ephemeral_ports' resource from 'resources' flag. These ports
- // will be allocated to each container as ephemeral ports.
- IntervalSet<uint16_t> ephemeralPorts;
- if (resources.get().ephemeral_ports().isSome()) {
- ephemeralPorts = getIntervalSet(resources.get().ephemeral_ports().get());
- }
-
- // Each container requires at least one ephemeral port for slave
- // executor communication. If no 'ephemeral_ports' resource is
- // found, we will return error.
- if (ephemeralPorts.empty()) {
- return Error("Ephemeral ports are not specified");
- }
-
- // Sanity check to make sure that the ephemeral ports specified do
- // not intersect with the specified non-ephemeral ports.
- if (ephemeralPorts.intersects(nonEphemeralPorts)) {
- return Error(
- "The specified ephemeral ports " + stringify(ephemeralPorts) +
- " intersect with the specified non-ephemeral ports " +
- stringify(nonEphemeralPorts));
- }
-
- // This is a sanity check to make sure that the ephemeral ports
- // specified do not intersect with the well known ports.
- if (ephemeralPorts.intersects(WELL_KNOWN_PORTS())) {
- return Error(
- "The specified ephemeral ports " + stringify(ephemeralPorts) +
- " intersect with well known ports " + stringify(WELL_KNOWN_PORTS()));
- }
-
- // Obtain the host ephemeral port range by reading the proc file
- // system ('ip_local_port_range').
- Try<string> value = os::read("/proc/sys/net/ipv4/ip_local_port_range");
- if (value.isError()) {
- return Error("Failed to read host ip_local_port_range: " + value.error());
- }
-
- vector<string> split = strings::split(strings::trim(value.get()), "\t");
- if (split.size() != 2) {
- return Error(
- "Unexpected format from host ip_local_port_range: " + value.get());
- }
-
- Try<uint16_t> begin = numify<uint16_t>(split[0]);
- if (begin.isError()) {
- return Error(
- "Failed to parse the begin of host ip_local_port_range: " + split[0]);
- }
-
- Try<uint16_t> end = numify<uint16_t>(split[1]);
- if (end.isError()) {
- return Error(
- "Failed to parse the end of host ip_local_port_range: " + split[1]);
- }
-
- Interval<uint16_t> hostEphemeralPorts =
- (Bound<uint16_t>::closed(begin.get()),
- Bound<uint16_t>::closed(end.get()));
-
- // Sanity check to make sure the specified ephemeral ports do not
- // intersect with the ephemeral ports used by the host.
- if (ephemeralPorts.intersects(hostEphemeralPorts)) {
- return Error(
- "The specified ephemeral ports " + stringify(ephemeralPorts) +
- " intersect with the ephemeral ports used by the host " +
- stringify(hostEphemeralPorts));
- }
-
- // TODO(chzhcn): Cross check ephemeral ports with used ports on the
- // host (e.g., using port scan).
-
- // Initialize the ephemeral ports allocator.
-
- // In theory, any positive integer can be broken up into a few
- // numbers that are power of 2 aligned. We choose to not allow this
- // for now so that each container has a fixed (one) number of
- // filters for ephemeral ports. This makes it easy to debug and
- // infer performance.
- if (roundDownToPowerOfTwo(flags.ephemeral_ports_per_container) !=
- flags.ephemeral_ports_per_container) {
- return Error(
- "The number of ephemeral ports for each container (" +
- stringify(flags.ephemeral_ports_per_container) +
- ") is not a power of 2");
- }
-
- if (ephemeralPorts.size() < flags.ephemeral_ports_per_container) {
- return Error(
- "Network Isolator is given ephemeral ports of size: " +
- stringify(ephemeralPorts.size()) + ", but asked to allocate " +
- stringify(flags.ephemeral_ports_per_container) +
- " ephemeral ports for a container");
- }
-
- if (flags.ephemeral_ports_per_container < MIN_EPHEMERAL_PORTS_SIZE) {
- return Error(
- "Each container has only " +
- stringify(flags.ephemeral_ports_per_container) +
- " ephemeral ports. The minimum required is: " +
- stringify(MIN_EPHEMERAL_PORTS_SIZE));
- }
-
- Owned<EphemeralPortsAllocator> ephemeralPortsAllocator(
- new EphemeralPortsAllocator(
- ephemeralPorts,
- flags.ephemeral_ports_per_container));
-
- // Get the name of the public interface (e.g., eth0). If it is not
- // specified, try to derive its name from the routing library.
- Result<string> eth0 = link::eth0();
- if (flags.eth0_name.isSome()) {
- eth0 = flags.eth0_name.get();
-
- // Check if the given public interface exists.
- Try<bool> hostEth0Exists = link::exists(eth0.get());
- if (hostEth0Exists.isError()) {
- return Error(
- "Failed to check if " + eth0.get() + " exists: " +
- hostEth0Exists.error());
- } else if (!hostEth0Exists.get()) {
- return Error("The public interface " + eth0.get() + " does not exist");
- }
- } else if (!eth0.isSome()){
- // eth0 is not specified in the flag and we did not get a valid
- // eth0 from the library.
- return Error(
- "Network Isolator failed to find a public interface: " + eth0.error());
- }
-
- LOG(INFO) << "Using " << eth0.get() << " as the public interface";
-
- // Get the name of the loopback interface. If it is not specified,
- // try to derive its name based on the loopback IP address.
- Result<string> lo = link::lo();
- // Option<string> lo = flags.lo_name;
- if (flags.lo_name.isSome()) {
- lo = flags.lo_name;
-
- // Check if the given loopback interface exists.
- Try<bool> hostLoExists = link::exists(lo.get());
- if (hostLoExists.isError()) {
- return Error(
- "Failed to check if " + lo.get() + " exists: " +
- hostLoExists.error());
- } else if (!hostLoExists.get()) {
- return Error("The loopback interface " + lo.get() + " does not exist");
- }
- } else if (!lo.isSome()) {
- // lo is not specified in the flag and we did not get a valid
- // lo from the library.
- return Error(
- "Network Isolator failed to find a loopback interface: " + lo.error());
- }
-
- LOG(INFO) << "Using " << lo.get() << " as the loopback interface";
-
- // If egress rate limit is provided, do a sanity check that it is
- // not greater than the host physical link speed.
- Option<Bytes> egressRateLimitPerContainer;
- if (flags.egress_rate_limit_per_container.isSome()) {
- // Read host physical link speed from /sys/class/net/eth0/speed.
- // This value is in MBits/s.
- Try<string> value =
- os::read(path::join("/sys/class/net", eth0.get(), "speed"));
-
- if (value.isError()) {
- return Error(
- "Failed to read " +
- path::join("/sys/class/net", eth0.get(), "speed") +
- ": " + value.error());
- }
-
- Try<uint64_t> hostLinkSpeed = numify<uint64_t>(strings::trim(value.get()));
- CHECK_SOME(hostLinkSpeed);
-
- // It could be possible that the nic driver doesn't support
- // reporting physical link speed. In that case, report error.
- if (hostLinkSpeed.get() == 0xFFFFFFFF) {
- return Error(
- "Network Isolator failed to determine link speed for " + eth0.get());
- }
-
- // Convert host link speed to Bytes/s for comparason.
- if (hostLinkSpeed.get() * 1000000 / 8 <
- flags.egress_rate_limit_per_container.get().bytes()) {
- return Error(
- "The given egress traffic limit for containers " +
- stringify(flags.egress_rate_limit_per_container.get().bytes()) +
- " Bytes/s is greater than the host link speed " +
- stringify(hostLinkSpeed.get() * 1000000 / 8) + " Bytes/s");
- }
-
- if (flags.egress_rate_limit_per_container.get() != Bytes(0)) {
- egressRateLimitPerContainer = flags.egress_rate_limit_per_container.get();
- } else {
- LOG(WARNING) << "Ignoring the given zero egress rate limit";
- }
- }
-
- // Get the host IP network, MAC and default gateway.
- Result<net::IPNetwork> hostIPNetwork =
- net::IPNetwork::fromLinkDevice(eth0.get(), AF_INET);
-
- if (!hostIPNetwork.isSome()) {
- return Error(
- "Failed to get the public IP network of " + eth0.get() + ": " +
- (hostIPNetwork.isError() ?
- hostIPNetwork.error() :
- "does not have an IPv4 network"));
- }
-
- Result<net::MAC> hostMAC = net::mac(eth0.get());
- if (!hostMAC.isSome()) {
- return Error(
- "Failed to get the MAC address of " + eth0.get() + ": " +
- (hostMAC.isError() ? hostMAC.error() : "does not have a MAC address"));
- }
-
- Result<net::IP> hostDefaultGateway = route::defaultGateway();
- if (!hostDefaultGateway.isSome()) {
- return Error(
- "Failed to get the default gateway of the host: " +
- (hostDefaultGateway.isError() ? hostDefaultGateway.error()
- : "The default gateway of the host does not exist"));
- }
-
- // Set the MAC address of the host loopback interface (lo) so that
- // it matches that of the host public interface (eth0). A fairly
- // recent kernel patch is needed for this operation to succeed:
- // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
- // 25f929fbff0d1bcebf2e92656d33025cd330cbf8
- Try<bool> setHostLoMAC = link::setMAC(lo.get(), hostMAC.get());
- if (setHostLoMAC.isError()) {
- return Error(
- "Failed to set the MAC address of " + lo.get() +
- ": " + setHostLoMAC.error());
- }
-
- // Set the MTU of the host loopback interface (lo) so that it
- // matches that of the host public interface (eth0).
- Result<unsigned int> hostEth0MTU = link::mtu(eth0.get());
- if (hostEth0MTU.isError()) {
- return Error(
- "Failed to get the MTU of " + eth0.get() +
- ": " + hostEth0MTU.error());
- }
-
- // The host public interface should exist since we just checked it.
- CHECK_SOME(hostEth0MTU);
-
- Try<bool> setHostLoMTU = link::setMTU(lo.get(), hostEth0MTU.get());
- if (setHostLoMTU.isError()) {
- return Error(
- "Failed to set the MTU of " + lo.get() +
- ": " + setHostLoMTU.error());
- }
-
- // Prepare the ingress queueing disciplines on host public interface
- // (eth0) and host loopback interface (lo).
- Try<bool> createHostEth0IngressQdisc = ingress::create(eth0.get());
- if (createHostEth0IngressQdisc.isError()) {
- return Error(
- "Failed to create the ingress qdisc on " + eth0.get() +
- ": " + createHostEth0IngressQdisc.error());
- }
-
- set<uint16_t> freeFlowIds;
- if (flags.egress_unique_flow_per_container) {
- // Prepare a fq_codel queueing discipline on host public interface
- // (eth0) for egress flow classification.
- //
- // TODO(cwang): Maybe we can continue when some other egress qdisc
- // exists because this is not a necessary qdisc for network
- // isolation, but we don't want inconsistency, so we just fail in
- // this case. See details in MESOS-2370.
- Try<bool> createHostEth0EgressQdisc = fq_codel::create(
- eth0.get(),
- EGRESS_ROOT,
- HOST_TX_FQ_CODEL_HANDLE);
- if (createHostEth0EgressQdisc.isError()) {
- return Error(
- "Failed to create the egress qdisc on " + eth0.get() +
- ": " + createHostEth0EgressQdisc.error());
- }
-
- // TODO(cwang): Make sure DEFAULT_FLOWS is large enough so that
- // it's unlikely to run out of free flow IDs.
- for (uint16_t i = CONTAINER_MIN_FLOWID; i < fq_codel::DEFAULT_FLOWS; i++) {
- freeFlowIds.insert(i);
- }
- }
-
- Try<bool> createHostLoQdisc = ingress::create(lo.get());
- if (createHostLoQdisc.isError()) {
- return Error(
- "Failed to create the ingress qdisc on " + lo.get() +
- ": " + createHostLoQdisc.error());
- }
-
- // Enable 'route_localnet' on host loopback interface (lo). This
- // enables the use of 127.0.0.1/8 for local routing purpose. This
- // feature only exists on kernel 3.6 or newer.
- const string loRouteLocalnet =
- path::join("/proc/sys/net/ipv4/conf", lo.get(), "route_localnet");
-
- if (!os::exists(loRouteLocalnet)) {
- // TODO(jieyu): Consider supporting running the isolator if this
- // feature is not available. We need to conditionally disable
- // routing for 127.0.0.1/8, and ask the tasks to use the public IP
- // for container to container and container to host communication.
- return Error("The kernel does not support 'route_localnet'");
- }
-
- Try<Nothing> write = os::write(loRouteLocalnet, "1");
- if (write.isError()) {
- return Error(
- "Failed to enable route_localnet for " + lo.get() +
- ": " + write.error());
- }
-
- // We disable 'rp_filter' and 'send_redirects' for host loopback
- // interface (lo) to work around a kernel bug, which was only
- // recently addressed in upstream in the following 3 commits.
- // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
- // 6a662719c9868b3d6c7d26b3a085f0cd3cc15e64
- // 0d5edc68739f1c1e0519acbea1d3f0c1882a15d7
- // e374c618b1465f0292047a9f4c244bd71ab5f1f0
- // The workaround ensures packets don't get dropped at lo.
- write = os::write("/proc/sys/net/ipv4/conf/all/rp_filter", "0");
- if (write.isError()) {
- return Error(
- "Failed to disable rp_filter for all: " + write.error());
- }
-
- write = os::write(path::join(
- "/proc/sys/net/ipv4/conf", lo.get(), "rp_filter"), "0");
- if (write.isError()) {
- return Error(
- "Failed to disable rp_filter for " + lo.get() +
- ": " + write.error());
- }
-
- write = os::write("/proc/sys/net/ipv4/conf/all/send_redirects", "0");
- if (write.isError()) {
- return Error(
- "Failed to disable send_redirects for all: " + write.error());
- }
-
- write = os::write(path::join(
- "/proc/sys/net/ipv4/conf", lo.get(), "send_redirects"), "0");
- if (write.isError()) {
- return Error(
- "Failed to disable send_redirects for " + lo.get() +
- ": " + write.error());
- }
-
- // We need to enable accept_local on host loopback interface (lo)
- // for kernels older than 3.6. Refer to the following:
- // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
- // 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2
- // https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
- write = os::write(path::join(
- "/proc/sys/net/ipv4/conf", lo.get(), "accept_local"), "1");
- if (write.isError()) {
- return Error(
- "Failed to enable accept_local for " + lo.get() +
- ": " + write.error());
- }
-
- // Reading host network configurations. Each container will match
- // these configurations.
- hashset<string> procs;
-
- // TODO(jieyu): The following is a partial list of all the
- // configurations. In the future, we may want to expose these
- // configurations using ContainerInfo.
-
- // The kernel will use a default value for the following
- // configurations inside a container. Therefore, we need to set them
- // in the container to match that on the host.
- procs.insert("/proc/sys/net/core/somaxconn");
-
- // As of kernel 3.10, the following configurations are shared
- // between host and containers, and therefore are not required to be
- // set in containers. We keep them here just in case the kernel
- // changes in the future.
- procs.insert("/proc/sys/net/core/netdev_max_backlog");
- procs.insert("/proc/sys/net/core/rmem_max");
- procs.insert("/proc/sys/net/core/wmem_max");
- procs.insert("/proc/sys/net/ipv4/tcp_keepalive_time");
- procs.insert("/proc/sys/net/ipv4/tcp_keepalive_intvl");
- procs.insert("/proc/sys/net/ipv4/tcp_keepalive_probes");
- procs.insert("/proc/sys/net/ipv4/tcp_max_syn_backlog");
- procs.insert("/proc/sys/net/ipv4/tcp_rmem");
- procs.insert("/proc/sys/net/ipv4/tcp_retries2");
- procs.insert("/proc/sys/net/ipv4/tcp_synack_retries");
- procs.insert("/proc/sys/net/ipv4/tcp_wmem");
- procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh1");
- procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh2");
- procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh3");
-
- hashmap<string, string> hostNetworkConfigurations;
- foreach (const string& proc, procs) {
- Try<string> value = os::read(proc);
- if (value.isSome()) {
- LOG(INFO) << proc << " = '" << strings::trim(value.get()) << "'";
- hostNetworkConfigurations[proc] = strings::trim(value.get());
- }
- }
-
- // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT(). Since we use a
- // new mount namespace for each container, for this mount point, we
- // set '--make-rshared' on the host and set '--make-rslave' inside
- // each container. This is important because when we unmount the
- // network namespace handles on the host, those handles will be
- // unmounted in the containers as well, but NOT vice versa.
-
- // We first create the bind mount directory if it does not exist.
- Try<Nothing> mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_ROOT());
- if (mkdir.isError()) {
- return Error(
- "Failed to create the bind mount root directory at " +
- PORT_MAPPING_BIND_MOUNT_ROOT() + ": " + mkdir.error());
- }
-
- // Now, check '/proc/mounts' to see if
- // PORT_MAPPING_BIND_MOUNT_ROOT() has already been self mounted.
- Try<fs::MountTable> mountTable = fs::MountTable::read("/proc/mounts");
- if (mountTable.isError()) {
- return Error(
- "Failed to the read the mount table at '/proc/mounts': " +
- mountTable.error());
- }
-
- Option<fs::MountTable::Entry> bindMountRoot;
- foreach (const fs::MountTable::Entry& entry, mountTable.get().entries) {
- if (entry.dir == PORT_MAPPING_BIND_MOUNT_ROOT()) {
- bindMountRoot = entry;
- }
- }
-
- // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT().
- if (bindMountRoot.isNone()) {
- // NOTE: Instead of using fs::mount to perform the bind mount, we
- // use the shell command here because the syscall 'mount' does not
- // update the mount table (i.e., /etc/mtab), which could cause
- // issues for the shell command 'mount --make-rslave' inside the
- // container. It's OK to use the blocking os::shell here because
- // 'create' will only be invoked during initialization.
- Try<string> mount = os::shell(
- "mount --bind %s %s",
- PORT_MAPPING_BIND_MOUNT_ROOT().c_str(),
- PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
-
- if (mount.isError()) {
- return Error(
- "Failed to self bind mount '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
- "': " + mount.error());
- }
- }
-
- // Mark the mount point PORT_MAPPING_BIND_MOUNT_ROOT() as
- // recursively shared.
- Try<string> mountShared = os::shell(
- "mount --make-rshared %s",
- PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
-
- if (mountShared.isError()) {
- return Error(
- "Failed to mark '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
- "' as recursively shared: " + mountShared.error());
- }
-
- // Create the network namespace handle symlink directory if it does
- // not exist. It is used to host from network namespace handle
- // symlinks whose basename is a container ID. This allows us to
- // recover container IDs for orphan containers (i.e., not known by
- // the slave). This is introduced in 0.23.0.
- mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
- if (mkdir.isError()) {
- return Error(
- "Failed to create the bind mount root directory at " +
- PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() + ": " + mkdir.error());
- }
-
- return new MesosIsolator(Owned<MesosIsolatorProcess>(
- new PortMappingIsolatorProcess(
- flags,
- eth0.get(),
- lo.get(),
- hostMAC.get(),
- hostIPNetwork.get(),
- hostEth0MTU.get(),
- hostDefaultGateway.get(),
- hostNetworkConfigurations,
- egressRateLimitPerContainer,
- nonEphemeralPorts,
- ephemeralPortsAllocator,
- freeFlowIds)));
-}
-
-
-Future<Nothing> PortMappingIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- // Extract pids from virtual device names (veth). This tells us
- // about all the potential live containers on this slave.
- Try<set<string>> links = net::links();
- if (links.isError()) {
- return Failure("Failed to get all the links: " + links.error());
- }
-
- hashset<pid_t> pids;
- foreach (const string& name, links.get()) {
- Option<pid_t> pid = getPidFromVeth(name);
- // Not all links follow the naming: mesos{pid}, so we simply
- // continue, e.g., eth0.
- if (pid.isNone()) {
- continue;
- } else if (pids.contains(pid.get())) {
- return Failure("Two virtual devices have the same name '" + name + "'");
- }
-
- pids.insert(pid.get());
- }
-
- // Scan the bind mount root to cleanup all stale network namespace
- // handles that do not have an active veth associated with.
- Try<list<string>> entries = os::ls(PORT_MAPPING_BIND_MOUNT_ROOT());
- if (entries.isError()) {
- return Failure(
- "Failed to list bind mount root '" +
- PORT_MAPPING_BIND_MOUNT_ROOT() +
- "': " + entries.error());
- }
-
- foreach (const string& entry, entries.get()) {
- const string path = path::join(PORT_MAPPING_BIND_MOUNT_ROOT(), entry);
-
- // NOTE: We expect all regular files whose names are numbers under
- // the bind mount root are network namespace handles.
- Result<pid_t> pid = getPidFromNamespaceHandle(path);
- if (pid.isError()) {
- return Failure(
- "Failed to get pid from network namespace handle '" +
- path + "': " + pid.error());
- } else if (pid.isNone()) {
- // We ignore files that are clearly not network namespace
- // handles created by us. It's likely that those are created by
- // users or other tools.
- LOG(WARNING) << "Unrecognized network namespace handle '" << path << "'";
- continue;
- }
-
- // We cleanup the network namespace handle if the associated
- // containers have clearly exited (i.e., the veth has gone). The
- // cleanup here is best effort.
- if (!pids.contains(pid.get())) {
- LOG(INFO) << "Removing stale network namespace handle '" << path << "'";
-
- Try<Nothing> unmount = fs::unmount(path, MNT_DETACH);
- if (unmount.isError()) {
- LOG(WARNING) << "Failed to unmount stale network namespace handle '"
- << path << "': " << unmount.error();
- }
-
- Try<Nothing> rm = os::rm(path);
- if (rm.isError()) {
- LOG(WARNING) << "Failed to remove stale network namespace handle '"
- << path << "': " << rm.error();
- }
- }
- }
-
- // Scan the bind mount symlink root for container IDs. This allows us
- // to recover container IDs for orphan containers (i.e., not known
- // by the slave). This is introduced in 0.23.0.
- entries = os::ls(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
- if (entries.isError()) {
- return Failure(
- "Failed to list bind mount symlink root '" +
- PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() +
- "': " + entries.error());
- }
-
- // This map stores the mapping between pids and container IDs
- // recovered from the bind mount root that have valid veth links. We
- // use a multihashmap here because multiple container IDs can map to
- // the same pid if the removal of a symlink fails in '_cleanup()'
- // and the pid is reused by a new container.
- multihashmap<pid_t, ContainerID> linkers;
-
- foreach (const string& entry, entries.get()) {
- const string path =
- path::join(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(), entry);
-
- // We only create symlinks in this directory and assume
- // non-symlink files are created by other users or tools,
- // therefore will be ignored.
- if (!os::stat::islink(path)) {
- LOG(WARNING) << "Ignored non-symlink file '" << path
- << "' under bind mount symlink root '"
- << PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() << "'";
- continue;
- }
-
- // NOTE: We expect all symlinks under the bind mount symlink root
- // to be container ID symlinks.
-
- Try<ContainerID> containerId = getContainerIdFromSymlink(path);
- if (containerId.isError()) {
- return Failure(
- "Failed to get container ID from network namespace handle symlink '" +
- path + "': " + containerId.error());
- }
-
- Result<pid_t> pid = getPidFromSymlink(path);
- if (pid.isError()) {
- return Failure(
- "Failed to get pid from network namespace handle symlink '" + path +
- "': " + pid.error());
- }
-
- // We remove the symlink if it's dangling or the associated
- // containers have clearly exited (i.e., the veth has gone). The
- // cleanup here is best effort.
- if (pid.isNone() || !pids.contains(pid.get())) {
- LOG(INFO) << "Removing stale network namespace handle symlink '"
- << path << "'";
-
- Try<Nothing> rm = os::rm(path);
- if (rm.isError()) {
- LOG(WARNING) << "Failed to remove stale network namespace handle "
- << " symlink '" << path << "': " << rm.error();
- }
- } else {
- LOG(INFO) << "Discovered network namespace handle symlink "
- << containerId.get() << " -> " << pid.get();
-
- linkers.put(pid.get(), containerId.get());
- }
- }
-
- // If multiple container IDs point to the same pid, we remove both
- // symlinks for safety (as if we cannot derive the container ID for
- // orphans, which is OK because it'll be treated the same as those
- // containers that are created by older (pre 0.23.0) versions). Note
- // that it's possible that multiple container IDs map to the same
- // pid if the removal of a symlink fails in '_cleanup()' and the pid
- // is reused by a new container.
- foreach (pid_t pid, linkers.keys()) {
- list<ContainerID> containerIds = linkers.get(pid);
- if (containerIds.size() > 1) {
- foreach (const ContainerID& containerId, containerIds) {
- const string linker = getSymlinkPath(containerId);
-
- LOG(WARNING) << "Removing duplicated network namespace handle symlink '"
- << linker << "'";
-
- Try<Nothing> rm = os::rm(linker);
- if (rm.isError()) {
- LOG(WARNING) << "Failed to remove duplicated network namespace "
- << "handle symlink '" << linker << "': " << rm.error();
- }
- }
-
- linkers.remove(pid);
- }
- }
-
- // Now, actually recover the isolator from slave's state.
- foreach (const ContainerState& state, states) {
- const ContainerID& containerId = state.container_id();
- pid_t pid = state.pid();
-
- VLOG(1) << "Recovering network isolator for container "
- << containerId << " with pid " << pid;
-
- if (!pids.contains(pid)) {
- // There are two possible cases here:
- //
- // 1) The container was launched by the slave with network
- // isolation disabled, so the pid could not be found in the
- // device names in the system.
- //
- // 2) The container was launched by the slave with network
- // isolation enabled, but veth is removed (because the
- // corresponding container is destroyed), but the slave
- // restarts before it is able to write the sentinel file.
- //
- // In both cases, we treat the container as unmanaged. For case
- // (2), it's safe to do so because the container has already
- // been destroyed.
- VLOG(1) << "Skipped recovery for container " << containerId
- << "with pid " << pid << " as either it was not managed by "
- << "the network isolator or it has already been destroyed";
-
- unmanaged.insert(containerId);
- continue;
- }
-
- Try<Info*> recover = _recover(pid);
- if (recover.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
-
- return Failure(
- "Failed to recover container " + stringify(containerId) +
- " with pid " + stringify(pid) + ": " + recover.error());
- }
-
- infos[containerId] = recover.get();
-
- // Remove the successfully recovered pid.
- pids.erase(pid);
- }
-
- // Recover orphans. Known orphans will be destroyed by containerizer
- // using the normal cleanup path (refer to MESOS-2367 for details).
- // Unknown orphans will be cleaned up immediately. The recovery will
- // fail if there is some unknown orphan that cannot be cleaned up.
- vector<Info*> unknownOrphans;
-
- foreach (pid_t pid, pids) {
- Try<Info*> recover = _recover(pid);
- if (recover.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- foreach (Info* info, unknownOrphans) {
- delete info;
- }
-
- return Failure(
- "Failed to recover orphaned container with pid " +
- stringify(pid) + ": " + recover.error());
- }
-
- if (linkers.get(pid).size() == 1) {
- const ContainerID containerId = linkers.get(pid).front();
- CHECK(!infos.contains(containerId));
-
- if (orphans.contains(containerId)) {
- infos[containerId] = recover.get();
- continue;
- }
- }
-
- unknownOrphans.push_back(recover.get());
- }
-
- foreach (Info* info, unknownOrphans) {
- CHECK_SOME(info->pid);
- pid_t pid = info->pid.get();
-
- Option<ContainerID> containerId;
- if (linkers.get(pid).size() == 1) {
- containerId = linkers.get(pid).front();
- }
-
- // NOTE: If 'infos' is empty (means there is no regular container
- // or known orphan), the '_cleanup' below will remove the ICMP and
- // ARP packet filters on host eth0. This will cause subsequent
- // calls to '_cleanup' for unknown orphans to fail. However, this
- // is OK because when slave restarts and tries to recover again,
- // it'll try to remove the remaining unknown orphans.
- // TODO(jieyu): Consider call '_cleanup' for all the unknown
- // orphans before returning even if error occurs.
- Try<Nothing> cleanup = _cleanup(info, containerId);
- if (cleanup.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
-
- // TODO(jieyu): Also delete 'info' in unknownOrphans. Notice
- // that some 'info' in unknownOrphans might have already been
- // deleted in '_cleanup' above.
-
- return Failure(
- "Failed to cleanup orphaned container with pid " +
- stringify(pid) + ": " + cleanup.error());
- }
- }
-
- // TODO(cwang): Consider removing unrecognized flow classifiers from
- // host eth0 egress.
-
- LOG(INFO) << "Network isolator recovery complete";
-
- return Nothing();
-}
-
-
-Try<PortMappingIsolatorProcess::Info*>
-PortMappingIsolatorProcess::_recover(pid_t pid)
-{
- // Get all the IP filters on veth.
- // NOTE: We only look at veth devices to recover port ranges
- // assigned to each container. That's the reason why we need to make
- // sure that we add filters to veth before adding filters to host
- // eth0 and host lo. Also, we need to make sure we remove filters
- // from host eth0 and host lo before removing filters from veth.
- Result<vector<ip::Classifier>> vethIngressClassifiers =
- ip::classifiers(veth(pid), ingress::HANDLE);
-
- if (vethIngressClassifiers.isError()) {
- return Error(
- "Failed to get all the IP filters on " + veth(pid) +
- ": " + vethIngressClassifiers.error());
- } else if (vethIngressClassifiers.isNone()) {
- return Error(
- "Failed to get all the IP filters on " + veth(pid) +
- ": link does not exist");
- }
-
- hashmap<PortRange, uint16_t> flowIds;
-
- if (flags.egress_unique_flow_per_container) {
- // Get all egress IP flow classifiers on eth0.
- Result<vector<filter::Filter<ip::Classifier>>> eth0EgressFilters =
- ip::filters(eth0, HOST_TX_FQ_CODEL_HANDLE);
-
- if (eth0EgressFilters.isError()) {
- return Error(
- "Failed to get all the IP flow classifiers on " + eth0 +
- ": " + eth0EgressFilters.error());
- } else if (eth0EgressFilters.isNone()) {
- return Error(
- "Failed to get all the IP flow classifiers on " + eth0 +
- ": link does not exist");
- }
-
- // Construct a port range to flow ID mapping from host eth0
- // egress. This map will be used later.
- foreach (const filter::Filter<ip::Classifier>& filter,
- eth0EgressFilters.get()) {
- const Option<PortRange> sourcePorts = filter.classifier.sourcePorts;
- const Option<Handle> classid = filter.classid;
-
- if (sourcePorts.isNone()) {
- return Error("Missing source ports for filters on egress of " + eth0);
- }
-
- if (classid.isNone()) {
- return Error("Missing classid for filters on egress of " + eth0);
- }
-
- if (flowIds.contains(sourcePorts.get())) {
- return Error(
- "Duplicated port range " + stringify(sourcePorts.get()) +
- " detected on egress of " + eth0);
- }
-
- flowIds[sourcePorts.get()] = classid.get().secondary();
- }
- }
-
- IntervalSet<uint16_t> nonEphemeralPorts;
- IntervalSet<uint16_t> ephemeralPorts;
- Option<uint16_t> flowId;
-
- foreach (const ip::Classifier& classifier, vethIngressClassifiers.get()) {
- const Option<PortRange> sourcePorts = classifier.sourcePorts;
- const Option<PortRange> destinationPorts = classifier.destinationPorts;
-
- // All the IP filters on veth used by us only have source ports.
- if (sourcePorts.isNone() || destinationPorts.isSome()) {
- return Error("Unexpected IP filter detected on " + veth(pid));
- }
-
- if (flowIds.contains(sourcePorts.get())) {
- if (flowId.isNone()) {
- flowId = flowIds.get(sourcePorts.get());
- } else if (flowId != flowIds.get(sourcePorts.get())) {
- return Error(
- "A container is associated with multiple flows "
- "on egress of " + eth0);
- }
- } else if (flowId.isSome()) {
- // This is the case where some port range of a container is
- // assigned to a flow while some isn't. This could happen if
- // slave crashes while those filters are created. However, this
- // is OK for us because packets by default go to the host flow.
- LOG(WARNING) << "Container port range " << sourcePorts.get()
- << " does not have flow id " << flowId.get()
- << " assigned";
- }
-
- Interval<uint16_t> ports =
- (Bound<uint16_t>::closed(sourcePorts.get().begin()),
- Bound<uint16_t>::closed(sourcePorts.get().end()));
-
- if (managedNonEphemeralPorts.contains(ports)) {
- nonEphemeralPorts += ports;
- } else if (ephemeralPortsAllocator->isManaged(ports)) {
- // We have duplicate here because we have two IP filters with
- // the same ephemeral port range (one for eth0 and one for lo).
- // But we should never have two intersecting port ranges.
- if (!ephemeralPorts.contains(ports) && ephemeralPorts.intersects(ports)) {
- return Error("Unexpected intersected ephemeral port ranges");
- }
-
- ephemeralPorts += ports;
- } else {
- return Error("Unexpected IP filter detected on " + veth(pid));
- }
- }
-
- Info* info = NULL;
-
- if (ephemeralPorts.empty()) {
- // NOTE: This is possible because the slave may crash while
- // calling 'isolate()', leaving a partially isolated container. To
- // clean up this partially isolated container, we still create an
- // Info struct here and let the 'cleanup' function clean it up
- // later.
- LOG(WARNING) << "No ephemeral ports found for container with pid "
- << stringify(pid) << ". This could happen if slave crashes "
- << "while isolating a container";
-
- info = new Info(nonEphemeralPorts, Interval<uint16_t>(), pid);
- } else {
- if (ephemeralPorts.intervalCount() != 1) {
- return Error("Each container should have only one ephemeral port range");
- }
-
- // Tell the allocator that this ephemeral port range is used.
- ephemeralPortsAllocator->allocate(*ephemeralPorts.begin());
-
- info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid);
-
- VLOG(1) << "Recovered network isolator for container with pid " << pid
- << " non-ephemeral port ranges " << nonEphemeralPorts
- << " and ephemeral port range " << *ephemeralPorts.begin();
- }
-
- if (flowId.isSome()) {
- freeFlowIds.erase(flowId.get());
- info->flowId = flowId.get();
- }
-
- return CHECK_NOTNULL(info);
-}
-
-
-Future<Option<ContainerPrepareInfo>> PortMappingIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (unmanaged.contains(containerId)) {
- return Failure("Asked to prepare an unmanaged container");
- }
-
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- Resources resources(executorInfo.resources());
-
- IntervalSet<uint16_t> nonEphemeralPorts;
-
- if (resources.ports().isSome()) {
- nonEphemeralPorts = getIntervalSet(resources.ports().get());
-
- // Sanity check to make sure that the assigned non-ephemeral ports
- // for the container are part of the non-ephemeral ports specified
- // by the slave.
- if (!managedNonEphemeralPorts.contains(nonEphemeralPorts)) {
- return Failure(
- "Some non-ephemeral ports specified in " +
- stringify(nonEphemeralPorts) +
- " are not managed by the slave");
- }
- }
-
- // TODO(jieyu): For now, we simply ignore the 'ephemeral_ports'
- // specified in the executor info. However, this behavior needs to
- // be changed once the master can make default allocations for
- // ephemeral ports.
- if (resources.ephemeral_ports().isSome()) {
- LOG(WARNING) << "Ignoring the specified ephemeral_ports '"
- << resources.ephemeral_ports().get()
- << "' for container" << containerId
- << " of executor " << executorInfo.executor_id();
- }
-
- // Allocate the ephemeral ports used by this container.
- Try<Interval<uint16_t>> ephemeralPorts = ephemeralPortsAllocator->allocate();
- if (ephemeralPorts.isError()) {
- return Failure(
- "Failed to allocate ephemeral ports: " + ephemeralPorts.error());
- }
-
- infos[containerId] = new Info(nonEphemeralPorts, ephemeralPorts.get());
-
- LOG(INFO) << "Using non-ephemeral ports " << nonEphemeralPorts
- << " and ephemeral ports " << ephemeralPorts.get()
- << " for container " << containerId << " of executor "
- << executorInfo.executor_id();
-
- ContainerPrepareInfo prepareInfo;
- prepareInfo.add_commands()->set_value(scripts(infos[containerId]));
-
- // NOTE: the port mapping isolator itself doesn't require mount
- // namespace. However, if mount namespace is enabled because of
- // other isolators, we need to set mount sharing accordingly for
- // PORT_MAPPING_BIND_MOUNT_ROOT to avoid races described in
- // MESOS-1558. So we turn on mount namespace here for consistency.
- prepareInfo.set_namespaces(CLONE_NEWNET | CLONE_NEWNS);
-
- return prepareInfo;
-}
-
-
-Future<Nothing> PortMappingIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- if (unmanaged.contains(containerId)) {
- return Failure("Asked to isolate an unmanaged container");
- }
-
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- if (info->pid.isSome()) {
- return Failure("The container has already been isolated");
- }
-
- info->pid = pid;
-
- if (flags.egress_unique_flow_per_container) {
- info->flowId = getNextFlowId();
- }
-
- // Bind mount the network namespace handle of the process 'pid' to a
- // directory to hold an extra reference to the network namespace
- // which will be released in 'cleanup'. By holding the extra
- // reference, the network namespace will not be destroyed even if
- // the process 'pid' is gone, which allows us to explicitly control
- // the network namespace life cycle.
- const string source = path::join("/proc", stringify(pid), "ns", "net");
- const string target = getNamespaceHandlePath(pid);
-
- Try<Nothing> touch = os::touch(target);
- if (touch.isError()) {
- return Failure("Failed to create the bind mount point: " + touch.error());
- }
-
- Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
- if (mount.isError()) {
- return Failure(
- "Failed to mount the network namespace handle from '" +
- source + "' to '" + target + "': " + mount.error());
- }
-
- LOG(INFO) << "Bind mounted '" << source << "' to '" << target
- << "' for container " << containerId;
-
- // Since 0.23.0, we create a symlink to the network namespace handle
- // using the container ID. This serves two purposes. First, it
- // allows us to recover the container ID later when slave restarts
- // even if slave's checkpointed meta data is deleted. Second, it
- // makes the debugging easier. See MESOS-2528 for details.
- const string linker = getSymlinkPath(containerId);
- Try<Nothing> symlink = ::fs::symlink(target, linker);
- if (symlink.isError()) {
- return Failure(
- "Failed to symlink the network namespace handle '" +
- linker + "' -> '" + target + "': " + symlink.error());
- }
-
- LOG(INFO) << "Created network namespace handle symlink '"
- << linker << "' -> '" << target << "'";
-
- // Create a virtual ethernet pair for this container.
- Try<bool> createVethPair = link::create(veth(pid), eth0, pid);
- if (createVethPair.isError()) {
- return Failure(
- "Failed to create virtual ethernet pair: " +
- createVethPair.error());
- }
-
- // Disable IPv6 for veth as IPv6 packets won't be forwarded anyway.
- const string disableIPv6 =
- path::join("/proc/sys/net/ipv6/conf", veth(pid), "disable_ipv6");
-
- if (os::exists(disableIPv6)) {
- Try<Nothing> write = os::write(disableIPv6, "1");
- if (write.isError()) {
- return Failure(
- "Failed to disable IPv6 for " + veth(pid) +
- ": " + write.error());
- }
- }
-
- // Sets the MAC address of veth to match the MAC address of the host
- // public interface (eth0).
- Try<bool> setVethMAC = link::setMAC(veth(pid), hostMAC);
- if (setVethMAC.isError()) {
- return Failure(
- "Failed to set the MAC address of " + veth(pid) +
- ": " + setVethMAC.error());
- }
-
- // Prepare the ingress queueing disciplines on veth.
- Try<bool> createQdisc = ingress::create(veth(pid));
- if (createQdisc.isError()) {
- return Failure(
- "Failed to create the ingress qdisc on " + veth(pid) +
- ": " + createQdisc.error());
- }
-
- // Veth device should exist since we just created it.
- CHECK(createQdisc.get());
-
- // For each port range, add a set of IP packet filters to properly
- // redirect IP traffic to/from containers.
- foreach (const PortRange& range,
- getPortRanges(info->nonEphemeralPorts + info->ephemeralPorts)) {
- if (info->flowId.isSome()) {
- LOG(INFO) << "Adding IP packet filters with ports " << range
- << " with flow ID " << info->flowId.get()
- << " for container " << containerId;
- } else {
- LOG(INFO) << "Adding IP packet filters with ports " << range
- << " for container " << containerId;
- }
-
- Try<Nothing> add = addHostIPFilters(range, info->flowId, veth(pid));
- if (add.isError()) {
- return Failure(
- "Failed to add IP packet filter with ports " +
- stringify(range) + " for container with pid " +
- stringify(pid) + ": " + add.error());
- }
- }
-
- // Relay ICMP packets from veth of the container to host eth0.
- Try<bool> icmpVethToEth0 = filter::icmp::create(
- veth(pid),
- ingress::HANDLE,
- icmp::Classifier(None()),
- Priority(ICMP_FILTER_PRIORITY, NORMAL),
- action::Redirect(eth0));
-
- if (icmpVethToEth0.isError()) {
- ++metrics.adding_veth_icmp_filters_errors;
-
- return Failure(
- "Failed to create an ICMP packet filter from " + veth(pid) +
- " to host " + eth0 + ": " + icmpVethToEth0.error());
- } else if (!icmpVethToEth0.get()) {
- ++metrics.adding_veth_icmp_filters_already_exist;
-
- return Failure(
- "The ICMP packet filter from " + veth(pid) +
- " to host " + eth0 + " already exists");
- }
-
- // Relay ARP packets from veth of the container to host eth0.
- Try<bool> arpVethToEth0 = filter::basic::create(
- veth(pid),
- ingress::HANDLE,
- ETH_P_ARP,
- Priority(ARP_FILTER_PRIORITY, NORMAL),
- action::Redirect(eth0));
-
- if (arpVethToEth0.isError()) {
- ++metrics.adding_veth_arp_filters_errors;
-
- return Failure(
- "Failed to create an ARP packet filter from " + veth(pid) +
- " to host " + eth0 + ": " + arpVethToEth0.error());
- } else if (!arpVethToEth0.get()) {
- ++metrics.adding_veth_arp_filters_already_exist;
-
- return Failure(
- "The ARP packet filter from " + veth(pid) +
- " to host " + eth0 + " already exists");
- }
-
- // Setup filters for ICMP and ARP packets. We mirror ICMP and ARP
- // packets from host eth0 to veths of all the containers. We also
- // setup flow classifiers for host eth0 egress.
- set<string> targets;
- foreachvalue (Info* info, infos) {
- if (info->pid.isSome()) {
- targets.insert(veth(info->pid.get()));
- }
- }
-
- if (targets.size() == 1) {
- // We just create the first container in which case we should
- // create filters for ICMP and ARP packets.
-
- // Create a new ICMP filter on host eth0 ingress for mirroring
- // packets from host eth0 to veth.
- Try<bool> icmpEth0ToVeth = filter::icmp::create(
- eth0,
- ingress::HANDLE,
- icmp::Classifier(hostIPNetwork.address()),
- Priority(ICMP_FILTER_PRIORITY, NORMAL),
- action::Mirror(targets));
-
- if (icmpEth0ToVeth.isError()) {
- ++metrics.adding_eth0_icmp_filters_errors;
-
- return Failure(
- "Failed to create an ICMP packet filter from host " + eth0 +
- " to " + veth(pid) + ": " + icmpEth0ToVeth.error());
- } else if (!icmpEth0ToVeth.get()) {
- ++metrics.adding_eth0_icmp_filters_already_exist;
-
- return Failure(
- "The ICMP packet filter on host " + eth0 + " already exists");
- }
-
- // Create a new ARP filter on host eth0 ingress for mirroring
- // packets from host eth0 to veth.
- Try<bool> arpEth0ToVeth = filter::basic::create(
- eth0,
- ingress::HANDLE,
- ETH_P_ARP,
- Priority(ARP_FILTER_PRIORITY, NORMAL),
- action::Mirror(targets));
-
- if (arpEth0ToVeth.isError()) {
- ++metrics.adding_eth0_arp_filters_errors;
-
- return Failure(
- "Failed to create an ARP packet filter from host " + eth0 +
- " to " + veth(pid) + ": " + arpEth0ToVeth.error());
- } else if (!arpEth0ToVeth.get()) {
- ++metrics.adding_eth0_arp_filters_already_exist;
-
- return Failure(
- "The ARP packet filter on host " + eth0 + " already exists");
- }
-
- if (flags.egress_unique_flow_per_container) {
- // Create a new ICMP filter on host eth0 egress for classifying
- // packets into a reserved flow.
- Try<bool> icmpEth0Egress = filter::icmp::create(
- eth0,
- HOST_TX_FQ_CODEL_HANDLE,
- icmp::Classifier(None()),
- Priority(ICMP_FILTER_PRIORITY, NORMAL),
- Handle(HOST_TX_FQ_CODEL_HANDLE, ICMP_FLOWID));
-
- if (icmpEth0Egress.isError()) {
- ++metrics.adding_eth0_egress_filters_errors;
-
- return Failure(
- "Failed to create the ICMP flow classifier on host " +
- eth0 + ": " + icmpEth0Egress.error());
- } else if (!icmpEth0Egress.get()) {
- ++metrics.adding_eth0_egress_filters_already_exist;
-
- return Failure(
- "The ICMP flow classifier on host " + eth0 + " already exists");
- }
-
- // Create a new ARP filter on host eth0 egress for classifying
- // packets into a reserved flow.
- Try<bool> arpEth0Egress = filter::basic::create(
- eth0,
- HOST_TX_FQ_CODEL_HANDLE,
- ETH_P_ARP,
- Priority(ARP_FILTER_PRIORITY, NORMAL),
- Handle(HOST_TX_FQ_CODEL_HANDLE, ARP_FLOWID));
-
- if (arpEth0Egress.isError()) {
- ++metrics.adding_eth0_egress_filters_errors;
-
- return Failure(
- "Failed to create the ARP flow classifier on host " +
- eth0 + ": " + arpEth0Egress.error());
- } else if (!arpEth0Egress.get()) {
- ++metrics.adding_eth0_egress_filters_already_exist;
-
- return Failure(
- "The ARP flow classifier on host " + eth0 + " already exists");
- }
-
- // Rest of the host packets go to a reserved flow.
- Try<bool> defaultEth0Egress = filter::basic::create(
- eth0,
- HOST_TX_FQ_CODEL_HANDLE,
- ETH_P_ALL,
- Priority(DEFAULT_FILTER_PRIORITY, NORMAL),
- Handle(HOST_TX_FQ_CODEL_HANDLE, HOST_FLOWID));
-
- if (defaultEth0Egress.isError()) {
- ++metrics.adding_eth0_egress_fi
<TRUNCATED>
[06/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
new file mode 100644
index 0000000..565f9cc
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
@@ -0,0 +1,3792 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <mesos/mesos.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/io.hpp>
+#include <process/pid.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/fs.hpp>
+#include <stout/hashset.hpp>
+#include <stout/json.hpp>
+#include <stout/lambda.hpp>
+#include <stout/mac.hpp>
+#include <stout/multihashmap.hpp>
+#include <stout/numify.hpp>
+#include <stout/os.hpp>
+#include <stout/option.hpp>
+#include <stout/path.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/result.hpp>
+#include <stout/stringify.hpp>
+#include <stout/strings.hpp>
+#include <stout/utils.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/stat.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "linux/routing/route.hpp"
+#include "linux/routing/utils.hpp"
+
+#include "linux/routing/diagnosis/diagnosis.hpp"
+
+#include "linux/routing/filter/basic.hpp"
+#include "linux/routing/filter/icmp.hpp"
+#include "linux/routing/filter/ip.hpp"
+
+#include "linux/routing/handle.hpp"
+
+#include "linux/routing/link/link.hpp"
+
+#include "linux/routing/queueing/fq_codel.hpp"
+#include "linux/routing/queueing/htb.hpp"
+#include "linux/routing/queueing/ingress.hpp"
+#include "linux/routing/queueing/statistics.hpp"
+
+#include "mesos/resources.hpp"
+
+#include "slave/constants.hpp"
+
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
+
+using namespace mesos::internal;
+
+using namespace process;
+
+using namespace routing;
+using namespace routing::filter;
+using namespace routing::queueing;
+using namespace routing::queueing::statistics;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::endl;
+using std::hex;
+using std::list;
+using std::ostringstream;
+using std::set;
+using std::sort;
+using std::string;
+using std::vector;
+
+using filter::ip::PortRange;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+// An old glibc might not have this symbol.
+#ifndef MNT_DETACH
+#define MNT_DETACH 2
+#endif
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The minimum number of ephemeral ports a container should have.
+static const uint16_t MIN_EPHEMERAL_PORTS_SIZE = 16;
+
+// Linux traffic control is a combination of queueing disciplines,
+// filters and classes organized as a tree for the ingress (tx) and
+// egress (rx) flows for each interface. Each container provides two
+// networking interfaces, a virtual eth0 and a loopback interface. The
+// flow of packets from the external network to container is shown
+// below:
+//
+// +----------------------+----------------------+
+// | Container |
+// |----------------------|----------------------|
+// | eth0 | lo |
+// +----------------------+----------------------+
+// ^ | ^ |
+// [3] | | [4] | |
+// | | [7] +-----------+ [10]
+// | |
+// | | [8] +-----------+ [9]
+// [2] | | [5] | |
+// | v v v
+// +----------------------+----------------------+
+// | veth0 | lo |
+// +----------------------|----------------------+
+// | Host |
+// |----------------------|----------------------|
+// | eth0 |
+// +----------------------+----------------------|
+// ^ |
+// [1] | | [6]
+// | v
+//
+// Traffic flowing from outside the network into a container enters
+// the system via the host ingress interface [1] and is routed based
+// on destination port to the outbound interface for the matching
+// container [2], which forwards the packet to the container's inbound
+// virtual interface. Outbound traffic destined for the external
+// network flows along the reverse path [4,5,6]. Loopback traffic is
+// directed to the corresponding Ethernet interface, either [7,10] or
+// [8,9] where the same destination port routing can be applied as to
+// external traffic. We use traffic control filters at several of the
+// interfaces to create these packet paths.
+//
+// Linux provides only a very simple topology for ingress interfaces.
+// A root is provided on a fixed handle (handle::INGRESS_ROOT) under
+// which a single qdisc can be installed, with handle ingress::HANDLE.
+// Traffic control filters can then be attached to the ingress qdisc.
+// We install one or more ingress filters on the host eth0 [1] to
+// direct traffic to the correct container, and on the container
+// virtual eth0 [5] to direct traffic to other containers or out of
+// the box. Since we know the ip port assignments for each container,
+// we can direct traffic directly to the appropriate container.
+// However, for ICMP and ARP traffic where no equivalent to a port
+// exists, we send a copy of the packet to every container and rely on
+// the network stack to drop unexpected packets.
+//
+// We install a Hierarchical Token Bucket (HTB) qdisc and class to
+// limit the outbound traffic bandwidth as the egress qdisc inside the
+// container [4] and then add a fq_codel qdisc to limit head of line
+// blocking on the egress filter. The egress traffic control chain is
+// thus:
+//
+// root device: handle::EGRESS_ROOT ->
+// htb egress qdisc: CONTAINER_TX_HTB_HANDLE ->
+// htb rate limiting class: CONTAINER_TX_HTB_CLASS_ID ->
+// buffer-bloat reduction: FQ_CODEL
+constexpr Handle CONTAINER_TX_HTB_HANDLE = Handle(1, 0);
+constexpr Handle CONTAINER_TX_HTB_CLASS_ID =
+ Handle(CONTAINER_TX_HTB_HANDLE, 1);
+
+
+// Finally we create a second fq_codel qdisc on the public interface
+// of the host [6] to reduce performance interference between
+// containers. We create independent flows for each container, and
+// one for the host, which ensures packets from each container are
+// guaranteed fair access to the host interface. This egress traffic
+// control chain for the host interface is thus:
+//
+// root device: handle::EGRESS_ROOT ->
+// buffer-bloat reduction: FQ_CODEL
+constexpr Handle HOST_TX_FQ_CODEL_HANDLE = Handle(1, 0);
+
+
+// The primary priority used by each type of filter.
+static const uint8_t ARP_FILTER_PRIORITY = 1;
+static const uint8_t ICMP_FILTER_PRIORITY = 2;
+static const uint8_t IP_FILTER_PRIORITY = 3;
+static const uint8_t DEFAULT_FILTER_PRIORITY = 4;
+
+
+// The secondary priorities used by filters.
+static const uint8_t HIGH = 1;
+static const uint8_t NORMAL = 2;
+static const uint8_t LOW = 3;
+
+
+// We assign a separate flow on host eth0 egress for each container
+// (See MESOS-2422 for details). Host egress traffic is assigned to a
+// reserved flow (HOST_FLOWID). ARP and ICMP traffic from containers
+// are not heavy, so they can share the same flow.
+static const uint16_t HOST_FLOWID = 1;
+static const uint16_t ARP_FLOWID = 2;
+static const uint16_t ICMP_FLOWID = 2;
+static const uint16_t CONTAINER_MIN_FLOWID = 3;
+
+
+// The well known ports. Used for sanity check.
+static Interval<uint16_t> WELL_KNOWN_PORTS()
+{
+ return (Bound<uint16_t>::closed(0), Bound<uint16_t>::open(1024));
+}
+
+
+/////////////////////////////////////////////////
+// Helper functions for the isolator.
+/////////////////////////////////////////////////
+
+// Given an integer x, find the largest integer t such that t <= x and
+// t is aligned to power of 2.
+static uint32_t roundDownToPowerOfTwo(uint32_t x)
+{
+ // Mutate x from 00001XXX to 0x00001111.
+
+ // We know the MSB has to be a 1, so kill the LSB and make sure the
+ // first 2 most significant bits are 1s.
+ x = x | (x >> 1);
+
+ // Now that the 2 most significant bits are 1s, make sure the first
+ // 4 most significant bits are 1s, too.
+ x = x | (x >> 2);
+
+ // We keep going. Note that the 0s left to the MSB are never turned
+ // to 1s.
+ x = x | (x >> 4);
+ x = x | (x >> 8);
+
+ // Now we have covered all 32 bits.
+ x = x | (x >> 16);
+
+ // 0x00001111 - (0x00001111 >> 1)
+ return x - (x >> 1);
+}
+
+
+// Returns the name of the host end of the virtual ethernet pair for a
+// given container. The kernel restricts link name to 16 characters or
+// less, so we cannot put container ID into the device name. Instead,
+// we use the pid of the executor process forked by the slave to
+// uniquely name the device for each container. It's safe because we
+// cannot have two active containers having the same pid for the
+// executor process.
+static string veth(pid_t pid)
+{
+ return PORT_MAPPING_VETH_PREFIX() + stringify(pid);
+}
+
+
+// Extracts the pid from the given veth name.
+static Option<pid_t> getPidFromVeth(const string& veth)
+{
+ if (strings::startsWith(veth, PORT_MAPPING_VETH_PREFIX())) {
+ Try<pid_t> pid = numify<pid_t>(
+ strings::remove(veth, PORT_MAPPING_VETH_PREFIX(), strings::PREFIX));
+
+ if (pid.isSome()) {
+ return pid.get();
+ }
+ }
+
+ return None();
+}
+
+
+// Extracts the container ID from the symlink that points to the
+// network namespace handle. The following is the layout of the bind
+// mount root and bind mount symlink root:
+// <PORT_MAPPING_BIND_MOUNT_ROOT()>
+// |--- 3945 (pid) <-|
+// |
+// <PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()> |
+// |--- ecf293e7-e6e8-4cbc-aaee-4d6c958aa276 --|
+// (symlink: container ID -> pid)
+static Try<ContainerID> getContainerIdFromSymlink(const string& symlink)
+{
+ if (!os::stat::islink(symlink)) {
+ return Error("Not a symlink");
+ }
+
+ string _containerId = Path(symlink).basename();
+
+ ContainerID containerId;
+ containerId.set_value(_containerId);
+
+ return containerId;
+}
+
+
+// Extracts the pid from the network namespace handle. Returns None if
+// the handle is clearly not created by us.
+static Result<pid_t> getPidFromNamespaceHandle(const string& handle)
+{
+ if (os::stat::islink(handle)) {
+ return Error("Not expecting a symlink");
+ }
+
+ string _pid = Path(handle).basename();
+
+ Try<pid_t> pid = numify<pid_t>(_pid);
+ if (pid.isError()) {
+ return None();
+ }
+
+ return pid.get();
+}
+
+
+// Extracts the pid from the symlink that points to the network
+// namespace handle. Returns None if it's a dangling symlink.
+static Result<pid_t> getPidFromSymlink(const string& symlink)
+{
+ if (!os::stat::islink(symlink)) {
+ return Error("Not a symlink");
+ }
+
+ Result<string> target = os::realpath(symlink);
+ if (target.isError()) {
+ return Error("Failed to follow the symlink: " + target.error());
+ } else if (target.isNone()) {
+ // This is a dangling symlink.
+ return None();
+ }
+
+ return getPidFromNamespaceHandle(target.get());
+}
+
+
+static string getSymlinkPath(const ContainerID& containerId)
+{
+ return path::join(
+ PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(),
+ stringify(containerId));
+}
+
+
+static string getNamespaceHandlePath(pid_t pid)
+{
+ return path::join(
+ PORT_MAPPING_BIND_MOUNT_ROOT(),
+ stringify(pid));
+}
+
+
+// Converts from value ranges to interval set.
+static IntervalSet<uint16_t> getIntervalSet(const Value::Ranges& ranges)
+{
+ IntervalSet<uint16_t> set;
+
+ for (int i = 0; i < ranges.range_size(); i++) {
+ set += (Bound<uint16_t>::closed(ranges.range(i).begin()),
+ Bound<uint16_t>::closed(ranges.range(i).end()));
+ }
+
+ return set;
+}
+
+/////////////////////////////////////////////////
+// Implementation for PortMappingUpdate.
+/////////////////////////////////////////////////
+
+const char* PortMappingUpdate::NAME = "update";
+
+
+PortMappingUpdate::Flags::Flags()
+{
+ add(ð0_name,
+ "eth0_name",
+ "The name of the public network interface (e.g., eth0)");
+
+ add(&lo_name,
+ "lo_name",
+ "The name of the loopback network interface (e.g., lo)");
+
+ add(&pid,
+ "pid",
+ "The pid of the process whose namespaces we will enter");
+
+ add(&ports_to_add,
+ "ports_to_add",
+ "A collection of port ranges (formatted as a JSON object)\n"
+ "for which to add IP filters. E.g.,\n"
+ "--ports_to_add={\"range\":[{\"begin\":4,\"end\":8}]}");
+
+ add(&ports_to_remove,
+ "ports_to_remove",
+ "A collection of port ranges (formatted as a JSON object)\n"
+ "for which to remove IP filters. E.g.,\n"
+ "--ports_to_remove={\"range\":[{\"begin\":4,\"end\":8}]}");
+}
+
+
+// The following two helper functions allow us to convert from a
+// collection of port ranges to a JSON object and vice versa. They
+// will be used for the port mapping update operation.
+template <typename Iterable>
+JSON::Object json(const Iterable& ranges)
+{
+ Value::Ranges values;
+ foreach (const PortRange& range, ranges) {
+ Value::Range value;
+ value.set_begin(range.begin());
+ value.set_end(range.end());
+
+ values.add_range()->CopyFrom(value);
+ }
+ return JSON::Protobuf(values);
+}
+
+
+static Try<vector<PortRange>> parse(const JSON::Object& object)
+{
+ Try<Value::Ranges> parsing = protobuf::parse<Value::Ranges>(object);
+ if (parsing.isError()) {
+ return Error("Failed to parse JSON: " + parsing.error());
+ }
+
+ vector<PortRange> ranges;
+ Value::Ranges values = parsing.get();
+ for (int i = 0; i < values.range_size(); i++) {
+ const Value::Range& value = values.range(i);
+ Try<PortRange> range = PortRange::fromBeginEnd(value.begin(), value.end());
+ if (range.isError()) {
+ return Error("Invalid port range: " + range.error());
+ }
+
+ ranges.push_back(range.get());
+ }
+ return ranges;
+}
+
+
+// Helper function to set up IP filters inside the container for a
+// given port range.
+static Try<Nothing> addContainerIPFilters(
+ const PortRange& range,
+ const string& eth0,
+ const string& lo)
+{
+ // Add an IP packet filter on lo such that local traffic inside a
+ // container will not be redirected to eth0.
+ Try<bool> loTerminal = filter::ip::create(
+ lo,
+ ingress::HANDLE,
+ ip::Classifier(None(), None(), None(), range),
+ Priority(IP_FILTER_PRIORITY, HIGH),
+ action::Terminal());
+
+ if (loTerminal.isError()) {
+ return Error(
+ "Failed to create an IP packet filter on " + lo +
+ " which stops packets from being sent to " + eth0 +
+ ": " + loTerminal.error());
+ } else if (!loTerminal.get()) {
+ return Error(
+ "The IP packet filter on " + lo +
+ " which stops packets from being sent to " +
+ eth0 + " already exists");
+ }
+
+ // Add an IP packet filter (for loopback IP) from eth0 to lo to
+ // redirect all loopback IP traffic to lo.
+ Try<bool> eth0ToLoLoopback = filter::ip::create(
+ eth0,
+ ingress::HANDLE,
+ ip::Classifier(
+ None(),
+ net::IPNetwork::LOOPBACK_V4().address(),
+ None(),
+ range),
+ Priority(IP_FILTER_PRIORITY, NORMAL),
+ action::Redirect(lo));
+
+ if (eth0ToLoLoopback.isError()) {
+ return Error(
+ "Failed to create an IP packet filter (for loopback IP) from " +
+ eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
+ } else if (!eth0ToLoLoopback.get()) {
+ return Error(
+ "The IP packet filter (for loopback IP) from " +
+ eth0 + " to " + lo + " already exists");
+ }
+
+ return Nothing();
+}
+
+
+// Helper function to remove IP filters inside the container for a
+// given port range.
+static Try<Nothing> removeContainerIPFilters(
+ const PortRange& range,
+ const string& eth0,
+ const string& lo)
+{
+ // Remove the 'terminal' IP packet filter on lo.
+ Try<bool> loTerminal = filter::ip::remove(
+ lo,
+ ingress::HANDLE,
+ ip::Classifier(None(), None(), None(), range));
+
+ if (loTerminal.isError()) {
+ return Error(
+ "Failed to remove the IP packet filter on " + lo +
+ " which stops packets from being sent to " + eth0 +
+ ": " + loTerminal.error());
+ } else if (!loTerminal.get()) {
+ return Error(
+ "The IP packet filter on " + lo +
+ " which stops packets from being sent to " + eth0 +
+ " does not exist");
+ }
+
+ // Remove the IP packet filter (for loopback IP) from eth0 to lo.
+ Try<bool> eth0ToLoLoopback = filter::ip::remove(
+ eth0,
+ ingress::HANDLE,
+ ip::Classifier(
+ None(),
+ net::IPNetwork::LOOPBACK_V4().address(),
+ None(),
+ range));
+
+ if (eth0ToLoLoopback.isError()) {
+ return Error(
+ "Failed to remove the IP packet filter (for loopback IP) from " +
+ eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
+ } else if (!eth0ToLoLoopback.get()) {
+ return Error(
+ "The IP packet filter (for loopback IP) from " +
+ eth0 + " to " + lo + " does not exist");
+ }
+
+ return Nothing();
+}
+
+
+int PortMappingUpdate::execute()
+{
+ if (flags.help) {
+ cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
+ << "Supported options:" << endl
+ << flags.usage();
+ return 0;
+ }
+
+ if (flags.eth0_name.isNone()) {
+ cerr << "The public interface name (e.g., eth0) is not specified" << endl;
+ return 1;
+ }
+
+ if (flags.lo_name.isNone()) {
+ cerr << "The loopback interface name (e.g., lo) is not specified" << endl;
+ return 1;
+ }
+
+ if (flags.pid.isNone()) {
+ cerr << "The pid is not specified" << endl;
+ return 1;
+ }
+
+ if (flags.ports_to_add.isNone() && flags.ports_to_remove.isNone()) {
+ cerr << "Nothing to update" << endl;
+ return 1;
+ }
+
+ Option<vector<PortRange>> portsToAdd;
+ Option<vector<PortRange>> portsToRemove;
+
+ if (flags.ports_to_add.isSome()) {
+ Try<vector<PortRange>> parsing = parse(flags.ports_to_add.get());
+ if (parsing.isError()) {
+ cerr << "Parsing 'ports_to_add' failed: " << parsing.error() << endl;
+ return 1;
+ }
+ portsToAdd = parsing.get();
+ }
+
+ if (flags.ports_to_remove.isSome()) {
+ Try<vector<PortRange>> parsing = parse(flags.ports_to_remove.get());
+ if (parsing.isError()) {
+ cerr << "Parsing 'ports_to_remove' failed: " << parsing.error() << endl;
+ return 1;
+ }
+ portsToRemove = parsing.get();
+ }
+
+ // Enter the network namespace.
+ Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
+ if (setns.isError()) {
+ cerr << "Failed to enter the network namespace of pid " << flags.pid.get()
+ << ": " << setns.error() << endl;
+ return 1;
+ }
+
+ // Update IP packet filters.
+ const string eth0 = flags.eth0_name.get();
+ const string lo = flags.lo_name.get();
+
+ if (portsToAdd.isSome()) {
+ foreach (const PortRange& range, portsToAdd.get()) {
+ Try<Nothing> add = addContainerIPFilters(range, eth0, lo);
+ if (add.isError()) {
+ cerr << "Failed to add IP filters: " << add.error() << endl;
+ return 1;
+ }
+ }
+ }
+
+ if (portsToRemove.isSome()) {
+ foreach (const PortRange& range, portsToRemove.get()) {
+ Try<Nothing> remove = removeContainerIPFilters(range, eth0, lo);
+ if (remove.isError()) {
+ cerr << "Failed to remove IP filters: " << remove.error() << endl;
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/////////////////////////////////////////////////
+// Implementation for PortMappingStatistics.
+/////////////////////////////////////////////////
+
+const char* PortMappingStatistics::NAME = "statistics";
+
+
+PortMappingStatistics::Flags::Flags()
+{
+ add(ð0_name,
+ "eth0_name",
+ "The name of the public network interface (e.g., eth0)");
+
+ add(&pid,
+ "pid",
+ "The pid of the process whose namespaces we will enter");
+
+ add(&enable_socket_statistics_summary,
+ "enable_socket_statistics_summary",
+ "Whether to collect socket statistics summary for this container\n",
+ false);
+
+ add(&enable_socket_statistics_details,
+ "enable_socket_statistics_details",
+ "Whether to collect socket statistics details (e.g., TCP RTT)\n"
+ "for this container.",
+ false);
+}
+
+
+// A helper that copies the traffic control statistics from the
+// statistics hashmap into the ResourceStatistics protocol buffer.
+static void addTrafficControlStatistics(
+ const string& id,
+ const hashmap<string, uint64_t>& statistics,
+ ResourceStatistics* result)
+{
+ TrafficControlStatistics *tc = result->add_net_traffic_control_statistics();
+
+ tc->set_id(id);
+
+ // TODO(pbrett) Use protobuf reflection here.
+ if (statistics.contains(BACKLOG)) {
+ tc->set_backlog(statistics.at(BACKLOG));
+ }
+ if (statistics.contains(BYTES)) {
+ tc->set_bytes(statistics.at(BYTES));
+ }
+ if (statistics.contains(DROPS)) {
+ tc->set_drops(statistics.at(DROPS));
+ }
+ if (statistics.contains(OVERLIMITS)) {
+ tc->set_overlimits(statistics.at(OVERLIMITS));
+ }
+ if (statistics.contains(PACKETS)) {
+ tc->set_packets(statistics.at(PACKETS));
+ }
+ if (statistics.contains(QLEN)) {
+ tc->set_qlen(statistics.at(QLEN));
+ }
+ if (statistics.contains(RATE_BPS)) {
+ tc->set_ratebps(statistics.at(RATE_BPS));
+ }
+ if (statistics.contains(RATE_PPS)) {
+ tc->set_ratepps(statistics.at(RATE_PPS));
+ }
+ if (statistics.contains(REQUEUES)) {
+ tc->set_requeues(statistics.at(REQUEUES));
+ }
+}
+
+
+int PortMappingStatistics::execute()
+{
+ if (flags.help) {
+ cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
+ << "Supported options:" << endl
+ << flags.usage();
+ return 0;
+ }
+
+ if (flags.pid.isNone()) {
+ cerr << "The pid is not specified" << endl;
+ return 1;
+ }
+
+ if (flags.eth0_name.isNone()) {
+ cerr << "The public interface name (e.g., eth0) is not specified" << endl;
+ return 1;
+ }
+
+ // Enter the network namespace.
+ Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
+ if (setns.isError()) {
+ // This could happen if the executor exits before this function is
+ // invoked. We do not log here to avoid spurious logging.
+ return 1;
+ }
+
+ ResourceStatistics result;
+
+ // NOTE: We use a dummy value here since this field will be cleared
+ // before the result is sent to the containerizer.
+ result.set_timestamp(0);
+
+ if (flags.enable_socket_statistics_summary) {
+ // Collections for socket statistics summary are below.
+
+ // For TCP, get the number of ACTIVE and TIME_WAIT connections,
+ // from reading /proc/net/sockstat (/proc/net/sockstat6 for IPV6).
+ // This is not as expensive in the kernel because only counter
+ // values are accessed instead of a dump of all the sockets.
+ // Example output:
+
+ // $ cat /proc/net/sockstat
+ // sockets: used 1391
+ // TCP: inuse 33 orphan 0 tw 0 alloc 37 mem 6
+ // UDP: inuse 15 mem 7
+ // UDPLITE: inuse 0
+ // RAW: inuse 0
+ // FRAG: inuse 0 memory 0
+
+ Try<string> value = os::read("/proc/net/sockstat");
+ if (value.isError()) {
+ cerr << "Failed to read /proc/net/sockstat: " << value.error() << endl;
+ return 1;
+ }
+
+ foreach (const string& line, strings::tokenize(value.get(), "\n")) {
+ if (!strings::startsWith(line, "TCP")) {
+ continue;
+ }
+
+ vector<string> tokens = strings::tokenize(line, " ");
+ for (size_t i = 0; i < tokens.size(); i++) {
+ if (tokens[i] == "inuse") {
+ if (i + 1 >= tokens.size()) {
+ cerr << "Unexpected output from /proc/net/sockstat" << endl;
+ // Be a bit forgiving here here since the /proc file
+ // output format can change, though not very likely.
+ continue;
+ }
+
+ // Set number of active TCP connections.
+ Try<size_t> inuse = numify<size_t>(tokens[i+1]);
+ if (inuse.isError()) {
+ cerr << "Failed to parse the number of tcp connections in use: "
+ << inuse.error() << endl;
+ continue;
+ }
+
+ result.set_net_tcp_active_connections(inuse.get());
+ } else if (tokens[i] == "tw") {
+ if (i + 1 >= tokens.size()) {
+ cerr << "Unexpected output from /proc/net/sockstat" << endl;
+ // Be a bit forgiving here here since the /proc file
+ // output format can change, though not very likely.
+ continue;
+ }
+
+ // Set number of TIME_WAIT TCP connections.
+ Try<size_t> tw = numify<size_t>(tokens[i+1]);
+ if (tw.isError()) {
+ cerr << "Failed to parse the number of tcp connections in"
+ << " TIME_WAIT: " << tw.error() << endl;
+ continue;
+ }
+
+ result.set_net_tcp_time_wait_connections(tw.get());
+ }
+ }
+ }
+ }
+
+ if (flags.enable_socket_statistics_details) {
+ // Collections for socket statistics details are below.
+
+ // NOTE: If the underlying library uses the older version of
+ // kernel API, the family argument passed in may not be honored.
+ Try<vector<diagnosis::socket::Info>> infos =
+ diagnosis::socket::infos(AF_INET, diagnosis::socket::state::ALL);
+
+ if (infos.isError()) {
+ cerr << "Failed to retrieve the socket information" << endl;
+ return 1;
+ }
+
+ vector<uint32_t> RTTs;
+ foreach (const diagnosis::socket::Info& info, infos.get()) {
+ // We double check on family regardless.
+ if (info.family != AF_INET) {
+ continue;
+ }
+
+ // We consider all sockets that have non-zero rtt value.
+ if (info.tcpInfo.isSome() && info.tcpInfo.get().tcpi_rtt != 0) {
+ RTTs.push_back(info.tcpInfo.get().tcpi_rtt);
+ }
+ }
+
+ // Only print to stdout when we have results.
+ if (RTTs.size() > 0) {
+ std::sort(RTTs.begin(), RTTs.end());
+
+ // NOTE: The size of RTTs is usually within 1 million so we
+ // don't need to worry about overflow here.
+ // TODO(jieyu): Right now, we choose to use "Nearest rank" for
+ // simplicity. Consider directly using the Statistics abstraction
+ // which computes "Linear interpolation between closest ranks".
+ // http://en.wikipedia.org/wiki/Percentile
+ size_t p50 = RTTs.size() * 50 / 100;
+ size_t p90 = RTTs.size() * 90 / 100;
+ size_t p95 = RTTs.size() * 95 / 100;
+ size_t p99 = RTTs.size() * 99 / 100;
+
+ result.set_net_tcp_rtt_microsecs_p50(RTTs[p50]);
+ result.set_net_tcp_rtt_microsecs_p90(RTTs[p90]);
+ result.set_net_tcp_rtt_microsecs_p95(RTTs[p95]);
+ result.set_net_tcp_rtt_microsecs_p99(RTTs[p99]);
+ }
+ }
+
+ // Collect traffic statistics for the container from the container
+ // virtual interface and export them in JSON.
+ const string& eth0 = flags.eth0_name.get();
+
+ // Overlimits are reported on the HTB qdisc at the egress root.
+ Result<hashmap<string, uint64_t>> statistics =
+ htb::statistics(eth0, EGRESS_ROOT);
+
+ if (statistics.isSome()) {
+ addTrafficControlStatistics(
+ NET_ISOLATOR_BW_LIMIT,
+ statistics.get(),
+ &result);
+ } else if (statistics.isNone()) {
+ // Traffic control statistics are only available when the
+ // container is created on a slave when the egress rate limit is
+ // on (i.e., egress_rate_limit_per_container flag is set). We
+ // can't just test for that flag here however, since the slave may
+ // have been restarted with different flags since the container
+ // was created. It is also possible that isolator statistics are
+ // unavailable because we the container is in the process of being
+ // created or destroy. Hence we do not report a lack of network
+ // statistics as an error.
+ } else if (statistics.isError()) {
+ cerr << "Failed to get htb qdisc statistics on " << eth0
+ << " in namespace " << flags.pid.get() << endl;
+ }
+
+ // Drops due to the bandwidth limit should be reported at the leaf.
+ statistics = fq_codel::statistics(eth0, CONTAINER_TX_HTB_CLASS_ID);
+ if (statistics.isSome()) {
+ addTrafficControlStatistics(
+ NET_ISOLATOR_BLOAT_REDUCTION,
+ statistics.get(),
+ &result);
+ } else if (statistics.isNone()) {
+ // See discussion on network isolator statistics above.
+ } else if (statistics.isError()) {
+ cerr << "Failed to get fq_codel qdisc statistics on " << eth0
+ << " in namespace " << flags.pid.get() << endl;
+ }
+
+ cout << stringify(JSON::Protobuf(result));
+ return 0;
+}
+
+
+/////////////////////////////////////////////////
+// Implementation for the isolator.
+/////////////////////////////////////////////////
+
+PortMappingIsolatorProcess::Metrics::Metrics()
+ : adding_eth0_ip_filters_errors(
+ "port_mapping/adding_eth0_ip_filters_errors"),
+ adding_eth0_ip_filters_already_exist(
+ "port_mapping/adding_eth0_ip_filters_already_exist"),
+ adding_eth0_egress_filters_errors(
+ "port_mapping/adding_eth0_egress_filters_errors"),
+ adding_eth0_egress_filters_already_exist(
+ "port_mapping/adding_eth0_egress_filters_already_exist"),
+ adding_lo_ip_filters_errors(
+ "port_mapping/adding_lo_ip_filters_errors"),
+ adding_lo_ip_filters_already_exist(
+ "port_mapping/adding_lo_ip_filters_already_exist"),
+ adding_veth_ip_filters_errors(
+ "port_mapping/adding_veth_ip_filters_errors"),
+ adding_veth_ip_filters_already_exist(
+ "port_mapping/adding_veth_ip_filters_already_exist"),
+ adding_veth_icmp_filters_errors(
+ "port_mapping/adding_veth_icmp_filters_errors"),
+ adding_veth_icmp_filters_already_exist(
+ "port_mapping/adding_veth_icmp_filters_already_exist"),
+ adding_veth_arp_filters_errors(
+ "port_mapping/adding_veth_arp_filters_errors"),
+ adding_veth_arp_filters_already_exist(
+ "port_mapping/adding_veth_arp_filters_already_exist"),
+ adding_eth0_icmp_filters_errors(
+ "port_mapping/adding_eth0_icmp_filters_errors"),
+ adding_eth0_icmp_filters_already_exist(
+ "port_mapping/adding_eth0_icmp_filters_already_exist"),
+ adding_eth0_arp_filters_errors(
+ "port_mapping/adding_eth0_arp_filters_errors"),
+ adding_eth0_arp_filters_already_exist(
+ "port_mapping/adding_eth0_arp_filters_already_exist"),
+ removing_eth0_ip_filters_errors(
+ "port_mapping/removing_eth0_ip_filters_errors"),
+ removing_eth0_ip_filters_do_not_exist(
+ "port_mapping/removing_eth0_ip_filters_do_not_exist"),
+ removing_eth0_egress_filters_errors(
+ "port_mapping/removing_eth0_egress_filters_errors"),
+ removing_eth0_egress_filters_do_not_exist(
+ "port_mapping/removinging_eth0_egress_filters_do_not_exist"),
+ removing_lo_ip_filters_errors(
+ "port_mapping/removing_lo_ip_filters_errors"),
+ removing_lo_ip_filters_do_not_exist(
+ "port_mapping/removing_lo_ip_filters_do_not_exist"),
+ removing_veth_ip_filters_errors(
+ "port_mapping/removing_veth_ip_filters_errors"),
+ removing_veth_ip_filters_do_not_exist(
+ "port_mapping/removing_veth_ip_filters_do_not_exist"),
+ removing_eth0_icmp_filters_errors(
+ "port_mapping/removing_eth0_icmp_filters_errors"),
+ removing_eth0_icmp_filters_do_not_exist(
+ "port_mapping/removing_eth0_icmp_filters_do_not_exist"),
+ removing_eth0_arp_filters_errors(
+ "port_mapping/removing_eth0_arp_filters_errors"),
+ removing_eth0_arp_filters_do_not_exist(
+ "port_mapping/removing_eth0_arp_filters_do_not_exist"),
+ updating_eth0_icmp_filters_errors(
+ "port_mapping/updating_eth0_icmp_filters_errors"),
+ updating_eth0_icmp_filters_already_exist(
+ "port_mapping/updating_eth0_icmp_filters_already_exist"),
+ updating_eth0_icmp_filters_do_not_exist(
+ "port_mapping/updating_eth0_icmp_filters_do_not_exist"),
+ updating_eth0_arp_filters_errors(
+ "port_mapping/updating_eth0_arp_filters_errors"),
+ updating_eth0_arp_filters_already_exist(
+ "port_mapping/updating_eth0_arp_filters_already_exist"),
+ updating_eth0_arp_filters_do_not_exist(
+ "port_mapping/updating_eth0_arp_filters_do_not_exist"),
+ updating_container_ip_filters_errors(
+ "port_mapping/updating_container_ip_filters_errors")
+{
+ process::metrics::add(adding_eth0_ip_filters_errors);
+ process::metrics::add(adding_eth0_ip_filters_already_exist);
+ process::metrics::add(adding_lo_ip_filters_errors);
+ process::metrics::add(adding_lo_ip_filters_already_exist);
+ process::metrics::add(adding_veth_ip_filters_errors);
+ process::metrics::add(adding_veth_ip_filters_already_exist);
+ process::metrics::add(adding_veth_icmp_filters_errors);
+ process::metrics::add(adding_veth_icmp_filters_already_exist);
+ process::metrics::add(adding_veth_arp_filters_errors);
+ process::metrics::add(adding_veth_arp_filters_already_exist);
+ process::metrics::add(adding_eth0_icmp_filters_errors);
+ process::metrics::add(adding_eth0_icmp_filters_already_exist);
+ process::metrics::add(adding_eth0_arp_filters_errors);
+ process::metrics::add(adding_eth0_arp_filters_already_exist);
+ process::metrics::add(removing_eth0_ip_filters_errors);
+ process::metrics::add(removing_eth0_ip_filters_do_not_exist);
+ process::metrics::add(removing_lo_ip_filters_errors);
+ process::metrics::add(removing_lo_ip_filters_do_not_exist);
+ process::metrics::add(removing_veth_ip_filters_errors);
+ process::metrics::add(removing_veth_ip_filters_do_not_exist);
+ process::metrics::add(removing_eth0_icmp_filters_errors);
+ process::metrics::add(removing_eth0_icmp_filters_do_not_exist);
+ process::metrics::add(removing_eth0_arp_filters_errors);
+ process::metrics::add(removing_eth0_arp_filters_do_not_exist);
+ process::metrics::add(updating_eth0_icmp_filters_errors);
+ process::metrics::add(updating_eth0_icmp_filters_already_exist);
+ process::metrics::add(updating_eth0_icmp_filters_do_not_exist);
+ process::metrics::add(updating_eth0_arp_filters_errors);
+ process::metrics::add(updating_eth0_arp_filters_already_exist);
+ process::metrics::add(updating_eth0_arp_filters_do_not_exist);
+ process::metrics::add(updating_container_ip_filters_errors);
+}
+
+
+PortMappingIsolatorProcess::Metrics::~Metrics()
+{
+ process::metrics::remove(adding_eth0_ip_filters_errors);
+ process::metrics::remove(adding_eth0_ip_filters_already_exist);
+ process::metrics::remove(adding_lo_ip_filters_errors);
+ process::metrics::remove(adding_lo_ip_filters_already_exist);
+ process::metrics::remove(adding_veth_ip_filters_errors);
+ process::metrics::remove(adding_veth_ip_filters_already_exist);
+ process::metrics::remove(adding_veth_icmp_filters_errors);
+ process::metrics::remove(adding_veth_icmp_filters_already_exist);
+ process::metrics::remove(adding_veth_arp_filters_errors);
+ process::metrics::remove(adding_veth_arp_filters_already_exist);
+ process::metrics::remove(adding_eth0_icmp_filters_errors);
+ process::metrics::remove(adding_eth0_icmp_filters_already_exist);
+ process::metrics::remove(adding_eth0_arp_filters_errors);
+ process::metrics::remove(adding_eth0_arp_filters_already_exist);
+ process::metrics::remove(removing_eth0_ip_filters_errors);
+ process::metrics::remove(removing_eth0_ip_filters_do_not_exist);
+ process::metrics::remove(removing_lo_ip_filters_errors);
+ process::metrics::remove(removing_lo_ip_filters_do_not_exist);
+ process::metrics::remove(removing_veth_ip_filters_errors);
+ process::metrics::remove(removing_veth_ip_filters_do_not_exist);
+ process::metrics::remove(removing_eth0_icmp_filters_errors);
+ process::metrics::remove(removing_eth0_icmp_filters_do_not_exist);
+ process::metrics::remove(removing_eth0_arp_filters_errors);
+ process::metrics::remove(removing_eth0_arp_filters_do_not_exist);
+ process::metrics::remove(updating_eth0_icmp_filters_errors);
+ process::metrics::remove(updating_eth0_icmp_filters_already_exist);
+ process::metrics::remove(updating_eth0_icmp_filters_do_not_exist);
+ process::metrics::remove(updating_eth0_arp_filters_errors);
+ process::metrics::remove(updating_eth0_arp_filters_already_exist);
+ process::metrics::remove(updating_eth0_arp_filters_do_not_exist);
+ process::metrics::remove(updating_container_ip_filters_errors);
+}
+
+
+Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
+{
+ // Check for root permission.
+ if (geteuid() != 0) {
+ return Error("Using network isolator requires root permissions");
+ }
+
+ // Verify that the network namespace is available by checking the
+ // existence of the network namespace handle of the current process.
+ if (ns::namespaces().count("net") == 0) {
+ return Error(
+ "Using network isolator requires network namespace. "
+ "Make sure your kernel is newer than 3.4");
+ }
+
+ // Check the routing library.
+ Try<Nothing> check = routing::check();
+ if (check.isError()) {
+ return Error(
+ "Routing library check failed: " +
+ check.error());
+ }
+
+ // Check the availability of a few Linux commands that we will use.
+ // We use the blocking os::shell here because 'create' will only be
+ // invoked during initialization.
+ Try<string> checkCommandTc = os::shell("tc filter show");
+ if (checkCommandTc.isError()) {
+ return Error("Check command 'tc' failed: " + checkCommandTc.error());
+ }
+
+ Try<string> checkCommandIp = os::shell("ip link show");
+ if (checkCommandIp.isError()) {
+ return Error("Check command 'ip' failed: " + checkCommandIp.error());
+ }
+
+ Try<Resources> resources = Resources::parse(
+ flags.resources.getOrElse(""),
+ flags.default_role);
+
+ if (resources.isError()) {
+ return Error("Failed to parse --resources: " + resources.error());
+ }
+
+ // Get 'ports' resource from 'resources' flag. These ports will be
+ // treated as non-ephemeral ports.
+ IntervalSet<uint16_t> nonEphemeralPorts;
+ if (resources.get().ports().isSome()) {
+ nonEphemeralPorts = getIntervalSet(resources.get().ports().get());
+ }
+
+ // Get 'ephemeral_ports' resource from 'resources' flag. These ports
+ // will be allocated to each container as ephemeral ports.
+ IntervalSet<uint16_t> ephemeralPorts;
+ if (resources.get().ephemeral_ports().isSome()) {
+ ephemeralPorts = getIntervalSet(resources.get().ephemeral_ports().get());
+ }
+
+ // Each container requires at least one ephemeral port for slave
+ // executor communication. If no 'ephemeral_ports' resource is
+ // found, we will return error.
+ if (ephemeralPorts.empty()) {
+ return Error("Ephemeral ports are not specified");
+ }
+
+ // Sanity check to make sure that the ephemeral ports specified do
+ // not intersect with the specified non-ephemeral ports.
+ if (ephemeralPorts.intersects(nonEphemeralPorts)) {
+ return Error(
+ "The specified ephemeral ports " + stringify(ephemeralPorts) +
+ " intersect with the specified non-ephemeral ports " +
+ stringify(nonEphemeralPorts));
+ }
+
+ // This is a sanity check to make sure that the ephemeral ports
+ // specified do not intersect with the well known ports.
+ if (ephemeralPorts.intersects(WELL_KNOWN_PORTS())) {
+ return Error(
+ "The specified ephemeral ports " + stringify(ephemeralPorts) +
+ " intersect with well known ports " + stringify(WELL_KNOWN_PORTS()));
+ }
+
+ // Obtain the host ephemeral port range by reading the proc file
+ // system ('ip_local_port_range').
+ Try<string> value = os::read("/proc/sys/net/ipv4/ip_local_port_range");
+ if (value.isError()) {
+ return Error("Failed to read host ip_local_port_range: " + value.error());
+ }
+
+ vector<string> split = strings::split(strings::trim(value.get()), "\t");
+ if (split.size() != 2) {
+ return Error(
+ "Unexpected format from host ip_local_port_range: " + value.get());
+ }
+
+ Try<uint16_t> begin = numify<uint16_t>(split[0]);
+ if (begin.isError()) {
+ return Error(
+ "Failed to parse the begin of host ip_local_port_range: " + split[0]);
+ }
+
+ Try<uint16_t> end = numify<uint16_t>(split[1]);
+ if (end.isError()) {
+ return Error(
+ "Failed to parse the end of host ip_local_port_range: " + split[1]);
+ }
+
+ Interval<uint16_t> hostEphemeralPorts =
+ (Bound<uint16_t>::closed(begin.get()),
+ Bound<uint16_t>::closed(end.get()));
+
+ // Sanity check to make sure the specified ephemeral ports do not
+ // intersect with the ephemeral ports used by the host.
+ if (ephemeralPorts.intersects(hostEphemeralPorts)) {
+ return Error(
+ "The specified ephemeral ports " + stringify(ephemeralPorts) +
+ " intersect with the ephemeral ports used by the host " +
+ stringify(hostEphemeralPorts));
+ }
+
+ // TODO(chzhcn): Cross check ephemeral ports with used ports on the
+ // host (e.g., using port scan).
+
+ // Initialize the ephemeral ports allocator.
+
+ // In theory, any positive integer can be broken up into a few
+ // numbers that are power of 2 aligned. We choose to not allow this
+ // for now so that each container has a fixed (one) number of
+ // filters for ephemeral ports. This makes it easy to debug and
+ // infer performance.
+ if (roundDownToPowerOfTwo(flags.ephemeral_ports_per_container) !=
+ flags.ephemeral_ports_per_container) {
+ return Error(
+ "The number of ephemeral ports for each container (" +
+ stringify(flags.ephemeral_ports_per_container) +
+ ") is not a power of 2");
+ }
+
+ if (ephemeralPorts.size() < flags.ephemeral_ports_per_container) {
+ return Error(
+ "Network Isolator is given ephemeral ports of size: " +
+ stringify(ephemeralPorts.size()) + ", but asked to allocate " +
+ stringify(flags.ephemeral_ports_per_container) +
+ " ephemeral ports for a container");
+ }
+
+ if (flags.ephemeral_ports_per_container < MIN_EPHEMERAL_PORTS_SIZE) {
+ return Error(
+ "Each container has only " +
+ stringify(flags.ephemeral_ports_per_container) +
+ " ephemeral ports. The minimum required is: " +
+ stringify(MIN_EPHEMERAL_PORTS_SIZE));
+ }
+
+ Owned<EphemeralPortsAllocator> ephemeralPortsAllocator(
+ new EphemeralPortsAllocator(
+ ephemeralPorts,
+ flags.ephemeral_ports_per_container));
+
+ // Get the name of the public interface (e.g., eth0). If it is not
+ // specified, try to derive its name from the routing library.
+ Result<string> eth0 = link::eth0();
+ if (flags.eth0_name.isSome()) {
+ eth0 = flags.eth0_name.get();
+
+ // Check if the given public interface exists.
+ Try<bool> hostEth0Exists = link::exists(eth0.get());
+ if (hostEth0Exists.isError()) {
+ return Error(
+ "Failed to check if " + eth0.get() + " exists: " +
+ hostEth0Exists.error());
+ } else if (!hostEth0Exists.get()) {
+ return Error("The public interface " + eth0.get() + " does not exist");
+ }
+ } else if (!eth0.isSome()){
+ // eth0 is not specified in the flag and we did not get a valid
+ // eth0 from the library.
+ return Error(
+ "Network Isolator failed to find a public interface: " + eth0.error());
+ }
+
+ LOG(INFO) << "Using " << eth0.get() << " as the public interface";
+
+ // Get the name of the loopback interface. If it is not specified,
+ // try to derive its name based on the loopback IP address.
+ Result<string> lo = link::lo();
+ // Option<string> lo = flags.lo_name;
+ if (flags.lo_name.isSome()) {
+ lo = flags.lo_name;
+
+ // Check if the given loopback interface exists.
+ Try<bool> hostLoExists = link::exists(lo.get());
+ if (hostLoExists.isError()) {
+ return Error(
+ "Failed to check if " + lo.get() + " exists: " +
+ hostLoExists.error());
+ } else if (!hostLoExists.get()) {
+ return Error("The loopback interface " + lo.get() + " does not exist");
+ }
+ } else if (!lo.isSome()) {
+ // lo is not specified in the flag and we did not get a valid
+ // lo from the library.
+ return Error(
+ "Network Isolator failed to find a loopback interface: " + lo.error());
+ }
+
+ LOG(INFO) << "Using " << lo.get() << " as the loopback interface";
+
+ // If egress rate limit is provided, do a sanity check that it is
+ // not greater than the host physical link speed.
+ Option<Bytes> egressRateLimitPerContainer;
+ if (flags.egress_rate_limit_per_container.isSome()) {
+ // Read host physical link speed from /sys/class/net/eth0/speed.
+ // This value is in MBits/s.
+ Try<string> value =
+ os::read(path::join("/sys/class/net", eth0.get(), "speed"));
+
+ if (value.isError()) {
+ return Error(
+ "Failed to read " +
+ path::join("/sys/class/net", eth0.get(), "speed") +
+ ": " + value.error());
+ }
+
+ Try<uint64_t> hostLinkSpeed = numify<uint64_t>(strings::trim(value.get()));
+ CHECK_SOME(hostLinkSpeed);
+
+ // It could be possible that the nic driver doesn't support
+ // reporting physical link speed. In that case, report error.
+ if (hostLinkSpeed.get() == 0xFFFFFFFF) {
+ return Error(
+ "Network Isolator failed to determine link speed for " + eth0.get());
+ }
+
+ // Convert host link speed to Bytes/s for comparason.
+ if (hostLinkSpeed.get() * 1000000 / 8 <
+ flags.egress_rate_limit_per_container.get().bytes()) {
+ return Error(
+ "The given egress traffic limit for containers " +
+ stringify(flags.egress_rate_limit_per_container.get().bytes()) +
+ " Bytes/s is greater than the host link speed " +
+ stringify(hostLinkSpeed.get() * 1000000 / 8) + " Bytes/s");
+ }
+
+ if (flags.egress_rate_limit_per_container.get() != Bytes(0)) {
+ egressRateLimitPerContainer = flags.egress_rate_limit_per_container.get();
+ } else {
+ LOG(WARNING) << "Ignoring the given zero egress rate limit";
+ }
+ }
+
+ // Get the host IP network, MAC and default gateway.
+ Result<net::IPNetwork> hostIPNetwork =
+ net::IPNetwork::fromLinkDevice(eth0.get(), AF_INET);
+
+ if (!hostIPNetwork.isSome()) {
+ return Error(
+ "Failed to get the public IP network of " + eth0.get() + ": " +
+ (hostIPNetwork.isError() ?
+ hostIPNetwork.error() :
+ "does not have an IPv4 network"));
+ }
+
+ Result<net::MAC> hostMAC = net::mac(eth0.get());
+ if (!hostMAC.isSome()) {
+ return Error(
+ "Failed to get the MAC address of " + eth0.get() + ": " +
+ (hostMAC.isError() ? hostMAC.error() : "does not have a MAC address"));
+ }
+
+ Result<net::IP> hostDefaultGateway = route::defaultGateway();
+ if (!hostDefaultGateway.isSome()) {
+ return Error(
+ "Failed to get the default gateway of the host: " +
+ (hostDefaultGateway.isError() ? hostDefaultGateway.error()
+ : "The default gateway of the host does not exist"));
+ }
+
+ // Set the MAC address of the host loopback interface (lo) so that
+ // it matches that of the host public interface (eth0). A fairly
+ // recent kernel patch is needed for this operation to succeed:
+ // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+ // 25f929fbff0d1bcebf2e92656d33025cd330cbf8
+ Try<bool> setHostLoMAC = link::setMAC(lo.get(), hostMAC.get());
+ if (setHostLoMAC.isError()) {
+ return Error(
+ "Failed to set the MAC address of " + lo.get() +
+ ": " + setHostLoMAC.error());
+ }
+
+ // Set the MTU of the host loopback interface (lo) so that it
+ // matches that of the host public interface (eth0).
+ Result<unsigned int> hostEth0MTU = link::mtu(eth0.get());
+ if (hostEth0MTU.isError()) {
+ return Error(
+ "Failed to get the MTU of " + eth0.get() +
+ ": " + hostEth0MTU.error());
+ }
+
+ // The host public interface should exist since we just checked it.
+ CHECK_SOME(hostEth0MTU);
+
+ Try<bool> setHostLoMTU = link::setMTU(lo.get(), hostEth0MTU.get());
+ if (setHostLoMTU.isError()) {
+ return Error(
+ "Failed to set the MTU of " + lo.get() +
+ ": " + setHostLoMTU.error());
+ }
+
+ // Prepare the ingress queueing disciplines on host public interface
+ // (eth0) and host loopback interface (lo).
+ Try<bool> createHostEth0IngressQdisc = ingress::create(eth0.get());
+ if (createHostEth0IngressQdisc.isError()) {
+ return Error(
+ "Failed to create the ingress qdisc on " + eth0.get() +
+ ": " + createHostEth0IngressQdisc.error());
+ }
+
+ set<uint16_t> freeFlowIds;
+ if (flags.egress_unique_flow_per_container) {
+ // Prepare a fq_codel queueing discipline on host public interface
+ // (eth0) for egress flow classification.
+ //
+ // TODO(cwang): Maybe we can continue when some other egress qdisc
+ // exists because this is not a necessary qdisc for network
+ // isolation, but we don't want inconsistency, so we just fail in
+ // this case. See details in MESOS-2370.
+ Try<bool> createHostEth0EgressQdisc = fq_codel::create(
+ eth0.get(),
+ EGRESS_ROOT,
+ HOST_TX_FQ_CODEL_HANDLE);
+ if (createHostEth0EgressQdisc.isError()) {
+ return Error(
+ "Failed to create the egress qdisc on " + eth0.get() +
+ ": " + createHostEth0EgressQdisc.error());
+ }
+
+ // TODO(cwang): Make sure DEFAULT_FLOWS is large enough so that
+ // it's unlikely to run out of free flow IDs.
+ for (uint16_t i = CONTAINER_MIN_FLOWID; i < fq_codel::DEFAULT_FLOWS; i++) {
+ freeFlowIds.insert(i);
+ }
+ }
+
+ Try<bool> createHostLoQdisc = ingress::create(lo.get());
+ if (createHostLoQdisc.isError()) {
+ return Error(
+ "Failed to create the ingress qdisc on " + lo.get() +
+ ": " + createHostLoQdisc.error());
+ }
+
+ // Enable 'route_localnet' on host loopback interface (lo). This
+ // enables the use of 127.0.0.1/8 for local routing purpose. This
+ // feature only exists on kernel 3.6 or newer.
+ const string loRouteLocalnet =
+ path::join("/proc/sys/net/ipv4/conf", lo.get(), "route_localnet");
+
+ if (!os::exists(loRouteLocalnet)) {
+ // TODO(jieyu): Consider supporting running the isolator if this
+ // feature is not available. We need to conditionally disable
+ // routing for 127.0.0.1/8, and ask the tasks to use the public IP
+ // for container to container and container to host communication.
+ return Error("The kernel does not support 'route_localnet'");
+ }
+
+ Try<Nothing> write = os::write(loRouteLocalnet, "1");
+ if (write.isError()) {
+ return Error(
+ "Failed to enable route_localnet for " + lo.get() +
+ ": " + write.error());
+ }
+
+ // We disable 'rp_filter' and 'send_redirects' for host loopback
+ // interface (lo) to work around a kernel bug, which was only
+ // recently addressed in upstream in the following 3 commits.
+ // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+ // 6a662719c9868b3d6c7d26b3a085f0cd3cc15e64
+ // 0d5edc68739f1c1e0519acbea1d3f0c1882a15d7
+ // e374c618b1465f0292047a9f4c244bd71ab5f1f0
+ // The workaround ensures packets don't get dropped at lo.
+ write = os::write("/proc/sys/net/ipv4/conf/all/rp_filter", "0");
+ if (write.isError()) {
+ return Error(
+ "Failed to disable rp_filter for all: " + write.error());
+ }
+
+ write = os::write(path::join(
+ "/proc/sys/net/ipv4/conf", lo.get(), "rp_filter"), "0");
+ if (write.isError()) {
+ return Error(
+ "Failed to disable rp_filter for " + lo.get() +
+ ": " + write.error());
+ }
+
+ write = os::write("/proc/sys/net/ipv4/conf/all/send_redirects", "0");
+ if (write.isError()) {
+ return Error(
+ "Failed to disable send_redirects for all: " + write.error());
+ }
+
+ write = os::write(path::join(
+ "/proc/sys/net/ipv4/conf", lo.get(), "send_redirects"), "0");
+ if (write.isError()) {
+ return Error(
+ "Failed to disable send_redirects for " + lo.get() +
+ ": " + write.error());
+ }
+
+ // We need to enable accept_local on host loopback interface (lo)
+ // for kernels older than 3.6. Refer to the following:
+ // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+ // 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2
+ // https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
+ write = os::write(path::join(
+ "/proc/sys/net/ipv4/conf", lo.get(), "accept_local"), "1");
+ if (write.isError()) {
+ return Error(
+ "Failed to enable accept_local for " + lo.get() +
+ ": " + write.error());
+ }
+
+ // Reading host network configurations. Each container will match
+ // these configurations.
+ hashset<string> procs;
+
+ // TODO(jieyu): The following is a partial list of all the
+ // configurations. In the future, we may want to expose these
+ // configurations using ContainerInfo.
+
+ // The kernel will use a default value for the following
+ // configurations inside a container. Therefore, we need to set them
+ // in the container to match that on the host.
+ procs.insert("/proc/sys/net/core/somaxconn");
+
+ // As of kernel 3.10, the following configurations are shared
+ // between host and containers, and therefore are not required to be
+ // set in containers. We keep them here just in case the kernel
+ // changes in the future.
+ procs.insert("/proc/sys/net/core/netdev_max_backlog");
+ procs.insert("/proc/sys/net/core/rmem_max");
+ procs.insert("/proc/sys/net/core/wmem_max");
+ procs.insert("/proc/sys/net/ipv4/tcp_keepalive_time");
+ procs.insert("/proc/sys/net/ipv4/tcp_keepalive_intvl");
+ procs.insert("/proc/sys/net/ipv4/tcp_keepalive_probes");
+ procs.insert("/proc/sys/net/ipv4/tcp_max_syn_backlog");
+ procs.insert("/proc/sys/net/ipv4/tcp_rmem");
+ procs.insert("/proc/sys/net/ipv4/tcp_retries2");
+ procs.insert("/proc/sys/net/ipv4/tcp_synack_retries");
+ procs.insert("/proc/sys/net/ipv4/tcp_wmem");
+ procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh1");
+ procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh2");
+ procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh3");
+
+ hashmap<string, string> hostNetworkConfigurations;
+ foreach (const string& proc, procs) {
+ Try<string> value = os::read(proc);
+ if (value.isSome()) {
+ LOG(INFO) << proc << " = '" << strings::trim(value.get()) << "'";
+ hostNetworkConfigurations[proc] = strings::trim(value.get());
+ }
+ }
+
+ // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT(). Since we use a
+ // new mount namespace for each container, for this mount point, we
+ // set '--make-rshared' on the host and set '--make-rslave' inside
+ // each container. This is important because when we unmount the
+ // network namespace handles on the host, those handles will be
+ // unmounted in the containers as well, but NOT vice versa.
+
+ // We first create the bind mount directory if it does not exist.
+ Try<Nothing> mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_ROOT());
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the bind mount root directory at " +
+ PORT_MAPPING_BIND_MOUNT_ROOT() + ": " + mkdir.error());
+ }
+
+ // Now, check '/proc/mounts' to see if
+ // PORT_MAPPING_BIND_MOUNT_ROOT() has already been self mounted.
+ Try<fs::MountTable> mountTable = fs::MountTable::read("/proc/mounts");
+ if (mountTable.isError()) {
+ return Error(
+ "Failed to the read the mount table at '/proc/mounts': " +
+ mountTable.error());
+ }
+
+ Option<fs::MountTable::Entry> bindMountRoot;
+ foreach (const fs::MountTable::Entry& entry, mountTable.get().entries) {
+ if (entry.dir == PORT_MAPPING_BIND_MOUNT_ROOT()) {
+ bindMountRoot = entry;
+ }
+ }
+
+ // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT().
+ if (bindMountRoot.isNone()) {
+ // NOTE: Instead of using fs::mount to perform the bind mount, we
+ // use the shell command here because the syscall 'mount' does not
+ // update the mount table (i.e., /etc/mtab), which could cause
+ // issues for the shell command 'mount --make-rslave' inside the
+ // container. It's OK to use the blocking os::shell here because
+ // 'create' will only be invoked during initialization.
+ Try<string> mount = os::shell(
+ "mount --bind %s %s",
+ PORT_MAPPING_BIND_MOUNT_ROOT().c_str(),
+ PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
+
+ if (mount.isError()) {
+ return Error(
+ "Failed to self bind mount '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
+ "': " + mount.error());
+ }
+ }
+
+ // Mark the mount point PORT_MAPPING_BIND_MOUNT_ROOT() as
+ // recursively shared.
+ Try<string> mountShared = os::shell(
+ "mount --make-rshared %s",
+ PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
+
+ if (mountShared.isError()) {
+ return Error(
+ "Failed to mark '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
+ "' as recursively shared: " + mountShared.error());
+ }
+
+ // Create the network namespace handle symlink directory if it does
+ // not exist. It is used to host from network namespace handle
+ // symlinks whose basename is a container ID. This allows us to
+ // recover container IDs for orphan containers (i.e., not known by
+ // the slave). This is introduced in 0.23.0.
+ mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the bind mount root directory at " +
+ PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() + ": " + mkdir.error());
+ }
+
+ return new MesosIsolator(Owned<MesosIsolatorProcess>(
+ new PortMappingIsolatorProcess(
+ flags,
+ eth0.get(),
+ lo.get(),
+ hostMAC.get(),
+ hostIPNetwork.get(),
+ hostEth0MTU.get(),
+ hostDefaultGateway.get(),
+ hostNetworkConfigurations,
+ egressRateLimitPerContainer,
+ nonEphemeralPorts,
+ ephemeralPortsAllocator,
+ freeFlowIds)));
+}
+
+
+Future<Nothing> PortMappingIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ // Extract pids from virtual device names (veth). This tells us
+ // about all the potential live containers on this slave.
+ Try<set<string>> links = net::links();
+ if (links.isError()) {
+ return Failure("Failed to get all the links: " + links.error());
+ }
+
+ hashset<pid_t> pids;
+ foreach (const string& name, links.get()) {
+ Option<pid_t> pid = getPidFromVeth(name);
+ // Not all links follow the naming: mesos{pid}, so we simply
+ // continue, e.g., eth0.
+ if (pid.isNone()) {
+ continue;
+ } else if (pids.contains(pid.get())) {
+ return Failure("Two virtual devices have the same name '" + name + "'");
+ }
+
+ pids.insert(pid.get());
+ }
+
+ // Scan the bind mount root to cleanup all stale network namespace
+ // handles that do not have an active veth associated with.
+ Try<list<string>> entries = os::ls(PORT_MAPPING_BIND_MOUNT_ROOT());
+ if (entries.isError()) {
+ return Failure(
+ "Failed to list bind mount root '" +
+ PORT_MAPPING_BIND_MOUNT_ROOT() +
+ "': " + entries.error());
+ }
+
+ foreach (const string& entry, entries.get()) {
+ const string path = path::join(PORT_MAPPING_BIND_MOUNT_ROOT(), entry);
+
+ // NOTE: We expect all regular files whose names are numbers under
+ // the bind mount root are network namespace handles.
+ Result<pid_t> pid = getPidFromNamespaceHandle(path);
+ if (pid.isError()) {
+ return Failure(
+ "Failed to get pid from network namespace handle '" +
+ path + "': " + pid.error());
+ } else if (pid.isNone()) {
+ // We ignore files that are clearly not network namespace
+ // handles created by us. It's likely that those are created by
+ // users or other tools.
+ LOG(WARNING) << "Unrecognized network namespace handle '" << path << "'";
+ continue;
+ }
+
+ // We cleanup the network namespace handle if the associated
+ // containers have clearly exited (i.e., the veth has gone). The
+ // cleanup here is best effort.
+ if (!pids.contains(pid.get())) {
+ LOG(INFO) << "Removing stale network namespace handle '" << path << "'";
+
+ Try<Nothing> unmount = fs::unmount(path, MNT_DETACH);
+ if (unmount.isError()) {
+ LOG(WARNING) << "Failed to unmount stale network namespace handle '"
+ << path << "': " << unmount.error();
+ }
+
+ Try<Nothing> rm = os::rm(path);
+ if (rm.isError()) {
+ LOG(WARNING) << "Failed to remove stale network namespace handle '"
+ << path << "': " << rm.error();
+ }
+ }
+ }
+
+ // Scan the bind mount symlink root for container IDs. This allows us
+ // to recover container IDs for orphan containers (i.e., not known
+ // by the slave). This is introduced in 0.23.0.
+ entries = os::ls(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
+ if (entries.isError()) {
+ return Failure(
+ "Failed to list bind mount symlink root '" +
+ PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() +
+ "': " + entries.error());
+ }
+
+ // This map stores the mapping between pids and container IDs
+ // recovered from the bind mount root that have valid veth links. We
+ // use a multihashmap here because multiple container IDs can map to
+ // the same pid if the removal of a symlink fails in '_cleanup()'
+ // and the pid is reused by a new container.
+ multihashmap<pid_t, ContainerID> linkers;
+
+ foreach (const string& entry, entries.get()) {
+ const string path =
+ path::join(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(), entry);
+
+ // We only create symlinks in this directory and assume
+ // non-symlink files are created by other users or tools,
+ // therefore will be ignored.
+ if (!os::stat::islink(path)) {
+ LOG(WARNING) << "Ignored non-symlink file '" << path
+ << "' under bind mount symlink root '"
+ << PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() << "'";
+ continue;
+ }
+
+ // NOTE: We expect all symlinks under the bind mount symlink root
+ // to be container ID symlinks.
+
+ Try<ContainerID> containerId = getContainerIdFromSymlink(path);
+ if (containerId.isError()) {
+ return Failure(
+ "Failed to get container ID from network namespace handle symlink '" +
+ path + "': " + containerId.error());
+ }
+
+ Result<pid_t> pid = getPidFromSymlink(path);
+ if (pid.isError()) {
+ return Failure(
+ "Failed to get pid from network namespace handle symlink '" + path +
+ "': " + pid.error());
+ }
+
+ // We remove the symlink if it's dangling or the associated
+ // containers have clearly exited (i.e., the veth has gone). The
+ // cleanup here is best effort.
+ if (pid.isNone() || !pids.contains(pid.get())) {
+ LOG(INFO) << "Removing stale network namespace handle symlink '"
+ << path << "'";
+
+ Try<Nothing> rm = os::rm(path);
+ if (rm.isError()) {
+ LOG(WARNING) << "Failed to remove stale network namespace handle "
+ << " symlink '" << path << "': " << rm.error();
+ }
+ } else {
+ LOG(INFO) << "Discovered network namespace handle symlink "
+ << containerId.get() << " -> " << pid.get();
+
+ linkers.put(pid.get(), containerId.get());
+ }
+ }
+
+ // If multiple container IDs point to the same pid, we remove both
+ // symlinks for safety (as if we cannot derive the container ID for
+ // orphans, which is OK because it'll be treated the same as those
+ // containers that are created by older (pre 0.23.0) versions). Note
+ // that it's possible that multiple container IDs map to the same
+ // pid if the removal of a symlink fails in '_cleanup()' and the pid
+ // is reused by a new container.
+ foreach (pid_t pid, linkers.keys()) {
+ list<ContainerID> containerIds = linkers.get(pid);
+ if (containerIds.size() > 1) {
+ foreach (const ContainerID& containerId, containerIds) {
+ const string linker = getSymlinkPath(containerId);
+
+ LOG(WARNING) << "Removing duplicated network namespace handle symlink '"
+ << linker << "'";
+
+ Try<Nothing> rm = os::rm(linker);
+ if (rm.isError()) {
+ LOG(WARNING) << "Failed to remove duplicated network namespace "
+ << "handle symlink '" << linker << "': " << rm.error();
+ }
+ }
+
+ linkers.remove(pid);
+ }
+ }
+
+ // Now, actually recover the isolator from slave's state.
+ foreach (const ContainerState& state, states) {
+ const ContainerID& containerId = state.container_id();
+ pid_t pid = state.pid();
+
+ VLOG(1) << "Recovering network isolator for container "
+ << containerId << " with pid " << pid;
+
+ if (!pids.contains(pid)) {
+ // There are two possible cases here:
+ //
+ // 1) The container was launched by the slave with network
+ // isolation disabled, so the pid could not be found in the
+ // device names in the system.
+ //
+ // 2) The container was launched by the slave with network
+ // isolation enabled, but veth is removed (because the
+ // corresponding container is destroyed), but the slave
+ // restarts before it is able to write the sentinel file.
+ //
+ // In both cases, we treat the container as unmanaged. For case
+ // (2), it's safe to do so because the container has already
+ // been destroyed.
+ VLOG(1) << "Skipped recovery for container " << containerId
+ << "with pid " << pid << " as either it was not managed by "
+ << "the network isolator or it has already been destroyed";
+
+ unmanaged.insert(containerId);
+ continue;
+ }
+
+ Try<Info*> recover = _recover(pid);
+ if (recover.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+
+ return Failure(
+ "Failed to recover container " + stringify(containerId) +
+ " with pid " + stringify(pid) + ": " + recover.error());
+ }
+
+ infos[containerId] = recover.get();
+
+ // Remove the successfully recovered pid.
+ pids.erase(pid);
+ }
+
+ // Recover orphans. Known orphans will be destroyed by containerizer
+ // using the normal cleanup path (refer to MESOS-2367 for details).
+ // Unknown orphans will be cleaned up immediately. The recovery will
+ // fail if there is some unknown orphan that cannot be cleaned up.
+ vector<Info*> unknownOrphans;
+
+ foreach (pid_t pid, pids) {
+ Try<Info*> recover = _recover(pid);
+ if (recover.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ foreach (Info* info, unknownOrphans) {
+ delete info;
+ }
+
+ return Failure(
+ "Failed to recover orphaned container with pid " +
+ stringify(pid) + ": " + recover.error());
+ }
+
+ if (linkers.get(pid).size() == 1) {
+ const ContainerID containerId = linkers.get(pid).front();
+ CHECK(!infos.contains(containerId));
+
+ if (orphans.contains(containerId)) {
+ infos[containerId] = recover.get();
+ continue;
+ }
+ }
+
+ unknownOrphans.push_back(recover.get());
+ }
+
+ foreach (Info* info, unknownOrphans) {
+ CHECK_SOME(info->pid);
+ pid_t pid = info->pid.get();
+
+ Option<ContainerID> containerId;
+ if (linkers.get(pid).size() == 1) {
+ containerId = linkers.get(pid).front();
+ }
+
+ // NOTE: If 'infos' is empty (means there is no regular container
+ // or known orphan), the '_cleanup' below will remove the ICMP and
+ // ARP packet filters on host eth0. This will cause subsequent
+ // calls to '_cleanup' for unknown orphans to fail. However, this
+ // is OK because when slave restarts and tries to recover again,
+ // it'll try to remove the remaining unknown orphans.
+ // TODO(jieyu): Consider call '_cleanup' for all the unknown
+ // orphans before returning even if error occurs.
+ Try<Nothing> cleanup = _cleanup(info, containerId);
+ if (cleanup.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+
+ // TODO(jieyu): Also delete 'info' in unknownOrphans. Notice
+ // that some 'info' in unknownOrphans might have already been
+ // deleted in '_cleanup' above.
+
+ return Failure(
+ "Failed to cleanup orphaned container with pid " +
+ stringify(pid) + ": " + cleanup.error());
+ }
+ }
+
+ // TODO(cwang): Consider removing unrecognized flow classifiers from
+ // host eth0 egress.
+
+ LOG(INFO) << "Network isolator recovery complete";
+
+ return Nothing();
+}
+
+
+Try<PortMappingIsolatorProcess::Info*>
+PortMappingIsolatorProcess::_recover(pid_t pid)
+{
+ // Get all the IP filters on veth.
+ // NOTE: We only look at veth devices to recover port ranges
+ // assigned to each container. That's the reason why we need to make
+ // sure that we add filters to veth before adding filters to host
+ // eth0 and host lo. Also, we need to make sure we remove filters
+ // from host eth0 and host lo before removing filters from veth.
+ Result<vector<ip::Classifier>> vethIngressClassifiers =
+ ip::classifiers(veth(pid), ingress::HANDLE);
+
+ if (vethIngressClassifiers.isError()) {
+ return Error(
+ "Failed to get all the IP filters on " + veth(pid) +
+ ": " + vethIngressClassifiers.error());
+ } else if (vethIngressClassifiers.isNone()) {
+ return Error(
+ "Failed to get all the IP filters on " + veth(pid) +
+ ": link does not exist");
+ }
+
+ hashmap<PortRange, uint16_t> flowIds;
+
+ if (flags.egress_unique_flow_per_container) {
+ // Get all egress IP flow classifiers on eth0.
+ Result<vector<filter::Filter<ip::Classifier>>> eth0EgressFilters =
+ ip::filters(eth0, HOST_TX_FQ_CODEL_HANDLE);
+
+ if (eth0EgressFilters.isError()) {
+ return Error(
+ "Failed to get all the IP flow classifiers on " + eth0 +
+ ": " + eth0EgressFilters.error());
+ } else if (eth0EgressFilters.isNone()) {
+ return Error(
+ "Failed to get all the IP flow classifiers on " + eth0 +
+ ": link does not exist");
+ }
+
+ // Construct a port range to flow ID mapping from host eth0
+ // egress. This map will be used later.
+ foreach (const filter::Filter<ip::Classifier>& filter,
+ eth0EgressFilters.get()) {
+ const Option<PortRange> sourcePorts = filter.classifier.sourcePorts;
+ const Option<Handle> classid = filter.classid;
+
+ if (sourcePorts.isNone()) {
+ return Error("Missing source ports for filters on egress of " + eth0);
+ }
+
+ if (classid.isNone()) {
+ return Error("Missing classid for filters on egress of " + eth0);
+ }
+
+ if (flowIds.contains(sourcePorts.get())) {
+ return Error(
+ "Duplicated port range " + stringify(sourcePorts.get()) +
+ " detected on egress of " + eth0);
+ }
+
+ flowIds[sourcePorts.get()] = classid.get().secondary();
+ }
+ }
+
+ IntervalSet<uint16_t> nonEphemeralPorts;
+ IntervalSet<uint16_t> ephemeralPorts;
+ Option<uint16_t> flowId;
+
+ foreach (const ip::Classifier& classifier, vethIngressClassifiers.get()) {
+ const Option<PortRange> sourcePorts = classifier.sourcePorts;
+ const Option<PortRange> destinationPorts = classifier.destinationPorts;
+
+ // All the IP filters on veth used by us only have source ports.
+ if (sourcePorts.isNone() || destinationPorts.isSome()) {
+ return Error("Unexpected IP filter detected on " + veth(pid));
+ }
+
+ if (flowIds.contains(sourcePorts.get())) {
+ if (flowId.isNone()) {
+ flowId = flowIds.get(sourcePorts.get());
+ } else if (flowId != flowIds.get(sourcePorts.get())) {
+ return Error(
+ "A container is associated with multiple flows "
+ "on egress of " + eth0);
+ }
+ } else if (flowId.isSome()) {
+ // This is the case where some port range of a container is
+ // assigned to a flow while some isn't. This could happen if
+ // slave crashes while those filters are created. However, this
+ // is OK for us because packets by default go to the host flow.
+ LOG(WARNING) << "Container port range " << sourcePorts.get()
+ << " does not have flow id " << flowId.get()
+ << " assigned";
+ }
+
+ Interval<uint16_t> ports =
+ (Bound<uint16_t>::closed(sourcePorts.get().begin()),
+ Bound<uint16_t>::closed(sourcePorts.get().end()));
+
+ if (managedNonEphemeralPorts.contains(ports)) {
+ nonEphemeralPorts += ports;
+ } else if (ephemeralPortsAllocator->isManaged(ports)) {
+ // We have duplicate here because we have two IP filters with
+ // the same ephemeral port range (one for eth0 and one for lo).
+ // But we should never have two intersecting port ranges.
+ if (!ephemeralPorts.contains(ports) && ephemeralPorts.intersects(ports)) {
+ return Error("Unexpected intersected ephemeral port ranges");
+ }
+
+ ephemeralPorts += ports;
+ } else {
+ return Error("Unexpected IP filter detected on " + veth(pid));
+ }
+ }
+
+ Info* info = NULL;
+
+ if (ephemeralPorts.empty()) {
+ // NOTE: This is possible because the slave may crash while
+ // calling 'isolate()', leaving a partially isolated container. To
+ // clean up this partially isolated container, we still create an
+ // Info struct here and let the 'cleanup' function clean it up
+ // later.
+ LOG(WARNING) << "No ephemeral ports found for container with pid "
+ << stringify(pid) << ". This could happen if slave crashes "
+ << "while isolating a container";
+
+ info = new Info(nonEphemeralPorts, Interval<uint16_t>(), pid);
+ } else {
+ if (ephemeralPorts.intervalCount() != 1) {
+ return Error("Each container should have only one ephemeral port range");
+ }
+
+ // Tell the allocator that this ephemeral port range is used.
+ ephemeralPortsAllocator->allocate(*ephemeralPorts.begin());
+
+ info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid);
+
+ VLOG(1) << "Recovered network isolator for container with pid " << pid
+ << " non-ephemeral port ranges " << nonEphemeralPorts
+ << " and ephemeral port range " << *ephemeralPorts.begin();
+ }
+
+ if (flowId.isSome()) {
+ freeFlowIds.erase(flowId.get());
+ info->flowId = flowId.get();
+ }
+
+ return CHECK_NOTNULL(info);
+}
+
+
+Future<Option<ContainerPrepareInfo>> PortMappingIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (unmanaged.contains(containerId)) {
+ return Failure("Asked to prepare an unmanaged container");
+ }
+
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ Resources resources(executorInfo.resources());
+
+ IntervalSet<uint16_t> nonEphemeralPorts;
+
+ if (resources.ports().isSome()) {
+ nonEphemeralPorts = getIntervalSet(resources.ports().get());
+
+ // Sanity check to make sure that the assigned non-ephemeral ports
+ // for the container are part of the non-ephemeral ports specified
+ // by the slave.
+ if (!managedNonEphemeralPorts.contains(nonEphemeralPorts)) {
+ return Failure(
+ "Some non-ephemeral ports specified in " +
+ stringify(nonEphemeralPorts) +
+ " are not managed by the slave");
+ }
+ }
+
+ // TODO(jieyu): For now, we simply ignore the 'ephemeral_ports'
+ // specified in the executor info. However, this behavior needs to
+ // be changed once the master can make default allocations for
+ // ephemeral ports.
+ if (resources.ephemeral_ports().isSome()) {
+ LOG(WARNING) << "Ignoring the specified ephemeral_ports '"
+ << resources.ephemeral_ports().get()
+ << "' for container" << containerId
+ << " of executor " << executorInfo.executor_id();
+ }
+
+ // Allocate the ephemeral ports used by this container.
+ Try<Interval<uint16_t>> ephemeralPorts = ephemeralPortsAllocator->allocate();
+ if (ephemeralPorts.isError()) {
+ return Failure(
+ "Failed to allocate ephemeral ports: " + ephemeralPorts.error());
+ }
+
+ infos[containerId] = new Info(nonEphemeralPorts, ephemeralPorts.get());
+
+ LOG(INFO) << "Using non-ephemeral ports " << nonEphemeralPorts
+ << " and ephemeral ports " << ephemeralPorts.get()
+ << " for container " << containerId << " of executor "
+ << executorInfo.executor_id();
+
+ ContainerPrepareInfo prepareInfo;
+ prepareInfo.add_commands()->set_value(scripts(infos[containerId]));
+
+ // NOTE: the port mapping isolator itself doesn't require mount
+ // namespace. However, if mount namespace is enabled because of
+ // other isolators, we need to set mount sharing accordingly for
+ // PORT_MAPPING_BIND_MOUNT_ROOT to avoid races described in
+ // MESOS-1558. So we turn on mount namespace here for consistency.
+ prepareInfo.set_namespaces(CLONE_NEWNET | CLONE_NEWNS);
+
+ return prepareInfo;
+}
+
+
+Future<Nothing> PortMappingIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ if (unmanaged.contains(containerId)) {
+ return Failure("Asked to isolate an unmanaged container");
+ }
+
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ if (info->pid.isSome()) {
+ return Failure("The container has already been isolated");
+ }
+
+ info->pid = pid;
+
+ if (flags.egress_unique_flow_per_container) {
+ info->flowId = getNextFlowId();
+ }
+
+ // Bind mount the network namespace handle of the process 'pid' to a
+ // directory to hold an extra reference to the network namespace
+ // which will be released in 'cleanup'. By holding the extra
+ // reference, the network namespace will not be destroyed even if
+ // the process 'pid' is gone, which allows us to explicitly control
+ // the network namespace life cycle.
+ const string source = path::join("/proc", stringify(pid), "ns", "net");
+ const string target = getNamespaceHandlePath(pid);
+
+ Try<Nothing> touch = os::touch(target);
+ if (touch.isError()) {
+ return Failure("Failed to create the bind mount point: " + touch.error());
+ }
+
+ Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mount the network namespace handle from '" +
+ source + "' to '" + target + "': " + mount.error());
+ }
+
+ LOG(INFO) << "Bind mounted '" << source << "' to '" << target
+ << "' for container " << containerId;
+
+ // Since 0.23.0, we create a symlink to the network namespace handle
+ // using the container ID. This serves two purposes. First, it
+ // allows us to recover the container ID later when slave restarts
+ // even if slave's checkpointed meta data is deleted. Second, it
+ // makes the debugging easier. See MESOS-2528 for details.
+ const string linker = getSymlinkPath(containerId);
+ Try<Nothing> symlink = ::fs::symlink(target, linker);
+ if (symlink.isError()) {
+ return Failure(
+ "Failed to symlink the network namespace handle '" +
+ linker + "' -> '" + target + "': " + symlink.error());
+ }
+
+ LOG(INFO) << "Created network namespace handle symlink '"
+ << linker << "' -> '" << target << "'";
+
+ // Create a virtual ethernet pair for this container.
+ Try<bool> createVethPair = link::create(veth(pid), eth0, pid);
+ if (createVethPair.isError()) {
+ return Failure(
+ "Failed to create virtual ethernet pair: " +
+ createVethPair.error());
+ }
+
+ // Disable IPv6 for veth as IPv6 packets won't be forwarded anyway.
+ const string disableIPv6 =
+ path::join("/proc/sys/net/ipv6/conf", veth(pid), "disable_ipv6");
+
+ if (os::exists(disableIPv6)) {
+ Try<Nothing> write = os::write(disableIPv6, "1");
+ if (write.isError()) {
+ return Failure(
+ "Failed to disable IPv6 for " + veth(pid) +
+ ": " + write.error());
+ }
+ }
+
+ // Sets the MAC address of veth to match the MAC address of the host
+ // public interface (eth0).
+ Try<bool> setVethMAC = link::setMAC(veth(pid), hostMAC);
+ if (setVethMAC.isError()) {
+ return Failure(
+ "Failed to set the MAC address of " + veth(pid) +
+ ": " + setVethMAC.error());
+ }
+
+ // Prepare the ingress queueing disciplines on veth.
+ Try<bool> createQdisc = ingress::create(veth(pid));
+ if (createQdisc.isError()) {
+ return Failure(
+ "Failed to create the ingress qdisc on " + veth(pid) +
+ ": " + createQdisc.error());
+ }
+
+ // Veth device should exist since we just created it.
+ CHECK(createQdisc.get());
+
+ // For each port range, add a set of IP packet filters to properly
+ // redirect IP traffic to/from containers.
+ foreach (const PortRange& range,
+ getPortRanges(info->nonEphemeralPorts + info->ephemeralPorts)) {
+ if (info->flowId.isSome()) {
+ LOG(INFO) << "Adding IP packet filters with ports " << range
+ << " with flow ID " << info->flowId.get()
+ << " for container " << containerId;
+ } else {
+ LOG(INFO) << "Adding IP packet filters with ports " << range
+ << " for container " << containerId;
+ }
+
+ Try<Nothing> add = addHostIPFilters(range, info->flowId, veth(pid));
+ if (add.isError()) {
+ return Failure(
+ "Failed to add IP packet filter with ports " +
+ stringify(range) + " for container with pid " +
+ stringify(pid) + ": " + add.error());
+ }
+ }
+
+ // Relay ICMP packets from veth of the container to host eth0.
+ Try<bool> icmpVethToEth0 = filter::icmp::create(
+ veth(pid),
+ ingress::HANDLE,
+ icmp::Classifier(None()),
+ Priority(ICMP_FILTER_PRIORITY, NORMAL),
+ action::Redirect(eth0));
+
+ if (icmpVethToEth0.isError()) {
+ ++metrics.adding_veth_icmp_filters_errors;
+
+ return Failure(
+ "Failed to create an ICMP packet filter from " + veth(pid) +
+ " to host " + eth0 + ": " + icmpVethToEth0.error());
+ } else if (!icmpVethToEth0.get()) {
+ ++metrics.adding_veth_icmp_filters_already_exist;
+
+ return Failure(
+ "The ICMP packet filter from " + veth(pid) +
+ " to host " + eth0 + " already exists");
+ }
+
+ // Relay ARP packets from veth of the container to host eth0.
+ Try<bool> arpVethToEth0 = filter::basic::create(
+ veth(pid),
+ ingress::HANDLE,
+ ETH_P_ARP,
+ Priority(ARP_FILTER_PRIORITY, NORMAL),
+ action::Redirect(eth0));
+
+ if (arpVethToEth0.isError()) {
+ ++metrics.adding_veth_arp_filters_errors;
+
+ return Failure(
+ "Failed to create an ARP packet filter from " + veth(pid) +
+ " to host " + eth0 + ": " + arpVethToEth0.error());
+ } else if (!arpVethToEth0.get()) {
+ ++metrics.adding_veth_arp_filters_already_exist;
+
+ return Failure(
+ "The ARP packet filter from " + veth(pid) +
+ " to host " + eth0 + " already exists");
+ }
+
+ // Setup filters for ICMP and ARP packets. We mirror ICMP and ARP
+ // packets from host eth0 to veths of all the containers. We also
+ // setup flow classifiers for host eth0 egress.
+ set<string> targets;
+ foreachvalue (Info* info, infos) {
+ if (info->pid.isSome()) {
+ targets.insert(veth(info->pid.get()));
+ }
+ }
+
+ if (targets.size() == 1) {
+ // We just create the first container in which case we should
+ // create filters for ICMP and ARP packets.
+
+ // Create a new ICMP filter on host eth0 ingress for mirroring
+ // packets from host eth0 to veth.
+ Try<bool> icmpEth0ToVeth = filter::icmp::create(
+ eth0,
+ ingress::HANDLE,
+ icmp::Classifier(hostIPNetwork.address()),
+ Priority(ICMP_FILTER_PRIORITY, NORMAL),
+ action::Mirror(targets));
+
+ if (icmpEth0ToVeth.isError()) {
+ ++metrics.adding_eth0_icmp_filters_errors;
+
+ return Failure(
+ "Failed to create an ICMP packet filter from host " + eth0 +
+ " to " + veth(pid) + ": " + icmpEth0ToVeth.error());
+ } else if (!icmpEth0ToVeth.get()) {
+ ++metrics.adding_eth0_icmp_filters_already_exist;
+
+ return Failure(
+ "The ICMP packet filter on host " + eth0 + " already exists");
+ }
+
+ // Create a new ARP filter on host eth0 ingress for mirroring
+ // packets from host eth0 to veth.
+ Try<bool> arpEth0ToVeth = filter::basic::create(
+ eth0,
+ ingress::HANDLE,
+ ETH_P_ARP,
+ Priority(ARP_FILTER_PRIORITY, NORMAL),
+ action::Mirror(targets));
+
+ if (arpEth0ToVeth.isError()) {
+ ++metrics.adding_eth0_arp_filters_errors;
+
+ return Failure(
+ "Failed to create an ARP packet filter from host " + eth0 +
+ " to " + veth(pid) + ": " + arpEth0ToVeth.error());
+ } else if (!arpEth0ToVeth.get()) {
+ ++metrics.adding_eth0_arp_filters_already_exist;
+
+ return Failure(
+ "The ARP packet filter on host " + eth0 + " already exists");
+ }
+
+ if (flags.egress_unique_flow_per_container) {
+ // Create a new ICMP filter on host eth0 egress for classifying
+ // packets into a reserved flow.
+ Try<bool> icmpEth0Egress = filter::icmp::create(
+ eth0,
+ HOST_TX_FQ_CODEL_HANDLE,
+ icmp::Classifier(None()),
+ Priority(ICMP_FILTER_PRIORITY, NORMAL),
+ Handle(HOST_TX_FQ_CODEL_HANDLE, ICMP_FLOWID));
+
+ if (icmpEth0Egress.isError()) {
+ ++metrics.adding_eth0_egress_filters_errors;
+
+ return Failure(
+ "Failed to create the ICMP flow classifier on host " +
+ eth0 + ": " + icmpEth0Egress.error());
+ } else if (!icmpEth0Egress.get()) {
+ ++metrics.adding_eth0_egress_filters_already_exist;
+
+ return Failure(
+ "The ICMP flow classifier on host " + eth0 + " already exists");
+ }
+
+ // Create a new ARP filter on host eth0 egress for classifying
+ // packets into a reserved flow.
+ Try<bool> arpEth0Egress = filter::basic::create(
+ eth0,
+ HOST_TX_FQ_CODEL_HANDLE,
+ ETH_P_ARP,
+ Priority(ARP_FILTER_PRIORITY, NORMAL),
+ Handle(HOST_TX_FQ_CODEL_HANDLE, ARP_FLOWID));
+
+ if (arpEth0Egress.isError()) {
+ ++metrics.adding_eth0_egress_filters_errors;
+
+ return Failure(
+ "Failed to create the ARP flow classifier on host " +
+ eth0 + ": " + arpEth0Egress.error());
+ } else if (!arpEth0Egress.get()) {
+ ++metrics.adding_eth0_egress_filters_already_exist;
+
+ return Failure(
+ "The ARP flow classifier on host " + eth0 + " already exists");
+ }
+
+ // Rest of the host packets go to a reserved flow.
+ Try<bool> defaultEth0Egress = filter::basic::create(
+ eth0,
+ HOST_TX_FQ_CODEL_HANDLE,
+ ETH_P_ALL,
+ Priority(DEFAULT_FILTER_PRIORITY, NORMAL),
+ Handle(HOST_TX_FQ_CODEL_HANDLE, HOST_FLOWID));
+
+ if (defaultEth0Egress.isError()) {
+ ++met
<TRUNCATED>
[07/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
new file mode 100644
index 0000000..cc54e90
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
@@ -0,0 +1,294 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <string>
+
+#include <stout/fs.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/posix.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess(
+ const Flags& _flags)
+ : flags(_flags) {}
+
+
+PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {}
+
+
+Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags)
+{
+ process::Owned<MesosIsolatorProcess> process(
+ new PosixFilesystemIsolatorProcess(flags));
+
+ return new MesosIsolator(process);
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ foreach (const ContainerState& state, states) {
+ infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> PosixFilesystemIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ // Return failure if the container change the filesystem root
+ // because the symlinks will become invalid in the new root.
+ if (executorInfo.has_container()) {
+ CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+ if (executorInfo.container().mesos().has_image()) {
+ return Failure("Container root filesystems not supported");
+ }
+
+ // TODO(jieyu): Also return a failure if there exists images in
+ // the specified volumes.
+ }
+
+ infos.put(containerId, Owned<Info>(new Info(directory)));
+
+ return update(containerId, executorInfo.resources())
+ .then([]() -> Future<Option<ContainerPrepareInfo>> { return None(); });
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ // No-op.
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> PosixFilesystemIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ // No-op.
+ return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ const Owned<Info>& info = infos[containerId];
+
+ // TODO(jieyu): Currently, we only allow non-nested relative
+ // container paths for volumes. This is enforced by the master. For
+ // those volumes, we create symlinks in the executor directory.
+ Resources current = info->resources;
+
+ // We first remove unneeded persistent volumes.
+ foreach (const Resource& resource, current.persistentVolumes()) {
+ // This is enforced by the master.
+ CHECK(resource.disk().has_volume());
+
+ // Ignore absolute and nested paths.
+ const string& containerPath = resource.disk().volume().container_path();
+ if (strings::contains(containerPath, "/")) {
+ LOG(WARNING) << "Skipping updating symlink for persistent volume "
+ << resource << " of container " << containerId
+ << " because the container path '" << containerPath
+ << "' contains slash";
+ continue;
+ }
+
+ if (resources.contains(resource)) {
+ continue;
+ }
+
+ string link = path::join(info->directory, containerPath);
+
+ LOG(INFO) << "Removing symlink '" << link << "' for persistent volume "
+ << resource << " of container " << containerId;
+
+ Try<Nothing> rm = os::rm(link);
+ if (rm.isError()) {
+ return Failure(
+ "Failed to remove the symlink for the unneeded "
+ "persistent volume at '" + link + "'");
+ }
+ }
+
+ // We then link additional persistent volumes.
+ foreach (const Resource& resource, resources.persistentVolumes()) {
+ // This is enforced by the master.
+ CHECK(resource.disk().has_volume());
+
+ // Ignore absolute and nested paths.
+ const string& containerPath = resource.disk().volume().container_path();
+ if (strings::contains(containerPath, "/")) {
+ LOG(WARNING) << "Skipping updating symlink for persistent volume "
+ << resource << " of container " << containerId
+ << " because the container path '" << containerPath
+ << "' contains slash";
+ continue;
+ }
+
+ if (current.contains(resource)) {
+ continue;
+ }
+
+ string original = paths::getPersistentVolumePath(
+ flags.work_dir,
+ resource.role(),
+ resource.disk().persistence().id());
+
+ // Set the ownership of the persistent volume to match that of the
+ // sandbox directory.
+ //
+ // NOTE: Currently, persistent volumes in Mesos are exclusive,
+ // meaning that if a persistent volume is used by one task or
+ // executor, it cannot be concurrently used by other task or
+ // executor. But if we allow multiple executors to use same
+ // persistent volume at the same time in the future, the ownership
+ // of the persistent volume may conflict here.
+ //
+ // TODO(haosdent): Consider letting the frameworks specify the
+ // user/group of the persistent volumes.
+ struct stat s;
+ if (::stat(info->directory.c_str(), &s) < 0) {
+ return Failure(
+ "Failed to get ownership for '" + info->directory +
+ "': " + strerror(errno));
+ }
+
+ LOG(INFO) << "Changing the ownership of the persistent volume at '"
+ << original << "' with uid " << s.st_uid
+ << " and gid " << s.st_gid;
+
+ Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true);
+ if (chown.isError()) {
+ return Failure(
+ "Failed to change the ownership of the persistent volume at '" +
+ original + "' with uid " + stringify(s.st_uid) +
+ " and gid " + stringify(s.st_gid) + ": " + chown.error());
+ }
+
+ string link = path::join(info->directory, containerPath);
+
+ if (os::exists(link)) {
+ // NOTE: This is possible because 'info->resources' will be
+ // reset when slave restarts and recovers. When the slave calls
+ // 'containerizer->update' after the executor re-registers,
+ // we'll try to relink all the already symlinked volumes.
+ Result<string> realpath = os::realpath(link);
+ if (!realpath.isSome()) {
+ return Failure(
+ "Failed to get the realpath of symlink '" + link + "': " +
+ (realpath.isError() ? realpath.error() : "No such directory"));
+ }
+
+ // A sanity check to make sure the target of the symlink does
+ // not change. In fact, this is not supposed to happen.
+ // NOTE: Here, we compare the realpaths because 'original' might
+ // contain symbolic links.
+ Result<string> _original = os::realpath(original);
+ if (!_original.isSome()) {
+ return Failure(
+ "Failed to get the realpath of volume '" + original + "': " +
+ (_original.isError() ? _original.error() : "No such directory"));
+ }
+
+ if (realpath.get() != _original.get()) {
+ return Failure(
+ "The existing symlink '" + link + "' points to '" +
+ _original.get() + "' and the new target is '" +
+ realpath.get() + "'");
+ }
+ } else {
+ LOG(INFO) << "Adding symlink from '" << original << "' to '"
+ << link << "' for persistent volume " << resource
+ << " of container " << containerId;
+
+ Try<Nothing> symlink = ::fs::symlink(original, link);
+ if (symlink.isError()) {
+ return Failure(
+ "Failed to symlink persistent volume from '" +
+ original + "' to '" + link + "'");
+ }
+ }
+ }
+
+ // Store the updated resources.
+ info->resources = resources;
+
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> PosixFilesystemIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ // No-op, no usage gathered.
+ return ResourceStatistics();
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ // Symlinks for persistent resources will be removed when the work
+ // directory is GC'ed, therefore no need to do explicit cleanup.
+ infos.erase(containerId);
+
+ return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
new file mode 100644
index 0000000..61b81dd
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_FILESYSTEM_ISOLATOR_HPP__
+#define __POSIX_FILESYSTEM_ISOLATOR_HPP__
+
+#include <mesos/resources.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class PosixFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~PosixFilesystemIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ PosixFilesystemIsolatorProcess(const Flags& flags);
+
+ const Flags flags;
+
+ struct Info
+ {
+ explicit Info(const std::string& _directory)
+ : directory(_directory) {}
+
+ const std::string directory;
+
+ // Track resources so we can unlink unneeded persistent volumes.
+ Resources resources;
+ };
+
+ hashmap<ContainerID, process::Owned<Info>> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_FILESYSTEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp b/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
new file mode 100644
index 0000000..4702211
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+
+#include "linux/ns.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+SharedFilesystemIsolatorProcess::SharedFilesystemIsolatorProcess(
+ const Flags& _flags)
+ : flags(_flags) {}
+
+
+SharedFilesystemIsolatorProcess::~SharedFilesystemIsolatorProcess() {}
+
+
+Try<Isolator*> SharedFilesystemIsolatorProcess::create(const Flags& flags)
+{
+ Result<string> user = os::user();
+ if (!user.isSome()) {
+ return Error("Failed to determine user: " +
+ (user.isError() ? user.error() : "username not found"));
+ }
+
+ if (user.get() != "root") {
+ return Error("SharedFilesystemIsolator requires root privileges");
+ }
+
+ process::Owned<MesosIsolatorProcess> process(
+ new SharedFilesystemIsolatorProcess(flags));
+
+ return new MesosIsolator(process);
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ // There is nothing to recover because we do not keep any state and
+ // do not monitor filesystem usage or perform any action on cleanup.
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> SharedFilesystemIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (executorInfo.has_container() &&
+ executorInfo.container().type() != ContainerInfo::MESOS) {
+ return Failure("Can only prepare filesystem for a MESOS container");
+ }
+
+ LOG(INFO) << "Preparing shared filesystem for container: "
+ << stringify(containerId);
+
+ if (!executorInfo.has_container()) {
+ // We don't consider this an error, there's just nothing to do so
+ // we return None.
+
+ return None();
+ }
+
+ // We don't support mounting to a container path which is a parent
+ // to another container path as this can mask entries. We'll keep
+ // track of all container paths so we can check this.
+ set<string> containerPaths;
+ containerPaths.insert(directory);
+
+ ContainerPrepareInfo prepareInfo;
+ prepareInfo.set_namespaces(CLONE_NEWNS);
+
+ foreach (const Volume& volume, executorInfo.container().volumes()) {
+ // Because the filesystem is shared we require the container path
+ // already exist, otherwise containers can create arbitrary paths
+ // outside their sandbox.
+ if (!os::exists(volume.container_path())) {
+ return Failure("Volume with container path '" +
+ volume.container_path() +
+ "' must exist on host for shared filesystem isolator");
+ }
+
+ // Host path must be provided.
+ if (!volume.has_host_path()) {
+ return Failure("Volume with container path '" +
+ volume.container_path() +
+ "' must specify host path for shared filesystem isolator");
+ }
+
+ // Check we won't mask another volume.
+ // NOTE: Assuming here that the container path is absolute, see
+ // Volume protobuf.
+ // TODO(idownes): This test is unnecessarily strict and could be
+ // relaxed if mounts could be re-ordered.
+ foreach (const string& containerPath, containerPaths) {
+ if (strings::startsWith(volume.container_path(), containerPath)) {
+ return Failure("Cannot mount volume to '" +
+ volume.container_path() +
+ "' because it is under volume '" +
+ containerPath +
+ "'");
+ }
+
+ if (strings::startsWith(containerPath, volume.container_path())) {
+ return Failure("Cannot mount volume to '" +
+ containerPath +
+ "' because it is under volume '" +
+ volume.container_path() +
+ "'");
+ }
+ }
+ containerPaths.insert(volume.container_path());
+
+ // A relative host path will be created in the container's work
+ // directory, otherwise check it already exists.
+ string hostPath;
+ if (!strings::startsWith(volume.host_path(), "/")) {
+ hostPath = path::join(directory, volume.host_path());
+
+ // Do not support any relative components in the resulting path.
+ // There should not be any links in the work directory to
+ // resolve.
+ if (strings::contains(hostPath, "/./") ||
+ strings::contains(hostPath, "/../")) {
+ return Failure("Relative host path '" +
+ hostPath +
+ "' cannot contain relative components");
+ }
+
+ Try<Nothing> mkdir = os::mkdir(hostPath, true);
+ if (mkdir.isError()) {
+ return Failure("Failed to create host_path '" +
+ hostPath +
+ "' for mount to '" +
+ volume.container_path() +
+ "': " +
+ mkdir.error());
+ }
+
+ // Set the ownership and permissions to match the container path
+ // as these are inherited from host path on bind mount.
+ struct stat stat;
+ if (::stat(volume.container_path().c_str(), &stat) < 0) {
+ return Failure("Failed to get permissions on '" +
+ volume.container_path() + "'" +
+ ": " + strerror(errno));
+ }
+
+ Try<Nothing> chmod = os::chmod(hostPath, stat.st_mode);
+ if (chmod.isError()) {
+ return Failure("Failed to chmod hostPath '" +
+ hostPath +
+ "': " +
+ chmod.error());
+ }
+
+ Try<Nothing> chown = os::chown(stat.st_uid, stat.st_gid, hostPath, false);
+ if (chown.isError()) {
+ return Failure("Failed to chown hostPath '" +
+ hostPath +
+ "': " +
+ chown.error());
+ }
+ } else {
+ hostPath = volume.host_path();
+
+ if (!os::exists(hostPath)) {
+ return Failure("Volume with container path '" +
+ volume.container_path() +
+ "' must have host path '" +
+ hostPath +
+ "' present on host for shared filesystem isolator");
+ }
+ }
+
+ prepareInfo.add_commands()->set_value(
+ "mount -n --bind " + hostPath + " " + volume.container_path());
+ }
+
+ return prepareInfo;
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ // No-op, isolation happens when unsharing the mount namespace.
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> SharedFilesystemIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ // No-op, for now.
+
+ return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ // No-op, nothing enforced.
+
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> SharedFilesystemIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ // No-op, no usage gathered.
+
+ return ResourceStatistics();
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ // Cleanup of mounts is done automatically done by the kernel when
+ // the mount namespace is destroyed after the last process
+ // terminates.
+
+ return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp b/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
new file mode 100644
index 0000000..3a2f7db
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SHARED_FILESYSTEM_ISOLATOR_HPP__
+#define __SHARED_FILESYSTEM_ISOLATOR_HPP__
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// This isolator is to be used when all containers share the host's
+// filesystem. It supports creating mounting "volumes" from the host
+// into each container's mount namespace. In particular, this can be
+// used to give each container a "private" system directory, such as
+// /tmp and /var/tmp.
+class SharedFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~SharedFilesystemIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ SharedFilesystemIsolatorProcess(const Flags& flags);
+
+ const Flags flags;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SHARED_FILESYSTEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
new file mode 100644
index 0000000..d41efe6
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/mount.h>
+
+#include <list>
+#include <set>
+#include <string>
+
+#include <stout/os.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/ls.hpp>
+#include <stout/os/stat.hpp>
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The root directory where we bind mount all the namespace handles.
+static const char PID_NS_BIND_MOUNT_ROOT[] = "/var/run/mesos/pidns";
+
+
+// The empty directory that we'll use to mask the namespace handles
+// inside each container. This mount ensures they cannot determine the
+// namespace of another container.
+static const char PID_NS_BIND_MOUNT_MASK_DIR[] = "/var/empty/mesos";
+
+
+// Helper to construct the path to a pid's namespace file.
+inline string nsProcFile(pid_t pid)
+{
+ return path::join("/proc", stringify(pid), "ns", "pid");
+}
+
+
+// Helper to construct the path to the additional reference created
+// for a container's pid namespace.
+inline string nsExtraReference(const ContainerID& containerId)
+{
+ return path::join(PID_NS_BIND_MOUNT_ROOT, stringify(containerId));
+}
+
+
+Try<Isolator*> NamespacesPidIsolatorProcess::create(const Flags& flags)
+{
+ // Check for root permission.
+ if (geteuid() != 0) {
+ return Error("The pid namespace isolator requires root permissions");
+ }
+
+ // Verify that pid namespaces are available on this kernel.
+ if (ns::namespaces().count("pid") == 0) {
+ return Error("Pid namespaces are not supported by this kernel");
+ }
+
+ // Create the directory where bind mounts of the pid namespace will
+ // be placed.
+ Try<Nothing> mkdir = os::mkdir(PID_NS_BIND_MOUNT_ROOT);
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the bind mount root directory at " +
+ string(PID_NS_BIND_MOUNT_ROOT) + ": " + mkdir.error());
+ }
+
+ // Create the empty directory that will be used to mask the bind
+ // mounts inside each container.
+ mkdir = os::mkdir(PID_NS_BIND_MOUNT_MASK_DIR);
+ if (mkdir.isError()) {
+ return Error(
+ "Failed to create the bind mount mask direcrory at " +
+ string(PID_NS_BIND_MOUNT_MASK_DIR) + ": " + mkdir.error());
+ }
+
+ return new MesosIsolator(Owned<MesosIsolatorProcess>(
+ new NamespacesPidIsolatorProcess()));
+}
+
+
+Result<ino_t> NamespacesPidIsolatorProcess::getNamespace(
+ const ContainerID& containerId)
+{
+ const string target = nsExtraReference(containerId);
+
+ if (os::exists(target)) {
+ return os::stat::inode(target);
+ }
+
+ return None();
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ hashset<ContainerID> recovered;
+ foreach (const ContainerState& state, states) {
+ recovered.insert(state.container_id());
+ }
+
+ // Clean up any unknown orphaned bind mounts and empty files. Known
+ // orphan bind mounts and empty files will be destroyed by the
+ // containerizer using the normal cleanup path. See MESOS-2367 for
+ // details.
+ Try<list<string>> entries = os::ls(PID_NS_BIND_MOUNT_ROOT);
+ if (entries.isError()) {
+ return Failure("Failed to list existing containers in '" +
+ string(PID_NS_BIND_MOUNT_ROOT) + "': " + entries.error());
+ }
+
+ foreach (const string& entry, entries.get()) {
+ ContainerID containerId;
+ containerId.set_value(entry);
+
+ if (!recovered.contains(containerId) && !orphans.contains(containerId)) {
+ cleanup(containerId);
+ }
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> NamespacesPidIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ ContainerPrepareInfo prepareInfo;
+ prepareInfo.set_namespaces(CLONE_NEWPID | CLONE_NEWNS);
+
+ // Mask the bind mount root directory in each container so
+ // containers cannot see the namespace bind mount of other
+ // containers.
+ prepareInfo.add_commands()->set_value(
+ "mount -n --bind " + string(PID_NS_BIND_MOUNT_MASK_DIR) +
+ " " + string(PID_NS_BIND_MOUNT_ROOT));
+
+ // Mount /proc for the container's pid namespace to show the
+ // container's pids (and other /proc files), not the parent's. We
+ // first recursively make the mount private because /proc is usually
+ // marked explicitly as shared (see /proc/self/mountinfo) and
+ // changes would propagate to the parent's /proc mount otherwise. We
+ // then mount /proc with the standard options. This technique was
+ // taken from unshare.c in utils-linux for --mount-proc. We use the
+ // -n flag so the mount is not added to the mtab where it will not
+ // be correctly removed with the namespace terminates.
+ prepareInfo.add_commands()->set_value(
+ "mount none /proc --make-private -o rec");
+ prepareInfo.add_commands()->set_value(
+ "mount -n -t proc proc /proc -o nosuid,noexec,nodev");
+
+ return prepareInfo;
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ const string source = nsProcFile(pid);
+ const string target = nsExtraReference(containerId);
+
+ // Create a bind mount of the pid namespace so we can control the
+ // lifetime of the pid namespace. This lets us identify the
+ // container's pid namespace, even if the leading pid has exited.
+ Try<Nothing> touch = os::touch(target);
+ if (touch.isError()) {
+ return Failure("Failed to create bind mount target: " + touch.error());
+ }
+
+ Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mount pid namespace handle from " +
+ source + " to " + target + ": " + mount.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> NamespacesPidIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> NamespacesPidIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ return ResourceStatistics();
+}
+
+
+// An old glibc might not have this symbol.
+#ifndef MNT_DETACH
+#define MNT_DETACH 2
+#endif
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ const string target = nsExtraReference(containerId);
+
+ if (os::exists(target)) {
+ // We don't expect anyone to have a reference to target but do a
+ // lazy umount in case. We do not want to force the umount; it
+ // will not cause an issue if this umount is delayed.
+ Try<Nothing> unmount = fs::unmount(target, MNT_DETACH);
+
+ // This will fail if the unmount hasn't completed yet but this
+ // only leaks a uniquely named empty file that will cleaned up as
+ // an orphan on recovery.
+ os::rm(target);
+ }
+
+ return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
new file mode 100644
index 0000000..87270d0
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NAMESPACES_PID_ISOLATOR_HPP__
+#define __NAMESPACES_PID_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <string>
+
+#include <stout/result.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// This isolator itself does not specify the necessary clone() flags
+// (see the LinuxLauncher for that) but it is used to keep track of a
+// container's pid namespace through a bind mount and exposed by
+// getNamespace().
+class NamespacesPidIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ // Return the pid namespace of the container. Returns None if the
+ // container was not created in a separate pid namespace, i.e.,
+ // processes are in the same namespace as the slave. This is used by
+ // the LinuxLauncher to determine if it can kill the leading process
+ // in the container and let the kernel kill the remaining processes.
+ // A container may not have a pid namespace if it was created
+ // without the namespaces/pid isolator and the slave was
+ // subsequently restarted with namespaces/pid enabled.
+ static Result<ino_t> getNamespace(const ContainerID& container);
+
+ NamespacesPidIsolatorProcess() {}
+
+ virtual ~NamespacesPidIsolatorProcess() {}
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __NAMESPACES_PID_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/helper.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/helper.cpp b/src/slave/containerizer/mesos/isolators/network/helper.cpp
new file mode 100644
index 0000000..d851195
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/helper.cpp
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/none.hpp>
+#include <stout/subcommand.hpp>
+
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
+
+using namespace mesos::internal::slave;
+
+
+int main(int argc, char** argv)
+{
+ return Subcommand::dispatch(
+ None(),
+ argc,
+ argv,
+ new PortMappingUpdate(),
+ new PortMappingStatistics());
+}
[12/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
Relocated MesosContainerizer specific files to the correct location.
Review: https://reviews.apache.org/r/39360
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a722d74
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a722d74
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a722d74
Branch: refs/heads/master
Commit: 9a722d742b2897ea3c6d5574273010c3b1546a58
Parents: 974906b
Author: Gilbert Song <gi...@mesoshere.io>
Authored: Mon Oct 26 10:52:02 2015 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Mon Oct 26 10:52:07 2015 -0700
----------------------------------------------------------------------
src/Makefile.am | 118 +-
src/docker/docker.cpp | 4 +-
src/examples/test_isolator_module.cpp | 2 +-
src/slave/containerizer/docker.cpp | 2 +-
.../isolators/cgroups/constants.hpp | 44 -
.../isolators/cgroups/cpushare.cpp | 572 ---
.../isolators/cgroups/cpushare.hpp | 122 -
.../containerizer/isolators/cgroups/mem.cpp | 733 ----
.../containerizer/isolators/cgroups/mem.hpp | 141 -
.../isolators/cgroups/perf_event.cpp | 435 --
.../isolators/cgroups/perf_event.hpp | 127 -
.../isolators/filesystem/linux.cpp | 893 -----
.../isolators/filesystem/linux.hpp | 149 -
.../isolators/filesystem/posix.cpp | 294 --
.../isolators/filesystem/posix.hpp | 89 -
.../isolators/filesystem/shared.cpp | 268 --
.../isolators/filesystem/shared.hpp | 79 -
.../containerizer/isolators/namespaces/pid.cpp | 266 --
.../containerizer/isolators/namespaces/pid.hpp | 91 -
.../containerizer/isolators/network/helper.cpp | 35 -
.../isolators/network/port_mapping.cpp | 3792 ------------------
.../isolators/network/port_mapping.hpp | 403 --
src/slave/containerizer/isolators/posix.hpp | 213 -
.../containerizer/isolators/posix/disk.cpp | 525 ---
.../containerizer/isolators/posix/disk.hpp | 151 -
src/slave/containerizer/linux_launcher.cpp | 2 +-
src/slave/containerizer/mesos/containerizer.cpp | 23 +-
.../mesos/isolators/cgroups/constants.hpp | 44 +
.../mesos/isolators/cgroups/cpushare.cpp | 572 +++
.../mesos/isolators/cgroups/cpushare.hpp | 122 +
.../mesos/isolators/cgroups/mem.cpp | 733 ++++
.../mesos/isolators/cgroups/mem.hpp | 141 +
.../mesos/isolators/cgroups/perf_event.cpp | 435 ++
.../mesos/isolators/cgroups/perf_event.hpp | 127 +
.../mesos/isolators/filesystem/linux.cpp | 893 +++++
.../mesos/isolators/filesystem/linux.hpp | 149 +
.../mesos/isolators/filesystem/posix.cpp | 294 ++
.../mesos/isolators/filesystem/posix.hpp | 89 +
.../mesos/isolators/filesystem/shared.cpp | 268 ++
.../mesos/isolators/filesystem/shared.hpp | 79 +
.../mesos/isolators/namespaces/pid.cpp | 266 ++
.../mesos/isolators/namespaces/pid.hpp | 91 +
.../mesos/isolators/network/helper.cpp | 35 +
.../mesos/isolators/network/port_mapping.cpp | 3792 ++++++++++++++++++
.../mesos/isolators/network/port_mapping.hpp | 403 ++
.../containerizer/mesos/isolators/posix.hpp | 213 +
.../mesos/isolators/posix/disk.cpp | 525 +++
.../mesos/isolators/posix/disk.hpp | 151 +
.../mesos/provisioner/appc/paths.cpp | 85 +
.../mesos/provisioner/appc/paths.hpp | 83 +
.../mesos/provisioner/appc/spec.cpp | 104 +
.../mesos/provisioner/appc/spec.hpp | 54 +
.../mesos/provisioner/appc/store.cpp | 288 ++
.../mesos/provisioner/appc/store.hpp | 63 +
.../containerizer/mesos/provisioner/backend.cpp | 62 +
.../containerizer/mesos/provisioner/backend.hpp | 67 +
.../mesos/provisioner/backends/bind.cpp | 250 ++
.../mesos/provisioner/backends/bind.hpp | 75 +
.../mesos/provisioner/backends/copy.cpp | 203 +
.../mesos/provisioner/backends/copy.hpp | 69 +
.../mesos/provisioner/docker/local_puller.cpp | 355 ++
.../mesos/provisioner/docker/local_puller.hpp | 66 +
.../mesos/provisioner/docker/message.hpp | 116 +
.../mesos/provisioner/docker/message.proto | 100 +
.../provisioner/docker/metadata_manager.cpp | 232 ++
.../provisioner/docker/metadata_manager.hpp | 106 +
.../mesos/provisioner/docker/paths.cpp | 115 +
.../mesos/provisioner/docker/paths.hpp | 98 +
.../mesos/provisioner/docker/puller.cpp | 46 +
.../mesos/provisioner/docker/puller.hpp | 68 +
.../provisioner/docker/registry_client.cpp | 641 +++
.../provisioner/docker/registry_client.hpp | 164 +
.../mesos/provisioner/docker/spec.cpp | 91 +
.../mesos/provisioner/docker/spec.hpp | 50 +
.../mesos/provisioner/docker/store.cpp | 277 ++
.../mesos/provisioner/docker/store.hpp | 68 +
.../mesos/provisioner/docker/token_manager.cpp | 362 ++
.../mesos/provisioner/docker/token_manager.hpp | 179 +
.../containerizer/mesos/provisioner/paths.cpp | 192 +
.../containerizer/mesos/provisioner/paths.hpp | 80 +
.../mesos/provisioner/provisioner.cpp | 444 ++
.../mesos/provisioner/provisioner.hpp | 94 +
.../containerizer/mesos/provisioner/store.cpp | 79 +
.../containerizer/mesos/provisioner/store.hpp | 75 +
.../containerizer/provisioner/appc/paths.cpp | 85 -
.../containerizer/provisioner/appc/paths.hpp | 83 -
.../containerizer/provisioner/appc/spec.cpp | 104 -
.../containerizer/provisioner/appc/spec.hpp | 54 -
.../containerizer/provisioner/appc/store.cpp | 288 --
.../containerizer/provisioner/appc/store.hpp | 63 -
src/slave/containerizer/provisioner/backend.cpp | 62 -
src/slave/containerizer/provisioner/backend.hpp | 67 -
.../containerizer/provisioner/backends/bind.cpp | 250 --
.../containerizer/provisioner/backends/bind.hpp | 75 -
.../containerizer/provisioner/backends/copy.cpp | 203 -
.../containerizer/provisioner/backends/copy.hpp | 69 -
.../provisioner/docker/local_puller.cpp | 355 --
.../provisioner/docker/local_puller.hpp | 66 -
.../provisioner/docker/message.hpp | 116 -
.../provisioner/docker/message.proto | 100 -
.../provisioner/docker/metadata_manager.cpp | 232 --
.../provisioner/docker/metadata_manager.hpp | 106 -
.../containerizer/provisioner/docker/paths.cpp | 115 -
.../containerizer/provisioner/docker/paths.hpp | 98 -
.../containerizer/provisioner/docker/puller.cpp | 46 -
.../containerizer/provisioner/docker/puller.hpp | 68 -
.../provisioner/docker/registry_client.cpp | 641 ---
.../provisioner/docker/registry_client.hpp | 164 -
.../containerizer/provisioner/docker/spec.cpp | 91 -
.../containerizer/provisioner/docker/spec.hpp | 50 -
.../containerizer/provisioner/docker/store.cpp | 277 --
.../containerizer/provisioner/docker/store.hpp | 68 -
.../provisioner/docker/token_manager.cpp | 362 --
.../provisioner/docker/token_manager.hpp | 179 -
src/slave/containerizer/provisioner/paths.cpp | 192 -
src/slave/containerizer/provisioner/paths.hpp | 80 -
.../containerizer/provisioner/provisioner.cpp | 444 --
.../containerizer/provisioner/provisioner.hpp | 94 -
src/slave/containerizer/provisioner/store.cpp | 79 -
src/slave/containerizer/provisioner/store.hpp | 75 -
.../containerizer/filesystem_isolator_tests.cpp | 4 +-
src/tests/containerizer/isolator_tests.cpp | 12 +-
src/tests/containerizer/port_mapping_tests.cpp | 2 +-
src/tests/containerizer/provisioner.hpp | 2 +-
.../containerizer/provisioner_appc_tests.cpp | 8 +-
.../containerizer/provisioner_backend_tests.cpp | 4 +-
.../containerizer/provisioner_docker_tests.cpp | 12 +-
src/tests/disk_quota_tests.cpp | 2 +-
128 files changed, 15021 insertions(+), 15022 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 98cbafc..5675b91 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -275,8 +275,8 @@ BUILT_SOURCES += $(STATE_PROTOS)
CLEANFILES += $(STATE_PROTOS)
DOCKER_PROVISIONER_PROTOS = \
- slave/containerizer/provisioner/docker/message.pb.cc \
- slave/containerizer/provisioner/docker/message.pb.h
+ slave/containerizer/mesos/provisioner/docker/message.pb.cc \
+ slave/containerizer/mesos/provisioner/docker/message.pb.h
BUILT_SOURCES += $(DOCKER_PROVISIONER_PROTOS)
CLEANFILES += $(DOCKER_PROVISIONER_PROTOS)
@@ -541,28 +541,28 @@ libmesos_no_3rdparty_la_SOURCES = \
slave/containerizer/external_containerizer.cpp \
slave/containerizer/fetcher.cpp \
slave/containerizer/isolator.cpp \
- slave/containerizer/isolators/filesystem/posix.cpp \
- slave/containerizer/isolators/posix/disk.cpp \
slave/containerizer/launcher.cpp \
slave/containerizer/mesos/containerizer.cpp \
+ slave/containerizer/mesos/isolators/filesystem/posix.cpp \
+ slave/containerizer/mesos/isolators/posix/disk.cpp \
slave/containerizer/mesos/launch.cpp \
- slave/containerizer/provisioner/paths.cpp \
- slave/containerizer/provisioner/provisioner.cpp \
- slave/containerizer/provisioner/store.cpp \
- slave/containerizer/provisioner/appc/paths.cpp \
- slave/containerizer/provisioner/appc/spec.cpp \
- slave/containerizer/provisioner/appc/store.cpp \
- slave/containerizer/provisioner/backend.cpp \
- slave/containerizer/provisioner/backends/copy.cpp \
- slave/containerizer/provisioner/docker/local_puller.cpp \
- slave/containerizer/provisioner/docker/message.proto \
- slave/containerizer/provisioner/docker/metadata_manager.cpp \
- slave/containerizer/provisioner/docker/paths.cpp \
- slave/containerizer/provisioner/docker/puller.cpp \
- slave/containerizer/provisioner/docker/registry_client.cpp \
- slave/containerizer/provisioner/docker/spec.cpp \
- slave/containerizer/provisioner/docker/store.cpp \
- slave/containerizer/provisioner/docker/token_manager.cpp \
+ slave/containerizer/mesos/provisioner/paths.cpp \
+ slave/containerizer/mesos/provisioner/provisioner.cpp \
+ slave/containerizer/mesos/provisioner/store.cpp \
+ slave/containerizer/mesos/provisioner/appc/paths.cpp \
+ slave/containerizer/mesos/provisioner/appc/spec.cpp \
+ slave/containerizer/mesos/provisioner/appc/store.cpp \
+ slave/containerizer/mesos/provisioner/backend.cpp \
+ slave/containerizer/mesos/provisioner/backends/copy.cpp \
+ slave/containerizer/mesos/provisioner/docker/local_puller.cpp \
+ slave/containerizer/mesos/provisioner/docker/message.proto \
+ slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp \
+ slave/containerizer/mesos/provisioner/docker/paths.cpp \
+ slave/containerizer/mesos/provisioner/docker/puller.cpp \
+ slave/containerizer/mesos/provisioner/docker/registry_client.cpp \
+ slave/containerizer/mesos/provisioner/docker/spec.cpp \
+ slave/containerizer/mesos/provisioner/docker/store.cpp \
+ slave/containerizer/mesos/provisioner/docker/token_manager.cpp \
slave/resource_estimators/noop.cpp \
usage/usage.cpp \
v1/attributes.cpp \
@@ -730,14 +730,14 @@ if OS_LINUX
libmesos_no_3rdparty_la_SOURCES += linux/fs.cpp
libmesos_no_3rdparty_la_SOURCES += linux/perf.cpp
libmesos_no_3rdparty_la_SOURCES += linux/systemd.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/cpushare.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/mem.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/perf_event.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/namespaces/pid.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/linux.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/shared.cpp
libmesos_no_3rdparty_la_SOURCES += slave/containerizer/linux_launcher.cpp
- libmesos_no_3rdparty_la_SOURCES += slave/containerizer/provisioner/backends/bind.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/mem.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/namespaces/pid.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/filesystem/linux.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/filesystem/shared.cpp
+ libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/provisioner/backends/bind.cpp
else
EXTRA_DIST += linux/cgroups.cpp
EXTRA_DIST += linux/fs.cpp
@@ -781,8 +781,8 @@ if WITH_NETWORK_ISOLATOR
linux/routing/queueing/statistics.hpp
libmesos_no_3rdparty_la_SOURCES += \
- slave/containerizer/isolators/network/port_mapping.cpp \
- slave/containerizer/isolators/network/port_mapping.hpp
+ slave/containerizer/mesos/isolators/network/port_mapping.cpp \
+ slave/containerizer/mesos/isolators/network/port_mapping.hpp
endif
libmesos_no_3rdparty_la_SOURCES += \
@@ -852,36 +852,36 @@ libmesos_no_3rdparty_la_SOURCES += \
slave/containerizer/isolator.hpp \
slave/containerizer/launcher.hpp \
slave/containerizer/linux_launcher.hpp \
- slave/containerizer/provisioner/paths.hpp \
- slave/containerizer/provisioner/provisioner.hpp \
- slave/containerizer/provisioner/store.hpp \
- slave/containerizer/provisioner/appc/paths.hpp \
- slave/containerizer/provisioner/appc/spec.hpp \
- slave/containerizer/provisioner/appc/store.hpp \
- slave/containerizer/provisioner/backend.hpp \
- slave/containerizer/provisioner/backends/bind.hpp \
- slave/containerizer/provisioner/backends/copy.hpp \
- slave/containerizer/provisioner/docker/local_puller.hpp \
- slave/containerizer/provisioner/docker/message.hpp \
- slave/containerizer/provisioner/docker/metadata_manager.hpp \
- slave/containerizer/provisioner/docker/paths.hpp \
- slave/containerizer/provisioner/docker/puller.hpp \
- slave/containerizer/provisioner/docker/registry_client.hpp \
- slave/containerizer/provisioner/docker/spec.hpp \
- slave/containerizer/provisioner/docker/store.hpp \
- slave/containerizer/provisioner/docker/token_manager.hpp \
- slave/containerizer/isolators/posix.hpp \
- slave/containerizer/isolators/posix/disk.hpp \
- slave/containerizer/isolators/cgroups/constants.hpp \
- slave/containerizer/isolators/cgroups/cpushare.hpp \
- slave/containerizer/isolators/cgroups/mem.hpp \
- slave/containerizer/isolators/cgroups/perf_event.hpp \
- slave/containerizer/isolators/namespaces/pid.hpp \
- slave/containerizer/isolators/filesystem/linux.hpp \
- slave/containerizer/isolators/filesystem/posix.hpp \
- slave/containerizer/isolators/filesystem/shared.hpp \
slave/containerizer/mesos/containerizer.hpp \
+ slave/containerizer/mesos/isolators/posix.hpp \
+ slave/containerizer/mesos/isolators/posix/disk.hpp \
+ slave/containerizer/mesos/isolators/cgroups/constants.hpp \
+ slave/containerizer/mesos/isolators/cgroups/cpushare.hpp \
+ slave/containerizer/mesos/isolators/cgroups/mem.hpp \
+ slave/containerizer/mesos/isolators/cgroups/perf_event.hpp \
+ slave/containerizer/mesos/isolators/namespaces/pid.hpp \
+ slave/containerizer/mesos/isolators/filesystem/linux.hpp \
+ slave/containerizer/mesos/isolators/filesystem/posix.hpp \
+ slave/containerizer/mesos/isolators/filesystem/shared.hpp \
slave/containerizer/mesos/launch.hpp \
+ slave/containerizer/mesos/provisioner/paths.hpp \
+ slave/containerizer/mesos/provisioner/provisioner.hpp \
+ slave/containerizer/mesos/provisioner/store.hpp \
+ slave/containerizer/mesos/provisioner/appc/paths.hpp \
+ slave/containerizer/mesos/provisioner/appc/spec.hpp \
+ slave/containerizer/mesos/provisioner/appc/store.hpp \
+ slave/containerizer/mesos/provisioner/backend.hpp \
+ slave/containerizer/mesos/provisioner/backends/bind.hpp \
+ slave/containerizer/mesos/provisioner/backends/copy.hpp \
+ slave/containerizer/mesos/provisioner/docker/local_puller.hpp \
+ slave/containerizer/mesos/provisioner/docker/message.hpp \
+ slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp \
+ slave/containerizer/mesos/provisioner/docker/paths.hpp \
+ slave/containerizer/mesos/provisioner/docker/puller.hpp \
+ slave/containerizer/mesos/provisioner/docker/registry_client.hpp \
+ slave/containerizer/mesos/provisioner/docker/spec.hpp \
+ slave/containerizer/mesos/provisioner/docker/store.hpp \
+ slave/containerizer/mesos/provisioner/docker/token_manager.hpp \
slave/qos_controllers/noop.hpp \
slave/resource_estimators/noop.hpp \
tests/cluster.hpp \
@@ -1120,7 +1120,7 @@ mesos_containerizer_LDADD = libmesos.la $(LDADD)
if WITH_NETWORK_ISOLATOR
pkglibexec_PROGRAMS += mesos-network-helper
-mesos_network_helper_SOURCES = slave/containerizer/isolators/network/helper.cpp
+mesos_network_helper_SOURCES = slave/containerizer/mesos/isolators/network/helper.cpp
mesos_network_helper_CPPFLAGS = $(MESOS_CPPFLAGS)
mesos_network_helper_LDADD = libmesos.la $(LDADD)
endif
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index 56d63dc..4ebca66 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -38,8 +38,8 @@
#include "linux/cgroups.hpp"
#endif // __linux__
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
#include "slave/constants.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/examples/test_isolator_module.cpp
----------------------------------------------------------------------
diff --git a/src/examples/test_isolator_module.cpp b/src/examples/test_isolator_module.cpp
index 577dfca..8123603 100644
--- a/src/examples/test_isolator_module.cpp
+++ b/src/examples/test_isolator_module.cpp
@@ -25,7 +25,7 @@
#include <stout/try.hpp>
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
#include "slave/flags.hpp"
using namespace mesos;
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 7022958..276b6e0 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -48,7 +48,7 @@
#include "slave/containerizer/docker.hpp"
#include "slave/containerizer/fetcher.hpp"
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
#include "usage/usage.hpp"
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/constants.hpp b/src/slave/containerizer/isolators/cgroups/constants.hpp
deleted file mode 100644
index db9dde0..0000000
--- a/src/slave/containerizer/isolators/cgroups/constants.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CGROUPS_ISOLATOR_CONSTANTS_HPP__
-#define __CGROUPS_ISOLATOR_CONSTANTS_HPP__
-
-#include <stout/bytes.hpp>
-#include <stout/duration.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// CPU subsystem constants.
-const uint64_t CPU_SHARES_PER_CPU = 1024;
-const uint64_t CPU_SHARES_PER_CPU_REVOCABLE = 10;
-const uint64_t MIN_CPU_SHARES = 2; // Linux constant.
-const Duration CPU_CFS_PERIOD = Milliseconds(100); // Linux default.
-const Duration MIN_CPU_CFS_QUOTA = Milliseconds(1);
-
-
-// Memory subsystem constants.
-const Bytes MIN_MEMORY = Megabytes(32);
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __CGROUPS_ISOLATOR_CONSTANTS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/cpushare.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/cpushare.cpp b/src/slave/containerizer/isolators/cgroups/cpushare.cpp
deleted file mode 100644
index ba748c6..0000000
--- a/src/slave/containerizer/isolators/cgroups/cpushare.cpp
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <mesos/type_utils.hpp>
-#include <mesos/values.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/pid.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/nothing.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "linux/cgroups.hpp"
-
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-CgroupsCpushareIsolatorProcess::CgroupsCpushareIsolatorProcess(
- const Flags& _flags,
- const hashmap<string, string>& _hierarchies,
- const vector<string>& _subsystems)
- : flags(_flags),
- hierarchies(_hierarchies),
- subsystems(_subsystems) {}
-
-
-CgroupsCpushareIsolatorProcess::~CgroupsCpushareIsolatorProcess() {}
-
-
-Try<Isolator*> CgroupsCpushareIsolatorProcess::create(const Flags& flags)
-{
- Try<string> hierarchyCpu = cgroups::prepare(
- flags.cgroups_hierarchy,
- "cpu",
- flags.cgroups_root);
-
- if (hierarchyCpu.isError()) {
- return Error(
- "Failed to prepare hierarchy for cpu subsystem: " +
- hierarchyCpu.error());
- }
-
- Try<string> hierarchyCpuacct = cgroups::prepare(
- flags.cgroups_hierarchy,
- "cpuacct",
- flags.cgroups_root);
-
- if (hierarchyCpuacct.isError()) {
- return Error(
- "Failed to prepare hierarchy for cpuacct subsystem: " +
- hierarchyCpuacct.error());
- }
-
- hashmap<string, string> hierarchies;
- vector<string> subsystems;
-
- hierarchies["cpu"] = hierarchyCpu.get();
- hierarchies["cpuacct"] = hierarchyCpuacct.get();
-
- if (hierarchyCpu.get() == hierarchyCpuacct.get()) {
- // Subsystem cpu and cpuacct are co-mounted (e.g., systemd).
- hierarchies["cpu,cpuacct"] = hierarchyCpu.get();
- subsystems.push_back("cpu,cpuacct");
-
- // Ensure that no other subsystem is attached to the hierarchy.
- Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
- if (_subsystems.isError()) {
- return Error(
- "Failed to get the list of attached subsystems for hierarchy " +
- hierarchyCpu.get());
- } else if (_subsystems.get().size() != 2) {
- return Error(
- "Unexpected subsystems found attached to the hierarchy " +
- hierarchyCpu.get());
- }
- } else {
- // Subsystem cpu and cpuacct are mounted separately.
- subsystems.push_back("cpu");
- subsystems.push_back("cpuacct");
-
- // Ensure that no other subsystem is attached to each of the
- // hierarchy.
- Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
- if (_subsystems.isError()) {
- return Error(
- "Failed to get the list of attached subsystems for hierarchy " +
- hierarchyCpu.get());
- } else if (_subsystems.get().size() != 1) {
- return Error(
- "Unexpected subsystems found attached to the hierarchy " +
- hierarchyCpu.get());
- }
-
- _subsystems = cgroups::subsystems(hierarchyCpuacct.get());
- if (_subsystems.isError()) {
- return Error(
- "Failed to get the list of attached subsystems for hierarchy " +
- hierarchyCpuacct.get());
- } else if (_subsystems.get().size() != 1) {
- return Error(
- "Unexpected subsystems found attached to the hierarchy " +
- hierarchyCpuacct.get());
- }
- }
-
- if (flags.cgroups_enable_cfs) {
- Try<bool> exists = cgroups::exists(
- hierarchies["cpu"],
- flags.cgroups_root,
- "cpu.cfs_quota_us");
-
- if (exists.isError() || !exists.get()) {
- return Error(
- "Failed to find 'cpu.cfs_quota_us'. Your kernel "
- "might be too old to use the CFS cgroups feature.");
- }
- }
-
- process::Owned<MesosIsolatorProcess> process(
- new CgroupsCpushareIsolatorProcess(flags, hierarchies, subsystems));
-
- return new MesosIsolator(process);
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- foreach (const ContainerState& state, states) {
- const ContainerID& containerId = state.container_id();
- const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
- Try<bool> exists = cgroups::exists(hierarchies["cpu"], cgroup);
- if (exists.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- infos.clear();
- return Failure(
- "Failed to check cgroup for container " + stringify(containerId));
- }
-
- if (!exists.get()) {
- // This may occur if the executor has exited and the isolator
- // has destroyed the cgroup but the slave dies before noticing
- // this. This will be detected when the containerizer tries to
- // monitor the executor's pid.
- LOG(WARNING) << "Couldn't find cgroup for container " << containerId;
- continue;
- }
-
- infos[containerId] = new Info(containerId, cgroup);
- }
-
- // Remove orphan cgroups.
- foreach (const string& subsystem, subsystems) {
- Try<vector<string>> cgroups = cgroups::get(
- hierarchies[subsystem],
- flags.cgroups_root);
-
- if (cgroups.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- infos.clear();
- return Failure(cgroups.error());
- }
-
- foreach (const string& cgroup, cgroups.get()) {
- // Ignore the slave cgroup (see the --slave_subsystems flag).
- // TODO(idownes): Remove this when the cgroups layout is
- // updated, see MESOS-1185.
- if (cgroup == path::join(flags.cgroups_root, "slave")) {
- continue;
- }
-
- ContainerID containerId;
- containerId.set_value(Path(cgroup).basename());
-
- if (infos.contains(containerId)) {
- continue;
- }
-
- // Known orphan cgroups will be destroyed by the containerizer
- // using the normal cleanup path. See MESOS-2367 for details.
- if (orphans.contains(containerId)) {
- infos[containerId] = new Info(containerId, cgroup);
- continue;
- }
-
- LOG(INFO) << "Removing unknown orphaned cgroup '"
- << path::join(subsystem, cgroup) << "'";
-
- // We don't wait on the destroy as we don't want to block recovery.
- cgroups::destroy(
- hierarchies[subsystem],
- cgroup,
- cgroups::DESTROY_TIMEOUT);
- }
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsCpushareIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- // TODO(bmahler): Don't insert into 'infos' unless we create the
- // cgroup successfully. It's safe for now because 'cleanup' gets
- // called if we return a Failure, but cleanup will fail because the
- // cgroup does not exist when cgroups::destroy is called.
- Info* info = new Info(
- containerId, path::join(flags.cgroups_root, containerId.value()));
-
- infos[containerId] = info;
-
- foreach (const string& subsystem, subsystems) {
- Try<bool> exists = cgroups::exists(hierarchies[subsystem], info->cgroup);
- if (exists.isError()) {
- return Failure("Failed to prepare isolator: " + exists.error());
- } else if (exists.get()) {
- return Failure("Failed to prepare isolator: cgroup already exists");
- }
-
- Try<Nothing> create = cgroups::create(hierarchies[subsystem], info->cgroup);
- if (create.isError()) {
- return Failure("Failed to prepare isolator: " + create.error());
- }
-
- // Chown the cgroup so the executor can create nested cgroups. Do
- // not recurse so the control files are still owned by the slave
- // user and thus cannot be changed by the executor.
- if (user.isSome()) {
- Try<Nothing> chown = os::chown(
- user.get(),
- path::join(hierarchies[subsystem], info->cgroup),
- false);
- if (chown.isError()) {
- return Failure("Failed to prepare isolator: " + chown.error());
- }
- }
- }
-
- return update(containerId, executorInfo.resources())
- .then([]() -> Future<Option<ContainerPrepareInfo>> {
- return None();
- });
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- CHECK_NONE(info->pid);
- info->pid = pid;
-
- foreach (const string& subsystem, subsystems) {
- Try<Nothing> assign = cgroups::assign(
- hierarchies[subsystem],
- info->cgroup,
- pid);
-
- if (assign.isError()) {
- LOG(ERROR) << "Failed to assign container '" << info->containerId
- << " to its own cgroup '"
- << path::join(hierarchies[subsystem], info->cgroup)
- << "' : " << assign.error();
-
- return Failure("Failed to isolate container: " + assign.error());
- }
- }
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsCpushareIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- CHECK_NOTNULL(infos[containerId]);
-
- return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- if (resources.cpus().isNone()) {
- return Failure("No cpus resource given");
- }
-
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- const Option<string>& hierarchy = hierarchies.get("cpu");
- if (hierarchy.isNone()) {
- return Failure("No 'cpu' hierarchy");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
- info->resources = resources;
-
- double cpus = resources.cpus().get();
-
- // Always set cpu.shares.
- uint64_t shares;
-
- if (flags.revocable_cpu_low_priority &&
- resources.revocable().cpus().isSome()) {
- shares = std::max(
- (uint64_t) (CPU_SHARES_PER_CPU_REVOCABLE * cpus),
- MIN_CPU_SHARES);
- } else {
- shares = std::max(
- (uint64_t) (CPU_SHARES_PER_CPU * cpus),
- MIN_CPU_SHARES);
- }
-
- Try<Nothing> write = cgroups::cpu::shares(
- hierarchy.get(),
- info->cgroup,
- shares);
-
- if (write.isError()) {
- return Failure("Failed to update 'cpu.shares': " + write.error());
- }
-
- LOG(INFO) << "Updated 'cpu.shares' to " << shares
- << " (cpus " << cpus << ")"
- << " for container " << containerId;
-
- // Set cfs quota if enabled.
- if (flags.cgroups_enable_cfs) {
- write = cgroups::cpu::cfs_period_us(
- hierarchy.get(),
- info->cgroup,
- CPU_CFS_PERIOD);
-
- if (write.isError()) {
- return Failure("Failed to update 'cpu.cfs_period_us': " + write.error());
- }
-
- Duration quota = std::max(CPU_CFS_PERIOD * cpus, MIN_CPU_CFS_QUOTA);
-
- write = cgroups::cpu::cfs_quota_us(hierarchy.get(), info->cgroup, quota);
- if (write.isError()) {
- return Failure("Failed to update 'cpu.cfs_quota_us': " + write.error());
- }
-
- LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
- << " and 'cpu.cfs_quota_us' to " << quota
- << " (cpus " << cpus << ")"
- << " for container " << containerId;
- }
-
- return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsCpushareIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- ResourceStatistics result;
-
- // TODO(chzhcn): Getting the number of processes and threads is
- // available as long as any cgroup subsystem is used so this best
- // not be tied to a specific cgroup isolator. A better place is
- // probably Linux Launcher, which uses the cgroup freezer subsystem.
- // That requires some change for it to adopt the new semantics of
- // reporting subsystem-independent cgroup usage.
- // Note: The complexity of this operation is linear to the number of
- // processes and threads in a container: the kernel has to allocate
- // memory to contain the list of pids or tids; the userspace has to
- // parse the cgroup files to get the size. If this proves to be a
- // performance bottleneck, some kind of rate limiting mechanism
- // needs to be employed.
- if (flags.cgroups_cpu_enable_pids_and_tids_count) {
- Try<std::set<pid_t>> pids =
- cgroups::processes(hierarchies["cpuacct"], info->cgroup);
- if (pids.isError()) {
- return Failure("Failed to get number of processes: " + pids.error());
- }
-
- result.set_processes(pids.get().size());
-
- Try<std::set<pid_t>> tids =
- cgroups::threads(hierarchies["cpuacct"], info->cgroup);
- if (tids.isError()) {
- return Failure("Failed to get number of threads: " + tids.error());
- }
-
- result.set_threads(tids.get().size());
- }
-
- // Get the number of clock ticks, used for cpu accounting.
- static long ticks = sysconf(_SC_CLK_TCK);
-
- PCHECK(ticks > 0) << "Failed to get sysconf(_SC_CLK_TCK)";
-
- // Add the cpuacct.stat information.
- Try<hashmap<string, uint64_t>> stat = cgroups::stat(
- hierarchies["cpuacct"],
- info->cgroup,
- "cpuacct.stat");
-
- if (stat.isError()) {
- return Failure("Failed to read cpuacct.stat: " + stat.error());
- }
-
- // TODO(bmahler): Add namespacing to cgroups to enforce the expected
- // structure, e.g., cgroups::cpuacct::stat.
- Option<uint64_t> user = stat.get().get("user");
- Option<uint64_t> system = stat.get().get("system");
-
- if (user.isSome() && system.isSome()) {
- result.set_cpus_user_time_secs((double) user.get() / (double) ticks);
- result.set_cpus_system_time_secs((double) system.get() / (double) ticks);
- }
-
- // Add the cpu.stat information only if CFS is enabled.
- if (flags.cgroups_enable_cfs) {
- stat = cgroups::stat(hierarchies["cpu"], info->cgroup, "cpu.stat");
- if (stat.isError()) {
- return Failure("Failed to read cpu.stat: " + stat.error());
- }
-
- Option<uint64_t> nr_periods = stat.get().get("nr_periods");
- if (nr_periods.isSome()) {
- result.set_cpus_nr_periods(nr_periods.get());
- }
-
- Option<uint64_t> nr_throttled = stat.get().get("nr_throttled");
- if (nr_throttled.isSome()) {
- result.set_cpus_nr_throttled(nr_throttled.get());
- }
-
- Option<uint64_t> throttled_time = stat.get().get("throttled_time");
- if (throttled_time.isSome()) {
- result.set_cpus_throttled_time_secs(
- Nanoseconds(throttled_time.get()).secs());
- }
- }
-
- return result;
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- // Multiple calls may occur during test clean up.
- if (!infos.contains(containerId)) {
- VLOG(1) << "Ignoring cleanup request for unknown container: "
- << containerId;
-
- return Nothing();
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- list<Future<Nothing>> futures;
- foreach (const string& subsystem, subsystems) {
- futures.push_back(cgroups::destroy(
- hierarchies[subsystem],
- info->cgroup,
- cgroups::DESTROY_TIMEOUT));
- }
-
- return collect(futures)
- .onAny(defer(PID<CgroupsCpushareIsolatorProcess>(this),
- &CgroupsCpushareIsolatorProcess::_cleanup,
- containerId,
- lambda::_1))
- .then([]() { return Nothing(); });
-}
-
-
-Future<list<Nothing>> CgroupsCpushareIsolatorProcess::_cleanup(
- const ContainerID& containerId,
- const Future<list<Nothing>>& future)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- CHECK_NOTNULL(infos[containerId]);
-
- if (!future.isReady()) {
- return Failure(
- "Failed to clean up container " + stringify(containerId) +
- " : " + (future.isFailed() ? future.failure() : "discarded"));
- }
-
- delete infos[containerId];
- infos.erase(containerId);
-
- return future;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/cpushare.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/cpushare.hpp b/src/slave/containerizer/isolators/cgroups/cpushare.hpp
deleted file mode 100644
index 54b83a7..0000000
--- a/src/slave/containerizer/isolators/cgroups/cpushare.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CPUSHARE_ISOLATOR_HPP__
-#define __CPUSHARE_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <string>
-#include <vector>
-
-#include <process/future.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/option.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Use the Linux cpu cgroup controller for cpu isolation which uses the
-// Completely Fair Scheduler (CFS).
-// - cpushare implements proportionally weighted scheduling.
-// - cfs implements hard quota based scheduling.
-class CgroupsCpushareIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~CgroupsCpushareIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- CgroupsCpushareIsolatorProcess(
- const Flags& flags,
- const hashmap<std::string, std::string>& hierarchies,
- const std::vector<std::string>& subsystems);
-
- virtual process::Future<std::list<Nothing>> _cleanup(
- const ContainerID& containerId,
- const process::Future<std::list<Nothing>>& future);
-
- struct Info
- {
- Info(const ContainerID& _containerId, const std::string& _cgroup)
- : containerId(_containerId), cgroup(_cgroup) {}
-
- const ContainerID containerId;
- const std::string cgroup;
- Option<pid_t> pid;
- Option<Resources> resources;
-
- process::Promise<mesos::slave::ContainerLimitation> limitation;
- };
-
- const Flags flags;
-
- // Map from subsystem to hierarchy.
- hashmap<std::string, std::string> hierarchies;
-
- // Subsystems used for this isolator. Typically, there are two
- // elements in the vector: 'cpu' and 'cpuacct'. If cpu and cpuacct
- // systems are co-mounted (e.g., systems using systemd), then there
- // will be only one element in the vector which is 'cpu,cpuacct'.
- std::vector<std::string> subsystems;
-
- // TODO(bmahler): Use Owned<Info>.
- hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __CPUSHARE_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/mem.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/mem.cpp b/src/slave/containerizer/isolators/cgroups/mem.cpp
deleted file mode 100644
index 55fa6f4..0000000
--- a/src/slave/containerizer/isolators/cgroups/mem.cpp
+++ /dev/null
@@ -1,733 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <list>
-#include <vector>
-
-#include <mesos/type_utils.hpp>
-#include <mesos/values.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/pid.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/lambda.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "common/protobuf_utils.hpp"
-
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-
-using namespace process;
-
-using cgroups::memory::pressure::Level;
-using cgroups::memory::pressure::Counter;
-
-using std::list;
-using std::ostringstream;
-using std::set;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-static const vector<Level> levels()
-{
- return {Level::LOW, Level::MEDIUM, Level::CRITICAL};
-}
-
-
-CgroupsMemIsolatorProcess::CgroupsMemIsolatorProcess(
- const Flags& _flags,
- const string& _hierarchy,
- const bool _limitSwap)
- : flags(_flags),
- hierarchy(_hierarchy),
- limitSwap(_limitSwap) {}
-
-
-CgroupsMemIsolatorProcess::~CgroupsMemIsolatorProcess() {}
-
-
-Try<Isolator*> CgroupsMemIsolatorProcess::create(const Flags& flags)
-{
- Try<string> hierarchy = cgroups::prepare(
- flags.cgroups_hierarchy,
- "memory",
- flags.cgroups_root);
-
- if (hierarchy.isError()) {
- return Error("Failed to create memory cgroup: " + hierarchy.error());
- }
-
- // Ensure that no other subsystem is attached to the hierarchy.
- Try<set<string>> subsystems = cgroups::subsystems(hierarchy.get());
- if (subsystems.isError()) {
- return Error(
- "Failed to get the list of attached subsystems for hierarchy " +
- hierarchy.get());
- } else if (subsystems.get().size() != 1) {
- return Error(
- "Unexpected subsystems found attached to the hierarchy " +
- hierarchy.get());
- }
-
- // Make sure the kernel OOM-killer is enabled.
- // The Mesos OOM handler, as implemented, is not capable of handling
- // the oom condition by itself safely given the limitations Linux
- // imposes on this code path.
- Try<Nothing> enable = cgroups::memory::oom::killer::enable(
- hierarchy.get(), flags.cgroups_root);
-
- if (enable.isError()) {
- return Error(enable.error());
- }
-
- // Test if memory pressure listening is enabled. We test that on the
- // root cgroup. We rely on 'Counter::create' to test if memory
- // pressure listening is enabled or not. The created counters will
- // be destroyed immediately.
- foreach (Level level, levels()) {
- Try<Owned<Counter>> counter = Counter::create(
- hierarchy.get(),
- flags.cgroups_root,
- level);
-
- if (counter.isError()) {
- return Error("Failed to listen on " + stringify(level) +
- " memory events: " + counter.error());
- }
- }
-
- // Determine whether to limit swap or not.
- bool limitSwap = false;
-
- if (flags.cgroups_limit_swap) {
- Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
- hierarchy.get(), flags.cgroups_root);
-
- if (check.isError()) {
- return Error(
- "Failed to read 'memory.memsw.limit_in_bytes': " +
- check.error());
- } else if (check.isNone()) {
- return Error("'memory.memsw.limit_in_bytes' is not available");
- }
-
- limitSwap = true;
- }
-
- process::Owned<MesosIsolatorProcess> process(
- new CgroupsMemIsolatorProcess(flags, hierarchy.get(), limitSwap));
-
- return new MesosIsolator(process);
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- foreach (const ContainerState& state, states) {
- const ContainerID& containerId = state.container_id();
- const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
- Try<bool> exists = cgroups::exists(hierarchy, cgroup);
- if (exists.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- infos.clear();
- return Failure("Failed to check cgroup for container '" +
- stringify(containerId) + "'");
- }
-
- if (!exists.get()) {
- VLOG(1) << "Couldn't find cgroup for container " << containerId;
- // This may occur if the executor has exited and the isolator
- // has destroyed the cgroup but the slave dies before noticing
- // this. This will be detected when the containerizer tries to
- // monitor the executor's pid.
- continue;
- }
-
- infos[containerId] = new Info(containerId, cgroup);
-
- oomListen(containerId);
- pressureListen(containerId);
- }
-
- // Remove orphan cgroups.
- Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
- if (cgroups.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- infos.clear();
- return Failure(cgroups.error());
- }
-
- foreach (const string& cgroup, cgroups.get()) {
- // Ignore the slave cgroup (see the --slave_subsystems flag).
- // TODO(idownes): Remove this when the cgroups layout is updated,
- // see MESOS-1185.
- if (cgroup == path::join(flags.cgroups_root, "slave")) {
- continue;
- }
-
- ContainerID containerId;
- containerId.set_value(Path(cgroup).basename());
-
- if (infos.contains(containerId)) {
- continue;
- }
-
- // Known orphan cgroups will be destroyed by the containerizer
- // using the normal cleanup path. See MESOS-2367 for details.
- if (orphans.contains(containerId)) {
- infos[containerId] = new Info(containerId, cgroup);
- continue;
- }
-
- LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
-
- // We don't wait on the destroy as we don't want to block recovery.
- cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsMemIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- // TODO(bmahler): Don't insert into 'infos' unless we create the
- // cgroup successfully. It's safe for now because 'cleanup' gets
- // called if we return a Failure, but cleanup will fail because the
- // cgroup does not exist when cgroups::destroy is called.
- Info* info = new Info(
- containerId, path::join(flags.cgroups_root, containerId.value()));
-
- infos[containerId] = info;
-
- // Create a cgroup for this container.
- Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
-
- if (exists.isError()) {
- return Failure("Failed to prepare isolator: " + exists.error());
- } else if (exists.get()) {
- return Failure("Failed to prepare isolator: cgroup already exists");
- }
-
- Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
- if (create.isError()) {
- return Failure("Failed to prepare isolator: " + create.error());
- }
-
- // Chown the cgroup so the executor can create nested cgroups. Do
- // not recurse so the control files are still owned by the slave
- // user and thus cannot be changed by the executor.
- if (user.isSome()) {
- Try<Nothing> chown = os::chown(
- user.get(),
- path::join(hierarchy, info->cgroup),
- false);
- if (chown.isError()) {
- return Failure("Failed to prepare isolator: " + chown.error());
- }
- }
-
- oomListen(containerId);
- pressureListen(containerId);
-
- return update(containerId, executorInfo.resources())
- .then([]() -> Future<Option<ContainerPrepareInfo>> {
- return None();
- });
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- CHECK_NONE(info->pid);
- info->pid = pid;
-
- Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
- if (assign.isError()) {
- return Failure("Failed to assign container '" +
- stringify(info->containerId) + "' to its own cgroup '" +
- path::join(hierarchy, info->cgroup) +
- "' : " + assign.error());
- }
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsMemIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- CHECK_NOTNULL(infos[containerId]);
-
- return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- if (resources.mem().isNone()) {
- return Failure("No memory resource given");
- }
-
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- // New limit.
- Bytes mem = resources.mem().get();
- Bytes limit = std::max(mem, MIN_MEMORY);
-
- // Always set the soft limit.
- Try<Nothing> write =
- cgroups::memory::soft_limit_in_bytes(hierarchy, info->cgroup, limit);
-
- if (write.isError()) {
- return Failure(
- "Failed to set 'memory.soft_limit_in_bytes': " + write.error());
- }
-
- LOG(INFO) << "Updated 'memory.soft_limit_in_bytes' to " << limit
- << " for container " << containerId;
-
- // Read the existing limit.
- Try<Bytes> currentLimit =
- cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
-
- // NOTE: If limitSwap is (has been) used then both limit_in_bytes
- // and memsw.limit_in_bytes will always be set to the same value.
- if (currentLimit.isError()) {
- return Failure(
- "Failed to read 'memory.limit_in_bytes': " + currentLimit.error());
- }
-
- // Determine whether to set the hard limit. If this is the first
- // time (info->pid.isNone()), or we're raising the existing limit,
- // then we can update the hard limit safely. Otherwise, if we need
- // to decrease 'memory.limit_in_bytes' we may induce an OOM if too
- // much memory is in use. As a result, we only update the soft limit
- // when the memory reservation is being reduced. This is probably
- // okay if the machine has available resources.
- // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
- // discrepancy between usage and soft limit and introduces a "manual
- // oom" if necessary.
- if (info->pid.isNone() || limit > currentLimit.get()) {
- // We always set limit_in_bytes first and optionally set
- // memsw.limit_in_bytes if limitSwap is true.
- Try<Nothing> write = cgroups::memory::limit_in_bytes(
- hierarchy, info->cgroup, limit);
-
- if (write.isError()) {
- return Failure(
- "Failed to set 'memory.limit_in_bytes': " + write.error());
- }
-
- LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
- << " for container " << containerId;
-
- if (limitSwap) {
- Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
- hierarchy, info->cgroup, limit);
-
- if (write.isError()) {
- return Failure(
- "Failed to set 'memory.memsw.limit_in_bytes': " + write.error());
- }
-
- LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
- << " for container " << containerId;
- }
- }
-
- return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsMemIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- ResourceStatistics result;
-
- // The rss from memory.stat is wrong in two dimensions:
- // 1. It does not include child cgroups.
- // 2. It does not include any file backed pages.
- Try<Bytes> usage = cgroups::memory::usage_in_bytes(hierarchy, info->cgroup);
- if (usage.isError()) {
- return Failure("Failed to parse memory.usage_in_bytes: " + usage.error());
- }
-
- result.set_mem_total_bytes(usage.get().bytes());
-
- if (limitSwap) {
- Try<Bytes> usage =
- cgroups::memory::memsw_usage_in_bytes(hierarchy, info->cgroup);
- if (usage.isError()) {
- return Failure(
- "Failed to parse memory.memsw.usage_in_bytes: " + usage.error());
- }
-
- result.set_mem_total_memsw_bytes(usage.get().bytes());
- }
-
- // TODO(bmahler): Add namespacing to cgroups to enforce the expected
- // structure, e.g, cgroups::memory::stat.
- Try<hashmap<string, uint64_t>> stat =
- cgroups::stat(hierarchy, info->cgroup, "memory.stat");
- if (stat.isError()) {
- return Failure("Failed to read memory.stat: " + stat.error());
- }
-
- Option<uint64_t> total_cache = stat.get().get("total_cache");
- if (total_cache.isSome()) {
- // TODO(chzhcn): mem_file_bytes is deprecated in 0.23.0 and will
- // be removed in 0.24.0.
- result.set_mem_file_bytes(total_cache.get());
-
- result.set_mem_cache_bytes(total_cache.get());
- }
-
- Option<uint64_t> total_rss = stat.get().get("total_rss");
- if (total_rss.isSome()) {
- // TODO(chzhcn): mem_anon_bytes is deprecated in 0.23.0 and will
- // be removed in 0.24.0.
- result.set_mem_anon_bytes(total_rss.get());
-
- result.set_mem_rss_bytes(total_rss.get());
- }
-
- Option<uint64_t> total_mapped_file = stat.get().get("total_mapped_file");
- if (total_mapped_file.isSome()) {
- result.set_mem_mapped_file_bytes(total_mapped_file.get());
- }
-
- Option<uint64_t> total_swap = stat.get().get("total_swap");
- if (total_swap.isSome()) {
- result.set_mem_swap_bytes(total_swap.get());
- }
-
- Option<uint64_t> total_unevictable = stat.get().get("total_unevictable");
- if (total_unevictable.isSome()) {
- result.set_mem_unevictable_bytes(total_unevictable.get());
- }
-
- // Get pressure counter readings.
- list<Level> levels;
- list<Future<uint64_t>> values;
- foreachpair (Level level,
- const Owned<Counter>& counter,
- info->pressureCounters) {
- levels.push_back(level);
- values.push_back(counter->value());
- }
-
- return await(values)
- .then(defer(PID<CgroupsMemIsolatorProcess>(this),
- &CgroupsMemIsolatorProcess::_usage,
- containerId,
- result,
- levels,
- lambda::_1));
-}
-
-
-Future<ResourceStatistics> CgroupsMemIsolatorProcess::_usage(
- const ContainerID& containerId,
- ResourceStatistics result,
- const list<Level>& levels,
- const list<Future<uint64_t>>& values)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- list<Level>::const_iterator iterator = levels.begin();
- foreach (const Future<uint64_t>& value, values) {
- if (value.isReady()) {
- switch (*iterator) {
- case Level::LOW:
- result.set_mem_low_pressure_counter(value.get());
- break;
- case Level::MEDIUM:
- result.set_mem_medium_pressure_counter(value.get());
- break;
- case Level::CRITICAL:
- result.set_mem_critical_pressure_counter(value.get());
- break;
- }
- } else {
- LOG(ERROR) << "Failed to listen on " << stringify(*iterator)
- << " pressure events for container " << containerId << ": "
- << (value.isFailed() ? value.failure() : "discarded");
- }
-
- ++iterator;
- }
-
- return result;
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- // Multiple calls may occur during test clean up.
- if (!infos.contains(containerId)) {
- VLOG(1) << "Ignoring cleanup request for unknown container: "
- << containerId;
- return Nothing();
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- if (info->oomNotifier.isPending()) {
- info->oomNotifier.discard();
- }
-
- return cgroups::destroy(hierarchy, info->cgroup, cgroups::DESTROY_TIMEOUT)
- .onAny(defer(PID<CgroupsMemIsolatorProcess>(this),
- &CgroupsMemIsolatorProcess::_cleanup,
- containerId,
- lambda::_1));
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::_cleanup(
- const ContainerID& containerId,
- const Future<Nothing>& future)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- CHECK_NOTNULL(infos[containerId]);
-
- if (!future.isReady()) {
- return Failure("Failed to clean up container " + stringify(containerId) +
- " : " + (future.isFailed() ? future.failure()
- : "discarded"));
- }
-
- delete infos[containerId];
- infos.erase(containerId);
-
- return Nothing();
-}
-
-
-void CgroupsMemIsolatorProcess::oomListen(
- const ContainerID& containerId)
-{
- CHECK(infos.contains(containerId));
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- info->oomNotifier = cgroups::memory::oom::listen(hierarchy, info->cgroup);
-
- // If the listening fails immediately, something very wrong
- // happened. Therefore, we report a fatal error here.
- if (info->oomNotifier.isFailed()) {
- LOG(FATAL) << "Failed to listen for OOM events for container "
- << containerId << ": "
- << info->oomNotifier.failure();
- }
-
- LOG(INFO) << "Started listening for OOM events for container "
- << containerId;
-
- info->oomNotifier.onReady(defer(
- PID<CgroupsMemIsolatorProcess>(this),
- &CgroupsMemIsolatorProcess::oomWaited,
- containerId,
- lambda::_1));
-}
-
-
-void CgroupsMemIsolatorProcess::oomWaited(
- const ContainerID& containerId,
- const Future<Nothing>& future)
-{
- if (future.isDiscarded()) {
- LOG(INFO) << "Discarded OOM notifier for container "
- << containerId;
- } else if (future.isFailed()) {
- LOG(ERROR) << "Listening on OOM events failed for container "
- << containerId << ": " << future.failure();
- } else {
- // Out-of-memory event happened, call the handler.
- LOG(INFO) << "OOM notifier is triggered for container " << containerId;
- oom(containerId);
- }
-}
-
-
-void CgroupsMemIsolatorProcess::oom(const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- // It is likely that process exited is executed before this
- // function (e.g. The kill and OOM events happen at the same
- // time, and the process exit event arrives first.) Therefore, we
- // should not report a fatal error here.
- LOG(INFO) << "OOM detected for an already terminated executor";
- return;
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- LOG(INFO) << "OOM detected for container " << containerId;
-
- // Construct a "message" string to describe why the isolator
- // destroyed the executor's cgroup (in order to assist in
- // debugging).
- ostringstream message;
- message << "Memory limit exceeded: ";
-
- // Output the requested memory limit.
- // NOTE: If limitSwap is (has been) used then both limit_in_bytes
- // and memsw.limit_in_bytes will always be set to the same value.
- Try<Bytes> limit = cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
-
- if (limit.isError()) {
- LOG(ERROR) << "Failed to read 'memory.limit_in_bytes': "
- << limit.error();
- } else {
- message << "Requested: " << limit.get() << " ";
- }
-
- // Output the maximum memory usage.
- Try<Bytes> usage = cgroups::memory::max_usage_in_bytes(
- hierarchy, info->cgroup);
-
- if (usage.isError()) {
- LOG(ERROR) << "Failed to read 'memory.max_usage_in_bytes': "
- << usage.error();
- } else {
- message << "Maximum Used: " << usage.get() << "\n";
- }
-
- // Output 'memory.stat' of the cgroup to help with debugging.
- // NOTE: With Kernel OOM-killer enabled these stats may not reflect
- // memory state at time of OOM.
- Try<string> read = cgroups::read(hierarchy, info->cgroup, "memory.stat");
- if (read.isError()) {
- LOG(ERROR) << "Failed to read 'memory.stat': " << read.error();
- } else {
- message << "\nMEMORY STATISTICS: \n" << read.get() << "\n";
- }
-
- LOG(INFO) << strings::trim(message.str()); // Trim the extra '\n' at the end.
-
- // TODO(jieyu): This is not accurate if the memory resource is from
- // a non-star role or spans roles (e.g., "*" and "role"). Ideally,
- // we should save the resources passed in and report it here.
- Resources mem = Resources::parse(
- "mem",
- stringify(usage.isSome() ? usage.get().megabytes() : 0),
- "*").get();
-
- info->limitation.set(
- protobuf::slave::createContainerLimitation(
- mem,
- message.str(),
- TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY));
-}
-
-
-void CgroupsMemIsolatorProcess::pressureListen(
- const ContainerID& containerId)
-{
- CHECK(infos.contains(containerId));
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- foreach (Level level, levels()) {
- Try<Owned<Counter>> counter = Counter::create(
- hierarchy,
- info->cgroup,
- level);
-
- if (counter.isError()) {
- LOG(ERROR) << "Failed to listen on " << level << " memory pressure "
- << "events for container " << containerId << ": "
- << counter.error();
- } else {
- info->pressureCounters[level] = counter.get();
-
- LOG(INFO) << "Started listening on " << level << " memory pressure "
- << "events for container " << containerId;
- }
- }
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/mem.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/mem.hpp b/src/slave/containerizer/isolators/cgroups/mem.hpp
deleted file mode 100644
index 47f73c3..0000000
--- a/src/slave/containerizer/isolators/cgroups/mem.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __MEM_ISOLATOR_HPP__
-#define __MEM_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/nothing.hpp>
-#include <stout/option.hpp>
-
-#include "linux/cgroups.hpp"
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CgroupsMemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~CgroupsMemIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- CgroupsMemIsolatorProcess(
- const Flags& flags,
- const std::string& hierarchy,
- bool limitSwap);
-
- process::Future<ResourceStatistics> _usage(
- const ContainerID& containerId,
- ResourceStatistics result,
- const std::list<cgroups::memory::pressure::Level>& levels,
- const std::list<process::Future<uint64_t>>& values);
-
- process::Future<Nothing> _cleanup(
- const ContainerID& containerId,
- const process::Future<Nothing>& future);
-
- struct Info
- {
- Info(const ContainerID& _containerId, const std::string& _cgroup)
- : containerId(_containerId), cgroup(_cgroup) {}
-
- const ContainerID containerId;
- const std::string cgroup;
- Option<pid_t> pid;
-
- process::Promise<mesos::slave::ContainerLimitation> limitation;
-
- // Used to cancel the OOM listening.
- process::Future<Nothing> oomNotifier;
-
- hashmap<cgroups::memory::pressure::Level,
- process::Owned<cgroups::memory::pressure::Counter>>
- pressureCounters;
- };
-
- // Start listening on OOM events. This function will create an
- // eventfd and start polling on it.
- void oomListen(const ContainerID& containerId);
-
- // This function is invoked when the polling on eventfd has a
- // result.
- void oomWaited(
- const ContainerID& containerId,
- const process::Future<Nothing>& future);
-
- // This function is invoked when the OOM event happens.
- void oom(const ContainerID& containerId);
-
- // Start listening on memory pressure events.
- void pressureListen(const ContainerID& containerId);
-
- const Flags flags;
-
- // The path to the cgroups subsystem hierarchy root.
- const std::string hierarchy;
-
- const bool limitSwap;
-
- // TODO(bmahler): Use Owned<Info>.
- hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __MEM_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/perf_event.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/perf_event.cpp b/src/slave/containerizer/isolators/cgroups/perf_event.cpp
deleted file mode 100644
index 03035df..0000000
--- a/src/slave/containerizer/isolators/cgroups/perf_event.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <vector>
-
-#include <google/protobuf/descriptor.h>
-#include <google/protobuf/message.h>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/delay.hpp>
-#include <process/io.hpp>
-#include <process/pid.hpp>
-#include <process/reap.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/lambda.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "linux/cgroups.hpp"
-#include "linux/perf.hpp"
-
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-using std::list;
-using std::set;
-using std::string;
-using std::vector;
-
-using process::Clock;
-using process::Failure;
-using process::Future;
-using process::PID;
-using process::Time;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> CgroupsPerfEventIsolatorProcess::create(const Flags& flags)
-{
- LOG(INFO) << "Creating PerfEvent isolator";
-
- if (!perf::supported()) {
- return Error("Perf is not supported");
- }
-
- if (flags.perf_duration > flags.perf_interval) {
- return Error("Sampling perf for duration (" +
- stringify(flags.perf_duration) +
- ") > interval (" +
- stringify(flags.perf_interval) +
- ") is not supported.");
- }
-
- if (!flags.perf_events.isSome()) {
- return Error("No perf events specified.");
- }
-
- set<string> events;
- foreach (const string& event,
- strings::tokenize(flags.perf_events.get(), ",")) {
- events.insert(event);
- }
-
- if (!perf::valid(events)) {
- return Error("Failed to create PerfEvent isolator, invalid events: " +
- stringify(events));
- }
-
- Try<string> hierarchy = cgroups::prepare(
- flags.cgroups_hierarchy,
- "perf_event",
- flags.cgroups_root);
-
- if (hierarchy.isError()) {
- return Error("Failed to create perf_event cgroup: " + hierarchy.error());
- }
-
- LOG(INFO) << "PerfEvent isolator will profile for " << flags.perf_duration
- << " every " << flags.perf_interval
- << " for events: " << stringify(events);
-
- process::Owned<MesosIsolatorProcess> process(
- new CgroupsPerfEventIsolatorProcess(flags, hierarchy.get(), events));
-
- return new MesosIsolator(process);
-}
-
-
-CgroupsPerfEventIsolatorProcess::~CgroupsPerfEventIsolatorProcess() {}
-
-
-void CgroupsPerfEventIsolatorProcess::initialize()
-{
- // Start sampling.
- sample();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- foreach (const ContainerState& state, states) {
- const ContainerID& containerId = state.container_id();
- const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
- Try<bool> exists = cgroups::exists(hierarchy, cgroup);
- if (exists.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
-
- infos.clear();
- return Failure("Failed to check cgroup " + cgroup +
- " for container '" + stringify(containerId) + "'");
- }
-
- if (!exists.get()) {
- // This may occur if the executor is exiting and the isolator has
- // destroyed the cgroup but the slave dies before noticing this. This
- // will be detected when the containerizer tries to monitor the
- // executor's pid.
- // NOTE: This could also occur if this isolator is now enabled for a
- // container that was started without this isolator. For this
- // particular isolator it is acceptable to continue running this
- // container without a perf_event cgroup because we don't ever
- // query it and the destroy will succeed immediately.
- VLOG(1) << "Couldn't find perf event cgroup for container " << containerId
- << ", perf statistics will not be available";
- continue;
- }
-
- infos[containerId] = new Info(containerId, cgroup);
- }
-
- // Remove orphan cgroups.
- Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
- if (cgroups.isError()) {
- foreachvalue (Info* info, infos) {
- delete info;
- }
- infos.clear();
- return Failure(cgroups.error());
- }
-
- foreach (const string& cgroup, cgroups.get()) {
- // Ignore the slave cgroup (see the --slave_subsystems flag).
- // TODO(idownes): Remove this when the cgroups layout is updated,
- // see MESOS-1185.
- if (cgroup == path::join(flags.cgroups_root, "slave")) {
- continue;
- }
-
- ContainerID containerId;
- containerId.set_value(Path(cgroup).basename());
-
- if (infos.contains(containerId)) {
- continue;
- }
-
- // Known orphan cgroups will be destroyed by the containerizer
- // using the normal cleanup path. See details in MESOS-2367.
- if (orphans.contains(containerId)) {
- infos[containerId] = new Info(containerId, cgroup);
- continue;
- }
-
- LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
-
- // We don't wait on the destroy as we don't want to block recovery.
- cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsPerfEventIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- LOG(INFO) << "Preparing perf event cgroup for " << containerId;
-
- Info* info = new Info(
- containerId,
- path::join(flags.cgroups_root, containerId.value()));
-
- infos[containerId] = CHECK_NOTNULL(info);
-
- // Create a cgroup for this container.
- Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
-
- if (exists.isError()) {
- return Failure("Failed to prepare isolator: " + exists.error());
- }
-
- if (exists.get()) {
- return Failure("Failed to prepare isolator: cgroup already exists");
- }
-
- if (!exists.get()) {
- Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
- if (create.isError()) {
- return Failure("Failed to prepare isolator: " + create.error());
- }
- }
-
- // Chown the cgroup so the executor can create nested cgroups. Do
- // not recurse so the control files are still owned by the slave
- // user and thus cannot be changed by the executor.
- if (user.isSome()) {
- Try<Nothing> chown = os::chown(
- user.get(),
- path::join(hierarchy, info->cgroup),
- false);
- if (chown.isError()) {
- return Failure("Failed to prepare isolator: " + chown.error());
- }
- }
-
- return None();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
- if (assign.isError()) {
- return Failure("Failed to assign container '" +
- stringify(info->containerId) + "' to its own cgroup '" +
- path::join(hierarchy, info->cgroup) +
- "' : " + assign.error());
- }
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsPerfEventIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- // No resources are limited.
- return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- // Nothing to update.
- return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsPerfEventIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- // Return an empty ResourceStatistics, i.e., without
- // PerfStatistics, if we don't know about this container.
- return ResourceStatistics();
- }
-
- CHECK_NOTNULL(infos[containerId]);
-
- ResourceStatistics statistics;
- statistics.mutable_perf()->CopyFrom(infos[containerId]->statistics);
-
- return statistics;
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- // Tolerate clean up attempts for unknown containers which may arise from
- // repeated clean up attempts (during test cleanup).
- if (!infos.contains(containerId)) {
- VLOG(1) << "Ignoring cleanup request for unknown container: "
- << containerId;
- return Nothing();
- }
-
- Info* info = CHECK_NOTNULL(infos[containerId]);
-
- info->destroying = true;
-
- return cgroups::destroy(hierarchy, info->cgroup)
- .then(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
- &CgroupsPerfEventIsolatorProcess::_cleanup,
- containerId));
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::_cleanup(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Nothing();
- }
-
- delete infos[containerId];
- infos.erase(containerId);
-
- return Nothing();
-}
-
-
-Future<hashmap<string, PerfStatistics>> discardSample(
- Future<hashmap<string, PerfStatistics>> future,
- const Duration& duration,
- const Duration& timeout)
-{
- LOG(ERROR) << "Perf sample of " << stringify(duration)
- << " failed to complete within " << stringify(timeout)
- << "; sampling will be halted";
-
- future.discard();
-
- return future;
-}
-
-
-void CgroupsPerfEventIsolatorProcess::sample()
-{
- // Collect a perf sample for all cgroups that are not being
- // destroyed. Since destroyal is asynchronous, 'perf stat' may
- // fail if the cgroup is destroyed before running perf.
- set<string> cgroups;
-
- foreachvalue (Info* info, infos) {
- CHECK_NOTNULL(info);
-
- if (!info->destroying) {
- cgroups.insert(info->cgroup);
- }
- }
-
- // The discard timeout includes an allowance of twice the
- // reaper interval to ensure we see the perf process exit.
- Duration timeout = flags.perf_duration + process::MAX_REAP_INTERVAL() * 2;
-
- perf::sample(events, cgroups, flags.perf_duration)
- .after(timeout,
- lambda::bind(&discardSample,
- lambda::_1,
- flags.perf_duration,
- timeout))
- .onAny(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
- &CgroupsPerfEventIsolatorProcess::_sample,
- Clock::now() + flags.perf_interval,
- lambda::_1));
-}
-
-
-void CgroupsPerfEventIsolatorProcess::_sample(
- const Time& next,
- const Future<hashmap<string, PerfStatistics>>& statistics)
-{
- if (!statistics.isReady()) {
- // In case the failure is transient or this is due to a timeout,
- // we continue sampling. Note that since sampling is done on an
- // interval, it should be ok if this is a non-transient failure.
- LOG(ERROR) << "Failed to get perf sample: "
- << (statistics.isFailed()
- ? statistics.failure()
- : "discarded due to timeout");
- } else {
- // Store the latest statistics, note that cgroups added in the
- // interim will be picked up by the next sample.
- foreachvalue (Info* info, infos) {
- CHECK_NOTNULL(info);
-
- if (statistics->contains(info->cgroup)) {
- info->statistics = statistics->get(info->cgroup).get();
- }
- }
- }
-
- // Schedule sample for the next time.
- delay(next - Clock::now(),
- PID<CgroupsPerfEventIsolatorProcess>(this),
- &CgroupsPerfEventIsolatorProcess::sample);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
[05/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp b/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
new file mode 100644
index 0000000..ae53c1b
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
@@ -0,0 +1,403 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PORT_MAPPING_ISOLATOR_HPP__
+#define __PORT_MAPPING_ISOLATOR_HPP__
+
+#include <stdint.h>
+
+#include <sys/types.h>
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <process/owned.hpp>
+#include <process/subprocess.hpp>
+
+#include <process/metrics/metrics.hpp>
+#include <process/metrics/counter.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/ip.hpp>
+#include <stout/interval.hpp>
+#include <stout/mac.hpp>
+#include <stout/none.hpp>
+#include <stout/option.hpp>
+#include <stout/subcommand.hpp>
+
+#include "linux/routing/filter/ip.hpp"
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The prefix this isolator uses for the virtual ethernet devices.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
+
+
+// The root directory where we bind mount all the namespace handles.
+// We choose the directory '/var/run/netns' so that we can use
+// iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
+// network namespace. This is very useful for debugging purposes.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
+
+// The root directory where we keep all the namespace handle
+// symlinks. This is introduced in 0.23.0.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
+{
+ return "/var/run/mesos/netns";
+}
+
+
+// These names are used to identify the traffic control statistics
+// output for each of the Linux Traffic Control Qdiscs we report.
+constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
+constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
+
+
+// Responsible for allocating ephemeral ports for the port mapping
+// network isolator. This class is exposed mainly for unit testing.
+class EphemeralPortsAllocator
+{
+public:
+ EphemeralPortsAllocator(
+ const IntervalSet<uint16_t>& total,
+ size_t _portsPerContainer)
+ : free(total),
+ portsPerContainer_(_portsPerContainer) {};
+
+ // Returns the number of ephemeral ports for each container.
+ size_t portsPerContainer() const { return portsPerContainer_; }
+
+ // Allocate an ephemeral port range for a container. The allocator
+ // will automatically find one port range with the given container
+ // size. Returns error if the allocation cannot be fulfilled (e.g.,
+ // exhausting available ephemeral ports).
+ Try<Interval<uint16_t>> allocate();
+
+ // Mark the specified ephemeral port range as allocated.
+ void allocate(const Interval<uint16_t>& ports);
+
+ // Deallocate the specified ephemeral port range.
+ void deallocate(const Interval<uint16_t>& ports);
+
+ // Return true if the specified ephemeral port range is managed by
+ // the allocator, regardless it has been allocated to use or not.
+ bool isManaged(const Interval<uint16_t>& ports)
+ {
+ return (free + used).contains(ports);
+ }
+
+private:
+ // Given an integer x, return the smallest integer t such that t >=
+ // x and t % m == 0.
+ static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
+
+ IntervalSet<uint16_t> free;
+ IntervalSet<uint16_t> used;
+
+ // The number of ephemeral ports for each container.
+ size_t portsPerContainer_;
+};
+
+
+// For the specified ports, generate a set of port ranges each of
+// which can be used by a single IP filter. In other words, each port
+// range needs to satisfy the following two conditions: 1) the size of
+// the range is 2^n (n=0,1,2...); 2) the begin of the range is size
+// aligned (i.e., begin % size == 0). This function is exposed mainly
+// for unit testing.
+std::vector<routing::filter::ip::PortRange> getPortRanges(
+ const IntervalSet<uint16_t>& ports);
+
+
+// Provides network isolation using port mapping. Each container is
+// assigned a fixed set of ports (including ephemeral ports). The
+// isolator will set up filters on the host such that network traffic
+// to the host will be properly redirected to the corresponding
+// container depending on the destination ports. The network traffic
+// from containers will also be properly relayed to the host. This
+// isolator is useful when the operator wants to reuse the host IP for
+// all containers running on the host (e.g., there are insufficient
+// IPs).
+class PortMappingIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~PortMappingIsolatorProcess() {}
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ struct Info
+ {
+ Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
+ const Interval<uint16_t>& _ephemeralPorts,
+ const Option<pid_t>& _pid = None())
+ : nonEphemeralPorts(_nonEphemeralPorts),
+ ephemeralPorts(_ephemeralPorts),
+ pid(_pid) {}
+
+ // Non-ephemeral ports used by the container. It's possible that a
+ // container does not use any non-ephemeral ports. In that case,
+ // 'nonEphemeralPorts' will be empty. This variable could change
+ // upon 'update'.
+ IntervalSet<uint16_t> nonEphemeralPorts;
+
+ // Each container has one and only one range of ephemeral ports.
+ // It cannot have more than one ranges of ephemeral ports because
+ // we need to setup the ip_local_port_range (which only accepts a
+ // single interval) inside the container to restrict the ephemeral
+ // ports used by the container.
+ const Interval<uint16_t> ephemeralPorts;
+
+ Option<pid_t> pid;
+ Option<uint16_t> flowId;
+ };
+
+ // Define the metrics used by the port mapping network isolator.
+ struct Metrics
+ {
+ Metrics();
+ ~Metrics();
+
+ process::metrics::Counter adding_eth0_ip_filters_errors;
+ process::metrics::Counter adding_eth0_ip_filters_already_exist;
+ process::metrics::Counter adding_eth0_egress_filters_errors;
+ process::metrics::Counter adding_eth0_egress_filters_already_exist;
+ process::metrics::Counter adding_lo_ip_filters_errors;
+ process::metrics::Counter adding_lo_ip_filters_already_exist;
+ process::metrics::Counter adding_veth_ip_filters_errors;
+ process::metrics::Counter adding_veth_ip_filters_already_exist;
+ process::metrics::Counter adding_veth_icmp_filters_errors;
+ process::metrics::Counter adding_veth_icmp_filters_already_exist;
+ process::metrics::Counter adding_veth_arp_filters_errors;
+ process::metrics::Counter adding_veth_arp_filters_already_exist;
+ process::metrics::Counter adding_eth0_icmp_filters_errors;
+ process::metrics::Counter adding_eth0_icmp_filters_already_exist;
+ process::metrics::Counter adding_eth0_arp_filters_errors;
+ process::metrics::Counter adding_eth0_arp_filters_already_exist;
+ process::metrics::Counter removing_eth0_ip_filters_errors;
+ process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
+ process::metrics::Counter removing_eth0_egress_filters_errors;
+ process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
+ process::metrics::Counter removing_lo_ip_filters_errors;
+ process::metrics::Counter removing_lo_ip_filters_do_not_exist;
+ process::metrics::Counter removing_veth_ip_filters_errors;
+ process::metrics::Counter removing_veth_ip_filters_do_not_exist;
+ process::metrics::Counter removing_eth0_icmp_filters_errors;
+ process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
+ process::metrics::Counter removing_eth0_arp_filters_errors;
+ process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
+ process::metrics::Counter updating_eth0_icmp_filters_errors;
+ process::metrics::Counter updating_eth0_icmp_filters_already_exist;
+ process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
+ process::metrics::Counter updating_eth0_arp_filters_errors;
+ process::metrics::Counter updating_eth0_arp_filters_already_exist;
+ process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
+ process::metrics::Counter updating_container_ip_filters_errors;
+ } metrics;
+
+ PortMappingIsolatorProcess(
+ const Flags& _flags,
+ const std::string& _eth0,
+ const std::string& _lo,
+ const net::MAC& _hostMAC,
+ const net::IPNetwork& _hostIPNetwork,
+ const size_t _hostEth0MTU,
+ const net::IP& _hostDefaultGateway,
+ const hashmap<std::string, std::string>& _hostNetworkConfigurations,
+ const Option<Bytes>& _egressRateLimitPerContainer,
+ const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
+ const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
+ const std::set<uint16_t>& _flowIDs)
+ : flags(_flags),
+ eth0(_eth0),
+ lo(_lo),
+ hostMAC(_hostMAC),
+ hostIPNetwork(_hostIPNetwork),
+ hostEth0MTU(_hostEth0MTU),
+ hostDefaultGateway(_hostDefaultGateway),
+ hostNetworkConfigurations(_hostNetworkConfigurations),
+ egressRateLimitPerContainer(_egressRateLimitPerContainer),
+ managedNonEphemeralPorts(_managedNonEphemeralPorts),
+ ephemeralPortsAllocator(_ephemeralPortsAllocator),
+ freeFlowIds(_flowIDs) {}
+
+ // Continuations.
+ Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
+ Try<Info*> _recover(pid_t pid);
+
+ void _update(
+ const ContainerID& containerId,
+ const process::Future<Option<int>>& status);
+
+ process::Future<ResourceStatistics> _usage(
+ const ResourceStatistics& result,
+ const process::Subprocess& s);
+
+ process::Future<ResourceStatistics> __usage(
+ ResourceStatistics result,
+ const process::Future<std::string>& out);
+
+ // Helper functions.
+ Try<Nothing> addHostIPFilters(
+ const routing::filter::ip::PortRange& range,
+ const Option<uint16_t>& flowId,
+ const std::string& veth);
+
+ Try<Nothing> removeHostIPFilters(
+ const routing::filter::ip::PortRange& range,
+ const std::string& veth,
+ bool removeFiltersOnVeth = true);
+
+ // Return the scripts that will be executed in the child context.
+ std::string scripts(Info* info);
+
+ uint16_t getNextFlowId();
+
+ const Flags flags;
+
+ const std::string eth0;
+ const std::string lo;
+ const net::MAC hostMAC;
+ const net::IPNetwork hostIPNetwork;
+ const size_t hostEth0MTU;
+ const net::IP hostDefaultGateway;
+
+ // Describe the host network configurations. It is a map between
+ // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
+ // values of the configure proc files.
+ const hashmap<std::string, std::string> hostNetworkConfigurations;
+
+ // The optional throughput limit to containers' egress traffic.
+ const Option<Bytes> egressRateLimitPerContainer;
+
+ // All the non-ephemeral ports managed by the slave, as passed in
+ // via flags.resources.
+ const IntervalSet<uint16_t> managedNonEphemeralPorts;
+
+ process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
+
+ // Store a set of unused flow ID's on this slave.
+ std::set<uint16_t> freeFlowIds;
+
+ hashmap<ContainerID, Info*> infos;
+
+ // Recovered containers from a previous run that weren't managed by
+ // the network isolator.
+ hashset<ContainerID> unmanaged;
+};
+
+
+// Defines the subcommand for 'update' that needs to be executed by a
+// subprocess to update the filters inside a container.
+class PortMappingUpdate : public Subcommand
+{
+public:
+ static const char* NAME;
+
+ struct Flags : public flags::FlagsBase
+ {
+ Flags();
+
+ Option<std::string> eth0_name;
+ Option<std::string> lo_name;
+ Option<pid_t> pid;
+ Option<JSON::Object> ports_to_add;
+ Option<JSON::Object> ports_to_remove;
+ };
+
+ PortMappingUpdate() : Subcommand(NAME) {}
+
+ Flags flags;
+
+protected:
+ virtual int execute();
+ virtual flags::FlagsBase* getFlags() { return &flags; }
+};
+
+
+// Defines the subcommand for 'statistics' that needs to be executed
+// by a subprocess to retrieve newtork statistics from inside a
+// container.
+class PortMappingStatistics : public Subcommand
+{
+public:
+ static const char* NAME;
+
+ struct Flags : public flags::FlagsBase
+ {
+ Flags();
+
+ Option<std::string> eth0_name;
+ Option<pid_t> pid;
+ bool enable_socket_statistics_summary;
+ bool enable_socket_statistics_details;
+ };
+
+ PortMappingStatistics() : Subcommand(NAME) {}
+
+ Flags flags;
+
+protected:
+ virtual int execute();
+ virtual flags::FlagsBase* getFlags() { return &flags; }
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PORT_MAPPING_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix.hpp b/src/slave/containerizer/mesos/isolators/posix.hpp
new file mode 100644
index 0000000..00df902
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix.hpp
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_ISOLATOR_HPP__
+#define __POSIX_ISOLATOR_HPP__
+
+#include <process/future.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/os.hpp>
+
+#include <stout/os/pstree.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "usage/usage.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// A basic MesosIsolatorProcess that keeps track of the pid but
+// doesn't do any resource isolation. Subclasses must implement
+// usage() for their appropriate resource(s).
+class PosixIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& state,
+ const hashset<ContainerID>& orphans)
+ {
+ foreach (const mesos::slave::ContainerState& run, state) {
+ // This should (almost) never occur: see comment in
+ // PosixLauncher::recover().
+ if (pids.contains(run.container_id())) {
+ return process::Failure("Container already recovered");
+ }
+
+ pids.put(run.container_id(), run.pid());
+
+ process::Owned<process::Promise<mesos::slave::ContainerLimitation>>
+ promise(new process::Promise<mesos::slave::ContainerLimitation>());
+ promises.put(run.container_id(), promise);
+ }
+
+ return Nothing();
+ }
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user)
+ {
+ if (promises.contains(containerId)) {
+ return process::Failure("Container " + stringify(containerId) +
+ " has already been prepared");
+ }
+
+ process::Owned<process::Promise<mesos::slave::ContainerLimitation>> promise(
+ new process::Promise<mesos::slave::ContainerLimitation>());
+ promises.put(containerId, promise);
+
+ return None();
+ }
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+ {
+ if (!promises.contains(containerId)) {
+ return process::Failure("Unknown container: " + stringify(containerId));
+ }
+
+ pids.put(containerId, pid);
+
+ return Nothing();
+ }
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId)
+ {
+ if (!promises.contains(containerId)) {
+ return process::Failure("Unknown container: " + stringify(containerId));
+ }
+
+ return promises[containerId]->future();
+ }
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources)
+ {
+ if (!promises.contains(containerId)) {
+ return process::Failure("Unknown container: " + stringify(containerId));
+ }
+
+ // No resources are actually isolated so nothing to do.
+ return Nothing();
+ }
+
+ virtual process::Future<Nothing> cleanup(const ContainerID& containerId)
+ {
+ if (!promises.contains(containerId)) {
+ return process::Failure("Unknown container: " + stringify(containerId));
+ }
+
+ // TODO(idownes): We should discard the container's promise here to signal
+ // to anyone that holds the future from watch().
+ promises.erase(containerId);
+
+ pids.erase(containerId);
+
+ return Nothing();
+ }
+
+protected:
+ hashmap<ContainerID, pid_t> pids;
+ hashmap<ContainerID,
+ process::Owned<process::Promise<mesos::slave::ContainerLimitation>>>
+ promises;
+};
+
+
+class PosixCpuIsolatorProcess : public PosixIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags)
+ {
+ process::Owned<MesosIsolatorProcess> process(
+ new PosixCpuIsolatorProcess());
+
+ return new MesosIsolator(process);
+ }
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId)
+ {
+ if (!pids.contains(containerId)) {
+ LOG(WARNING) << "No resource usage for unknown container '"
+ << containerId << "'";
+ return ResourceStatistics();
+ }
+
+ // Use 'mesos-usage' but only request 'cpus_' values.
+ Try<ResourceStatistics> usage =
+ mesos::internal::usage(pids.get(containerId).get(), false, true);
+ if (usage.isError()) {
+ return process::Failure(usage.error());
+ }
+ return usage.get();
+ }
+
+private:
+ PosixCpuIsolatorProcess() {}
+};
+
+
+class PosixMemIsolatorProcess : public PosixIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags)
+ {
+ process::Owned<MesosIsolatorProcess> process(
+ new PosixMemIsolatorProcess());
+
+ return new MesosIsolator(process);
+ }
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId)
+ {
+ if (!pids.contains(containerId)) {
+ LOG(WARNING) << "No resource usage for unknown container '"
+ << containerId << "'";
+ return ResourceStatistics();
+ }
+
+ // Use 'mesos-usage' but only request 'mem_' values.
+ Try<ResourceStatistics> usage =
+ mesos::internal::usage(pids.get(containerId).get(), true, false);
+ if (usage.isError()) {
+ return process::Failure(usage.error());
+ }
+ return usage.get();
+ }
+
+private:
+ PosixMemIsolatorProcess() {}
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix/disk.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix/disk.cpp b/src/slave/containerizer/mesos/isolators/posix/disk.cpp
new file mode 100644
index 0000000..ec25156
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix/disk.cpp
@@ -0,0 +1,525 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <signal.h>
+
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+#include <sys/types.h>
+
+#include <deque>
+#include <tuple>
+
+#include <glog/logging.h>
+
+#include <process/check.hpp>
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/io.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/check.hpp>
+#include <stout/foreach.hpp>
+#include <stout/lambda.hpp>
+#include <stout/numify.hpp>
+#include <stout/strings.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/killtree.hpp>
+
+#include "common/protobuf_utils.hpp"
+
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
+
+using namespace process;
+
+using std::deque;
+using std::list;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> PosixDiskIsolatorProcess::create(const Flags& flags)
+{
+ // TODO(jieyu): Check the availability of command 'du'.
+
+ return new MesosIsolator(process::Owned<MesosIsolatorProcess>(
+ new PosixDiskIsolatorProcess(flags)));
+}
+
+
+PosixDiskIsolatorProcess::Info::PathInfo::~PathInfo()
+{
+ usage.discard();
+}
+
+
+PosixDiskIsolatorProcess::PosixDiskIsolatorProcess(const Flags& _flags)
+ : flags(_flags), collector(flags.container_disk_watch_interval) {}
+
+
+PosixDiskIsolatorProcess::~PosixDiskIsolatorProcess() {}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ foreach (const ContainerState& state, states) {
+ // Since we checkpoint the executor after we create its working
+ // directory, the working directory should definitely exist.
+ CHECK(os::exists(state.directory()))
+ << "Executor work directory " << state.directory() << " doesn't exist";
+
+ infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> PosixDiskIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ infos.put(containerId, Owned<Info>(new Info(directory)));
+
+ return None();
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> PosixDiskIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ if (!infos.contains(containerId)) {
+ LOG(WARNING) << "Ignoring update for unknown container " << containerId;
+ return Nothing();
+ }
+
+ LOG(INFO) << "Updating the disk resources for container "
+ << containerId << " to " << resources;
+
+ const Owned<Info>& info = infos[containerId];
+
+ // This stores the updated quotas.
+ hashmap<string, Resources> quotas;
+
+ foreach (const Resource& resource, resources) {
+ if (resource.name() != "disk") {
+ continue;
+ }
+
+ // The path at which we will collect disk usage and enforce quota.
+ string path;
+
+ // NOTE: We do not allow the case where has_disk() is true but
+ // with nothing set inside DiskInfo. The master will enforce it.
+ if (!resource.has_disk()) {
+ // Regular disk used for executor working directory.
+ path = info->directory;
+ } else {
+ // TODO(jieyu): Support persistent volmes as well.
+ LOG(ERROR) << "Enforcing disk quota unsupported for " << resource;
+ continue;
+ }
+
+ quotas[path] += resource;
+ }
+
+ // Update the quota for paths. For each new path, we also initiate
+ // the disk usage collection.
+ foreachpair (const string& path, const Resources& quota, quotas) {
+ if (!info->paths.contains(path)) {
+ info->paths[path].usage = collector.usage(path)
+ .onAny(defer(
+ PID<PosixDiskIsolatorProcess>(this),
+ &PosixDiskIsolatorProcess::_collect,
+ containerId,
+ path,
+ lambda::_1));
+ }
+
+ info->paths[path].quota = quota;
+ }
+
+ // Remove paths that we no longer interested in.
+ foreach (const string& path, info->paths.keys()) {
+ if (!quotas.contains(path)) {
+ info->paths.erase(path);
+ }
+ }
+
+ return Nothing();
+}
+
+
+void PosixDiskIsolatorProcess::_collect(
+ const ContainerID& containerId,
+ const string& path,
+ const Future<Bytes>& future)
+{
+ if (future.isDiscarded()) {
+ LOG(INFO) << "Checking disk usage at '" << path << "' for container "
+ << containerId << " has been cancelled";
+ } else if (future.isFailed()) {
+ LOG(ERROR) << "Checking disk usage at '" << path << "' for container "
+ << containerId << " has failed: " << future.failure();
+ }
+
+ if (!infos.contains(containerId)) {
+ // The container might have just been destroyed.
+ return;
+ }
+
+ const Owned<Info>& info = infos[containerId];
+
+ if (!info->paths.contains(path)) {
+ // The path might have just been removed from this container's
+ // resources.
+ return;
+ }
+
+ // Check if the disk usage exceeds the quota. If yes, report the
+ // limitation. We keep collecting the disk usage for 'path' by
+ // initiating another round of disk usage check. The check will be
+ // throttled by DiskUsageCollector.
+ if (future.isReady()) {
+ // Save the last disk usage.
+ info->paths[path].lastUsage = future.get();
+
+ if (flags.enforce_container_disk_quota) {
+ Option<Bytes> quota = info->paths[path].quota.disk();
+ CHECK_SOME(quota);
+
+ if (future.get() > quota.get()) {
+ info->limitation.set(
+ protobuf::slave::createContainerLimitation(
+ Resources(info->paths[path].quota),
+ "Disk usage (" + stringify(future.get()) +
+ ") exceeds quota (" + stringify(quota.get()) + ")",
+ TaskStatus::REASON_CONTAINER_LIMITATION_DISK));
+ }
+ }
+ }
+
+ info->paths[path].usage = collector.usage(path)
+ .onAny(defer(
+ PID<PosixDiskIsolatorProcess>(this),
+ &PosixDiskIsolatorProcess::_collect,
+ containerId,
+ path,
+ lambda::_1));
+}
+
+
+Future<ResourceStatistics> PosixDiskIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ ResourceStatistics result;
+
+ const Owned<Info>& info = infos[containerId];
+
+ if (info->paths.contains(info->directory)) {
+ Option<Bytes> quota = info->paths[info->directory].quota.disk();
+ CHECK_SOME(quota);
+
+ result.set_disk_limit_bytes(quota.get().bytes());
+
+ // NOTE: There may be a large delay (# of containers * interval)
+ // until an initial cached value is returned here!
+ if (info->paths[info->directory].lastUsage.isSome()) {
+ result.set_disk_used_bytes(
+ info->paths[info->directory].lastUsage.get().bytes());
+ }
+ }
+
+ return result;
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ LOG(WARNING) << "Ignoring cleanup for unknown container " << containerId;
+ return Nothing();
+ }
+
+ infos.erase(containerId);
+
+ return Nothing();
+}
+
+
+class DiskUsageCollectorProcess : public Process<DiskUsageCollectorProcess>
+{
+public:
+ DiskUsageCollectorProcess(const Duration& _interval) : interval(_interval) {}
+ virtual ~DiskUsageCollectorProcess() {}
+
+ Future<Bytes> usage(const string& path)
+ {
+ foreach (const Owned<Entry>& entry, entries) {
+ if (entry->path == path) {
+ return entry->promise.future();
+ }
+ }
+
+ entries.push_back(Owned<Entry>(new Entry(path)));
+
+ // Install onDiscard callback.
+ Future<Bytes> future = entries.back()->promise.future();
+ future.onDiscard(defer(self(), &Self::discard, path));
+
+ return future;
+ }
+
+protected:
+ void initialize()
+ {
+ schedule();
+ }
+
+ void finalize()
+ {
+ foreach (const Owned<Entry>& entry, entries) {
+ if (entry->du.isSome() && entry->du.get().status().isPending()) {
+ os::killtree(entry->du.get().pid(), SIGKILL);
+ }
+
+ entry->promise.fail("DiskUsageCollector is destroyed");
+ }
+ }
+
+private:
+ // Describe a single pending check.
+ struct Entry
+ {
+ explicit Entry(const string& _path) : path(_path) {}
+
+ string path;
+ Option<Subprocess> du;
+ Promise<Bytes> promise;
+ };
+
+ // This function is invoked right before each 'du' is exec'ed. Note
+ // that this function needs to be async signal safe.
+ static int setupChild()
+ {
+#ifdef __linux__
+ // Kill the child process if the parent exits.
+ // NOTE: This function should never returns non-zero because we
+ // are passing in a valid signal.
+ return ::prctl(PR_SET_PDEATHSIG, SIGKILL);
+#else
+ return 0;
+#endif
+ }
+
+ void discard(const string& path)
+ {
+ for (auto it = entries.begin(); it != entries.end(); ++it) {
+ // We only cancel those checks whose 'du' haven't been launched.
+ if ((*it)->path == path && (*it)->du.isNone()) {
+ (*it)->promise.discard();
+ entries.erase(it);
+ break;
+ }
+ }
+ }
+
+ // Schedule a 'du' to be invoked. The current implementation does
+ // not allow multiple 'du's running concurrently. The minimal
+ // interval between two subsequent 'du's is controlled by 'interval'
+ // for throttling purpose.
+ void schedule()
+ {
+ if (entries.empty()) {
+ delay(interval, self(), &Self::schedule);
+ return;
+ }
+
+ const Owned<Entry>& entry = entries.front();
+
+ // Invoke 'du' and report number of 1K-byte blocks. We fix the
+ // block size here so that we can get consistent results on all
+ // platforms (e.g., OS X uses 512 byte blocks).
+ //
+ // NOTE: The 'du' processes are run in the slave's cgroup and it
+ // will be that cgroup that is charged for (a) memory to cache the
+ // fs data structures, (b) disk I/O to read those structures, and
+ // (c) the cpu time to traverse.
+ Try<Subprocess> s = subprocess(
+ "du",
+ vector<string>({"du", "-k", "-s", entry->path}),
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PIPE(),
+ Subprocess::PIPE(),
+ None(),
+ None(),
+ setupChild);
+
+ if (s.isError()) {
+ entry->promise.fail("Failed to exec 'du': " + s.error());
+
+ entries.pop_front();
+ delay(interval, self(), &Self::schedule);
+ return;
+ }
+
+ entry->du = s.get();
+
+ await(s.get().status(),
+ io::read(s.get().out().get()),
+ io::read(s.get().err().get()))
+ .onAny(defer(self(), &Self::_schedule, lambda::_1));
+ }
+
+ void _schedule(const Future<std::tuple<
+ Future<Option<int>>,
+ Future<string>,
+ Future<string>>>& future)
+ {
+ CHECK_READY(future);
+ CHECK(!entries.empty());
+
+ const Owned<Entry>& entry = entries.front();
+ CHECK_SOME(entry->du);
+
+ Future<Option<int>> status = std::get<0>(future.get());
+
+ if (!status.isReady()) {
+ entry->promise.fail(
+ "Failed to perform 'du': " +
+ (status.isFailed() ? status.failure() : "discarded"));
+ } else if (status.get().isNone()) {
+ entry->promise.fail("Failed to reap the status of 'du'");
+ } else if (status.get().get() != 0) {
+ Future<string> error = std::get<2>(future.get());
+ if (!error.isReady()) {
+ entry->promise.fail(
+ "Failed to perform 'du'. Reading stderr failed: " +
+ (error.isFailed() ? error.failure() : "discarded"));
+ } else {
+ entry->promise.fail("Failed to perform 'du': " + error.get());
+ }
+ } else {
+ Future<string> output = std::get<1>(future.get());
+ if (!output.isReady()) {
+ entry->promise.fail(
+ "Failed to read stdout from 'du': " +
+ (output.isFailed() ? output.failure() : "discarded"));
+ } else {
+ // Parsing the output from 'du'. The following is a sample
+ // output. Tab is used as the delimiter between the number of
+ // blocks and the checked path.
+ // $ du /var/lib/mesos/.../runs/container_id
+ // 1024 /var/lib/mesos/.../runs/container_id
+ vector<string> tokens = strings::tokenize(output.get(), " \t");
+ if (tokens.empty()) {
+ entry->promise.fail("Unexpected output from 'du': " + output.get());
+ } else {
+ Try<size_t> value = numify<size_t>(tokens[0]);
+ if (value.isError()) {
+ entry->promise.fail("Unexpected output from 'du': " + output.get());
+ } else {
+ // Notify the callers.
+ entry->promise.set(Kilobytes(value.get()));
+ }
+ }
+ }
+ }
+
+ entries.pop_front();
+ delay(interval, self(), &Self::schedule);
+ }
+
+ const Duration interval;
+
+ // A queue of pending checks.
+ deque<Owned<Entry>> entries;
+};
+
+
+DiskUsageCollector::DiskUsageCollector(const Duration& interval)
+{
+ process = new DiskUsageCollectorProcess(interval);
+ spawn(process);
+}
+
+
+DiskUsageCollector::~DiskUsageCollector()
+{
+ terminate(process);
+ wait(process);
+ delete process;
+}
+
+
+Future<Bytes> DiskUsageCollector::usage(const string& path)
+{
+ return dispatch(process, &DiskUsageCollectorProcess::usage, path);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix/disk.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix/disk.hpp b/src/slave/containerizer/mesos/isolators/posix/disk.hpp
new file mode 100644
index 0000000..85df5d2
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix/disk.hpp
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_DISK_ISOLATOR_HPP__
+#define __POSIX_DISK_ISOLATOR_HPP__
+
+#include <string>
+
+#include <process/owned.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+
+#include "slave/flags.hpp"
+#include "slave/state.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declarations.
+class DiskUsageCollectorProcess;
+
+
+// Responsible for collecting disk usage for paths, while ensuring
+// that an interval elapses between each collection.
+class DiskUsageCollector
+{
+public:
+ DiskUsageCollector(const Duration& interval);
+ ~DiskUsageCollector();
+
+ // Returns the disk usage rooted at 'path'. The user can discard the
+ // returned future to cancel the check.
+ process::Future<Bytes> usage(const std::string& path);
+
+private:
+ DiskUsageCollectorProcess* process;
+};
+
+
+// This isolator monitors the disk usage for containers, and reports
+// ContainerLimitation when a container exceeds its disk quota. This
+// leverages the DiskUsageCollector to ensure that we don't induce too
+// much CPU usage and disk caching effects from running 'du' too
+// often.
+//
+// NOTE: Currently all containers are processed in the same queue,
+// which means that when a container starts, it could take many disk
+// collection intervals until any data is available in the resource
+// usage statistics!
+//
+// TODO(jieyu): Consider handling each container independently, or
+// triggering an initial collection when the container starts, to
+// ensure that we have usage statistics without a large delay.
+class PosixDiskIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~PosixDiskIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ PosixDiskIsolatorProcess(const Flags& flags);
+
+ void _collect(
+ const ContainerID& containerId,
+ const std::string& path,
+ const process::Future<Bytes>& future);
+
+ const Flags flags;
+ DiskUsageCollector collector;
+
+ struct Info
+ {
+ explicit Info(const std::string& _directory) : directory(_directory) {}
+
+ // We save executor working directory here so that we know where
+ // to collect disk usage for disk resources without DiskInfo.
+ const std::string directory;
+
+ process::Promise<mesos::slave::ContainerLimitation> limitation;
+
+ // The keys of the hashmaps contain the executor working directory
+ // above, and optionally paths of volumes used by the container.
+ // For each path, we maintain its quota and its last usage.
+ struct PathInfo
+ {
+ ~PathInfo();
+
+ Resources quota;
+ process::Future<Bytes> usage;
+ Option<Bytes> lastUsage;
+ };
+
+ hashmap<std::string, PathInfo> paths;
+ };
+
+ hashmap<ContainerID, process::Owned<Info>> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_DISK_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/paths.cpp b/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
new file mode 100644
index 0000000..c898cd3
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <stout/path.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+
+using std::list;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace paths {
+
+string getStagingDir(const string& storeDir)
+{
+ return path::join(storeDir, "staging");
+}
+
+
+string getImagesDir(const string& storeDir)
+{
+ return path::join(storeDir, "images");
+}
+
+
+string getImagePath(const string& storeDir, const string& imageId)
+{
+ return path::join(getImagesDir(storeDir), imageId);
+}
+
+
+string getImageRootfsPath(
+ const string& storeDir,
+ const string& imageId)
+{
+ return path::join(getImagePath(storeDir, imageId), "rootfs");
+}
+
+
+string getImageRootfsPath(const string& imagePath)
+{
+ return path::join(imagePath, "rootfs");
+}
+
+
+string getImageManifestPath(
+ const string& storeDir,
+ const string& imageId)
+{
+ return path::join(getImagePath(storeDir, imageId), "manifest");
+}
+
+
+string getImageManifestPath(const string& imagePath)
+{
+ return path::join(imagePath, "manifest");
+}
+
+} // namespace paths {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/paths.hpp b/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
new file mode 100644
index 0000000..7c36d67
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_PATHS_HPP__
+#define __PROVISIONER_APPC_PATHS_HPP__
+
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/try.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace paths {
+
+// The appc store file system layout is as follows:
+//
+// <store_dir> ('--appc_store_dir' flag)
+// |--staging (contains temp directories for staging downloads)
+// |
+// |--images (stores validated images)
+// |--<image_id> (in the form of "sha512-<128_character_hash_sum>")
+// |--manifest
+// |--rootfs
+// |--... (according to the ACI spec)
+//
+// TODO(xujyan): The staging directory is unused for now (it's
+// externally managed) but implemented to illustrate the need for a
+// separate 'images' directory. Complete the layout diagram when the
+// staging directory is utilized by the provisioner.
+
+std::string getStagingDir(const std::string& storeDir);
+
+
+std::string getImagesDir(const std::string& storeDir);
+
+
+std::string getImagePath(
+ const std::string& storeDir,
+ const std::string& imageId);
+
+
+std::string getImageRootfsPath(
+ const std::string& storeDir,
+ const std::string& imageId);
+
+
+std::string getImageRootfsPath(const std::string& imagePath);
+
+
+std::string getImageManifestPath(
+ const std::string& storeDir,
+ const std::string& imageId);
+
+
+std::string getImageManifestPath(const std::string& imagePath);
+
+} // namespace paths {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_PATHS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/spec.cpp b/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
new file mode 100644
index 0000000..5361c24
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/os/stat.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace spec {
+
+Option<Error> validateManifest(const AppcImageManifest& manifest)
+{
+ // TODO(idownes): Validate that required fields are present when
+ // this cannot be expressed in the protobuf specification, e.g.,
+ // repeated fields with >= 1.
+ // TODO(xujyan): More thorough type validation:
+ // https://github.com/appc/spec/blob/master/spec/types.md
+ if (manifest.ackind() != "ImageManifest") {
+ return Error("Incorrect acKind field: " + manifest.ackind());
+ }
+
+ return None();
+}
+
+
+Option<Error> validateImageID(const string& imageId)
+{
+ if (!strings::startsWith(imageId, "sha512-")) {
+ return Error("Image ID needs to start with sha512-");
+ }
+
+ string hash = strings::remove(imageId, "sha512-", strings::PREFIX);
+ if (hash.length() != 128) {
+ return Error("Invalid hash length for: " + hash);
+ }
+
+ return None();
+}
+
+
+Option<Error> validateLayout(const string& imagePath)
+{
+ if (!os::stat::isdir(paths::getImageRootfsPath(imagePath))) {
+ return Error("No rootfs directory found in image layout");
+ }
+
+ if (!os::stat::isfile(paths::getImageManifestPath(imagePath))) {
+ return Error("No manifest found in image layout");
+ }
+
+ return None();
+}
+
+
+Try<AppcImageManifest> parse(const string& value)
+{
+ Try<JSON::Object> json = JSON::parse<JSON::Object>(value);
+ if (json.isError()) {
+ return Error("JSON parse failed: " + json.error());
+ }
+
+ Try<AppcImageManifest> manifest =
+ protobuf::parse<AppcImageManifest>(json.get());
+
+ if (manifest.isError()) {
+ return Error("Protobuf parse failed: " + manifest.error());
+ }
+
+ Option<Error> error = validateManifest(manifest.get());
+ if (error.isSome()) {
+ return Error("Schema validation failed: " + error.get().message);
+ }
+
+ return manifest.get();
+}
+
+} // namespace spec {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/spec.hpp b/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
new file mode 100644
index 0000000..2bc8c6f
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_SPEC_HPP__
+#define __PROVISIONER_APPC_SPEC_HPP__
+
+#include <string>
+
+#include <stout/error.hpp>
+#include <stout/option.hpp>
+
+#include <mesos/mesos.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Appc spec.
+Option<Error> validateManifest(const AppcImageManifest& manifest);
+
+// Validate if the specified image ID conforms to the Appc spec.
+Option<Error> validateImageID(const std::string& imageId);
+
+// Validate if the specified image has the disk layout that conforms
+// to the Appc spec.
+Option<Error> validateLayout(const std::string& imagePath);
+
+// Parse the AppcImageManifest in the specified JSON string.
+Try<AppcImageManifest> parse(const std::string& value);
+
+} // namespace spec {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_SPEC_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/store.cpp b/src/slave/containerizer/mesos/provisioner/appc/store.cpp
new file mode 100644
index 0000000..a2f36f1
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/store.cpp
@@ -0,0 +1,288 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+
+#include <stout/check.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+
+// Defines a locally cached image (which has passed validation).
+struct CachedImage
+{
+ static Try<CachedImage> create(const string& imagePath);
+
+ CachedImage(
+ const AppcImageManifest& _manifest,
+ const string& _id,
+ const string& _path)
+ : manifest(_manifest), id(_id), path(_path) {}
+
+ string rootfs() const
+ {
+ return path::join(path, "rootfs");
+ }
+
+ const AppcImageManifest manifest;
+
+ // Image ID of the format "sha512-value" where "value" is the hex
+ // encoded string of the sha512 digest of the uncompressed tar file
+ // of the image.
+ const string id;
+
+ // Absolute path to the extracted image.
+ const string path;
+};
+
+
+Try<CachedImage> CachedImage::create(const string& imagePath)
+{
+ Option<Error> error = spec::validateLayout(imagePath);
+ if (error.isSome()) {
+ return Error("Invalid image layout: " + error.get().message);
+ }
+
+ string imageId = Path(imagePath).basename();
+
+ error = spec::validateImageID(imageId);
+ if (error.isSome()) {
+ return Error("Invalid image ID: " + error.get().message);
+ }
+
+ Try<string> read = os::read(paths::getImageManifestPath(imagePath));
+ if (read.isError()) {
+ return Error("Failed to read manifest: " + read.error());
+ }
+
+ Try<AppcImageManifest> manifest = spec::parse(read.get());
+ if (manifest.isError()) {
+ return Error("Failed to parse manifest: " + manifest.error());
+ }
+
+ return CachedImage(manifest.get(), imageId, imagePath);
+}
+
+
+// Helper that implements this:
+// https://github.com/appc/spec/blob/master/spec/aci.md#dependency-matching
+static bool matches(Image::Appc requirements, const CachedImage& candidate)
+{
+ // The name must match.
+ if (candidate.manifest.name() != requirements.name()) {
+ return false;
+ }
+
+ // If an id is specified the candidate must match.
+ if (requirements.has_id() && (candidate.id != requirements.id())) {
+ return false;
+ }
+
+ // Extract labels for easier comparison, this also weeds out duplicates.
+ // TODO(xujyan): Detect duplicate labels in image manifest validation
+ // and Image::Appc validation.
+ hashmap<string, string> requiredLabels;
+ foreach (const Label& label, requirements.labels().labels()) {
+ requiredLabels[label.key()] = label.value();
+ }
+
+ hashmap<string, string> candidateLabels;
+ foreach (const AppcImageManifest::Label& label,
+ candidate.manifest.labels()) {
+ candidateLabels[label.name()] = label.value();
+ }
+
+ // Any label specified must be present and match in the candidate.
+ foreachpair (const string& name,
+ const string& value,
+ requiredLabels) {
+ if (!candidateLabels.contains(name) ||
+ candidateLabels.get(name).get() != value) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+class StoreProcess : public Process<StoreProcess>
+{
+public:
+ StoreProcess(const string& rootDir);
+
+ ~StoreProcess() {}
+
+ Future<Nothing> recover();
+
+ Future<vector<string>> get(const Image& image);
+
+private:
+ // Absolute path to the root directory of the store as defined by
+ // --appc_store_dir.
+ const string rootDir;
+
+ // Mappings: name -> id -> image.
+ hashmap<string, hashmap<string, CachedImage>> images;
+};
+
+
+Try<Owned<slave::Store>> Store::create(const Flags& flags)
+{
+ Try<Nothing> mkdir = os::mkdir(paths::getImagesDir(flags.appc_store_dir));
+ if (mkdir.isError()) {
+ return Error("Failed to create the images directory: " + mkdir.error());
+ }
+
+ // Make sure the root path is canonical so all image paths derived
+ // from it are canonical too.
+ Result<string> rootDir = os::realpath(flags.appc_store_dir);
+ if (!rootDir.isSome()) {
+ // The above mkdir call recursively creates the store directory
+ // if necessary so it cannot be None here.
+ CHECK_ERROR(rootDir);
+
+ return Error(
+ "Failed to get the realpath of the store root directory: " +
+ rootDir.error());
+ }
+
+ return Owned<slave::Store>(new Store(
+ Owned<StoreProcess>(new StoreProcess(rootDir.get()))));
+}
+
+
+Store::Store(Owned<StoreProcess> _process)
+ : process(_process)
+{
+ spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Store::~Store()
+{
+ terminate(process.get());
+ wait(process.get());
+}
+
+
+Future<Nothing> Store::recover()
+{
+ return dispatch(process.get(), &StoreProcess::recover);
+}
+
+
+Future<vector<string>> Store::get(const Image& image)
+{
+ return dispatch(process.get(), &StoreProcess::get, image);
+}
+
+
+StoreProcess::StoreProcess(const string& _rootDir) : rootDir(_rootDir) {}
+
+
+Future<Nothing> StoreProcess::recover()
+{
+ // Recover everything in the store.
+ Try<list<string>> imageIds = os::ls(paths::getImagesDir(rootDir));
+ if (imageIds.isError()) {
+ return Failure(
+ "Failed to list images under '" +
+ paths::getImagesDir(rootDir) + "': " +
+ imageIds.error());
+ }
+
+ foreach (const string& imageId, imageIds.get()) {
+ string path = paths::getImagePath(rootDir, imageId);
+ if (!os::stat::isdir(path)) {
+ LOG(WARNING) << "Unexpected entry in storage: " << imageId;
+ continue;
+ }
+
+ Try<CachedImage> image = CachedImage::create(path);
+ if (image.isError()) {
+ LOG(WARNING) << "Unexpected entry in storage: " << image.error();
+ continue;
+ }
+
+ LOG(INFO) << "Restored image '" << image.get().manifest.name() << "'";
+
+ images[image.get().manifest.name()].put(image.get().id, image.get());
+ }
+
+ return Nothing();
+}
+
+
+Future<vector<string>> StoreProcess::get(const Image& image)
+{
+ if (image.type() != Image::APPC) {
+ return Failure("Not an Appc image: " + stringify(image.type()));
+ }
+
+ const Image::Appc& appc = image.appc();
+
+ if (!images.contains(appc.name())) {
+ return Failure("No Appc image named '" + appc.name() + "' can be found");
+ }
+
+ // Get local candidates.
+ vector<CachedImage> candidates;
+ foreach (const CachedImage& candidate, images[appc.name()].values()) {
+ // The first match is returned.
+ // TODO(xujyan): Some tie-breaking rules are necessary.
+ if (matches(appc, candidate)) {
+ LOG(INFO) << "Found match for Appc image '" << appc.name()
+ << "' in the store";
+
+ // The Appc store current doesn't support dependencies and this
+ // is enforced by manifest validation: if the image's manifest
+ // contains dependencies it would fail the validation and
+ // wouldn't be stored in the store.
+ return vector<string>({candidate.rootfs()});
+ }
+ }
+
+ return Failure("No Appc image named '" + appc.name() +
+ "' can match the requirements");
+}
+
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/store.hpp b/src/slave/containerizer/mesos/provisioner/appc/store.hpp
new file mode 100644
index 0000000..18dbf63
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/store.hpp
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_STORE_HPP__
+#define __PROVISIONER_APPC_STORE_HPP__
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+
+// Forward declaration.
+class StoreProcess;
+
+
+class Store : public slave::Store
+{
+public:
+ static Try<process::Owned<slave::Store>> create(const Flags& flags);
+
+ ~Store();
+
+ virtual process::Future<Nothing> recover();
+
+ // TODO(xujyan): Fetching remotely is not implemented for now and
+ // until then the future fails directly if the image is not in the
+ // local cache.
+ // TODO(xujyan): The store currently doesn't support images that
+ // have dependencies and we should add it later.
+ virtual process::Future<std::vector<std::string>> get(const Image& image);
+
+private:
+ Store(process::Owned<StoreProcess> process);
+
+ Store(const Store&) = delete; // Not copyable.
+ Store& operator=(const Store&) = delete; // Not assignable.
+
+ process::Owned<StoreProcess> process;
+};
+
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_STORE_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backend.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backend.cpp b/src/slave/containerizer/mesos/provisioner/backend.cpp
new file mode 100644
index 0000000..35ced4b
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backend.cpp
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <glog/logging.h>
+
+#include <stout/os.hpp>
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
+
+using namespace process;
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+hashmap<string, Owned<Backend>> Backend::create(const Flags& flags)
+{
+ hashmap<string, Try<Owned<Backend>>(*)(const Flags&)> creators;
+
+#ifdef __linux__
+ creators.put("bind", &BindBackend::create);
+#endif // __linux__
+ creators.put("copy", &CopyBackend::create);
+
+ hashmap<string, Owned<Backend>> backends;
+
+ foreachkey (const string& name, creators) {
+ Try<Owned<Backend>> backend = creators[name](flags);
+ if (backend.isError()) {
+ LOG(WARNING) << "Failed to create '" << name << "' backend: "
+ << backend.error();
+ continue;
+ }
+ backends.put(name, backend.get());
+ }
+
+ return backends;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backend.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backend.hpp b/src/slave/containerizer/mesos/provisioner/backend.hpp
new file mode 100644
index 0000000..1c80b79
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backend.hpp
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKEND_HPP__
+#define __PROVISIONER_BACKEND_HPP__
+
+#include <string>
+#include <vector>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Provision a root filesystem for a container.
+class Backend
+{
+public:
+ virtual ~Backend() {}
+
+ // Return a map of all supported backends keyed by their names. Note
+ // that Backends that failed to be created due to incorrect flags are
+ // simply not added to the result.
+ static hashmap<std::string, process::Owned<Backend>> create(
+ const Flags& flags);
+
+ // Provision a root filesystem for a container into the specified 'rootfs'
+ // directory by applying the specified list of root filesystem layers in
+ // the list order, i.e., files in a layer can overwrite/shadow those from
+ // another layer earlier in the list.
+ virtual process::Future<Nothing> provision(
+ const std::vector<std::string>& layers,
+ const std::string& rootfs) = 0;
+
+ // Destroy the root filesystem provisioned at the specified 'rootfs'
+ // directory. Return false if there is no provisioned root filesystem
+ // to destroy for the given directory.
+ virtual process::Future<bool> destroy(const std::string& rootfs) = 0;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKEND_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/bind.cpp b/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
new file mode 100644
index 0000000..22ca3c1
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
@@ -0,0 +1,250 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <process/dispatch.hpp>
+#include <process/process.hpp>
+
+#include <process/metrics/counter.hpp>
+#include <process/metrics/metrics.hpp>
+
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+
+#include "linux/fs.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+
+using namespace process;
+
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class BindBackendProcess : public Process<BindBackendProcess>
+{
+public:
+ Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
+
+ Future<bool> destroy(const string& rootfs);
+
+ struct Metrics
+ {
+ Metrics();
+ ~Metrics();
+
+ process::metrics::Counter remove_rootfs_errors;
+ } metrics;
+};
+
+
+Try<Owned<Backend>> BindBackend::create(const Flags&)
+{
+ Result<string> user = os::user();
+ if (!user.isSome()) {
+ return Error("Failed to determine user: " +
+ (user.isError() ? user.error() : "username not found"));
+ }
+
+ if (user.get() != "root") {
+ return Error("BindBackend requires root privileges");
+ }
+
+ return Owned<Backend>(new BindBackend(
+ Owned<BindBackendProcess>(new BindBackendProcess())));
+}
+
+
+BindBackend::~BindBackend()
+{
+ terminate(process.get());
+ wait(process.get());
+}
+
+
+BindBackend::BindBackend(Owned<BindBackendProcess> _process)
+ : process(_process)
+{
+ spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Future<Nothing> BindBackend::provision(
+ const vector<string>& layers,
+ const string& rootfs)
+{
+ return dispatch(
+ process.get(), &BindBackendProcess::provision, layers, rootfs);
+}
+
+
+Future<bool> BindBackend::destroy(const string& rootfs)
+{
+ return dispatch(process.get(), &BindBackendProcess::destroy, rootfs);
+}
+
+
+Future<Nothing> BindBackendProcess::provision(
+ const vector<string>& layers,
+ const string& rootfs)
+{
+ if (layers.size() > 1) {
+ return Failure(
+ "Multiple layers are not supported by the bind backend");
+ }
+
+ if (layers.size() == 0) {
+ return Failure("No filesystem layer provided");
+ }
+
+ Try<Nothing> mkdir = os::mkdir(rootfs);
+ if (mkdir.isError()) {
+ return Failure("Failed to create container rootfs at " + rootfs);
+ }
+
+ // TODO(xujyan): Use MS_REC? Does any provisioner use mounts within
+ // its image store in a single layer?
+ Try<Nothing> mount = fs::mount(
+ layers.front(),
+ rootfs,
+ None(),
+ MS_BIND,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to bind mount rootfs '" + layers.front() +
+ "' to '" + rootfs + "': " + mount.error());
+ }
+
+ // And remount it read-only.
+ mount = fs::mount(
+ None(), // Ignored.
+ rootfs,
+ None(),
+ MS_BIND | MS_RDONLY | MS_REMOUNT,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to remount rootfs '" + rootfs + "' read-only: " +
+ mount.error());
+ }
+
+ // Mark the mount as shared+slave.
+ mount = fs::mount(
+ None(),
+ rootfs,
+ None(),
+ MS_SLAVE,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mark mount '" + rootfs +
+ "' as a slave mount: " + mount.error());
+ }
+
+ mount = fs::mount(
+ None(),
+ rootfs,
+ None(),
+ MS_SHARED,
+ NULL);
+
+ if (mount.isError()) {
+ return Failure(
+ "Failed to mark mount '" + rootfs +
+ "' as a shared mount: " + mount.error());
+ }
+
+ return Nothing();
+}
+
+
+Future<bool> BindBackendProcess::destroy(const string& rootfs)
+{
+ Try<fs::MountInfoTable> mountTable = fs::MountInfoTable::read();
+
+ if (mountTable.isError()) {
+ return Failure("Failed to read mount table: " + mountTable.error());
+ }
+
+ foreach (const fs::MountInfoTable::Entry& entry, mountTable.get().entries) {
+ // TODO(xujyan): If MS_REC was used in 'provision()' we would need
+ // to check `strings::startsWith(entry.target, rootfs)` here to
+ // unmount all nested mounts.
+ if (entry.target == rootfs) {
+ // NOTE: This would fail if the rootfs is still in use.
+ Try<Nothing> unmount = fs::unmount(entry.target);
+ if (unmount.isError()) {
+ return Failure(
+ "Failed to destroy bind-mounted rootfs '" + rootfs + "': " +
+ unmount.error());
+ }
+
+ // TODO(jieyu): If 'rmdir' here returns EBUSY, we still returns
+ // a success. This is currently possible because the parent
+ // mount of 'rootfs' might not be a shared mount. Thus,
+ // containers in different mount namespaces might hold extra
+ // references to this mount. It is OK to ignore the EBUSY error
+ // because the provisioner will later try to delete all the
+ // rootfses for the terminated containers.
+ if (::rmdir(rootfs.c_str()) != 0) {
+ string message =
+ "Failed to remove rootfs mount point '" + rootfs +
+ "': " + strerror(errno);
+
+ if (errno == EBUSY) {
+ LOG(ERROR) << message;
+ ++metrics.remove_rootfs_errors;
+ } else {
+ return Failure(message);
+ }
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+BindBackendProcess::Metrics::Metrics()
+ : remove_rootfs_errors(
+ "containerizer/mesos/provisioner/bind/remove_rootfs_errors")
+{
+ process::metrics::add(remove_rootfs_errors);
+}
+
+
+BindBackendProcess::Metrics::~Metrics()
+{
+ process::metrics::remove(remove_rootfs_errors);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/bind.hpp b/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
new file mode 100644
index 0000000..7747810
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKENDS_BIND_HPP__
+#define __PROVISIONER_BACKENDS_BIND_HPP__
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class BindBackendProcess;
+
+
+// This is a specialized backend that may be useful for deployments
+// using large (multi-GB) single-layer images *and* where more recent
+// kernel features such as overlayfs are not available (overlayfs-based
+// backend tracked by MESOS-2971). For small images (10's to 100's of MB)
+// the copy backend may be sufficient. NOTE:
+// 1) BindBackend supports only a single layer. Multi-layer images will
+// fail to provision and the container will fail to launch!
+// 2) The filesystem is read-only because all containers using this
+// image share the source. Select writable areas can be achieved by
+// mounting read-write volumes to places like /tmp, /var/tmp,
+// /home, etc. using the ContainerInfo. These can be relative to
+// the executor work directory.
+// N.B. Since the filesystem is read-only, '--sandbox_directory' must
+// already exist within the filesystem because the filesystem isolator
+// is unable to create it!
+// 3) It's fast because the bind mount requires (nearly) zero IO.
+class BindBackend : public Backend
+{
+public:
+ virtual ~BindBackend();
+
+ // BindBackend doesn't use any flag.
+ static Try<process::Owned<Backend>> create(const Flags&);
+
+ virtual process::Future<Nothing> provision(
+ const std::vector<std::string>& layers,
+ const std::string& rootfs);
+
+ virtual process::Future<bool> destroy(const std::string& rootfs);
+
+private:
+ explicit BindBackend(process::Owned<BindBackendProcess> process);
+
+ BindBackend(const BindBackend&); // Not copyable.
+ BindBackend& operator=(const BindBackend&); // Not assignable.
+
+ process::Owned<BindBackendProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKENDS_BIND_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/copy.cpp b/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
new file mode 100644
index 0000000..59c5a27
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/io.hpp>
+#include <process/process.hpp>
+#include <process/subprocess.hpp>
+
+
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
+
+
+using namespace process;
+
+using std::string;
+using std::list;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CopyBackendProcess : public Process<CopyBackendProcess>
+{
+public:
+ Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
+
+ Future<bool> destroy(const string& rootfs);
+
+private:
+ Future<Nothing> _provision(string layer, const string& rootfs);
+};
+
+
+Try<Owned<Backend>> CopyBackend::create(const Flags&)
+{
+ return Owned<Backend>(new CopyBackend(
+ Owned<CopyBackendProcess>(new CopyBackendProcess())));
+}
+
+
+CopyBackend::~CopyBackend()
+{
+ terminate(process.get());
+ wait(process.get());
+}
+
+
+CopyBackend::CopyBackend(Owned<CopyBackendProcess> _process)
+ : process(_process)
+{
+ spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Future<Nothing> CopyBackend::provision(
+ const vector<string>& layers,
+ const string& rootfs)
+{
+ return dispatch(
+ process.get(), &CopyBackendProcess::provision, layers, rootfs);
+}
+
+
+Future<bool> CopyBackend::destroy(const string& rootfs)
+{
+ return dispatch(process.get(), &CopyBackendProcess::destroy, rootfs);
+}
+
+
+Future<Nothing> CopyBackendProcess::provision(
+ const vector<string>& layers,
+ const string& rootfs)
+{
+ if (layers.size() == 0) {
+ return Failure("No filesystem layers provided");
+ }
+
+ if (os::exists(rootfs)) {
+ return Failure("Rootfs is already provisioned");
+ }
+
+ Try<Nothing> mkdir = os::mkdir(rootfs);
+ if (mkdir.isError()) {
+ return Failure("Failed to create rootfs directory: " + mkdir.error());
+ }
+
+ list<Future<Nothing>> futures{Nothing()};
+
+ foreach (const string layer, layers) {
+ futures.push_back(
+ futures.back().then(
+ defer(self(), &Self::_provision, layer, rootfs)));
+ }
+
+ return collect(futures)
+ .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+Future<Nothing> CopyBackendProcess::_provision(
+ string layer,
+ const string& rootfs)
+{
+ VLOG(1) << "Copying layer path '" << layer << "' to rootfs '" << rootfs
+ << "'";
+
+#ifdef __APPLE__
+ if (!strings::endsWith(layer, "/")) {
+ layer += "/";
+ }
+
+ // OSX cp doesn't support -T flag, but supports source trailing
+ // slash so we only copy the content but not the folder.
+ vector<string> args{"cp", "-a", layer, rootfs};
+#else
+ vector<string> args{"cp", "-aT", layer, rootfs};
+#endif // __APPLE__
+
+ Try<Subprocess> s = subprocess(
+ "cp",
+ args,
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PATH("/dev/null"),
+ Subprocess::PIPE());
+
+ if (s.isError()) {
+ return Failure("Failed to create 'cp' subprocess: " + s.error());
+ }
+
+ Subprocess cp = s.get();
+
+ return cp.status()
+ .then([cp](const Option<int>& status) -> Future<Nothing> {
+ if (status.isNone()) {
+ return Failure("Failed to reap subprocess to copy image");
+ } else if (status.get() != 0) {
+ return io::read(cp.err().get())
+ .then([](const string& err) -> Future<Nothing> {
+ return Failure("Failed to copy layer: " + err);
+ });
+ }
+
+ return Nothing();
+ });
+}
+
+
+Future<bool> CopyBackendProcess::destroy(const string& rootfs)
+{
+ vector<string> argv{"rm", "-rf", rootfs};
+
+ Try<Subprocess> s = subprocess(
+ "rm",
+ argv,
+ Subprocess::PATH("/dev/null"),
+ Subprocess::FD(STDOUT_FILENO),
+ Subprocess::FD(STDERR_FILENO));
+
+ if (s.isError()) {
+ return Failure("Failed to create 'rm' subprocess: " + s.error());
+ }
+
+ return s.get().status()
+ .then([](const Option<int>& status) -> Future<bool> {
+ if (status.isNone()) {
+ return Failure("Failed to reap subprocess to destroy rootfs");
+ } else if (status.get() != 0) {
+ return Failure("Failed to destroy rootfs, exit status: " +
+ WSTRINGIFY(status.get()));
+ }
+
+ return true;
+ });
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/copy.hpp b/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
new file mode 100644
index 0000000..0d3e7d4
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKENDS_COPY_HPP__
+#define __PROVISIONER_BACKENDS_COPY_HPP__
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class CopyBackendProcess;
+
+
+// The backend implementation that copies the layers to the target.
+// NOTE: Using this backend currently has a few implications:
+// 1) The disk space used by the provisioned rootfs is not counted
+// towards either the usage by the executor/task or the store
+// cache, which can interfere with the slave hosts's disk space
+// allocation.
+// 2) The task can write unrestrictedly into the provisioned rootfs
+// which is not accounted for (in terms of disk usage) either.
+class CopyBackend : public Backend
+{
+public:
+ virtual ~CopyBackend();
+
+ // CopyBackend doesn't use any flag.
+ static Try<process::Owned<Backend>> create(const Flags&);
+
+ // Provisions a rootfs given the layers' paths and target rootfs
+ // path.
+ virtual process::Future<Nothing> provision(
+ const std::vector<std::string>& layers,
+ const std::string& rootfs);
+
+ virtual process::Future<bool> destroy(const std::string& rootfs);
+
+private:
+ explicit CopyBackend(process::Owned<CopyBackendProcess> process);
+
+ CopyBackend(const CopyBackend&); // Not copyable.
+ CopyBackend& operator=(const CopyBackend&); // Not assignable.
+
+ process::Owned<CopyBackendProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKENDS_COPY_HPP__
[09/12] mesos git commit: Relocated MesosContainerizer specific files
to the correct location.
Posted by ji...@apache.org.
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/port_mapping.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.hpp b/src/slave/containerizer/isolators/network/port_mapping.hpp
deleted file mode 100644
index ae53c1b..0000000
--- a/src/slave/containerizer/isolators/network/port_mapping.hpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PORT_MAPPING_ISOLATOR_HPP__
-#define __PORT_MAPPING_ISOLATOR_HPP__
-
-#include <stdint.h>
-
-#include <sys/types.h>
-
-#include <set>
-#include <string>
-#include <vector>
-
-#include <process/owned.hpp>
-#include <process/subprocess.hpp>
-
-#include <process/metrics/metrics.hpp>
-#include <process/metrics/counter.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/ip.hpp>
-#include <stout/interval.hpp>
-#include <stout/mac.hpp>
-#include <stout/none.hpp>
-#include <stout/option.hpp>
-#include <stout/subcommand.hpp>
-
-#include "linux/routing/filter/ip.hpp"
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The prefix this isolator uses for the virtual ethernet devices.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
-
-
-// The root directory where we bind mount all the namespace handles.
-// We choose the directory '/var/run/netns' so that we can use
-// iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
-// network namespace. This is very useful for debugging purposes.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
-
-// The root directory where we keep all the namespace handle
-// symlinks. This is introduced in 0.23.0.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
-{
- return "/var/run/mesos/netns";
-}
-
-
-// These names are used to identify the traffic control statistics
-// output for each of the Linux Traffic Control Qdiscs we report.
-constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
-constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
-
-
-// Responsible for allocating ephemeral ports for the port mapping
-// network isolator. This class is exposed mainly for unit testing.
-class EphemeralPortsAllocator
-{
-public:
- EphemeralPortsAllocator(
- const IntervalSet<uint16_t>& total,
- size_t _portsPerContainer)
- : free(total),
- portsPerContainer_(_portsPerContainer) {};
-
- // Returns the number of ephemeral ports for each container.
- size_t portsPerContainer() const { return portsPerContainer_; }
-
- // Allocate an ephemeral port range for a container. The allocator
- // will automatically find one port range with the given container
- // size. Returns error if the allocation cannot be fulfilled (e.g.,
- // exhausting available ephemeral ports).
- Try<Interval<uint16_t>> allocate();
-
- // Mark the specified ephemeral port range as allocated.
- void allocate(const Interval<uint16_t>& ports);
-
- // Deallocate the specified ephemeral port range.
- void deallocate(const Interval<uint16_t>& ports);
-
- // Return true if the specified ephemeral port range is managed by
- // the allocator, regardless it has been allocated to use or not.
- bool isManaged(const Interval<uint16_t>& ports)
- {
- return (free + used).contains(ports);
- }
-
-private:
- // Given an integer x, return the smallest integer t such that t >=
- // x and t % m == 0.
- static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
-
- IntervalSet<uint16_t> free;
- IntervalSet<uint16_t> used;
-
- // The number of ephemeral ports for each container.
- size_t portsPerContainer_;
-};
-
-
-// For the specified ports, generate a set of port ranges each of
-// which can be used by a single IP filter. In other words, each port
-// range needs to satisfy the following two conditions: 1) the size of
-// the range is 2^n (n=0,1,2...); 2) the begin of the range is size
-// aligned (i.e., begin % size == 0). This function is exposed mainly
-// for unit testing.
-std::vector<routing::filter::ip::PortRange> getPortRanges(
- const IntervalSet<uint16_t>& ports);
-
-
-// Provides network isolation using port mapping. Each container is
-// assigned a fixed set of ports (including ephemeral ports). The
-// isolator will set up filters on the host such that network traffic
-// to the host will be properly redirected to the corresponding
-// container depending on the destination ports. The network traffic
-// from containers will also be properly relayed to the host. This
-// isolator is useful when the operator wants to reuse the host IP for
-// all containers running on the host (e.g., there are insufficient
-// IPs).
-class PortMappingIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~PortMappingIsolatorProcess() {}
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- struct Info
- {
- Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
- const Interval<uint16_t>& _ephemeralPorts,
- const Option<pid_t>& _pid = None())
- : nonEphemeralPorts(_nonEphemeralPorts),
- ephemeralPorts(_ephemeralPorts),
- pid(_pid) {}
-
- // Non-ephemeral ports used by the container. It's possible that a
- // container does not use any non-ephemeral ports. In that case,
- // 'nonEphemeralPorts' will be empty. This variable could change
- // upon 'update'.
- IntervalSet<uint16_t> nonEphemeralPorts;
-
- // Each container has one and only one range of ephemeral ports.
- // It cannot have more than one ranges of ephemeral ports because
- // we need to setup the ip_local_port_range (which only accepts a
- // single interval) inside the container to restrict the ephemeral
- // ports used by the container.
- const Interval<uint16_t> ephemeralPorts;
-
- Option<pid_t> pid;
- Option<uint16_t> flowId;
- };
-
- // Define the metrics used by the port mapping network isolator.
- struct Metrics
- {
- Metrics();
- ~Metrics();
-
- process::metrics::Counter adding_eth0_ip_filters_errors;
- process::metrics::Counter adding_eth0_ip_filters_already_exist;
- process::metrics::Counter adding_eth0_egress_filters_errors;
- process::metrics::Counter adding_eth0_egress_filters_already_exist;
- process::metrics::Counter adding_lo_ip_filters_errors;
- process::metrics::Counter adding_lo_ip_filters_already_exist;
- process::metrics::Counter adding_veth_ip_filters_errors;
- process::metrics::Counter adding_veth_ip_filters_already_exist;
- process::metrics::Counter adding_veth_icmp_filters_errors;
- process::metrics::Counter adding_veth_icmp_filters_already_exist;
- process::metrics::Counter adding_veth_arp_filters_errors;
- process::metrics::Counter adding_veth_arp_filters_already_exist;
- process::metrics::Counter adding_eth0_icmp_filters_errors;
- process::metrics::Counter adding_eth0_icmp_filters_already_exist;
- process::metrics::Counter adding_eth0_arp_filters_errors;
- process::metrics::Counter adding_eth0_arp_filters_already_exist;
- process::metrics::Counter removing_eth0_ip_filters_errors;
- process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
- process::metrics::Counter removing_eth0_egress_filters_errors;
- process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
- process::metrics::Counter removing_lo_ip_filters_errors;
- process::metrics::Counter removing_lo_ip_filters_do_not_exist;
- process::metrics::Counter removing_veth_ip_filters_errors;
- process::metrics::Counter removing_veth_ip_filters_do_not_exist;
- process::metrics::Counter removing_eth0_icmp_filters_errors;
- process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
- process::metrics::Counter removing_eth0_arp_filters_errors;
- process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
- process::metrics::Counter updating_eth0_icmp_filters_errors;
- process::metrics::Counter updating_eth0_icmp_filters_already_exist;
- process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
- process::metrics::Counter updating_eth0_arp_filters_errors;
- process::metrics::Counter updating_eth0_arp_filters_already_exist;
- process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
- process::metrics::Counter updating_container_ip_filters_errors;
- } metrics;
-
- PortMappingIsolatorProcess(
- const Flags& _flags,
- const std::string& _eth0,
- const std::string& _lo,
- const net::MAC& _hostMAC,
- const net::IPNetwork& _hostIPNetwork,
- const size_t _hostEth0MTU,
- const net::IP& _hostDefaultGateway,
- const hashmap<std::string, std::string>& _hostNetworkConfigurations,
- const Option<Bytes>& _egressRateLimitPerContainer,
- const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
- const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
- const std::set<uint16_t>& _flowIDs)
- : flags(_flags),
- eth0(_eth0),
- lo(_lo),
- hostMAC(_hostMAC),
- hostIPNetwork(_hostIPNetwork),
- hostEth0MTU(_hostEth0MTU),
- hostDefaultGateway(_hostDefaultGateway),
- hostNetworkConfigurations(_hostNetworkConfigurations),
- egressRateLimitPerContainer(_egressRateLimitPerContainer),
- managedNonEphemeralPorts(_managedNonEphemeralPorts),
- ephemeralPortsAllocator(_ephemeralPortsAllocator),
- freeFlowIds(_flowIDs) {}
-
- // Continuations.
- Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
- Try<Info*> _recover(pid_t pid);
-
- void _update(
- const ContainerID& containerId,
- const process::Future<Option<int>>& status);
-
- process::Future<ResourceStatistics> _usage(
- const ResourceStatistics& result,
- const process::Subprocess& s);
-
- process::Future<ResourceStatistics> __usage(
- ResourceStatistics result,
- const process::Future<std::string>& out);
-
- // Helper functions.
- Try<Nothing> addHostIPFilters(
- const routing::filter::ip::PortRange& range,
- const Option<uint16_t>& flowId,
- const std::string& veth);
-
- Try<Nothing> removeHostIPFilters(
- const routing::filter::ip::PortRange& range,
- const std::string& veth,
- bool removeFiltersOnVeth = true);
-
- // Return the scripts that will be executed in the child context.
- std::string scripts(Info* info);
-
- uint16_t getNextFlowId();
-
- const Flags flags;
-
- const std::string eth0;
- const std::string lo;
- const net::MAC hostMAC;
- const net::IPNetwork hostIPNetwork;
- const size_t hostEth0MTU;
- const net::IP hostDefaultGateway;
-
- // Describe the host network configurations. It is a map between
- // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
- // values of the configure proc files.
- const hashmap<std::string, std::string> hostNetworkConfigurations;
-
- // The optional throughput limit to containers' egress traffic.
- const Option<Bytes> egressRateLimitPerContainer;
-
- // All the non-ephemeral ports managed by the slave, as passed in
- // via flags.resources.
- const IntervalSet<uint16_t> managedNonEphemeralPorts;
-
- process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
-
- // Store a set of unused flow ID's on this slave.
- std::set<uint16_t> freeFlowIds;
-
- hashmap<ContainerID, Info*> infos;
-
- // Recovered containers from a previous run that weren't managed by
- // the network isolator.
- hashset<ContainerID> unmanaged;
-};
-
-
-// Defines the subcommand for 'update' that needs to be executed by a
-// subprocess to update the filters inside a container.
-class PortMappingUpdate : public Subcommand
-{
-public:
- static const char* NAME;
-
- struct Flags : public flags::FlagsBase
- {
- Flags();
-
- Option<std::string> eth0_name;
- Option<std::string> lo_name;
- Option<pid_t> pid;
- Option<JSON::Object> ports_to_add;
- Option<JSON::Object> ports_to_remove;
- };
-
- PortMappingUpdate() : Subcommand(NAME) {}
-
- Flags flags;
-
-protected:
- virtual int execute();
- virtual flags::FlagsBase* getFlags() { return &flags; }
-};
-
-
-// Defines the subcommand for 'statistics' that needs to be executed
-// by a subprocess to retrieve newtork statistics from inside a
-// container.
-class PortMappingStatistics : public Subcommand
-{
-public:
- static const char* NAME;
-
- struct Flags : public flags::FlagsBase
- {
- Flags();
-
- Option<std::string> eth0_name;
- Option<pid_t> pid;
- bool enable_socket_statistics_summary;
- bool enable_socket_statistics_details;
- };
-
- PortMappingStatistics() : Subcommand(NAME) {}
-
- Flags flags;
-
-protected:
- virtual int execute();
- virtual flags::FlagsBase* getFlags() { return &flags; }
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PORT_MAPPING_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix.hpp b/src/slave/containerizer/isolators/posix.hpp
deleted file mode 100644
index 00df902..0000000
--- a/src/slave/containerizer/isolators/posix.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_ISOLATOR_HPP__
-#define __POSIX_ISOLATOR_HPP__
-
-#include <process/future.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/os.hpp>
-
-#include <stout/os/pstree.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "usage/usage.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// A basic MesosIsolatorProcess that keeps track of the pid but
-// doesn't do any resource isolation. Subclasses must implement
-// usage() for their appropriate resource(s).
-class PosixIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& state,
- const hashset<ContainerID>& orphans)
- {
- foreach (const mesos::slave::ContainerState& run, state) {
- // This should (almost) never occur: see comment in
- // PosixLauncher::recover().
- if (pids.contains(run.container_id())) {
- return process::Failure("Container already recovered");
- }
-
- pids.put(run.container_id(), run.pid());
-
- process::Owned<process::Promise<mesos::slave::ContainerLimitation>>
- promise(new process::Promise<mesos::slave::ContainerLimitation>());
- promises.put(run.container_id(), promise);
- }
-
- return Nothing();
- }
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user)
- {
- if (promises.contains(containerId)) {
- return process::Failure("Container " + stringify(containerId) +
- " has already been prepared");
- }
-
- process::Owned<process::Promise<mesos::slave::ContainerLimitation>> promise(
- new process::Promise<mesos::slave::ContainerLimitation>());
- promises.put(containerId, promise);
-
- return None();
- }
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid)
- {
- if (!promises.contains(containerId)) {
- return process::Failure("Unknown container: " + stringify(containerId));
- }
-
- pids.put(containerId, pid);
-
- return Nothing();
- }
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId)
- {
- if (!promises.contains(containerId)) {
- return process::Failure("Unknown container: " + stringify(containerId));
- }
-
- return promises[containerId]->future();
- }
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources)
- {
- if (!promises.contains(containerId)) {
- return process::Failure("Unknown container: " + stringify(containerId));
- }
-
- // No resources are actually isolated so nothing to do.
- return Nothing();
- }
-
- virtual process::Future<Nothing> cleanup(const ContainerID& containerId)
- {
- if (!promises.contains(containerId)) {
- return process::Failure("Unknown container: " + stringify(containerId));
- }
-
- // TODO(idownes): We should discard the container's promise here to signal
- // to anyone that holds the future from watch().
- promises.erase(containerId);
-
- pids.erase(containerId);
-
- return Nothing();
- }
-
-protected:
- hashmap<ContainerID, pid_t> pids;
- hashmap<ContainerID,
- process::Owned<process::Promise<mesos::slave::ContainerLimitation>>>
- promises;
-};
-
-
-class PosixCpuIsolatorProcess : public PosixIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags)
- {
- process::Owned<MesosIsolatorProcess> process(
- new PosixCpuIsolatorProcess());
-
- return new MesosIsolator(process);
- }
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId)
- {
- if (!pids.contains(containerId)) {
- LOG(WARNING) << "No resource usage for unknown container '"
- << containerId << "'";
- return ResourceStatistics();
- }
-
- // Use 'mesos-usage' but only request 'cpus_' values.
- Try<ResourceStatistics> usage =
- mesos::internal::usage(pids.get(containerId).get(), false, true);
- if (usage.isError()) {
- return process::Failure(usage.error());
- }
- return usage.get();
- }
-
-private:
- PosixCpuIsolatorProcess() {}
-};
-
-
-class PosixMemIsolatorProcess : public PosixIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags)
- {
- process::Owned<MesosIsolatorProcess> process(
- new PosixMemIsolatorProcess());
-
- return new MesosIsolator(process);
- }
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId)
- {
- if (!pids.contains(containerId)) {
- LOG(WARNING) << "No resource usage for unknown container '"
- << containerId << "'";
- return ResourceStatistics();
- }
-
- // Use 'mesos-usage' but only request 'mem_' values.
- Try<ResourceStatistics> usage =
- mesos::internal::usage(pids.get(containerId).get(), true, false);
- if (usage.isError()) {
- return process::Failure(usage.error());
- }
- return usage.get();
- }
-
-private:
- PosixMemIsolatorProcess() {}
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix/disk.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix/disk.cpp b/src/slave/containerizer/isolators/posix/disk.cpp
deleted file mode 100644
index 73e62a2..0000000
--- a/src/slave/containerizer/isolators/posix/disk.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <signal.h>
-
-#ifdef __linux__
-#include <sys/prctl.h>
-#endif
-#include <sys/types.h>
-
-#include <deque>
-#include <tuple>
-
-#include <glog/logging.h>
-
-#include <process/check.hpp>
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/delay.hpp>
-#include <process/io.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/check.hpp>
-#include <stout/foreach.hpp>
-#include <stout/lambda.hpp>
-#include <stout/numify.hpp>
-#include <stout/strings.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/killtree.hpp>
-
-#include "common/protobuf_utils.hpp"
-
-#include "slave/containerizer/isolators/posix/disk.hpp"
-
-using namespace process;
-
-using std::deque;
-using std::list;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> PosixDiskIsolatorProcess::create(const Flags& flags)
-{
- // TODO(jieyu): Check the availability of command 'du'.
-
- return new MesosIsolator(process::Owned<MesosIsolatorProcess>(
- new PosixDiskIsolatorProcess(flags)));
-}
-
-
-PosixDiskIsolatorProcess::Info::PathInfo::~PathInfo()
-{
- usage.discard();
-}
-
-
-PosixDiskIsolatorProcess::PosixDiskIsolatorProcess(const Flags& _flags)
- : flags(_flags), collector(flags.container_disk_watch_interval) {}
-
-
-PosixDiskIsolatorProcess::~PosixDiskIsolatorProcess() {}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::recover(
- const list<ContainerState>& states,
- const hashset<ContainerID>& orphans)
-{
- foreach (const ContainerState& state, states) {
- // Since we checkpoint the executor after we create its working
- // directory, the working directory should definitely exist.
- CHECK(os::exists(state.directory()))
- << "Executor work directory " << state.directory() << " doesn't exist";
-
- infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
- }
-
- return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> PosixDiskIsolatorProcess::prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const string& directory,
- const Option<string>& user)
-{
- if (infos.contains(containerId)) {
- return Failure("Container has already been prepared");
- }
-
- infos.put(containerId, Owned<Info>(new Info(directory)));
-
- return None();
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::isolate(
- const ContainerID& containerId,
- pid_t pid)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- return Nothing();
-}
-
-
-Future<ContainerLimitation> PosixDiskIsolatorProcess::watch(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::update(
- const ContainerID& containerId,
- const Resources& resources)
-{
- if (!infos.contains(containerId)) {
- LOG(WARNING) << "Ignoring update for unknown container " << containerId;
- return Nothing();
- }
-
- LOG(INFO) << "Updating the disk resources for container "
- << containerId << " to " << resources;
-
- const Owned<Info>& info = infos[containerId];
-
- // This stores the updated quotas.
- hashmap<string, Resources> quotas;
-
- foreach (const Resource& resource, resources) {
- if (resource.name() != "disk") {
- continue;
- }
-
- // The path at which we will collect disk usage and enforce quota.
- string path;
-
- // NOTE: We do not allow the case where has_disk() is true but
- // with nothing set inside DiskInfo. The master will enforce it.
- if (!resource.has_disk()) {
- // Regular disk used for executor working directory.
- path = info->directory;
- } else {
- // TODO(jieyu): Support persistent volmes as well.
- LOG(ERROR) << "Enforcing disk quota unsupported for " << resource;
- continue;
- }
-
- quotas[path] += resource;
- }
-
- // Update the quota for paths. For each new path, we also initiate
- // the disk usage collection.
- foreachpair (const string& path, const Resources& quota, quotas) {
- if (!info->paths.contains(path)) {
- info->paths[path].usage = collector.usage(path)
- .onAny(defer(
- PID<PosixDiskIsolatorProcess>(this),
- &PosixDiskIsolatorProcess::_collect,
- containerId,
- path,
- lambda::_1));
- }
-
- info->paths[path].quota = quota;
- }
-
- // Remove paths that we no longer interested in.
- foreach (const string& path, info->paths.keys()) {
- if (!quotas.contains(path)) {
- info->paths.erase(path);
- }
- }
-
- return Nothing();
-}
-
-
-void PosixDiskIsolatorProcess::_collect(
- const ContainerID& containerId,
- const string& path,
- const Future<Bytes>& future)
-{
- if (future.isDiscarded()) {
- LOG(INFO) << "Checking disk usage at '" << path << "' for container "
- << containerId << " has been cancelled";
- } else if (future.isFailed()) {
- LOG(ERROR) << "Checking disk usage at '" << path << "' for container "
- << containerId << " has failed: " << future.failure();
- }
-
- if (!infos.contains(containerId)) {
- // The container might have just been destroyed.
- return;
- }
-
- const Owned<Info>& info = infos[containerId];
-
- if (!info->paths.contains(path)) {
- // The path might have just been removed from this container's
- // resources.
- return;
- }
-
- // Check if the disk usage exceeds the quota. If yes, report the
- // limitation. We keep collecting the disk usage for 'path' by
- // initiating another round of disk usage check. The check will be
- // throttled by DiskUsageCollector.
- if (future.isReady()) {
- // Save the last disk usage.
- info->paths[path].lastUsage = future.get();
-
- if (flags.enforce_container_disk_quota) {
- Option<Bytes> quota = info->paths[path].quota.disk();
- CHECK_SOME(quota);
-
- if (future.get() > quota.get()) {
- info->limitation.set(
- protobuf::slave::createContainerLimitation(
- Resources(info->paths[path].quota),
- "Disk usage (" + stringify(future.get()) +
- ") exceeds quota (" + stringify(quota.get()) + ")",
- TaskStatus::REASON_CONTAINER_LIMITATION_DISK));
- }
- }
- }
-
- info->paths[path].usage = collector.usage(path)
- .onAny(defer(
- PID<PosixDiskIsolatorProcess>(this),
- &PosixDiskIsolatorProcess::_collect,
- containerId,
- path,
- lambda::_1));
-}
-
-
-Future<ResourceStatistics> PosixDiskIsolatorProcess::usage(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- return Failure("Unknown container");
- }
-
- ResourceStatistics result;
-
- const Owned<Info>& info = infos[containerId];
-
- if (info->paths.contains(info->directory)) {
- Option<Bytes> quota = info->paths[info->directory].quota.disk();
- CHECK_SOME(quota);
-
- result.set_disk_limit_bytes(quota.get().bytes());
-
- // NOTE: There may be a large delay (# of containers * interval)
- // until an initial cached value is returned here!
- if (info->paths[info->directory].lastUsage.isSome()) {
- result.set_disk_used_bytes(
- info->paths[info->directory].lastUsage.get().bytes());
- }
- }
-
- return result;
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::cleanup(
- const ContainerID& containerId)
-{
- if (!infos.contains(containerId)) {
- LOG(WARNING) << "Ignoring cleanup for unknown container " << containerId;
- return Nothing();
- }
-
- infos.erase(containerId);
-
- return Nothing();
-}
-
-
-class DiskUsageCollectorProcess : public Process<DiskUsageCollectorProcess>
-{
-public:
- DiskUsageCollectorProcess(const Duration& _interval) : interval(_interval) {}
- virtual ~DiskUsageCollectorProcess() {}
-
- Future<Bytes> usage(const string& path)
- {
- foreach (const Owned<Entry>& entry, entries) {
- if (entry->path == path) {
- return entry->promise.future();
- }
- }
-
- entries.push_back(Owned<Entry>(new Entry(path)));
-
- // Install onDiscard callback.
- Future<Bytes> future = entries.back()->promise.future();
- future.onDiscard(defer(self(), &Self::discard, path));
-
- return future;
- }
-
-protected:
- void initialize()
- {
- schedule();
- }
-
- void finalize()
- {
- foreach (const Owned<Entry>& entry, entries) {
- if (entry->du.isSome() && entry->du.get().status().isPending()) {
- os::killtree(entry->du.get().pid(), SIGKILL);
- }
-
- entry->promise.fail("DiskUsageCollector is destroyed");
- }
- }
-
-private:
- // Describe a single pending check.
- struct Entry
- {
- explicit Entry(const string& _path) : path(_path) {}
-
- string path;
- Option<Subprocess> du;
- Promise<Bytes> promise;
- };
-
- // This function is invoked right before each 'du' is exec'ed. Note
- // that this function needs to be async signal safe.
- static int setupChild()
- {
-#ifdef __linux__
- // Kill the child process if the parent exits.
- // NOTE: This function should never returns non-zero because we
- // are passing in a valid signal.
- return ::prctl(PR_SET_PDEATHSIG, SIGKILL);
-#else
- return 0;
-#endif
- }
-
- void discard(const string& path)
- {
- for (auto it = entries.begin(); it != entries.end(); ++it) {
- // We only cancel those checks whose 'du' haven't been launched.
- if ((*it)->path == path && (*it)->du.isNone()) {
- (*it)->promise.discard();
- entries.erase(it);
- break;
- }
- }
- }
-
- // Schedule a 'du' to be invoked. The current implementation does
- // not allow multiple 'du's running concurrently. The minimal
- // interval between two subsequent 'du's is controlled by 'interval'
- // for throttling purpose.
- void schedule()
- {
- if (entries.empty()) {
- delay(interval, self(), &Self::schedule);
- return;
- }
-
- const Owned<Entry>& entry = entries.front();
-
- // Invoke 'du' and report number of 1K-byte blocks. We fix the
- // block size here so that we can get consistent results on all
- // platforms (e.g., OS X uses 512 byte blocks).
- //
- // NOTE: The 'du' processes are run in the slave's cgroup and it
- // will be that cgroup that is charged for (a) memory to cache the
- // fs data structures, (b) disk I/O to read those structures, and
- // (c) the cpu time to traverse.
- Try<Subprocess> s = subprocess(
- "du",
- vector<string>({"du", "-k", "-s", entry->path}),
- Subprocess::PATH("/dev/null"),
- Subprocess::PIPE(),
- Subprocess::PIPE(),
- None(),
- None(),
- setupChild);
-
- if (s.isError()) {
- entry->promise.fail("Failed to exec 'du': " + s.error());
-
- entries.pop_front();
- delay(interval, self(), &Self::schedule);
- return;
- }
-
- entry->du = s.get();
-
- await(s.get().status(),
- io::read(s.get().out().get()),
- io::read(s.get().err().get()))
- .onAny(defer(self(), &Self::_schedule, lambda::_1));
- }
-
- void _schedule(const Future<std::tuple<
- Future<Option<int>>,
- Future<string>,
- Future<string>>>& future)
- {
- CHECK_READY(future);
- CHECK(!entries.empty());
-
- const Owned<Entry>& entry = entries.front();
- CHECK_SOME(entry->du);
-
- Future<Option<int>> status = std::get<0>(future.get());
-
- if (!status.isReady()) {
- entry->promise.fail(
- "Failed to perform 'du': " +
- (status.isFailed() ? status.failure() : "discarded"));
- } else if (status.get().isNone()) {
- entry->promise.fail("Failed to reap the status of 'du'");
- } else if (status.get().get() != 0) {
- Future<string> error = std::get<2>(future.get());
- if (!error.isReady()) {
- entry->promise.fail(
- "Failed to perform 'du'. Reading stderr failed: " +
- (error.isFailed() ? error.failure() : "discarded"));
- } else {
- entry->promise.fail("Failed to perform 'du': " + error.get());
- }
- } else {
- Future<string> output = std::get<1>(future.get());
- if (!output.isReady()) {
- entry->promise.fail(
- "Failed to read stdout from 'du': " +
- (output.isFailed() ? output.failure() : "discarded"));
- } else {
- // Parsing the output from 'du'. The following is a sample
- // output. Tab is used as the delimiter between the number of
- // blocks and the checked path.
- // $ du /var/lib/mesos/.../runs/container_id
- // 1024 /var/lib/mesos/.../runs/container_id
- vector<string> tokens = strings::tokenize(output.get(), " \t");
- if (tokens.empty()) {
- entry->promise.fail("Unexpected output from 'du': " + output.get());
- } else {
- Try<size_t> value = numify<size_t>(tokens[0]);
- if (value.isError()) {
- entry->promise.fail("Unexpected output from 'du': " + output.get());
- } else {
- // Notify the callers.
- entry->promise.set(Kilobytes(value.get()));
- }
- }
- }
- }
-
- entries.pop_front();
- delay(interval, self(), &Self::schedule);
- }
-
- const Duration interval;
-
- // A queue of pending checks.
- deque<Owned<Entry>> entries;
-};
-
-
-DiskUsageCollector::DiskUsageCollector(const Duration& interval)
-{
- process = new DiskUsageCollectorProcess(interval);
- spawn(process);
-}
-
-
-DiskUsageCollector::~DiskUsageCollector()
-{
- terminate(process);
- wait(process);
- delete process;
-}
-
-
-Future<Bytes> DiskUsageCollector::usage(const string& path)
-{
- return dispatch(process, &DiskUsageCollectorProcess::usage, path);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix/disk.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix/disk.hpp b/src/slave/containerizer/isolators/posix/disk.hpp
deleted file mode 100644
index 85df5d2..0000000
--- a/src/slave/containerizer/isolators/posix/disk.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_DISK_ISOLATOR_HPP__
-#define __POSIX_DISK_ISOLATOR_HPP__
-
-#include <string>
-
-#include <process/owned.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-
-#include "slave/flags.hpp"
-#include "slave/state.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declarations.
-class DiskUsageCollectorProcess;
-
-
-// Responsible for collecting disk usage for paths, while ensuring
-// that an interval elapses between each collection.
-class DiskUsageCollector
-{
-public:
- DiskUsageCollector(const Duration& interval);
- ~DiskUsageCollector();
-
- // Returns the disk usage rooted at 'path'. The user can discard the
- // returned future to cancel the check.
- process::Future<Bytes> usage(const std::string& path);
-
-private:
- DiskUsageCollectorProcess* process;
-};
-
-
-// This isolator monitors the disk usage for containers, and reports
-// ContainerLimitation when a container exceeds its disk quota. This
-// leverages the DiskUsageCollector to ensure that we don't induce too
-// much CPU usage and disk caching effects from running 'du' too
-// often.
-//
-// NOTE: Currently all containers are processed in the same queue,
-// which means that when a container starts, it could take many disk
-// collection intervals until any data is available in the resource
-// usage statistics!
-//
-// TODO(jieyu): Consider handling each container independently, or
-// triggering an initial collection when the container starts, to
-// ensure that we have usage statistics without a large delay.
-class PosixDiskIsolatorProcess : public MesosIsolatorProcess
-{
-public:
- static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
- virtual ~PosixDiskIsolatorProcess();
-
- virtual process::Future<Nothing> recover(
- const std::list<mesos::slave::ContainerState>& states,
- const hashset<ContainerID>& orphans);
-
- virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
- const ContainerID& containerId,
- const ExecutorInfo& executorInfo,
- const std::string& directory,
- const Option<std::string>& user);
-
- virtual process::Future<Nothing> isolate(
- const ContainerID& containerId,
- pid_t pid);
-
- virtual process::Future<mesos::slave::ContainerLimitation> watch(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> update(
- const ContainerID& containerId,
- const Resources& resources);
-
- virtual process::Future<ResourceStatistics> usage(
- const ContainerID& containerId);
-
- virtual process::Future<Nothing> cleanup(
- const ContainerID& containerId);
-
-private:
- PosixDiskIsolatorProcess(const Flags& flags);
-
- void _collect(
- const ContainerID& containerId,
- const std::string& path,
- const process::Future<Bytes>& future);
-
- const Flags flags;
- DiskUsageCollector collector;
-
- struct Info
- {
- explicit Info(const std::string& _directory) : directory(_directory) {}
-
- // We save executor working directory here so that we know where
- // to collect disk usage for disk resources without DiskInfo.
- const std::string directory;
-
- process::Promise<mesos::slave::ContainerLimitation> limitation;
-
- // The keys of the hashmaps contain the executor working directory
- // above, and optionally paths of volumes used by the container.
- // For each path, we maintain its quota and its last usage.
- struct PathInfo
- {
- ~PathInfo();
-
- Resources quota;
- process::Future<Bytes> usage;
- Option<Bytes> lastUsage;
- };
-
- hashmap<std::string, PathInfo> paths;
- };
-
- hashmap<ContainerID, process::Owned<Info>> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_DISK_ISOLATOR_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/linux_launcher.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/linux_launcher.cpp b/src/slave/containerizer/linux_launcher.cpp
index c03b89e..654f0e1 100644
--- a/src/slave/containerizer/linux_launcher.cpp
+++ b/src/slave/containerizer/linux_launcher.cpp
@@ -40,7 +40,7 @@
#include "slave/containerizer/linux_launcher.hpp"
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
using namespace process;
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index d1fc5a4..0664d00 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -47,36 +47,35 @@
#include "slave/containerizer/linux_launcher.hpp"
#endif
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
-#include "slave/containerizer/isolators/posix/disk.hpp"
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
#ifdef __linux__
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
#endif
#ifdef __linux__
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
#endif
-#include "slave/containerizer/isolators/filesystem/posix.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/posix.hpp"
#ifdef __linux__
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
#endif
#ifdef __linux__
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
#endif
#ifdef WITH_NETWORK_ISOLATOR
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
#endif
#include "slave/containerizer/mesos/containerizer.hpp"
#include "slave/containerizer/mesos/launch.hpp"
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
using std::list;
using std::map;
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp b/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
new file mode 100644
index 0000000..db9dde0
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CGROUPS_ISOLATOR_CONSTANTS_HPP__
+#define __CGROUPS_ISOLATOR_CONSTANTS_HPP__
+
+#include <stout/bytes.hpp>
+#include <stout/duration.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// CPU subsystem constants.
+const uint64_t CPU_SHARES_PER_CPU = 1024;
+const uint64_t CPU_SHARES_PER_CPU_REVOCABLE = 10;
+const uint64_t MIN_CPU_SHARES = 2; // Linux constant.
+const Duration CPU_CFS_PERIOD = Milliseconds(100); // Linux default.
+const Duration MIN_CPU_CFS_QUOTA = Milliseconds(1);
+
+
+// Memory subsystem constants.
+const Bytes MIN_MEMORY = Megabytes(32);
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __CGROUPS_ISOLATOR_CONSTANTS_HPP__
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
new file mode 100644
index 0000000..84a64e5
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <mesos/type_utils.hpp>
+#include <mesos/values.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/pid.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/nothing.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "linux/cgroups.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+CgroupsCpushareIsolatorProcess::CgroupsCpushareIsolatorProcess(
+ const Flags& _flags,
+ const hashmap<string, string>& _hierarchies,
+ const vector<string>& _subsystems)
+ : flags(_flags),
+ hierarchies(_hierarchies),
+ subsystems(_subsystems) {}
+
+
+CgroupsCpushareIsolatorProcess::~CgroupsCpushareIsolatorProcess() {}
+
+
+Try<Isolator*> CgroupsCpushareIsolatorProcess::create(const Flags& flags)
+{
+ Try<string> hierarchyCpu = cgroups::prepare(
+ flags.cgroups_hierarchy,
+ "cpu",
+ flags.cgroups_root);
+
+ if (hierarchyCpu.isError()) {
+ return Error(
+ "Failed to prepare hierarchy for cpu subsystem: " +
+ hierarchyCpu.error());
+ }
+
+ Try<string> hierarchyCpuacct = cgroups::prepare(
+ flags.cgroups_hierarchy,
+ "cpuacct",
+ flags.cgroups_root);
+
+ if (hierarchyCpuacct.isError()) {
+ return Error(
+ "Failed to prepare hierarchy for cpuacct subsystem: " +
+ hierarchyCpuacct.error());
+ }
+
+ hashmap<string, string> hierarchies;
+ vector<string> subsystems;
+
+ hierarchies["cpu"] = hierarchyCpu.get();
+ hierarchies["cpuacct"] = hierarchyCpuacct.get();
+
+ if (hierarchyCpu.get() == hierarchyCpuacct.get()) {
+ // Subsystem cpu and cpuacct are co-mounted (e.g., systemd).
+ hierarchies["cpu,cpuacct"] = hierarchyCpu.get();
+ subsystems.push_back("cpu,cpuacct");
+
+ // Ensure that no other subsystem is attached to the hierarchy.
+ Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
+ if (_subsystems.isError()) {
+ return Error(
+ "Failed to get the list of attached subsystems for hierarchy " +
+ hierarchyCpu.get());
+ } else if (_subsystems.get().size() != 2) {
+ return Error(
+ "Unexpected subsystems found attached to the hierarchy " +
+ hierarchyCpu.get());
+ }
+ } else {
+ // Subsystem cpu and cpuacct are mounted separately.
+ subsystems.push_back("cpu");
+ subsystems.push_back("cpuacct");
+
+ // Ensure that no other subsystem is attached to each of the
+ // hierarchy.
+ Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
+ if (_subsystems.isError()) {
+ return Error(
+ "Failed to get the list of attached subsystems for hierarchy " +
+ hierarchyCpu.get());
+ } else if (_subsystems.get().size() != 1) {
+ return Error(
+ "Unexpected subsystems found attached to the hierarchy " +
+ hierarchyCpu.get());
+ }
+
+ _subsystems = cgroups::subsystems(hierarchyCpuacct.get());
+ if (_subsystems.isError()) {
+ return Error(
+ "Failed to get the list of attached subsystems for hierarchy " +
+ hierarchyCpuacct.get());
+ } else if (_subsystems.get().size() != 1) {
+ return Error(
+ "Unexpected subsystems found attached to the hierarchy " +
+ hierarchyCpuacct.get());
+ }
+ }
+
+ if (flags.cgroups_enable_cfs) {
+ Try<bool> exists = cgroups::exists(
+ hierarchies["cpu"],
+ flags.cgroups_root,
+ "cpu.cfs_quota_us");
+
+ if (exists.isError() || !exists.get()) {
+ return Error(
+ "Failed to find 'cpu.cfs_quota_us'. Your kernel "
+ "might be too old to use the CFS cgroups feature.");
+ }
+ }
+
+ process::Owned<MesosIsolatorProcess> process(
+ new CgroupsCpushareIsolatorProcess(flags, hierarchies, subsystems));
+
+ return new MesosIsolator(process);
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::recover(
+ const list<ContainerState>& states,
+ const hashset<ContainerID>& orphans)
+{
+ foreach (const ContainerState& state, states) {
+ const ContainerID& containerId = state.container_id();
+ const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+ Try<bool> exists = cgroups::exists(hierarchies["cpu"], cgroup);
+ if (exists.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ infos.clear();
+ return Failure(
+ "Failed to check cgroup for container " + stringify(containerId));
+ }
+
+ if (!exists.get()) {
+ // This may occur if the executor has exited and the isolator
+ // has destroyed the cgroup but the slave dies before noticing
+ // this. This will be detected when the containerizer tries to
+ // monitor the executor's pid.
+ LOG(WARNING) << "Couldn't find cgroup for container " << containerId;
+ continue;
+ }
+
+ infos[containerId] = new Info(containerId, cgroup);
+ }
+
+ // Remove orphan cgroups.
+ foreach (const string& subsystem, subsystems) {
+ Try<vector<string>> cgroups = cgroups::get(
+ hierarchies[subsystem],
+ flags.cgroups_root);
+
+ if (cgroups.isError()) {
+ foreachvalue (Info* info, infos) {
+ delete info;
+ }
+ infos.clear();
+ return Failure(cgroups.error());
+ }
+
+ foreach (const string& cgroup, cgroups.get()) {
+ // Ignore the slave cgroup (see the --slave_subsystems flag).
+ // TODO(idownes): Remove this when the cgroups layout is
+ // updated, see MESOS-1185.
+ if (cgroup == path::join(flags.cgroups_root, "slave")) {
+ continue;
+ }
+
+ ContainerID containerId;
+ containerId.set_value(Path(cgroup).basename());
+
+ if (infos.contains(containerId)) {
+ continue;
+ }
+
+ // Known orphan cgroups will be destroyed by the containerizer
+ // using the normal cleanup path. See MESOS-2367 for details.
+ if (orphans.contains(containerId)) {
+ infos[containerId] = new Info(containerId, cgroup);
+ continue;
+ }
+
+ LOG(INFO) << "Removing unknown orphaned cgroup '"
+ << path::join(subsystem, cgroup) << "'";
+
+ // We don't wait on the destroy as we don't want to block recovery.
+ cgroups::destroy(
+ hierarchies[subsystem],
+ cgroup,
+ cgroups::DESTROY_TIMEOUT);
+ }
+ }
+
+ return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsCpushareIsolatorProcess::prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const string& directory,
+ const Option<string>& user)
+{
+ if (infos.contains(containerId)) {
+ return Failure("Container has already been prepared");
+ }
+
+ // TODO(bmahler): Don't insert into 'infos' unless we create the
+ // cgroup successfully. It's safe for now because 'cleanup' gets
+ // called if we return a Failure, but cleanup will fail because the
+ // cgroup does not exist when cgroups::destroy is called.
+ Info* info = new Info(
+ containerId, path::join(flags.cgroups_root, containerId.value()));
+
+ infos[containerId] = info;
+
+ foreach (const string& subsystem, subsystems) {
+ Try<bool> exists = cgroups::exists(hierarchies[subsystem], info->cgroup);
+ if (exists.isError()) {
+ return Failure("Failed to prepare isolator: " + exists.error());
+ } else if (exists.get()) {
+ return Failure("Failed to prepare isolator: cgroup already exists");
+ }
+
+ Try<Nothing> create = cgroups::create(hierarchies[subsystem], info->cgroup);
+ if (create.isError()) {
+ return Failure("Failed to prepare isolator: " + create.error());
+ }
+
+ // Chown the cgroup so the executor can create nested cgroups. Do
+ // not recurse so the control files are still owned by the slave
+ // user and thus cannot be changed by the executor.
+ if (user.isSome()) {
+ Try<Nothing> chown = os::chown(
+ user.get(),
+ path::join(hierarchies[subsystem], info->cgroup),
+ false);
+ if (chown.isError()) {
+ return Failure("Failed to prepare isolator: " + chown.error());
+ }
+ }
+ }
+
+ return update(containerId, executorInfo.resources())
+ .then([]() -> Future<Option<ContainerPrepareInfo>> {
+ return None();
+ });
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::isolate(
+ const ContainerID& containerId,
+ pid_t pid)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ CHECK_NONE(info->pid);
+ info->pid = pid;
+
+ foreach (const string& subsystem, subsystems) {
+ Try<Nothing> assign = cgroups::assign(
+ hierarchies[subsystem],
+ info->cgroup,
+ pid);
+
+ if (assign.isError()) {
+ LOG(ERROR) << "Failed to assign container '" << info->containerId
+ << " to its own cgroup '"
+ << path::join(hierarchies[subsystem], info->cgroup)
+ << "' : " << assign.error();
+
+ return Failure("Failed to isolate container: " + assign.error());
+ }
+ }
+
+ return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsCpushareIsolatorProcess::watch(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ CHECK_NOTNULL(infos[containerId]);
+
+ return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::update(
+ const ContainerID& containerId,
+ const Resources& resources)
+{
+ if (resources.cpus().isNone()) {
+ return Failure("No cpus resource given");
+ }
+
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ const Option<string>& hierarchy = hierarchies.get("cpu");
+ if (hierarchy.isNone()) {
+ return Failure("No 'cpu' hierarchy");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+ info->resources = resources;
+
+ double cpus = resources.cpus().get();
+
+ // Always set cpu.shares.
+ uint64_t shares;
+
+ if (flags.revocable_cpu_low_priority &&
+ resources.revocable().cpus().isSome()) {
+ shares = std::max(
+ (uint64_t) (CPU_SHARES_PER_CPU_REVOCABLE * cpus),
+ MIN_CPU_SHARES);
+ } else {
+ shares = std::max(
+ (uint64_t) (CPU_SHARES_PER_CPU * cpus),
+ MIN_CPU_SHARES);
+ }
+
+ Try<Nothing> write = cgroups::cpu::shares(
+ hierarchy.get(),
+ info->cgroup,
+ shares);
+
+ if (write.isError()) {
+ return Failure("Failed to update 'cpu.shares': " + write.error());
+ }
+
+ LOG(INFO) << "Updated 'cpu.shares' to " << shares
+ << " (cpus " << cpus << ")"
+ << " for container " << containerId;
+
+ // Set cfs quota if enabled.
+ if (flags.cgroups_enable_cfs) {
+ write = cgroups::cpu::cfs_period_us(
+ hierarchy.get(),
+ info->cgroup,
+ CPU_CFS_PERIOD);
+
+ if (write.isError()) {
+ return Failure("Failed to update 'cpu.cfs_period_us': " + write.error());
+ }
+
+ Duration quota = std::max(CPU_CFS_PERIOD * cpus, MIN_CPU_CFS_QUOTA);
+
+ write = cgroups::cpu::cfs_quota_us(hierarchy.get(), info->cgroup, quota);
+ if (write.isError()) {
+ return Failure("Failed to update 'cpu.cfs_quota_us': " + write.error());
+ }
+
+ LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
+ << " and 'cpu.cfs_quota_us' to " << quota
+ << " (cpus " << cpus << ")"
+ << " for container " << containerId;
+ }
+
+ return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsCpushareIsolatorProcess::usage(
+ const ContainerID& containerId)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ ResourceStatistics result;
+
+ // TODO(chzhcn): Getting the number of processes and threads is
+ // available as long as any cgroup subsystem is used so this best
+ // not be tied to a specific cgroup isolator. A better place is
+ // probably Linux Launcher, which uses the cgroup freezer subsystem.
+ // That requires some change for it to adopt the new semantics of
+ // reporting subsystem-independent cgroup usage.
+ // Note: The complexity of this operation is linear to the number of
+ // processes and threads in a container: the kernel has to allocate
+ // memory to contain the list of pids or tids; the userspace has to
+ // parse the cgroup files to get the size. If this proves to be a
+ // performance bottleneck, some kind of rate limiting mechanism
+ // needs to be employed.
+ if (flags.cgroups_cpu_enable_pids_and_tids_count) {
+ Try<std::set<pid_t>> pids =
+ cgroups::processes(hierarchies["cpuacct"], info->cgroup);
+ if (pids.isError()) {
+ return Failure("Failed to get number of processes: " + pids.error());
+ }
+
+ result.set_processes(pids.get().size());
+
+ Try<std::set<pid_t>> tids =
+ cgroups::threads(hierarchies["cpuacct"], info->cgroup);
+ if (tids.isError()) {
+ return Failure("Failed to get number of threads: " + tids.error());
+ }
+
+ result.set_threads(tids.get().size());
+ }
+
+ // Get the number of clock ticks, used for cpu accounting.
+ static long ticks = sysconf(_SC_CLK_TCK);
+
+ PCHECK(ticks > 0) << "Failed to get sysconf(_SC_CLK_TCK)";
+
+ // Add the cpuacct.stat information.
+ Try<hashmap<string, uint64_t>> stat = cgroups::stat(
+ hierarchies["cpuacct"],
+ info->cgroup,
+ "cpuacct.stat");
+
+ if (stat.isError()) {
+ return Failure("Failed to read cpuacct.stat: " + stat.error());
+ }
+
+ // TODO(bmahler): Add namespacing to cgroups to enforce the expected
+ // structure, e.g., cgroups::cpuacct::stat.
+ Option<uint64_t> user = stat.get().get("user");
+ Option<uint64_t> system = stat.get().get("system");
+
+ if (user.isSome() && system.isSome()) {
+ result.set_cpus_user_time_secs((double) user.get() / (double) ticks);
+ result.set_cpus_system_time_secs((double) system.get() / (double) ticks);
+ }
+
+ // Add the cpu.stat information only if CFS is enabled.
+ if (flags.cgroups_enable_cfs) {
+ stat = cgroups::stat(hierarchies["cpu"], info->cgroup, "cpu.stat");
+ if (stat.isError()) {
+ return Failure("Failed to read cpu.stat: " + stat.error());
+ }
+
+ Option<uint64_t> nr_periods = stat.get().get("nr_periods");
+ if (nr_periods.isSome()) {
+ result.set_cpus_nr_periods(nr_periods.get());
+ }
+
+ Option<uint64_t> nr_throttled = stat.get().get("nr_throttled");
+ if (nr_throttled.isSome()) {
+ result.set_cpus_nr_throttled(nr_throttled.get());
+ }
+
+ Option<uint64_t> throttled_time = stat.get().get("throttled_time");
+ if (throttled_time.isSome()) {
+ result.set_cpus_throttled_time_secs(
+ Nanoseconds(throttled_time.get()).secs());
+ }
+ }
+
+ return result;
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::cleanup(
+ const ContainerID& containerId)
+{
+ // Multiple calls may occur during test clean up.
+ if (!infos.contains(containerId)) {
+ VLOG(1) << "Ignoring cleanup request for unknown container: "
+ << containerId;
+
+ return Nothing();
+ }
+
+ Info* info = CHECK_NOTNULL(infos[containerId]);
+
+ list<Future<Nothing>> futures;
+ foreach (const string& subsystem, subsystems) {
+ futures.push_back(cgroups::destroy(
+ hierarchies[subsystem],
+ info->cgroup,
+ cgroups::DESTROY_TIMEOUT));
+ }
+
+ return collect(futures)
+ .onAny(defer(PID<CgroupsCpushareIsolatorProcess>(this),
+ &CgroupsCpushareIsolatorProcess::_cleanup,
+ containerId,
+ lambda::_1))
+ .then([]() { return Nothing(); });
+}
+
+
+Future<list<Nothing>> CgroupsCpushareIsolatorProcess::_cleanup(
+ const ContainerID& containerId,
+ const Future<list<Nothing>>& future)
+{
+ if (!infos.contains(containerId)) {
+ return Failure("Unknown container");
+ }
+
+ CHECK_NOTNULL(infos[containerId]);
+
+ if (!future.isReady()) {
+ return Failure(
+ "Failed to clean up container " + stringify(containerId) +
+ " : " + (future.isFailed() ? future.failure() : "discarded"));
+ }
+
+ delete infos[containerId];
+ infos.erase(containerId);
+
+ return future;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
new file mode 100644
index 0000000..4ce37bf
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CPUSHARE_ISOLATOR_HPP__
+#define __CPUSHARE_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <string>
+#include <vector>
+
+#include <process/future.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/option.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Use the Linux cpu cgroup controller for cpu isolation which uses the
+// Completely Fair Scheduler (CFS).
+// - cpushare implements proportionally weighted scheduling.
+// - cfs implements hard quota based scheduling.
+class CgroupsCpushareIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+ static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+ virtual ~CgroupsCpushareIsolatorProcess();
+
+ virtual process::Future<Nothing> recover(
+ const std::list<mesos::slave::ContainerState>& states,
+ const hashset<ContainerID>& orphans);
+
+ virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+ const ContainerID& containerId,
+ const ExecutorInfo& executorInfo,
+ const std::string& directory,
+ const Option<std::string>& user);
+
+ virtual process::Future<Nothing> isolate(
+ const ContainerID& containerId,
+ pid_t pid);
+
+ virtual process::Future<mesos::slave::ContainerLimitation> watch(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> update(
+ const ContainerID& containerId,
+ const Resources& resources);
+
+ virtual process::Future<ResourceStatistics> usage(
+ const ContainerID& containerId);
+
+ virtual process::Future<Nothing> cleanup(
+ const ContainerID& containerId);
+
+private:
+ CgroupsCpushareIsolatorProcess(
+ const Flags& flags,
+ const hashmap<std::string, std::string>& hierarchies,
+ const std::vector<std::string>& subsystems);
+
+ virtual process::Future<std::list<Nothing>> _cleanup(
+ const ContainerID& containerId,
+ const process::Future<std::list<Nothing>>& future);
+
+ struct Info
+ {
+ Info(const ContainerID& _containerId, const std::string& _cgroup)
+ : containerId(_containerId), cgroup(_cgroup) {}
+
+ const ContainerID containerId;
+ const std::string cgroup;
+ Option<pid_t> pid;
+ Option<Resources> resources;
+
+ process::Promise<mesos::slave::ContainerLimitation> limitation;
+ };
+
+ const Flags flags;
+
+ // Map from subsystem to hierarchy.
+ hashmap<std::string, std::string> hierarchies;
+
+ // Subsystems used for this isolator. Typically, there are two
+ // elements in the vector: 'cpu' and 'cpuacct'. If cpu and cpuacct
+ // systems are co-mounted (e.g., systems using systemd), then there
+ // will be only one element in the vector which is 'cpu,cpuacct'.
+ std::vector<std::string> subsystems;
+
+ // TODO(bmahler): Use Owned<Info>.
+ hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __CPUSHARE_ISOLATOR_HPP__