You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@mesos.apache.org by ji...@apache.org on 2015/10/26 19:40:25 UTC

[01/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Repository: mesos
Updated Branches:
  refs/heads/master 974906bf0 -> 9a722d742


http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/store.cpp b/src/slave/containerizer/provisioner/docker/store.cpp
deleted file mode 100644
index 5034013..0000000
--- a/src/slave/containerizer/provisioner/docker/store.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-#include <list>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/json.hpp>
-#include <stout/os.hpp>
-#include <stout/result.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/subprocess.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/flags.hpp"
-
-using namespace process;
-
-using std::list;
-using std::pair;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class StoreProcess : public Process<StoreProcess>
-{
-public:
-  StoreProcess(
-      const Flags& _flags,
-      const Owned<MetadataManager>& _metadataManager,
-      const Owned<Puller>& _puller)
-    : flags(_flags), metadataManager(_metadataManager), puller(_puller) {}
-
-  ~StoreProcess() {}
-
-  Future<Nothing> recover();
-
-  Future<vector<string>> get(const mesos::Image& image);
-
-private:
-  Future<Image> _get(
-      const Image::Name& name,
-      const Option<Image>& image);
-
-  Future<vector<string>> __get(const Image& image);
-
-  Future<vector<string>> moveLayers(
-      const std::string& staging,
-      const std::list<pair<string, string>>& layerPaths);
-
-  Future<Image> storeImage(
-      const Image::Name& name,
-      const std::vector<std::string>& layerIds);
-
-  Future<Nothing> moveLayer(const pair<string, string>& layerPath);
-
-  const Flags flags;
-  Owned<MetadataManager> metadataManager;
-  Owned<Puller> puller;
-};
-
-
-Try<Owned<slave::Store>> Store::create(const Flags& flags)
-{
-  Try<Owned<Puller>> puller = Puller::create(flags);
-  if (puller.isError()) {
-    return Error("Failed to create Docker puller: " + puller.error());
-  }
-
-  if (!os::exists(flags.docker_store_dir)) {
-    Try<Nothing> mkdir = os::mkdir(flags.docker_store_dir);
-    if (mkdir.isError()) {
-      return Error("Failed to create Docker store directory: " + mkdir.error());
-    }
-  }
-
-  if (!os::exists(paths::getStagingDir(flags.docker_store_dir))) {
-    Try<Nothing> mkdir =
-      os::mkdir(paths::getStagingDir(flags.docker_store_dir));
-
-    if (mkdir.isError()) {
-      return Error("Failed to create Docker store staging directory: " +
-                   mkdir.error());
-    }
-  }
-
-  Try<Owned<MetadataManager>> metadataManager = MetadataManager::create(flags);
-  if (metadataManager.isError()) {
-    return Error(metadataManager.error());
-  }
-
-  Owned<StoreProcess> process(
-      new StoreProcess(flags, metadataManager.get(), puller.get()));
-
-  return Owned<slave::Store>(new Store(process));
-}
-
-
-Store::Store(const Owned<StoreProcess>& _process) : process(_process)
-{
-  process::spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Store::~Store()
-{
-  process::terminate(process.get());
-  process::wait(process.get());
-}
-
-
-Future<Nothing> Store::recover()
-{
-  return dispatch(process.get(), &StoreProcess::recover);
-}
-
-
-Future<vector<string>> Store::get(const mesos::Image& image)
-{
-  return dispatch(process.get(), &StoreProcess::get, image);
-}
-
-
-Future<vector<string>> StoreProcess::get(const mesos::Image& image)
-{
-  if (image.type() != mesos::Image::DOCKER) {
-    return Failure("Docker provisioner store only supports Docker images");
-  }
-
-  Image::Name imageName = parseImageName(image.docker().name());
-
-  return metadataManager->get(imageName)
-    .then(defer(self(), &Self::_get, imageName, lambda::_1))
-    .then(defer(self(), &Self::__get, lambda::_1));
-}
-
-
-Future<Image> StoreProcess::_get(
-    const Image::Name& name,
-    const Option<Image>& image)
-{
-  if (image.isSome()) {
-    return image.get();
-  }
-
-  Try<string> staging =
-    os::mkdtemp(paths::getStagingTempDir(flags.docker_store_dir));
-
-  if (staging.isError()) {
-    return Failure("Failed to create a staging directory");
-  }
-
-  return puller->pull(name, staging.get())
-    .then(defer(self(), &Self::moveLayers, staging.get(), lambda::_1))
-    .then(defer(self(), &Self::storeImage, name, lambda::_1))
-    .onAny([staging]() {
-      Try<Nothing> rmdir = os::rmdir(staging.get());
-      if (rmdir.isError()) {
-        LOG(WARNING) << "Failed to remove staging directory: " << rmdir.error();
-      }
-    });
-}
-
-
-Future<vector<string>> StoreProcess::__get(const Image& image)
-{
-  vector<string> layerDirectories;
-  foreach (const string& layer, image.layer_ids()) {
-    layerDirectories.push_back(
-        paths::getImageLayerRootfsPath(
-            flags.docker_store_dir, layer));
-  }
-
-  return layerDirectories;
-}
-
-
-Future<Nothing> StoreProcess::recover()
-{
-  return metadataManager->recover();
-}
-
-
-Future<vector<string>> StoreProcess::moveLayers(
-    const string& staging,
-    const list<pair<string, string>>& layerPaths)
-{
-  list<Future<Nothing>> futures;
-  foreach (const auto& layerPath, layerPaths) {
-    futures.push_back(moveLayer(layerPath));
-  }
-
-  return collect(futures)
-    .then([layerPaths]() {
-        vector<string> layerIds;
-        foreach (const auto& layerPath, layerPaths) {
-          layerIds.push_back(layerPath.first);
-        }
-
-        return layerIds;
-    });
-}
-
-
-Future<Image> StoreProcess::storeImage(
-    const Image::Name& name,
-    const vector<string>& layerIds)
-{
-  return metadataManager->put(name, layerIds);
-}
-
-
-Future<Nothing> StoreProcess::moveLayer(const pair<string, string>& layerPath)
-{
-  if (!os::exists(layerPath.second)) {
-    return Failure("Unable to find layer '" + layerPath.first + "' in '" +
-                   layerPath.second + "'");
-  }
-
-  const string imageLayerPath =
-    paths::getImageLayerPath(flags.docker_store_dir, layerPath.first);
-
-  if (!os::exists(imageLayerPath)) {
-    Try<Nothing> mkdir = os::mkdir(imageLayerPath);
-    if (mkdir.isError()) {
-      return Failure("Failed to create layer path in store for id '" +
-                     layerPath.first + "': " + mkdir.error());
-    }
-  }
-
-  Try<Nothing> status = os::rename(
-      layerPath.second,
-      paths::getImageLayerRootfsPath(
-          flags.docker_store_dir, layerPath.first));
-
-  if (status.isError()) {
-    return Failure("Failed to move layer '" + layerPath.first +
-                   "' to store directory: " + status.error());
-  }
-
-  return Nothing();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/store.hpp b/src/slave/containerizer/provisioner/docker/store.hpp
deleted file mode 100644
index b5e6c87..0000000
--- a/src/slave/containerizer/provisioner/docker/store.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_STORE_HPP__
-#define __PROVISIONER_DOCKER_STORE_HPP__
-
-#include <string>
-
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward Declarations.
-class Puller;
-class StoreProcess;
-
-
-// Store fetches the Docker images and stores them on disk.
-class Store : public slave::Store
-{
-public:
-  static Try<process::Owned<slave::Store>> create(const Flags& flags);
-
-  ~Store();
-
-  process::Future<Nothing> recover();
-
-  process::Future<std::vector<std::string>> get(const mesos::Image& image);
-
-private:
-  explicit Store(const process::Owned<StoreProcess>& _process);
-
-  Store& operator=(const Store&) = delete; // Not assignable.
-  Store(const Store&) = delete; // Not copyable.
-
-  process::Owned<StoreProcess> process;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_STORE_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/token_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/token_manager.cpp b/src/slave/containerizer/provisioner/docker/token_manager.cpp
deleted file mode 100644
index 18b29c3..0000000
--- a/src/slave/containerizer/provisioner/docker/token_manager.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
-
-using std::hash;
-using std::string;
-using std::vector;
-
-using process::Clock;
-using process::Failure;
-using process::Future;
-using process::Owned;
-using process::Process;
-using process::Time;
-
-using process::http::Request;
-using process::http::Response;
-using process::http::URL;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-class TokenManagerProcess : public Process<TokenManagerProcess>
-{
-public:
-  static Try<Owned<TokenManagerProcess>> create(const URL& realm);
-
-  Future<Token> getToken(
-      const string& service,
-      const string& scope,
-      const Option<string>& account);
-
-private:
-  static const string TOKEN_PATH_PREFIX;
-  static const Duration RESPONSE_TIMEOUT;
-
-  TokenManagerProcess(const URL& realm)
-    : realm_(realm) {}
-
-  Try<Token> getTokenFromResponse(const Response& response) const;
-
-  /**
-   * Key for the token cache.
-   */
-  struct TokenCacheKey
-  {
-    string service;
-    string scope;
-  };
-
-  struct TokenCacheKeyHash
-  {
-    size_t operator()(const TokenCacheKey& key) const
-    {
-      hash<string> hashFn;
-
-      return (hashFn(key.service) ^
-          (hashFn(key.scope) << 1));
-    }
-  };
-
-  struct TokenCacheKeyEqual
-  {
-    bool operator()(
-        const TokenCacheKey& left,
-        const TokenCacheKey& right) const
-    {
-      return ((left.service == right.service) &&
-          (left.scope == right.scope));
-    }
-  };
-
-  typedef hashmap<
-    const TokenCacheKey,
-    Token,
-    TokenCacheKeyHash,
-    TokenCacheKeyEqual> TokenCacheType;
-
-  const URL realm_;
-  TokenCacheType tokenCache_;
-
-  TokenManagerProcess(const TokenManagerProcess&) = delete;
-  TokenManagerProcess& operator=(const TokenManagerProcess&) = delete;
-};
-
-const Duration TokenManagerProcess::RESPONSE_TIMEOUT = Seconds(10);
-const string TokenManagerProcess::TOKEN_PATH_PREFIX = "/v2/token/";
-
-
-Token::Token(
-    const string& _raw,
-    const JSON::Object& _header,
-    const JSON::Object& _claims,
-    const Option<Time>& _expiration,
-    const Option<Time>& _notBefore)
-  : raw(_raw),
-    header(_header),
-    claims(_claims),
-    expiration(_expiration),
-    notBefore(_notBefore) {}
-
-
-// TODO(josephw): Parse this string with some protobufs.
-Try<Token> Token::create(const string& raw)
-{
-  auto decode = [](
-      const string& segment) -> Try<JSON::Object> {
-    const auto padding = segment.length() % 4;
-    string paddedSegment(segment);
-
-    if (padding) {
-      paddedSegment.append(padding, '=');
-    }
-
-    Try<string> decoded = base64::decode(paddedSegment);
-    if (decoded.isError()) {
-      return Error(decoded.error());
-    }
-
-    return JSON::parse<JSON::Object>(decoded.get());
-  };
-
-  const vector<string> tokens = strings::tokenize(raw, ".");
-
-  if (tokens.size() != 3) {
-    return Error("Invalid raw token string");
-  }
-
-  Try<JSON::Object> header = decode(tokens[0]);
-  if (header.isError()) {
-    return Error("Failed to decode 'header' segment: " + header.error());
-  }
-
-  Try<JSON::Object> claims = decode(tokens[1]);
-  if (claims.isError()) {
-    return Error("Failed to decode 'claims' segment: " + claims.error());
-  }
-
-  Result<Time> expirationTime = getTimeValue(claims.get(), "exp");
-  if (expirationTime.isError()) {
-    return Error("Failed to decode expiration time: " + expirationTime.error());
-  }
-
-  Option<Time> expiration;
-  if (expirationTime.isSome()) {
-    expiration = expirationTime.get();
-  }
-
-  Result<Time> notBeforeTime = getTimeValue(claims.get(), "nbf");
-  if (notBeforeTime.isError()) {
-    return Error("Failed to decode not-before time: " + notBeforeTime.error());
-  }
-
-  Option<Time> notBefore;
-  if (notBeforeTime.isSome()) {
-    notBefore = notBeforeTime.get();
-  }
-
-  Token token(raw, header.get(), claims.get(), expiration, notBefore);
-
-  if (token.isExpired()) {
-    return Error("Token has expired");
-  }
-
-  // TODO(jojy): Add signature validation.
-  return token;
-}
-
-
-Result<Time> Token::getTimeValue(const JSON::Object& object, const string& key)
-{
-  Result<JSON::Number> jsonValue = object.find<JSON::Number>(key);
-
-  Option<Time> timeValue;
-
-  // If expiration is provided, we will process it for future validations.
-  if (jsonValue.isSome()) {
-    Try<Time> time = Time::create(jsonValue.get().as<double>());
-    if (time.isError()) {
-      return Error("Failed to decode time: " + time.error());
-    }
-
-    timeValue = time.get();
-  }
-
-  return timeValue;
-}
-
-
-bool Token::isExpired() const
-{
-  if (expiration.isSome()) {
-    return (Clock::now() >= expiration.get());
-  }
-
-  return false;
-}
-
-
-bool Token::isValid() const
-{
-  if (!isExpired()) {
-    if (notBefore.isSome()) {
-      return (Clock::now() >= notBefore.get());
-    }
-
-    return true;
-  }
-
-  // TODO(jojy): Add signature validation.
-  return false;
-}
-
-
-Try<Owned<TokenManager>> TokenManager::create(
-    const URL& realm)
-{
-  Try<Owned<TokenManagerProcess>> process = TokenManagerProcess::create(realm);
-  if (process.isError()) {
-    return Error(process.error());
-  }
-
-  return Owned<TokenManager>(new TokenManager(process.get()));
-}
-
-
-TokenManager::TokenManager(Owned<TokenManagerProcess>& process)
-  : process_(process)
-{
-  spawn(CHECK_NOTNULL(process_.get()));
-}
-
-
-TokenManager::~TokenManager()
-{
-  terminate(process_.get());
-  process::wait(process_.get());
-}
-
-
-Future<Token> TokenManager::getToken(
-    const string& service,
-    const string& scope,
-    const Option<string>& account)
-{
-  return dispatch(
-      process_.get(),
-      &TokenManagerProcess::getToken,
-      service,
-      scope,
-      account);
-}
-
-
-Try<Owned<TokenManagerProcess>> TokenManagerProcess::create(const URL& realm)
-{
-  return Owned<TokenManagerProcess>(new TokenManagerProcess(realm));
-}
-
-
-Try<Token> TokenManagerProcess::getTokenFromResponse(
-    const Response& response) const
-{
-  Try<JSON::Object> tokenJSON = JSON::parse<JSON::Object>(response.body);
-  if (tokenJSON.isError()) {
-    return Error(tokenJSON.error());
-  }
-
-  Result<JSON::String> tokenString =
-    tokenJSON.get().find<JSON::String>("token");
-
-  if (tokenString.isError()) {
-    return Error(tokenString.error());
-  }
-
-  Try<Token> result = Token::create(tokenString.get().value);
-  if (result.isError()) {
-    return Error(result.error());
-  }
-
-  return result.get();;
-}
-
-
-Future<Token> TokenManagerProcess::getToken(
-    const string& service,
-    const string& scope,
-    const Option<string>& account)
-{
-  const TokenCacheKey tokenKey = {service, scope};
-
-  if (tokenCache_.contains(tokenKey)) {
-    Token token = tokenCache_.at(tokenKey);
-
-    if (token.isValid()) {
-      return token;
-    } else {
-      LOG(WARNING) << "Cached token was invalid. Will fetch once again";
-    }
-  }
-
-  URL tokenUrl = realm_;
-  tokenUrl.path = TOKEN_PATH_PREFIX;
-
-  tokenUrl.query = {
-    {"service", service},
-    {"scope", scope},
-  };
-
-  if (account.isSome()) {
-    tokenUrl.query.insert({"account", account.get()});
-  }
-
-  return process::http::get(tokenUrl, None())
-    .after(RESPONSE_TIMEOUT, [] (Future<Response> resp) -> Future<Response> {
-      resp.discard();
-      return Failure("Timeout waiting for response to token request");
-    })
-    .then(defer(self(), [this, tokenKey](
-        const Future<Response>& response) -> Future<Token> {
-      Try<Token> token = getTokenFromResponse(response.get());
-      if (token.isError()) {
-        return Failure(
-            "Failed to parse JSON Web Token object from response: " +
-            token.error());
-      }
-
-      tokenCache_.insert({tokenKey, token.get()});
-
-      return token.get();
-    }));
-}
-
-// TODO(jojy): Add implementation for basic authentication based getToken API.
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/token_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/token_manager.hpp b/src/slave/containerizer/provisioner/docker/token_manager.hpp
deleted file mode 100644
index 2f4abff..0000000
--- a/src/slave/containerizer/provisioner/docker/token_manager.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
-#define __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
-
-#include <functional>
-#include <string>
-
-#include <stout/base64.hpp>
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/strings.hpp>
-
-#include <process/future.hpp>
-#include <process/http.hpp>
-#include <process/process.hpp>
-#include <process/time.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-
-/**
- * Encapsulates JSON Web Token.
- *
- * Reference: https://tools.ietf.org/html/rfc7519.
- */
-struct Token
-{
-  /**
-   * Factory method for Token object.
-   *
-   * Parses the raw token string and validates for token's expiration.
-   *
-   * @returns Token if parsing and validation succeeds.
-   *          Error if parsing or validation fails.
-   */
-  static Try<Token> create(const std::string& rawString);
-
-  /**
-   * Compares token's expiration time(expressed in seconds) with current time.
-   *
-   * @returns True if token's expiration time is greater than current time.
-   *          False if token's expiration time is less than or equal to current
-   *          time.
-   */
-  bool isExpired() const;
-
-  /**
-   * Validates the token if its "exp" "nbf" values are in range.
-   *
-   * @returns True if current time is within token's "exp" and "nbf" values.
-   *          False if current time is not within token's "exp" and "nbf"
-   *          values.
-   */
-  bool isValid() const;
-
-  const std::string raw;
-  const JSON::Object header;
-  const JSON::Object claims;
-  // TODO(jojy): Add signature information.
-
-private:
-  Token(
-      const std::string& raw,
-      const JSON::Object& headerJson,
-      const JSON::Object& claimsJson,
-      const Option<process::Time>& expireTime,
-      const Option<process::Time>& notBeforeTime);
-
-  static Result<process::Time> getTimeValue(
-      const JSON::Object& object,
-      const std::string& key);
-
-  const Option<process::Time> expiration;
-  const Option<process::Time> notBefore;
-};
-
-
-// Forward declaration.
-class TokenManagerProcess;
-
-
-/**
- *  Acquires and manages docker registry tokens. It keeps the tokens in its
- *  cache to server any future request for the same token.
- *  The cache grows unbounded.
- *  TODO(jojy): The cache can be optimized to prune based on the expiry time of
- *  the token and server's issue time.
- */
-class TokenManager
-{
-public:
-  /**
-   * Factory method for creating TokenManager object.
-   *
-   * TokenManager and registry authorization realm has a 1:1 relationship.
-   *
-   * @param realm URL of the authorization server from where token will be
-   *     requested by this TokenManager.
-   * @returns Owned<TokenManager> if success.
-   *          Error on failure.
-   */
-  static Try<process::Owned<TokenManager>> create(
-      const process::http::URL& realm);
-
-  /**
-   * Returns JSON Web Token from cache or from remote server using "Basic
-   * authorization".
-   *
-   * @param service Name of the service that hosts the resource for which
-   *     token is being requested.
-   * @param scope unique scope returned by the 401 Unauthorized response
-   *     from the registry.
-   * @param account Name of the account which the client is acting as.
-   * @param user base64 encoded userid for basic authorization.
-   * @param password base64 encoded password for basic authorization.
-   * @returns Token struct that encapsulates JSON Web Token.
-   */
-  process::Future<Token> getToken(
-      const std::string& service,
-      const std::string& scope,
-      const Option<std::string>& account,
-      const std::string& user,
-      const Option<std::string>& password);
-
-  /**
-   * Returns JSON Web Token from cache or from remote server using "TLS/Cert"
-   * based authorization.
-   *
-   * @param service Name of the service that hosts the resource for which
-   *     token is being requested.
-   * @param scope unique scope returned by the 401 Unauthorized response
-   *     from the registry.
-   * @param account Name of the account which the client is acting as.
-   * @returns Token struct that encapsulates JSON Web Token.
-   */
-  process::Future<Token> getToken(
-      const std::string& service,
-      const std::string& scope,
-      const Option<std::string>& account);
-
-  ~TokenManager();
-
-private:
-  TokenManager(process::Owned<TokenManagerProcess>& process);
-
-  TokenManager(const TokenManager&) = delete;
-  TokenManager& operator=(const TokenManager&) = delete;
-
-  process::Owned<TokenManagerProcess> process_;
-};
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/paths.cpp b/src/slave/containerizer/provisioner/paths.cpp
deleted file mode 100644
index 8d0f7d3..0000000
--- a/src/slave/containerizer/provisioner/paths.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <mesos/type_utils.hpp>
-
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include <stout/os/stat.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/provisioner/paths.hpp"
-
-using std::list;
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace provisioner {
-namespace paths {
-
-static string getContainersDir(const string& provisionerDir)
-{
-  return path::join(provisionerDir, "containers");
-}
-
-
-static string getBackendsDir(const string& containerDir)
-{
-  return path::join(containerDir, "backends");
-}
-
-
-static string getBackendDir(const string& backendsDir, const string& backend)
-{
-  return path::join(backendsDir, backend);
-}
-
-
-static string getRootfsesDir(const string& backendDir)
-{
-  return path::join(backendDir, "rootfses");
-}
-
-
-static string getRootfsDir(const string& rootfsesDir, const string& roofsId)
-{
-  return path::join(rootfsesDir, roofsId);
-}
-
-
-string getContainerDir(
-    const string& provisionerDir,
-    const ContainerID& containerId)
-{
-  return path::join(getContainersDir(provisionerDir), containerId.value());
-}
-
-
-string getContainerRootfsDir(
-    const string& provisionerDir,
-    const ContainerID& containerId,
-    const string& backend,
-    const string& rootfsId)
-{
-  return getRootfsDir(
-      getRootfsesDir(
-          getBackendDir(
-              getBackendsDir(
-                  getContainerDir(
-                      provisionerDir,
-                      containerId)),
-              backend)),
-      rootfsId);
-}
-
-
-Try<hashset<ContainerID>> listContainers(
-    const string& provisionerDir)
-{
-  hashset<ContainerID> results;
-
-  string containersDir = getContainersDir(provisionerDir);
-  if (!os::exists(containersDir)) {
-    // No container has been created yet.
-    return results;
-  }
-
-  Try<list<string>> containerIds = os::ls(containersDir);
-  if (containerIds.isError()) {
-    return Error("Unable to list the containers directory: " +
-                 containerIds.error());
-  }
-
-  foreach (const string& entry, containerIds.get()) {
-    string containerPath = path::join(containersDir, entry);
-
-    if (!os::stat::isdir(containerPath)) {
-      LOG(WARNING) << "Ignoring unexpected container entry at: "
-                   << containerPath;
-      continue;
-    }
-
-    ContainerID containerId;
-    containerId.set_value(entry);
-    results.insert(containerId);
-  }
-
-  return results;
-}
-
-
-Try<hashmap<string, hashset<string>>> listContainerRootfses(
-    const string& provisionerDir,
-    const ContainerID& containerId)
-{
-  hashmap<string, hashset<string>> results;
-
-  string backendsDir = getBackendsDir(
-      getContainerDir(
-          provisionerDir,
-          containerId));
-
-  Try<list<string>> backends = os::ls(backendsDir);
-  if (backends.isError()) {
-    return Error("Unable to list the container directory: " + backends.error());
-  }
-
-  foreach (const string& backend, backends.get()) {
-    string backendDir = getBackendDir(backendsDir, backend);
-    if (!os::stat::isdir(backendDir)) {
-      LOG(WARNING) << "Ignoring unexpected backend entry at: " << backendDir;
-      continue;
-    }
-
-    Try<list<string>> rootfses = os::ls(getRootfsesDir(backendDir));
-    if (rootfses.isError()) {
-      return Error("Unable to list the backend directory: " + rootfses.error());
-    }
-
-    hashset<string> backendResults;
-
-    foreach (const string& rootfsId, rootfses.get()) {
-      string rootfs = getRootfsDir(getRootfsesDir(backendDir), rootfsId);
-
-      if (!os::stat::isdir(rootfs)) {
-        LOG(WARNING) << "Ignoring unexpected rootfs entry at: " << backendDir;
-        continue;
-      }
-
-      backendResults.insert(rootfsId);
-    }
-
-    if (backendResults.empty()) {
-      LOG(WARNING) << "Ignoring a backend directory with no rootfs in it: "
-                   << backendDir;
-      continue;
-    }
-
-    // The rootfs directory has passed validation.
-    results.put(backend, backendResults);
-  }
-
-  return results;
-}
-
-} // namespace paths {
-} // namespace provisioner {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/paths.hpp b/src/slave/containerizer/provisioner/paths.hpp
deleted file mode 100644
index 7ebd36d..0000000
--- a/src/slave/containerizer/provisioner/paths.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_PATHS_HPP__
-#define __PROVISIONER_PATHS_HPP__
-
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/try.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace provisioner {
-namespace paths {
-
-// The provisioner rootfs directory is as follows:
-// <work_dir> ('--work_dir' flag)
-// |-- provisioner
-//     |-- containers
-//         |-- <container_id>
-//             |-- backends
-//                 |-- <backend> (copy, bind, etc.)
-//                     |-- rootfses
-//                         |-- <rootfs_id> (the rootfs)
-//
-// There can be multiple backends due to the change of backend flags.
-// Under each backend a rootfs is identified by the 'rootfs_id' which
-// is a UUID.
-
-std::string getContainerDir(
-    const std::string& provisionerDir,
-    const ContainerID& containerId);
-
-
-std::string getContainerRootfsDir(
-    const std::string& provisionerDir,
-    const ContainerID& containerId,
-    const std::string& backend,
-    const std::string& rootfsId);
-
-
-// Recursively "ls" the container directory and return a map of
-// backend -> {rootfsId, ...}
-Try<hashmap<std::string, hashset<std::string>>>
-listContainerRootfses(
-    const std::string& provisionerDir,
-    const ContainerID& containerId);
-
-
-// Return a set of container IDs.
-Try<hashset<ContainerID>> listContainers(
-    const std::string& provisionerDir);
-
-} // namespace paths {
-} // namespace provisioner {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/provisioner.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/provisioner.cpp b/src/slave/containerizer/provisioner/provisioner.cpp
deleted file mode 100644
index 30a85a5..0000000
--- a/src/slave/containerizer/provisioner/provisioner.cpp
+++ /dev/null
@@ -1,444 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <mesos/type_utils.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/process.hpp>
-
-#include <process/metrics/counter.hpp>
-#include <process/metrics/metrics.hpp>
-
-#include <stout/foreach.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/os.hpp>
-#include <stout/stringify.hpp>
-#include <stout/uuid.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/provisioner/backend.hpp"
-#include "slave/containerizer/provisioner/paths.hpp"
-#include "slave/containerizer/provisioner/provisioner.hpp"
-#include "slave/containerizer/provisioner/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerState;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class ProvisionerProcess : public Process<ProvisionerProcess>
-{
-public:
-  ProvisionerProcess(
-      const Flags& flags,
-      const string& rootDir,
-      const hashmap<Image::Type, Owned<Store>>& stores,
-      const hashmap<string, Owned<Backend>>& backends);
-
-  Future<Nothing> recover(
-      const list<ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  Future<string> provision(
-      const ContainerID& containerId,
-      const Image& image);
-
-  Future<bool> destroy(const ContainerID& containerId);
-
-private:
-  Future<string> _provision(
-      const ContainerID& containerId,
-      const vector<string>& layers);
-
-  Future<bool> _destroy(const ContainerID& containerId);
-
-  const Flags flags;
-
-  // Absolute path to the provisioner root directory. It can be
-  // derived from '--work_dir' but we keep a separate copy here
-  // because we converted it into an absolute path so managed rootfs
-  // paths match the ones in 'mountinfo' (important if mount-based
-  // backends are used).
-  const string rootDir;
-
-  const hashmap<Image::Type, Owned<Store>> stores;
-  const hashmap<string, Owned<Backend>> backends;
-
-  struct Info
-  {
-    // Mappings: backend -> {rootfsId, ...}
-    hashmap<string, hashset<string>> rootfses;
-  };
-
-  hashmap<ContainerID, Owned<Info>> infos;
-
-  struct Metrics
-  {
-    Metrics();
-    ~Metrics();
-
-    process::metrics::Counter remove_container_errors;
-  } metrics;
-};
-
-
-Try<Owned<Provisioner>> Provisioner::create(
-    const Flags& flags,
-    Fetcher* fetcher)
-{
-  string _rootDir = slave::paths::getProvisionerDir(flags.work_dir);
-
-  Try<Nothing> mkdir = os::mkdir(_rootDir);
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create provisioner root directory '" +
-        _rootDir + "': " + mkdir.error());
-  }
-
-  Result<string> rootDir = os::realpath(_rootDir);
-  if (rootDir.isError()) {
-    return Error(
-        "Failed to resolve the realpath of provisioner root directory '" +
-        _rootDir + "': " + rootDir.error());
-  }
-
-  CHECK_SOME(rootDir); // Can't be None since we just created it.
-
-  Try<hashmap<Image::Type, Owned<Store>>> stores = Store::create(flags);
-  if (stores.isError()) {
-    return Error("Failed to create image stores: " + stores.error());
-  }
-
-  hashmap<string, Owned<Backend>> backends = Backend::create(flags);
-  if (backends.empty()) {
-    return Error("No usable provisioner backend created");
-  }
-
-  if (!backends.contains(flags.image_provisioner_backend)) {
-    return Error(
-        "The specified provisioner backend '" +
-        flags.image_provisioner_backend + "' is unsupported");
-  }
-
-  return Owned<Provisioner>(new Provisioner(
-      Owned<ProvisionerProcess>(new ProvisionerProcess(
-          flags,
-          rootDir.get(),
-          stores.get(),
-          backends))));
-}
-
-
-Provisioner::Provisioner(Owned<ProvisionerProcess> _process)
-  : process(_process)
-{
-  spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Provisioner::~Provisioner()
-{
-  if (process.get() != NULL) {
-    terminate(process.get());
-    wait(process.get());
-  }
-}
-
-
-Future<Nothing> Provisioner::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  return dispatch(
-      CHECK_NOTNULL(process.get()),
-      &ProvisionerProcess::recover,
-      states,
-      orphans);
-}
-
-
-Future<string> Provisioner::provision(
-    const ContainerID& containerId,
-    const Image& image)
-{
-  return dispatch(
-      CHECK_NOTNULL(process.get()),
-      &ProvisionerProcess::provision,
-      containerId,
-      image);
-}
-
-
-Future<bool> Provisioner::destroy(const ContainerID& containerId)
-{
-  return dispatch(
-      CHECK_NOTNULL(process.get()),
-      &ProvisionerProcess::destroy,
-      containerId);
-}
-
-
-ProvisionerProcess::ProvisionerProcess(
-    const Flags& _flags,
-    const string& _rootDir,
-    const hashmap<Image::Type, Owned<Store>>& _stores,
-    const hashmap<string, Owned<Backend>>& _backends)
-  : flags(_flags),
-    rootDir(_rootDir),
-    stores(_stores),
-    backends(_backends) {}
-
-
-Future<Nothing> ProvisionerProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  // Register living containers, including the ones that do not
-  // provision images.
-  hashset<ContainerID> alive;
-  foreach (const ContainerState& state, states) {
-    alive.insert(state.container_id());
-  }
-
-  // List provisioned containers; recover living ones; destroy unknown
-  // orphans. Note that known orphan containers are recovered as well
-  // and they will be destroyed by the containerizer using the normal
-  // cleanup path. See MESOS-2367 for details.
-  Try<hashset<ContainerID>> containers =
-    provisioner::paths::listContainers(rootDir);
-
-  if (containers.isError()) {
-    return Failure(
-        "Failed to list the containers managed by the provisioner: " +
-        containers.error());
-  }
-
-  // Scan the list of containers, register all of them with 'infos'
-  // but mark unknown orphans for immediate cleanup.
-  hashset<ContainerID> unknownOrphans;
-
-  foreach (const ContainerID& containerId, containers.get()) {
-    Owned<Info> info = Owned<Info>(new Info());
-
-    Try<hashmap<string, hashset<string>>> rootfses =
-      provisioner::paths::listContainerRootfses(rootDir, containerId);
-
-    if (rootfses.isError()) {
-      return Failure(
-          "Unable to list rootfses belonged to container " +
-          stringify(containerId) + ": " + rootfses.error());
-    }
-
-    foreachkey (const string& backend, rootfses.get()) {
-      if (!backends.contains(backend)) {
-        return Failure(
-            "Found rootfses managed by an unrecognized backend: " + backend);
-      }
-
-      info->rootfses.put(backend, rootfses.get()[backend]);
-    }
-
-    infos.put(containerId, info);
-
-    if (alive.contains(containerId) || orphans.contains(containerId)) {
-      LOG(INFO) << "Recovered container " << containerId;
-      continue;
-    } else {
-      // For immediate cleanup below.
-      unknownOrphans.insert(containerId);
-    }
-  }
-
-  // Cleanup unknown orphan containers' rootfses.
-  list<Future<bool>> cleanups;
-  foreach (const ContainerID& containerId, unknownOrphans) {
-    LOG(INFO) << "Cleaning up unknown orphan container " << containerId;
-    cleanups.push_back(destroy(containerId));
-  }
-
-  Future<Nothing> cleanup = collect(cleanups)
-    .then([]() -> Future<Nothing> { return Nothing(); });
-
-  // Recover stores.
-  list<Future<Nothing>> recovers;
-  foreachvalue (const Owned<Store>& store, stores) {
-    recovers.push_back(store->recover());
-  }
-
-  Future<Nothing> recover = collect(recovers)
-    .then([]() -> Future<Nothing> { return Nothing(); });
-
-  // A successful provisioner recovery depends on:
-  // 1) Recovery of living containers and known orphans (done above).
-  // 2) Successful cleanup of unknown orphans.
-  // 3) Successful store recovery.
-  //
-  // TODO(jieyu): Do not recover 'store' before unknown orphans are
-  // cleaned up. In the future, we may want to cleanup unused rootfses
-  // in 'store', which might fail if there still exist unknown orphans
-  // holding references to them.
-  return collect(cleanup, recover)
-    .then([=]() -> Future<Nothing> {
-      LOG(INFO) << "Provisioner recovery complete";
-      return Nothing();
-    });
-}
-
-
-Future<string> ProvisionerProcess::provision(
-    const ContainerID& containerId,
-    const Image& image)
-{
-  if (!stores.contains(image.type())) {
-    return Failure(
-        "Unsupported container image type: " +
-        stringify(image.type()));
-  }
-
-  // Get and then provision image layers from the store.
-  return stores.get(image.type()).get()->get(image)
-    .then(defer(self(), &Self::_provision, containerId, lambda::_1));
-}
-
-
-Future<string> ProvisionerProcess::_provision(
-    const ContainerID& containerId,
-    const vector<string>& layers)
-{
-  // TODO(jieyu): Choose a backend smartly. For instance, if there is
-  // only one layer returned from the store. prefer to use bind
-  // backend because it's the simplest.
-  const string& backend = flags.image_provisioner_backend;
-  CHECK(backends.contains(backend));
-
-  string rootfsId = UUID::random().toString();
-
-  string rootfs = provisioner::paths::getContainerRootfsDir(
-      rootDir,
-      containerId,
-      backend,
-      rootfsId);
-
-  LOG(INFO) << "Provisioning image rootfs '" << rootfs
-            << "' for container " << containerId;
-
-  // NOTE: It's likely that the container ID already exists in 'infos'
-  // because one container might provision multiple images.
-  if (!infos.contains(containerId)) {
-    infos.put(containerId, Owned<Info>(new Info()));
-  }
-
-  infos[containerId]->rootfses[backend].insert(rootfsId);
-
-  return backends.get(backend).get()->provision(layers, rootfs)
-    .then([rootfs]() -> Future<string> { return rootfs; });
-}
-
-
-Future<bool> ProvisionerProcess::destroy(const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    LOG(INFO) << "Ignoring destroy request for unknown container "
-              << containerId;
-
-    return false;
-  }
-
-  // Unregister the container first. If destroy() fails, we can rely
-  // on recover() to retry it later.
-  Owned<Info> info = infos[containerId];
-  infos.erase(containerId);
-
-  list<Future<bool>> futures;
-  foreachkey (const string& backend, info->rootfses) {
-    if (!backends.contains(backend)) {
-      return Failure("Unknown backend '" + backend + "'");
-    }
-
-    foreach (const string& rootfsId, info->rootfses[backend]) {
-      string rootfs = provisioner::paths::getContainerRootfsDir(
-          rootDir,
-          containerId,
-          backend,
-          rootfsId);
-
-      LOG(INFO) << "Destroying container rootfs at '" << rootfs
-                << "' for container " << containerId;
-
-      futures.push_back(backends.get(backend).get()->destroy(rootfs));
-    }
-  }
-
-  // TODO(xujyan): Revisit the usefulness of this return value.
-  return collect(futures)
-    .then(defer(self(), &ProvisionerProcess::_destroy, containerId));
-}
-
-
-Future<bool> ProvisionerProcess::_destroy(const ContainerID& containerId)
-{
-  // This should be fairly cheap as the directory should only
-  // contain a few empty sub-directories at this point.
-  //
-  // TODO(jieyu): Currently, it's possible that some directories
-  // cannot be removed due to EBUSY. EBUSY is caused by the race
-  // between cleaning up this container and new containers copying
-  // the host mount table. It's OK to ignore them. The cleanup
-  // will be retried during slave recovery.
-  string containerDir =
-    provisioner::paths::getContainerDir(rootDir, containerId);
-
-  Try<Nothing> rmdir = os::rmdir(containerDir);
-  if (rmdir.isError()) {
-    LOG(ERROR) << "Failed to remove the provisioned container directory "
-               << "at '" << containerDir << "': " << rmdir.error();
-
-    ++metrics.remove_container_errors;
-  }
-
-  return true;
-}
-
-
-ProvisionerProcess::Metrics::Metrics()
-  : remove_container_errors(
-      "containerizer/mesos/provisioner/remove_container_errors")
-{
-  process::metrics::add(remove_container_errors);
-}
-
-
-ProvisionerProcess::Metrics::~Metrics()
-{
-  process::metrics::remove(remove_container_errors);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/provisioner.hpp b/src/slave/containerizer/provisioner/provisioner.hpp
deleted file mode 100644
index 912fc5a..0000000
--- a/src/slave/containerizer/provisioner/provisioner.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_HPP__
-#define __PROVISIONER_HPP__
-
-#include <list>
-
-#include <mesos/resources.hpp>
-
-#include <mesos/slave/isolator.hpp> // For ContainerState.
-
-#include <stout/nothing.hpp>
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/fetcher.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class ProvisionerProcess;
-
-
-class Provisioner
-{
-public:
-  // Create the provisioner based on the specified flags.
-  static Try<process::Owned<Provisioner>> create(
-      const Flags& flags,
-      Fetcher* fetcher);
-
-  // NOTE: Made 'virtual' for mocking and testing.
-  virtual ~Provisioner();
-
-  // Recover root filesystems for containers from the run states and
-  // the orphan containers (known to the launcher but not known to the
-  // slave) detected by the launcher. This function is also
-  // responsible for cleaning up any intermediate artifacts (e.g.
-  // directories) to not leak anything.
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  // Provision a root filesystem for the container using the specified
-  // image and return the absolute path to the root filesystem.
-  virtual process::Future<std::string> provision(
-      const ContainerID& containerId,
-      const Image& image);
-
-  // Destroy a previously provisioned root filesystem. Assumes that
-  // all references (e.g., mounts, open files) to the provisioned
-  // filesystem have been removed. Return false if there is no
-  // provisioned root filesystem for the given container.
-  virtual process::Future<bool> destroy(const ContainerID& containerId);
-
-protected:
-  Provisioner() {} // For creating mock object.
-
-private:
-  explicit Provisioner(process::Owned<ProvisionerProcess> process);
-
-  Provisioner(const Provisioner&) = delete; // Not copyable.
-  Provisioner& operator=(const Provisioner&) = delete; // Not assignable.
-
-  process::Owned<ProvisionerProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/store.cpp b/src/slave/containerizer/provisioner/store.cpp
deleted file mode 100644
index a8bc302..0000000
--- a/src/slave/containerizer/provisioner/store.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <string>
-
-#include <mesos/type_utils.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/containerizer/provisioner/appc/store.hpp"
-
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-using namespace process;
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<hashmap<Image::Type, Owned<Store>>> Store::create(const Flags& flags)
-{
-  if (flags.image_providers.isNone()) {
-    return hashmap<Image::Type, Owned<Store>>();
-  }
-
-  hashmap<Image::Type, Try<Owned<Store>>(*)(const Flags&)> creators;
-  creators.put(Image::APPC, &appc::Store::create);
-  creators.put(Image::DOCKER, &docker::Store::create);
-
-  hashmap<Image::Type, Owned<Store>> stores;
-
-  foreach (const string& type,
-           strings::tokenize(flags.image_providers.get(), ",")) {
-    Image::Type imageType;
-    if (!Image::Type_Parse(strings::upper(type), &imageType)) {
-      return Error("Unknown image type '" + type + "'");
-    }
-
-    if (!creators.contains(imageType)) {
-      return Error("Unsupported image type '" + type + "'");
-    }
-
-    Try<Owned<Store>> store = creators[imageType](flags);
-    if (store.isError()) {
-      return Error(
-          "Failed to create store for image type '" +
-          type + "': " + store.error());
-    }
-
-    stores.put(imageType, store.get());
-  }
-
-  return stores;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/store.hpp b/src/slave/containerizer/provisioner/store.hpp
deleted file mode 100644
index cf3e7d7..0000000
--- a/src/slave/containerizer/provisioner/store.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_STORE_HPP__
-#define __PROVISIONER_STORE_HPP__
-
-#include <string>
-#include <vector>
-
-#include <mesos/mesos.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/try.hpp>
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// An image store abstraction that "stores" images. It serves as a
-// read-through cache (cache misses are fetched remotely and
-// transparently) for images.
-class Store
-{
-public:
-  static Try<hashmap<Image::Type, process::Owned<Store>>> create(
-      const Flags& flags);
-
-  virtual ~Store() {}
-
-  virtual process::Future<Nothing> recover() = 0;
-
-  // Get the specified image (and all its recursive dependencies) as a
-  // list of rootfs layers in the topological order (dependencies go
-  // before dependents in the list). The images required to build this
-  // list are either retrieved from the local cache or fetched
-  // remotely.
-  //
-  // NOTE: The returned list should not have duplicates. e.g., in the
-  // following scenario the result should be [C, B, D, A] (B before D
-  // in this example is decided by the order in which A specifies its
-  // dependencies).
-  //
-  // A --> B --> C
-  // |           ^
-  // |---> D ----|
-  //
-  // The returned future fails if the requested image or any of its
-  // dependencies cannot be found or failed to be fetched.
-  virtual process::Future<std::vector<std::string>> get(const Image& image) = 0;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_STORE_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/filesystem_isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/filesystem_isolator_tests.cpp b/src/tests/containerizer/filesystem_isolator_tests.cpp
index 1e332e0..39008f6 100644
--- a/src/tests/containerizer/filesystem_isolator_tests.cpp
+++ b/src/tests/containerizer/filesystem_isolator_tests.cpp
@@ -42,12 +42,12 @@
 #ifdef __linux__
 #include "slave/containerizer/linux_launcher.hpp"
 
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
 #endif
 
 #include "slave/containerizer/mesos/containerizer.hpp"
 
-#include "slave/containerizer/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
 
 #include "tests/flags.hpp"
 #include "tests/mesos.hpp"

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/isolator_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/isolator_tests.cpp b/src/tests/containerizer/isolator_tests.cpp
index 237f3f2..4e1e90b 100644
--- a/src/tests/containerizer/isolator_tests.cpp
+++ b/src/tests/containerizer/isolator_tests.cpp
@@ -50,13 +50,13 @@
 #include "slave/slave.hpp"
 
 #ifdef __linux__
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
 #endif // __linux__
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
 
 #include "slave/containerizer/launcher.hpp"
 #ifdef __linux__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/port_mapping_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/port_mapping_tests.cpp b/src/tests/containerizer/port_mapping_tests.cpp
index feca204..ae2c0e6 100644
--- a/src/tests/containerizer/port_mapping_tests.cpp
+++ b/src/tests/containerizer/port_mapping_tests.cpp
@@ -56,7 +56,7 @@
 #include "slave/flags.hpp"
 #include "slave/slave.hpp"
 
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
 
 #include "slave/containerizer/fetcher.hpp"
 #include "slave/containerizer/launcher.hpp"

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner.hpp b/src/tests/containerizer/provisioner.hpp
index 54aab5f..507e141 100644
--- a/src/tests/containerizer/provisioner.hpp
+++ b/src/tests/containerizer/provisioner.hpp
@@ -26,7 +26,7 @@
 #include <stout/hashmap.hpp>
 #include <stout/stringify.hpp>
 
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
 
 #include "tests/containerizer/rootfs.hpp"
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_appc_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_appc_tests.cpp b/src/tests/containerizer/provisioner_appc_tests.cpp
index 3da759a..213e9c0 100644
--- a/src/tests/containerizer/provisioner_appc_tests.cpp
+++ b/src/tests/containerizer/provisioner_appc_tests.cpp
@@ -31,11 +31,11 @@
 
 #include "slave/paths.hpp"
 
-#include "slave/containerizer/provisioner/paths.hpp"
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
 
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-#include "slave/containerizer/provisioner/appc/store.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
 
 #include "tests/utils.hpp"
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_backend_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_backend_tests.cpp b/src/tests/containerizer/provisioner_backend_tests.cpp
index b72edc6..3347d58 100644
--- a/src/tests/containerizer/provisioner_backend_tests.cpp
+++ b/src/tests/containerizer/provisioner_backend_tests.cpp
@@ -29,8 +29,8 @@
 #include "linux/fs.hpp"
 #endif // __linux__
 
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-#include "slave/containerizer/provisioner/backends/copy.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
 
 #include "tests/flags.hpp"
 #include "tests/utils.hpp"

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/containerizer/provisioner_docker_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/containerizer/provisioner_docker_tests.cpp b/src/tests/containerizer/provisioner_docker_tests.cpp
index 01d3025..cf8aac3 100644
--- a/src/tests/containerizer/provisioner_docker_tests.cpp
+++ b/src/tests/containerizer/provisioner_docker_tests.cpp
@@ -37,12 +37,12 @@
 
 #include <process/ssl/gtest.hpp>
 
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/registry_client.hpp"
-#include "slave/containerizer/provisioner/docker/spec.hpp"
-#include "slave/containerizer/provisioner/docker/store.hpp"
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/registry_client.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
 
 #include "tests/mesos.hpp"
 #include "tests/utils.hpp"

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/tests/disk_quota_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/disk_quota_tests.cpp b/src/tests/disk_quota_tests.cpp
index 1ed1342..47a0640 100644
--- a/src/tests/disk_quota_tests.cpp
+++ b/src/tests/disk_quota_tests.cpp
@@ -40,7 +40,7 @@
 
 #include "slave/containerizer/fetcher.hpp"
 
-#include "slave/containerizer/isolators/posix/disk.hpp"
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
 
 #include "slave/containerizer/mesos/containerizer.hpp"

[03/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp b/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
new file mode 100644
index 0000000..ad5b63f
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/token_manager.cpp
@@ -0,0 +1,362 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
+
+using std::hash;
+using std::string;
+using std::vector;
+
+using process::Clock;
+using process::Failure;
+using process::Future;
+using process::Owned;
+using process::Process;
+using process::Time;
+
+using process::http::Request;
+using process::http::Response;
+using process::http::URL;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+class TokenManagerProcess : public Process<TokenManagerProcess>
+{
+public:
+  static Try<Owned<TokenManagerProcess>> create(const URL& realm);
+
+  Future<Token> getToken(
+      const string& service,
+      const string& scope,
+      const Option<string>& account);
+
+private:
+  static const string TOKEN_PATH_PREFIX;
+  static const Duration RESPONSE_TIMEOUT;
+
+  TokenManagerProcess(const URL& realm)
+    : realm_(realm) {}
+
+  Try<Token> getTokenFromResponse(const Response& response) const;
+
+  /**
+   * Key for the token cache.
+   */
+  struct TokenCacheKey
+  {
+    string service;
+    string scope;
+  };
+
+  struct TokenCacheKeyHash
+  {
+    size_t operator()(const TokenCacheKey& key) const
+    {
+      hash<string> hashFn;
+
+      return (hashFn(key.service) ^
+          (hashFn(key.scope) << 1));
+    }
+  };
+
+  struct TokenCacheKeyEqual
+  {
+    bool operator()(
+        const TokenCacheKey& left,
+        const TokenCacheKey& right) const
+    {
+      return ((left.service == right.service) &&
+          (left.scope == right.scope));
+    }
+  };
+
+  typedef hashmap<
+    const TokenCacheKey,
+    Token,
+    TokenCacheKeyHash,
+    TokenCacheKeyEqual> TokenCacheType;
+
+  const URL realm_;
+  TokenCacheType tokenCache_;
+
+  TokenManagerProcess(const TokenManagerProcess&) = delete;
+  TokenManagerProcess& operator=(const TokenManagerProcess&) = delete;
+};
+
+const Duration TokenManagerProcess::RESPONSE_TIMEOUT = Seconds(10);
+const string TokenManagerProcess::TOKEN_PATH_PREFIX = "/v2/token/";
+
+
+Token::Token(
+    const string& _raw,
+    const JSON::Object& _header,
+    const JSON::Object& _claims,
+    const Option<Time>& _expiration,
+    const Option<Time>& _notBefore)
+  : raw(_raw),
+    header(_header),
+    claims(_claims),
+    expiration(_expiration),
+    notBefore(_notBefore) {}
+
+
+// TODO(josephw): Parse this string with some protobufs.
+Try<Token> Token::create(const string& raw)
+{
+  auto decode = [](
+      const string& segment) -> Try<JSON::Object> {
+    const auto padding = segment.length() % 4;
+    string paddedSegment(segment);
+
+    if (padding) {
+      paddedSegment.append(padding, '=');
+    }
+
+    Try<string> decoded = base64::decode(paddedSegment);
+    if (decoded.isError()) {
+      return Error(decoded.error());
+    }
+
+    return JSON::parse<JSON::Object>(decoded.get());
+  };
+
+  const vector<string> tokens = strings::tokenize(raw, ".");
+
+  if (tokens.size() != 3) {
+    return Error("Invalid raw token string");
+  }
+
+  Try<JSON::Object> header = decode(tokens[0]);
+  if (header.isError()) {
+    return Error("Failed to decode 'header' segment: " + header.error());
+  }
+
+  Try<JSON::Object> claims = decode(tokens[1]);
+  if (claims.isError()) {
+    return Error("Failed to decode 'claims' segment: " + claims.error());
+  }
+
+  Result<Time> expirationTime = getTimeValue(claims.get(), "exp");
+  if (expirationTime.isError()) {
+    return Error("Failed to decode expiration time: " + expirationTime.error());
+  }
+
+  Option<Time> expiration;
+  if (expirationTime.isSome()) {
+    expiration = expirationTime.get();
+  }
+
+  Result<Time> notBeforeTime = getTimeValue(claims.get(), "nbf");
+  if (notBeforeTime.isError()) {
+    return Error("Failed to decode not-before time: " + notBeforeTime.error());
+  }
+
+  Option<Time> notBefore;
+  if (notBeforeTime.isSome()) {
+    notBefore = notBeforeTime.get();
+  }
+
+  Token token(raw, header.get(), claims.get(), expiration, notBefore);
+
+  if (token.isExpired()) {
+    return Error("Token has expired");
+  }
+
+  // TODO(jojy): Add signature validation.
+  return token;
+}
+
+
+Result<Time> Token::getTimeValue(const JSON::Object& object, const string& key)
+{
+  Result<JSON::Number> jsonValue = object.find<JSON::Number>(key);
+
+  Option<Time> timeValue;
+
+  // If expiration is provided, we will process it for future validations.
+  if (jsonValue.isSome()) {
+    Try<Time> time = Time::create(jsonValue.get().as<double>());
+    if (time.isError()) {
+      return Error("Failed to decode time: " + time.error());
+    }
+
+    timeValue = time.get();
+  }
+
+  return timeValue;
+}
+
+
+bool Token::isExpired() const
+{
+  if (expiration.isSome()) {
+    return (Clock::now() >= expiration.get());
+  }
+
+  return false;
+}
+
+
+bool Token::isValid() const
+{
+  if (!isExpired()) {
+    if (notBefore.isSome()) {
+      return (Clock::now() >= notBefore.get());
+    }
+
+    return true;
+  }
+
+  // TODO(jojy): Add signature validation.
+  return false;
+}
+
+
+Try<Owned<TokenManager>> TokenManager::create(
+    const URL& realm)
+{
+  Try<Owned<TokenManagerProcess>> process = TokenManagerProcess::create(realm);
+  if (process.isError()) {
+    return Error(process.error());
+  }
+
+  return Owned<TokenManager>(new TokenManager(process.get()));
+}
+
+
+TokenManager::TokenManager(Owned<TokenManagerProcess>& process)
+  : process_(process)
+{
+  spawn(CHECK_NOTNULL(process_.get()));
+}
+
+
+TokenManager::~TokenManager()
+{
+  terminate(process_.get());
+  process::wait(process_.get());
+}
+
+
+Future<Token> TokenManager::getToken(
+    const string& service,
+    const string& scope,
+    const Option<string>& account)
+{
+  return dispatch(
+      process_.get(),
+      &TokenManagerProcess::getToken,
+      service,
+      scope,
+      account);
+}
+
+
+Try<Owned<TokenManagerProcess>> TokenManagerProcess::create(const URL& realm)
+{
+  return Owned<TokenManagerProcess>(new TokenManagerProcess(realm));
+}
+
+
+Try<Token> TokenManagerProcess::getTokenFromResponse(
+    const Response& response) const
+{
+  Try<JSON::Object> tokenJSON = JSON::parse<JSON::Object>(response.body);
+  if (tokenJSON.isError()) {
+    return Error(tokenJSON.error());
+  }
+
+  Result<JSON::String> tokenString =
+    tokenJSON.get().find<JSON::String>("token");
+
+  if (tokenString.isError()) {
+    return Error(tokenString.error());
+  }
+
+  Try<Token> result = Token::create(tokenString.get().value);
+  if (result.isError()) {
+    return Error(result.error());
+  }
+
+  return result.get();;
+}
+
+
+Future<Token> TokenManagerProcess::getToken(
+    const string& service,
+    const string& scope,
+    const Option<string>& account)
+{
+  const TokenCacheKey tokenKey = {service, scope};
+
+  if (tokenCache_.contains(tokenKey)) {
+    Token token = tokenCache_.at(tokenKey);
+
+    if (token.isValid()) {
+      return token;
+    } else {
+      LOG(WARNING) << "Cached token was invalid. Will fetch once again";
+    }
+  }
+
+  URL tokenUrl = realm_;
+  tokenUrl.path = TOKEN_PATH_PREFIX;
+
+  tokenUrl.query = {
+    {"service", service},
+    {"scope", scope},
+  };
+
+  if (account.isSome()) {
+    tokenUrl.query.insert({"account", account.get()});
+  }
+
+  return process::http::get(tokenUrl, None())
+    .after(RESPONSE_TIMEOUT, [] (Future<Response> resp) -> Future<Response> {
+      resp.discard();
+      return Failure("Timeout waiting for response to token request");
+    })
+    .then(defer(self(), [this, tokenKey](
+        const Future<Response>& response) -> Future<Token> {
+      Try<Token> token = getTokenFromResponse(response.get());
+      if (token.isError()) {
+        return Failure(
+            "Failed to parse JSON Web Token object from response: " +
+            token.error());
+      }
+
+      tokenCache_.insert({tokenKey, token.get()});
+
+      return token.get();
+    }));
+}
+
+// TODO(jojy): Add implementation for basic authentication based getToken API.
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp b/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
new file mode 100644
index 0000000..2f4abff
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/token_manager.hpp
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
+#define __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__
+
+#include <functional>
+#include <string>
+
+#include <stout/base64.hpp>
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/strings.hpp>
+
+#include <process/future.hpp>
+#include <process/http.hpp>
+#include <process/process.hpp>
+#include <process/time.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+
+/**
+ * Encapsulates JSON Web Token.
+ *
+ * Reference: https://tools.ietf.org/html/rfc7519.
+ */
+struct Token
+{
+  /**
+   * Factory method for Token object.
+   *
+   * Parses the raw token string and validates for token's expiration.
+   *
+   * @returns Token if parsing and validation succeeds.
+   *          Error if parsing or validation fails.
+   */
+  static Try<Token> create(const std::string& rawString);
+
+  /**
+   * Compares token's expiration time(expressed in seconds) with current time.
+   *
+   * @returns True if token's expiration time is greater than current time.
+   *          False if token's expiration time is less than or equal to current
+   *          time.
+   */
+  bool isExpired() const;
+
+  /**
+   * Validates the token if its "exp" "nbf" values are in range.
+   *
+   * @returns True if current time is within token's "exp" and "nbf" values.
+   *          False if current time is not within token's "exp" and "nbf"
+   *          values.
+   */
+  bool isValid() const;
+
+  const std::string raw;
+  const JSON::Object header;
+  const JSON::Object claims;
+  // TODO(jojy): Add signature information.
+
+private:
+  Token(
+      const std::string& raw,
+      const JSON::Object& headerJson,
+      const JSON::Object& claimsJson,
+      const Option<process::Time>& expireTime,
+      const Option<process::Time>& notBeforeTime);
+
+  static Result<process::Time> getTimeValue(
+      const JSON::Object& object,
+      const std::string& key);
+
+  const Option<process::Time> expiration;
+  const Option<process::Time> notBefore;
+};
+
+
+// Forward declaration.
+class TokenManagerProcess;
+
+
+/**
+ *  Acquires and manages docker registry tokens. It keeps the tokens in its
+ *  cache to server any future request for the same token.
+ *  The cache grows unbounded.
+ *  TODO(jojy): The cache can be optimized to prune based on the expiry time of
+ *  the token and server's issue time.
+ */
+class TokenManager
+{
+public:
+  /**
+   * Factory method for creating TokenManager object.
+   *
+   * TokenManager and registry authorization realm has a 1:1 relationship.
+   *
+   * @param realm URL of the authorization server from where token will be
+   *     requested by this TokenManager.
+   * @returns Owned<TokenManager> if success.
+   *          Error on failure.
+   */
+  static Try<process::Owned<TokenManager>> create(
+      const process::http::URL& realm);
+
+  /**
+   * Returns JSON Web Token from cache or from remote server using "Basic
+   * authorization".
+   *
+   * @param service Name of the service that hosts the resource for which
+   *     token is being requested.
+   * @param scope unique scope returned by the 401 Unauthorized response
+   *     from the registry.
+   * @param account Name of the account which the client is acting as.
+   * @param user base64 encoded userid for basic authorization.
+   * @param password base64 encoded password for basic authorization.
+   * @returns Token struct that encapsulates JSON Web Token.
+   */
+  process::Future<Token> getToken(
+      const std::string& service,
+      const std::string& scope,
+      const Option<std::string>& account,
+      const std::string& user,
+      const Option<std::string>& password);
+
+  /**
+   * Returns JSON Web Token from cache or from remote server using "TLS/Cert"
+   * based authorization.
+   *
+   * @param service Name of the service that hosts the resource for which
+   *     token is being requested.
+   * @param scope unique scope returned by the 401 Unauthorized response
+   *     from the registry.
+   * @param account Name of the account which the client is acting as.
+   * @returns Token struct that encapsulates JSON Web Token.
+   */
+  process::Future<Token> getToken(
+      const std::string& service,
+      const std::string& scope,
+      const Option<std::string>& account);
+
+  ~TokenManager();
+
+private:
+  TokenManager(process::Owned<TokenManagerProcess>& process);
+
+  TokenManager(const TokenManager&) = delete;
+  TokenManager& operator=(const TokenManager&) = delete;
+
+  process::Owned<TokenManagerProcess> process_;
+};
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_TOKEN_MANAGER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/paths.cpp b/src/slave/containerizer/mesos/provisioner/paths.cpp
new file mode 100644
index 0000000..b019eb9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/paths.cpp
@@ -0,0 +1,192 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <mesos/type_utils.hpp>
+
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include <stout/os/stat.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+
+using std::list;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace provisioner {
+namespace paths {
+
+static string getContainersDir(const string& provisionerDir)
+{
+  return path::join(provisionerDir, "containers");
+}
+
+
+static string getBackendsDir(const string& containerDir)
+{
+  return path::join(containerDir, "backends");
+}
+
+
+static string getBackendDir(const string& backendsDir, const string& backend)
+{
+  return path::join(backendsDir, backend);
+}
+
+
+static string getRootfsesDir(const string& backendDir)
+{
+  return path::join(backendDir, "rootfses");
+}
+
+
+static string getRootfsDir(const string& rootfsesDir, const string& roofsId)
+{
+  return path::join(rootfsesDir, roofsId);
+}
+
+
+string getContainerDir(
+    const string& provisionerDir,
+    const ContainerID& containerId)
+{
+  return path::join(getContainersDir(provisionerDir), containerId.value());
+}
+
+
+string getContainerRootfsDir(
+    const string& provisionerDir,
+    const ContainerID& containerId,
+    const string& backend,
+    const string& rootfsId)
+{
+  return getRootfsDir(
+      getRootfsesDir(
+          getBackendDir(
+              getBackendsDir(
+                  getContainerDir(
+                      provisionerDir,
+                      containerId)),
+              backend)),
+      rootfsId);
+}
+
+
+Try<hashset<ContainerID>> listContainers(
+    const string& provisionerDir)
+{
+  hashset<ContainerID> results;
+
+  string containersDir = getContainersDir(provisionerDir);
+  if (!os::exists(containersDir)) {
+    // No container has been created yet.
+    return results;
+  }
+
+  Try<list<string>> containerIds = os::ls(containersDir);
+  if (containerIds.isError()) {
+    return Error("Unable to list the containers directory: " +
+                 containerIds.error());
+  }
+
+  foreach (const string& entry, containerIds.get()) {
+    string containerPath = path::join(containersDir, entry);
+
+    if (!os::stat::isdir(containerPath)) {
+      LOG(WARNING) << "Ignoring unexpected container entry at: "
+                   << containerPath;
+      continue;
+    }
+
+    ContainerID containerId;
+    containerId.set_value(entry);
+    results.insert(containerId);
+  }
+
+  return results;
+}
+
+
+Try<hashmap<string, hashset<string>>> listContainerRootfses(
+    const string& provisionerDir,
+    const ContainerID& containerId)
+{
+  hashmap<string, hashset<string>> results;
+
+  string backendsDir = getBackendsDir(
+      getContainerDir(
+          provisionerDir,
+          containerId));
+
+  Try<list<string>> backends = os::ls(backendsDir);
+  if (backends.isError()) {
+    return Error("Unable to list the container directory: " + backends.error());
+  }
+
+  foreach (const string& backend, backends.get()) {
+    string backendDir = getBackendDir(backendsDir, backend);
+    if (!os::stat::isdir(backendDir)) {
+      LOG(WARNING) << "Ignoring unexpected backend entry at: " << backendDir;
+      continue;
+    }
+
+    Try<list<string>> rootfses = os::ls(getRootfsesDir(backendDir));
+    if (rootfses.isError()) {
+      return Error("Unable to list the backend directory: " + rootfses.error());
+    }
+
+    hashset<string> backendResults;
+
+    foreach (const string& rootfsId, rootfses.get()) {
+      string rootfs = getRootfsDir(getRootfsesDir(backendDir), rootfsId);
+
+      if (!os::stat::isdir(rootfs)) {
+        LOG(WARNING) << "Ignoring unexpected rootfs entry at: " << backendDir;
+        continue;
+      }
+
+      backendResults.insert(rootfsId);
+    }
+
+    if (backendResults.empty()) {
+      LOG(WARNING) << "Ignoring a backend directory with no rootfs in it: "
+                   << backendDir;
+      continue;
+    }
+
+    // The rootfs directory has passed validation.
+    results.put(backend, backendResults);
+  }
+
+  return results;
+}
+
+} // namespace paths {
+} // namespace provisioner {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/paths.hpp b/src/slave/containerizer/mesos/provisioner/paths.hpp
new file mode 100644
index 0000000..7ebd36d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/paths.hpp
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_PATHS_HPP__
+#define __PROVISIONER_PATHS_HPP__
+
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/try.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace provisioner {
+namespace paths {
+
+// The provisioner rootfs directory is as follows:
+// <work_dir> ('--work_dir' flag)
+// |-- provisioner
+//     |-- containers
+//         |-- <container_id>
+//             |-- backends
+//                 |-- <backend> (copy, bind, etc.)
+//                     |-- rootfses
+//                         |-- <rootfs_id> (the rootfs)
+//
+// There can be multiple backends due to the change of backend flags.
+// Under each backend a rootfs is identified by the 'rootfs_id' which
+// is a UUID.
+
+std::string getContainerDir(
+    const std::string& provisionerDir,
+    const ContainerID& containerId);
+
+
+std::string getContainerRootfsDir(
+    const std::string& provisionerDir,
+    const ContainerID& containerId,
+    const std::string& backend,
+    const std::string& rootfsId);
+
+
+// Recursively "ls" the container directory and return a map of
+// backend -> {rootfsId, ...}
+Try<hashmap<std::string, hashset<std::string>>>
+listContainerRootfses(
+    const std::string& provisionerDir,
+    const ContainerID& containerId);
+
+
+// Return a set of container IDs.
+Try<hashset<ContainerID>> listContainers(
+    const std::string& provisionerDir);
+
+} // namespace paths {
+} // namespace provisioner {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/provisioner.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/provisioner.cpp b/src/slave/containerizer/mesos/provisioner/provisioner.cpp
new file mode 100644
index 0000000..e99c1c9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/provisioner.cpp
@@ -0,0 +1,444 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mesos/type_utils.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/process.hpp>
+
+#include <process/metrics/counter.hpp>
+#include <process/metrics/metrics.hpp>
+
+#include <stout/foreach.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/os.hpp>
+#include <stout/stringify.hpp>
+#include <stout/uuid.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+#include "slave/containerizer/mesos/provisioner/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerState;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class ProvisionerProcess : public Process<ProvisionerProcess>
+{
+public:
+  ProvisionerProcess(
+      const Flags& flags,
+      const string& rootDir,
+      const hashmap<Image::Type, Owned<Store>>& stores,
+      const hashmap<string, Owned<Backend>>& backends);
+
+  Future<Nothing> recover(
+      const list<ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  Future<string> provision(
+      const ContainerID& containerId,
+      const Image& image);
+
+  Future<bool> destroy(const ContainerID& containerId);
+
+private:
+  Future<string> _provision(
+      const ContainerID& containerId,
+      const vector<string>& layers);
+
+  Future<bool> _destroy(const ContainerID& containerId);
+
+  const Flags flags;
+
+  // Absolute path to the provisioner root directory. It can be
+  // derived from '--work_dir' but we keep a separate copy here
+  // because we converted it into an absolute path so managed rootfs
+  // paths match the ones in 'mountinfo' (important if mount-based
+  // backends are used).
+  const string rootDir;
+
+  const hashmap<Image::Type, Owned<Store>> stores;
+  const hashmap<string, Owned<Backend>> backends;
+
+  struct Info
+  {
+    // Mappings: backend -> {rootfsId, ...}
+    hashmap<string, hashset<string>> rootfses;
+  };
+
+  hashmap<ContainerID, Owned<Info>> infos;
+
+  struct Metrics
+  {
+    Metrics();
+    ~Metrics();
+
+    process::metrics::Counter remove_container_errors;
+  } metrics;
+};
+
+
+Try<Owned<Provisioner>> Provisioner::create(
+    const Flags& flags,
+    Fetcher* fetcher)
+{
+  string _rootDir = slave::paths::getProvisionerDir(flags.work_dir);
+
+  Try<Nothing> mkdir = os::mkdir(_rootDir);
+  if (mkdir.isError()) {
+    return Error(
+        "Failed to create provisioner root directory '" +
+        _rootDir + "': " + mkdir.error());
+  }
+
+  Result<string> rootDir = os::realpath(_rootDir);
+  if (rootDir.isError()) {
+    return Error(
+        "Failed to resolve the realpath of provisioner root directory '" +
+        _rootDir + "': " + rootDir.error());
+  }
+
+  CHECK_SOME(rootDir); // Can't be None since we just created it.
+
+  Try<hashmap<Image::Type, Owned<Store>>> stores = Store::create(flags);
+  if (stores.isError()) {
+    return Error("Failed to create image stores: " + stores.error());
+  }
+
+  hashmap<string, Owned<Backend>> backends = Backend::create(flags);
+  if (backends.empty()) {
+    return Error("No usable provisioner backend created");
+  }
+
+  if (!backends.contains(flags.image_provisioner_backend)) {
+    return Error(
+        "The specified provisioner backend '" +
+        flags.image_provisioner_backend + "' is unsupported");
+  }
+
+  return Owned<Provisioner>(new Provisioner(
+      Owned<ProvisionerProcess>(new ProvisionerProcess(
+          flags,
+          rootDir.get(),
+          stores.get(),
+          backends))));
+}
+
+
+Provisioner::Provisioner(Owned<ProvisionerProcess> _process)
+  : process(_process)
+{
+  spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Provisioner::~Provisioner()
+{
+  if (process.get() != NULL) {
+    terminate(process.get());
+    wait(process.get());
+  }
+}
+
+
+Future<Nothing> Provisioner::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  return dispatch(
+      CHECK_NOTNULL(process.get()),
+      &ProvisionerProcess::recover,
+      states,
+      orphans);
+}
+
+
+Future<string> Provisioner::provision(
+    const ContainerID& containerId,
+    const Image& image)
+{
+  return dispatch(
+      CHECK_NOTNULL(process.get()),
+      &ProvisionerProcess::provision,
+      containerId,
+      image);
+}
+
+
+Future<bool> Provisioner::destroy(const ContainerID& containerId)
+{
+  return dispatch(
+      CHECK_NOTNULL(process.get()),
+      &ProvisionerProcess::destroy,
+      containerId);
+}
+
+
+ProvisionerProcess::ProvisionerProcess(
+    const Flags& _flags,
+    const string& _rootDir,
+    const hashmap<Image::Type, Owned<Store>>& _stores,
+    const hashmap<string, Owned<Backend>>& _backends)
+  : flags(_flags),
+    rootDir(_rootDir),
+    stores(_stores),
+    backends(_backends) {}
+
+
+Future<Nothing> ProvisionerProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // Register living containers, including the ones that do not
+  // provision images.
+  hashset<ContainerID> alive;
+  foreach (const ContainerState& state, states) {
+    alive.insert(state.container_id());
+  }
+
+  // List provisioned containers; recover living ones; destroy unknown
+  // orphans. Note that known orphan containers are recovered as well
+  // and they will be destroyed by the containerizer using the normal
+  // cleanup path. See MESOS-2367 for details.
+  Try<hashset<ContainerID>> containers =
+    provisioner::paths::listContainers(rootDir);
+
+  if (containers.isError()) {
+    return Failure(
+        "Failed to list the containers managed by the provisioner: " +
+        containers.error());
+  }
+
+  // Scan the list of containers, register all of them with 'infos'
+  // but mark unknown orphans for immediate cleanup.
+  hashset<ContainerID> unknownOrphans;
+
+  foreach (const ContainerID& containerId, containers.get()) {
+    Owned<Info> info = Owned<Info>(new Info());
+
+    Try<hashmap<string, hashset<string>>> rootfses =
+      provisioner::paths::listContainerRootfses(rootDir, containerId);
+
+    if (rootfses.isError()) {
+      return Failure(
+          "Unable to list rootfses belonged to container " +
+          stringify(containerId) + ": " + rootfses.error());
+    }
+
+    foreachkey (const string& backend, rootfses.get()) {
+      if (!backends.contains(backend)) {
+        return Failure(
+            "Found rootfses managed by an unrecognized backend: " + backend);
+      }
+
+      info->rootfses.put(backend, rootfses.get()[backend]);
+    }
+
+    infos.put(containerId, info);
+
+    if (alive.contains(containerId) || orphans.contains(containerId)) {
+      LOG(INFO) << "Recovered container " << containerId;
+      continue;
+    } else {
+      // For immediate cleanup below.
+      unknownOrphans.insert(containerId);
+    }
+  }
+
+  // Cleanup unknown orphan containers' rootfses.
+  list<Future<bool>> cleanups;
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    LOG(INFO) << "Cleaning up unknown orphan container " << containerId;
+    cleanups.push_back(destroy(containerId));
+  }
+
+  Future<Nothing> cleanup = collect(cleanups)
+    .then([]() -> Future<Nothing> { return Nothing(); });
+
+  // Recover stores.
+  list<Future<Nothing>> recovers;
+  foreachvalue (const Owned<Store>& store, stores) {
+    recovers.push_back(store->recover());
+  }
+
+  Future<Nothing> recover = collect(recovers)
+    .then([]() -> Future<Nothing> { return Nothing(); });
+
+  // A successful provisioner recovery depends on:
+  // 1) Recovery of living containers and known orphans (done above).
+  // 2) Successful cleanup of unknown orphans.
+  // 3) Successful store recovery.
+  //
+  // TODO(jieyu): Do not recover 'store' before unknown orphans are
+  // cleaned up. In the future, we may want to cleanup unused rootfses
+  // in 'store', which might fail if there still exist unknown orphans
+  // holding references to them.
+  return collect(cleanup, recover)
+    .then([=]() -> Future<Nothing> {
+      LOG(INFO) << "Provisioner recovery complete";
+      return Nothing();
+    });
+}
+
+
+Future<string> ProvisionerProcess::provision(
+    const ContainerID& containerId,
+    const Image& image)
+{
+  if (!stores.contains(image.type())) {
+    return Failure(
+        "Unsupported container image type: " +
+        stringify(image.type()));
+  }
+
+  // Get and then provision image layers from the store.
+  return stores.get(image.type()).get()->get(image)
+    .then(defer(self(), &Self::_provision, containerId, lambda::_1));
+}
+
+
+Future<string> ProvisionerProcess::_provision(
+    const ContainerID& containerId,
+    const vector<string>& layers)
+{
+  // TODO(jieyu): Choose a backend smartly. For instance, if there is
+  // only one layer returned from the store. prefer to use bind
+  // backend because it's the simplest.
+  const string& backend = flags.image_provisioner_backend;
+  CHECK(backends.contains(backend));
+
+  string rootfsId = UUID::random().toString();
+
+  string rootfs = provisioner::paths::getContainerRootfsDir(
+      rootDir,
+      containerId,
+      backend,
+      rootfsId);
+
+  LOG(INFO) << "Provisioning image rootfs '" << rootfs
+            << "' for container " << containerId;
+
+  // NOTE: It's likely that the container ID already exists in 'infos'
+  // because one container might provision multiple images.
+  if (!infos.contains(containerId)) {
+    infos.put(containerId, Owned<Info>(new Info()));
+  }
+
+  infos[containerId]->rootfses[backend].insert(rootfsId);
+
+  return backends.get(backend).get()->provision(layers, rootfs)
+    .then([rootfs]() -> Future<string> { return rootfs; });
+}
+
+
+Future<bool> ProvisionerProcess::destroy(const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    LOG(INFO) << "Ignoring destroy request for unknown container "
+              << containerId;
+
+    return false;
+  }
+
+  // Unregister the container first. If destroy() fails, we can rely
+  // on recover() to retry it later.
+  Owned<Info> info = infos[containerId];
+  infos.erase(containerId);
+
+  list<Future<bool>> futures;
+  foreachkey (const string& backend, info->rootfses) {
+    if (!backends.contains(backend)) {
+      return Failure("Unknown backend '" + backend + "'");
+    }
+
+    foreach (const string& rootfsId, info->rootfses[backend]) {
+      string rootfs = provisioner::paths::getContainerRootfsDir(
+          rootDir,
+          containerId,
+          backend,
+          rootfsId);
+
+      LOG(INFO) << "Destroying container rootfs at '" << rootfs
+                << "' for container " << containerId;
+
+      futures.push_back(backends.get(backend).get()->destroy(rootfs));
+    }
+  }
+
+  // TODO(xujyan): Revisit the usefulness of this return value.
+  return collect(futures)
+    .then(defer(self(), &ProvisionerProcess::_destroy, containerId));
+}
+
+
+Future<bool> ProvisionerProcess::_destroy(const ContainerID& containerId)
+{
+  // This should be fairly cheap as the directory should only
+  // contain a few empty sub-directories at this point.
+  //
+  // TODO(jieyu): Currently, it's possible that some directories
+  // cannot be removed due to EBUSY. EBUSY is caused by the race
+  // between cleaning up this container and new containers copying
+  // the host mount table. It's OK to ignore them. The cleanup
+  // will be retried during slave recovery.
+  string containerDir =
+    provisioner::paths::getContainerDir(rootDir, containerId);
+
+  Try<Nothing> rmdir = os::rmdir(containerDir);
+  if (rmdir.isError()) {
+    LOG(ERROR) << "Failed to remove the provisioned container directory "
+               << "at '" << containerDir << "': " << rmdir.error();
+
+    ++metrics.remove_container_errors;
+  }
+
+  return true;
+}
+
+
+ProvisionerProcess::Metrics::Metrics()
+  : remove_container_errors(
+      "containerizer/mesos/provisioner/remove_container_errors")
+{
+  process::metrics::add(remove_container_errors);
+}
+
+
+ProvisionerProcess::Metrics::~Metrics()
+{
+  process::metrics::remove(remove_container_errors);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/provisioner.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/provisioner.hpp b/src/slave/containerizer/mesos/provisioner/provisioner.hpp
new file mode 100644
index 0000000..912fc5a
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/provisioner.hpp
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_HPP__
+#define __PROVISIONER_HPP__
+
+#include <list>
+
+#include <mesos/resources.hpp>
+
+#include <mesos/slave/isolator.hpp> // For ContainerState.
+
+#include <stout/nothing.hpp>
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/fetcher.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class ProvisionerProcess;
+
+
+class Provisioner
+{
+public:
+  // Create the provisioner based on the specified flags.
+  static Try<process::Owned<Provisioner>> create(
+      const Flags& flags,
+      Fetcher* fetcher);
+
+  // NOTE: Made 'virtual' for mocking and testing.
+  virtual ~Provisioner();
+
+  // Recover root filesystems for containers from the run states and
+  // the orphan containers (known to the launcher but not known to the
+  // slave) detected by the launcher. This function is also
+  // responsible for cleaning up any intermediate artifacts (e.g.
+  // directories) to not leak anything.
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  // Provision a root filesystem for the container using the specified
+  // image and return the absolute path to the root filesystem.
+  virtual process::Future<std::string> provision(
+      const ContainerID& containerId,
+      const Image& image);
+
+  // Destroy a previously provisioned root filesystem. Assumes that
+  // all references (e.g., mounts, open files) to the provisioned
+  // filesystem have been removed. Return false if there is no
+  // provisioned root filesystem for the given container.
+  virtual process::Future<bool> destroy(const ContainerID& containerId);
+
+protected:
+  Provisioner() {} // For creating mock object.
+
+private:
+  explicit Provisioner(process::Owned<ProvisionerProcess> process);
+
+  Provisioner(const Provisioner&) = delete; // Not copyable.
+  Provisioner& operator=(const Provisioner&) = delete; // Not assignable.
+
+  process::Owned<ProvisionerProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/store.cpp b/src/slave/containerizer/mesos/provisioner/store.cpp
new file mode 100644
index 0000000..4b8abe6
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/store.cpp
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include <mesos/type_utils.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+using namespace process;
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<hashmap<Image::Type, Owned<Store>>> Store::create(const Flags& flags)
+{
+  if (flags.image_providers.isNone()) {
+    return hashmap<Image::Type, Owned<Store>>();
+  }
+
+  hashmap<Image::Type, Try<Owned<Store>>(*)(const Flags&)> creators;
+  creators.put(Image::APPC, &appc::Store::create);
+  creators.put(Image::DOCKER, &docker::Store::create);
+
+  hashmap<Image::Type, Owned<Store>> stores;
+
+  foreach (const string& type,
+           strings::tokenize(flags.image_providers.get(), ",")) {
+    Image::Type imageType;
+    if (!Image::Type_Parse(strings::upper(type), &imageType)) {
+      return Error("Unknown image type '" + type + "'");
+    }
+
+    if (!creators.contains(imageType)) {
+      return Error("Unsupported image type '" + type + "'");
+    }
+
+    Try<Owned<Store>> store = creators[imageType](flags);
+    if (store.isError()) {
+      return Error(
+          "Failed to create store for image type '" +
+          type + "': " + store.error());
+    }
+
+    stores.put(imageType, store.get());
+  }
+
+  return stores;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/store.hpp b/src/slave/containerizer/mesos/provisioner/store.hpp
new file mode 100644
index 0000000..cf3e7d7
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/store.hpp
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_STORE_HPP__
+#define __PROVISIONER_STORE_HPP__
+
+#include <string>
+#include <vector>
+
+#include <mesos/mesos.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// An image store abstraction that "stores" images. It serves as a
+// read-through cache (cache misses are fetched remotely and
+// transparently) for images.
+class Store
+{
+public:
+  static Try<hashmap<Image::Type, process::Owned<Store>>> create(
+      const Flags& flags);
+
+  virtual ~Store() {}
+
+  virtual process::Future<Nothing> recover() = 0;
+
+  // Get the specified image (and all its recursive dependencies) as a
+  // list of rootfs layers in the topological order (dependencies go
+  // before dependents in the list). The images required to build this
+  // list are either retrieved from the local cache or fetched
+  // remotely.
+  //
+  // NOTE: The returned list should not have duplicates. e.g., in the
+  // following scenario the result should be [C, B, D, A] (B before D
+  // in this example is decided by the order in which A specifies its
+  // dependencies).
+  //
+  // A --> B --> C
+  // |           ^
+  // |---> D ----|
+  //
+  // The returned future fails if the requested image or any of its
+  // dependencies cannot be found or failed to be fetched.
+  virtual process::Future<std::vector<std::string>> get(const Image& image) = 0;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_STORE_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/paths.cpp b/src/slave/containerizer/provisioner/appc/paths.cpp
deleted file mode 100644
index 8817c0f..0000000
--- a/src/slave/containerizer/provisioner/appc/paths.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <stout/path.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-
-using std::list;
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace paths {
-
-string getStagingDir(const string& storeDir)
-{
-  return path::join(storeDir, "staging");
-}
-
-
-string getImagesDir(const string& storeDir)
-{
-  return path::join(storeDir, "images");
-}
-
-
-string getImagePath(const string& storeDir, const string& imageId)
-{
-  return path::join(getImagesDir(storeDir), imageId);
-}
-
-
-string getImageRootfsPath(
-    const string& storeDir,
-    const string& imageId)
-{
-  return path::join(getImagePath(storeDir, imageId), "rootfs");
-}
-
-
-string getImageRootfsPath(const string& imagePath)
-{
-  return path::join(imagePath, "rootfs");
-}
-
-
-string getImageManifestPath(
-    const string& storeDir,
-    const string& imageId)
-{
-  return path::join(getImagePath(storeDir, imageId), "manifest");
-}
-
-
-string getImageManifestPath(const string& imagePath)
-{
-  return path::join(imagePath, "manifest");
-}
-
-} // namespace paths {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/paths.hpp b/src/slave/containerizer/provisioner/appc/paths.hpp
deleted file mode 100644
index 7c36d67..0000000
--- a/src/slave/containerizer/provisioner/appc/paths.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_PATHS_HPP__
-#define __PROVISIONER_APPC_PATHS_HPP__
-
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/try.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace paths {
-
-// The appc store file system layout is as follows:
-//
-// <store_dir> ('--appc_store_dir' flag)
-// |--staging (contains temp directories for staging downloads)
-// |
-// |--images (stores validated images)
-//    |--<image_id> (in the form of "sha512-<128_character_hash_sum>")
-//       |--manifest
-//       |--rootfs
-//          |--... (according to the ACI spec)
-//
-// TODO(xujyan): The staging directory is unused for now (it's
-// externally managed) but implemented to illustrate the need for a
-// separate 'images' directory. Complete the layout diagram when the
-// staging directory is utilized by the provisioner.
-
-std::string getStagingDir(const std::string& storeDir);
-
-
-std::string getImagesDir(const std::string& storeDir);
-
-
-std::string getImagePath(
-    const std::string& storeDir,
-    const std::string& imageId);
-
-
-std::string getImageRootfsPath(
-    const std::string& storeDir,
-    const std::string& imageId);
-
-
-std::string getImageRootfsPath(const std::string& imagePath);
-
-
-std::string getImageManifestPath(
-    const std::string& storeDir,
-    const std::string& imageId);
-
-
-std::string getImageManifestPath(const std::string& imagePath);
-
-} // namespace paths {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/spec.cpp b/src/slave/containerizer/provisioner/appc/spec.cpp
deleted file mode 100644
index bbe523d..0000000
--- a/src/slave/containerizer/provisioner/appc/spec.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/os/stat.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace spec {
-
-Option<Error> validateManifest(const AppcImageManifest& manifest)
-{
-  // TODO(idownes): Validate that required fields are present when
-  // this cannot be expressed in the protobuf specification, e.g.,
-  // repeated fields with >= 1.
-  // TODO(xujyan): More thorough type validation:
-  // https://github.com/appc/spec/blob/master/spec/types.md
-  if (manifest.ackind() != "ImageManifest") {
-    return Error("Incorrect acKind field: " + manifest.ackind());
-  }
-
-  return None();
-}
-
-
-Option<Error> validateImageID(const string& imageId)
-{
-  if (!strings::startsWith(imageId, "sha512-")) {
-    return Error("Image ID needs to start with sha512-");
-  }
-
-  string hash = strings::remove(imageId, "sha512-", strings::PREFIX);
-  if (hash.length() != 128) {
-    return Error("Invalid hash length for: " + hash);
-  }
-
-  return None();
-}
-
-
-Option<Error> validateLayout(const string& imagePath)
-{
-  if (!os::stat::isdir(paths::getImageRootfsPath(imagePath))) {
-    return Error("No rootfs directory found in image layout");
-  }
-
-  if (!os::stat::isfile(paths::getImageManifestPath(imagePath))) {
-    return Error("No manifest found in image layout");
-  }
-
-  return None();
-}
-
-
-Try<AppcImageManifest> parse(const string& value)
-{
-  Try<JSON::Object> json = JSON::parse<JSON::Object>(value);
-  if (json.isError()) {
-    return Error("JSON parse failed: " + json.error());
-  }
-
-  Try<AppcImageManifest> manifest =
-    protobuf::parse<AppcImageManifest>(json.get());
-
-  if (manifest.isError()) {
-    return Error("Protobuf parse failed: " + manifest.error());
-  }
-
-  Option<Error> error = validateManifest(manifest.get());
-  if (error.isSome()) {
-    return Error("Schema validation failed: " + error.get().message);
-  }
-
-  return manifest.get();
-}
-
-} // namespace spec {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/spec.hpp b/src/slave/containerizer/provisioner/appc/spec.hpp
deleted file mode 100644
index 2bc8c6f..0000000
--- a/src/slave/containerizer/provisioner/appc/spec.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_SPEC_HPP__
-#define __PROVISIONER_APPC_SPEC_HPP__
-
-#include <string>
-
-#include <stout/error.hpp>
-#include <stout/option.hpp>
-
-#include <mesos/mesos.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Appc spec.
-Option<Error> validateManifest(const AppcImageManifest& manifest);
-
-// Validate if the specified image ID conforms to the Appc spec.
-Option<Error> validateImageID(const std::string& imageId);
-
-// Validate if the specified image has the disk layout that conforms
-// to the Appc spec.
-Option<Error> validateLayout(const std::string& imagePath);
-
-// Parse the AppcImageManifest in the specified JSON string.
-Try<AppcImageManifest> parse(const std::string& value);
-
-} // namespace spec {
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_SPEC_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/store.cpp b/src/slave/containerizer/provisioner/appc/store.cpp
deleted file mode 100644
index a5ef4ea..0000000
--- a/src/slave/containerizer/provisioner/appc/store.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <glog/logging.h>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-
-#include <stout/check.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include "slave/containerizer/provisioner/appc/paths.hpp"
-#include "slave/containerizer/provisioner/appc/spec.hpp"
-#include "slave/containerizer/provisioner/appc/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-
-// Defines a locally cached image (which has passed validation).
-struct CachedImage
-{
-  static Try<CachedImage> create(const string& imagePath);
-
-  CachedImage(
-      const AppcImageManifest& _manifest,
-      const string& _id,
-      const string& _path)
-    : manifest(_manifest), id(_id), path(_path) {}
-
-  string rootfs() const
-  {
-    return path::join(path, "rootfs");
-  }
-
-  const AppcImageManifest manifest;
-
-  // Image ID of the format "sha512-value" where "value" is the hex
-  // encoded string of the sha512 digest of the uncompressed tar file
-  // of the image.
-  const string id;
-
-  // Absolute path to the extracted image.
-  const string path;
-};
-
-
-Try<CachedImage> CachedImage::create(const string& imagePath)
-{
-  Option<Error> error = spec::validateLayout(imagePath);
-  if (error.isSome()) {
-    return Error("Invalid image layout: " + error.get().message);
-  }
-
-  string imageId = Path(imagePath).basename();
-
-  error = spec::validateImageID(imageId);
-  if (error.isSome()) {
-    return Error("Invalid image ID: " + error.get().message);
-  }
-
-  Try<string> read = os::read(paths::getImageManifestPath(imagePath));
-  if (read.isError()) {
-    return Error("Failed to read manifest: " + read.error());
-  }
-
-  Try<AppcImageManifest> manifest = spec::parse(read.get());
-  if (manifest.isError()) {
-    return Error("Failed to parse manifest: " + manifest.error());
-  }
-
-  return CachedImage(manifest.get(), imageId, imagePath);
-}
-
-
-// Helper that implements this:
-// https://github.com/appc/spec/blob/master/spec/aci.md#dependency-matching
-static bool matches(Image::Appc requirements, const CachedImage& candidate)
-{
-  // The name must match.
-  if (candidate.manifest.name() != requirements.name()) {
-    return false;
-  }
-
-  // If an id is specified the candidate must match.
-  if (requirements.has_id() && (candidate.id != requirements.id())) {
-    return false;
-  }
-
-  // Extract labels for easier comparison, this also weeds out duplicates.
-  // TODO(xujyan): Detect duplicate labels in image manifest validation
-  // and Image::Appc validation.
-  hashmap<string, string> requiredLabels;
-  foreach (const Label& label, requirements.labels().labels()) {
-    requiredLabels[label.key()] = label.value();
-  }
-
-  hashmap<string, string> candidateLabels;
-  foreach (const AppcImageManifest::Label& label,
-           candidate.manifest.labels()) {
-    candidateLabels[label.name()] = label.value();
-  }
-
-  // Any label specified must be present and match in the candidate.
-  foreachpair (const string& name,
-               const string& value,
-               requiredLabels) {
-    if (!candidateLabels.contains(name) ||
-        candidateLabels.get(name).get() != value) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-
-class StoreProcess : public Process<StoreProcess>
-{
-public:
-  StoreProcess(const string& rootDir);
-
-  ~StoreProcess() {}
-
-  Future<Nothing> recover();
-
-  Future<vector<string>> get(const Image& image);
-
-private:
-  // Absolute path to the root directory of the store as defined by
-  // --appc_store_dir.
-  const string rootDir;
-
-  // Mappings: name -> id -> image.
-  hashmap<string, hashmap<string, CachedImage>> images;
-};
-
-
-Try<Owned<slave::Store>> Store::create(const Flags& flags)
-{
-  Try<Nothing> mkdir = os::mkdir(paths::getImagesDir(flags.appc_store_dir));
-  if (mkdir.isError()) {
-    return Error("Failed to create the images directory: " + mkdir.error());
-  }
-
-  // Make sure the root path is canonical so all image paths derived
-  // from it are canonical too.
-  Result<string> rootDir = os::realpath(flags.appc_store_dir);
-  if (!rootDir.isSome()) {
-    // The above mkdir call recursively creates the store directory
-    // if necessary so it cannot be None here.
-    CHECK_ERROR(rootDir);
-
-    return Error(
-        "Failed to get the realpath of the store root directory: " +
-        rootDir.error());
-  }
-
-  return Owned<slave::Store>(new Store(
-      Owned<StoreProcess>(new StoreProcess(rootDir.get()))));
-}
-
-
-Store::Store(Owned<StoreProcess> _process)
-  : process(_process)
-{
-  spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Store::~Store()
-{
-  terminate(process.get());
-  wait(process.get());
-}
-
-
-Future<Nothing> Store::recover()
-{
-  return dispatch(process.get(), &StoreProcess::recover);
-}
-
-
-Future<vector<string>> Store::get(const Image& image)
-{
-  return dispatch(process.get(), &StoreProcess::get, image);
-}
-
-
-StoreProcess::StoreProcess(const string& _rootDir) : rootDir(_rootDir) {}
-
-
-Future<Nothing> StoreProcess::recover()
-{
-  // Recover everything in the store.
-  Try<list<string>> imageIds = os::ls(paths::getImagesDir(rootDir));
-  if (imageIds.isError()) {
-    return Failure(
-        "Failed to list images under '" +
-        paths::getImagesDir(rootDir) + "': " +
-        imageIds.error());
-  }
-
-  foreach (const string& imageId, imageIds.get()) {
-    string path = paths::getImagePath(rootDir, imageId);
-    if (!os::stat::isdir(path)) {
-      LOG(WARNING) << "Unexpected entry in storage: " << imageId;
-      continue;
-    }
-
-    Try<CachedImage> image = CachedImage::create(path);
-    if (image.isError()) {
-      LOG(WARNING) << "Unexpected entry in storage: " << image.error();
-      continue;
-    }
-
-    LOG(INFO) << "Restored image '" << image.get().manifest.name() << "'";
-
-    images[image.get().manifest.name()].put(image.get().id, image.get());
-  }
-
-  return Nothing();
-}
-
-
-Future<vector<string>> StoreProcess::get(const Image& image)
-{
-  if (image.type() != Image::APPC) {
-    return Failure("Not an Appc image: " + stringify(image.type()));
-  }
-
-  const Image::Appc& appc = image.appc();
-
-  if (!images.contains(appc.name())) {
-    return Failure("No Appc image named '" + appc.name() + "' can be found");
-  }
-
-  // Get local candidates.
-  vector<CachedImage> candidates;
-  foreach (const CachedImage& candidate, images[appc.name()].values()) {
-    // The first match is returned.
-    // TODO(xujyan): Some tie-breaking rules are necessary.
-    if (matches(appc, candidate)) {
-      LOG(INFO) << "Found match for Appc image '" << appc.name()
-                << "' in the store";
-
-      // The Appc store current doesn't support dependencies and this
-      // is enforced by manifest validation: if the image's manifest
-      // contains dependencies it would fail the validation and
-      // wouldn't be stored in the store.
-      return vector<string>({candidate.rootfs()});
-    }
-  }
-
-  return Failure("No Appc image named '" + appc.name() +
-                 "' can match the requirements");
-}
-
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/appc/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/appc/store.hpp b/src/slave/containerizer/provisioner/appc/store.hpp
deleted file mode 100644
index e845519..0000000
--- a/src/slave/containerizer/provisioner/appc/store.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_APPC_STORE_HPP__
-#define __PROVISIONER_APPC_STORE_HPP__
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace appc {
-
-// Forward declaration.
-class StoreProcess;
-
-
-class Store : public slave::Store
-{
-public:
-  static Try<process::Owned<slave::Store>> create(const Flags& flags);
-
-  ~Store();
-
-  virtual process::Future<Nothing> recover();
-
-  // TODO(xujyan): Fetching remotely is not implemented for now and
-  // until then the future fails directly if the image is not in the
-  // local cache.
-  // TODO(xujyan): The store currently doesn't support images that
-  // have dependencies and we should add it later.
-  virtual process::Future<std::vector<std::string>> get(const Image& image);
-
-private:
-  Store(process::Owned<StoreProcess> process);
-
-  Store(const Store&) = delete; // Not copyable.
-  Store& operator=(const Store&) = delete; // Not assignable.
-
-  process::Owned<StoreProcess> process;
-};
-
-} // namespace appc {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_APPC_STORE_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backend.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backend.cpp b/src/slave/containerizer/provisioner/backend.cpp
deleted file mode 100644
index b5d9670..0000000
--- a/src/slave/containerizer/provisioner/backend.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <glog/logging.h>
-
-#include <stout/os.hpp>
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-#include "slave/containerizer/provisioner/backends/copy.hpp"
-
-using namespace process;
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-hashmap<string, Owned<Backend>> Backend::create(const Flags& flags)
-{
-  hashmap<string, Try<Owned<Backend>>(*)(const Flags&)> creators;
-
-#ifdef __linux__
-  creators.put("bind", &BindBackend::create);
-#endif // __linux__
-  creators.put("copy", &CopyBackend::create);
-
-  hashmap<string, Owned<Backend>> backends;
-
-  foreachkey (const string& name, creators) {
-    Try<Owned<Backend>> backend = creators[name](flags);
-    if (backend.isError()) {
-      LOG(WARNING) << "Failed to create '" << name << "' backend: "
-                   << backend.error();
-      continue;
-    }
-    backends.put(name, backend.get());
-  }
-
-  return backends;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backend.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backend.hpp b/src/slave/containerizer/provisioner/backend.hpp
deleted file mode 100644
index 1c80b79..0000000
--- a/src/slave/containerizer/provisioner/backend.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKEND_HPP__
-#define __PROVISIONER_BACKEND_HPP__
-
-#include <string>
-#include <vector>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/try.hpp>
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Provision a root filesystem for a container.
-class Backend
-{
-public:
-  virtual ~Backend() {}
-
-  // Return a map of all supported backends keyed by their names. Note
-  // that Backends that failed to be created due to incorrect flags are
-  // simply not added to the result.
-  static hashmap<std::string, process::Owned<Backend>> create(
-      const Flags& flags);
-
-  // Provision a root filesystem for a container into the specified 'rootfs'
-  // directory by applying the specified list of root filesystem layers in
-  // the list order, i.e., files in a layer can overwrite/shadow those from
-  // another layer earlier in the list.
-  virtual process::Future<Nothing> provision(
-      const std::vector<std::string>& layers,
-      const std::string& rootfs) = 0;
-
-  // Destroy the root filesystem provisioned at the specified 'rootfs'
-  // directory. Return false if there is no provisioned root filesystem
-  // to destroy for the given directory.
-  virtual process::Future<bool> destroy(const std::string& rootfs) = 0;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKEND_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/bind.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/bind.cpp b/src/slave/containerizer/provisioner/backends/bind.cpp
deleted file mode 100644
index 1fe1746..0000000
--- a/src/slave/containerizer/provisioner/backends/bind.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <errno.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <process/dispatch.hpp>
-#include <process/process.hpp>
-
-#include <process/metrics/counter.hpp>
-#include <process/metrics/metrics.hpp>
-
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-
-#include "linux/fs.hpp"
-
-#include "slave/containerizer/provisioner/backends/bind.hpp"
-
-using namespace process;
-
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class BindBackendProcess : public Process<BindBackendProcess>
-{
-public:
-  Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
-
-  Future<bool> destroy(const string& rootfs);
-
-  struct Metrics
-  {
-    Metrics();
-    ~Metrics();
-
-    process::metrics::Counter remove_rootfs_errors;
-  } metrics;
-};
-
-
-Try<Owned<Backend>> BindBackend::create(const Flags&)
-{
-  Result<string> user = os::user();
-  if (!user.isSome()) {
-    return Error("Failed to determine user: " +
-                 (user.isError() ? user.error() : "username not found"));
-  }
-
-  if (user.get() != "root") {
-    return Error("BindBackend requires root privileges");
-  }
-
-  return Owned<Backend>(new BindBackend(
-      Owned<BindBackendProcess>(new BindBackendProcess())));
-}
-
-
-BindBackend::~BindBackend()
-{
-  terminate(process.get());
-  wait(process.get());
-}
-
-
-BindBackend::BindBackend(Owned<BindBackendProcess> _process)
-  : process(_process)
-{
-  spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Future<Nothing> BindBackend::provision(
-    const vector<string>& layers,
-    const string& rootfs)
-{
-  return dispatch(
-      process.get(), &BindBackendProcess::provision, layers, rootfs);
-}
-
-
-Future<bool> BindBackend::destroy(const string& rootfs)
-{
-  return dispatch(process.get(), &BindBackendProcess::destroy, rootfs);
-}
-
-
-Future<Nothing> BindBackendProcess::provision(
-    const vector<string>& layers,
-    const string& rootfs)
-{
-  if (layers.size() > 1) {
-    return Failure(
-        "Multiple layers are not supported by the bind backend");
-  }
-
-  if (layers.size() == 0) {
-    return Failure("No filesystem layer provided");
-  }
-
-  Try<Nothing> mkdir = os::mkdir(rootfs);
-  if (mkdir.isError()) {
-    return Failure("Failed to create container rootfs at " + rootfs);
-  }
-
-  // TODO(xujyan): Use MS_REC? Does any provisioner use mounts within
-  // its image store in a single layer?
-  Try<Nothing> mount = fs::mount(
-      layers.front(),
-      rootfs,
-      None(),
-      MS_BIND,
-      NULL);
-
-  if (mount.isError()) {
-    return Failure(
-        "Failed to bind mount rootfs '" + layers.front() +
-        "' to '" + rootfs + "': " + mount.error());
-  }
-
-  // And remount it read-only.
-  mount = fs::mount(
-      None(), // Ignored.
-      rootfs,
-      None(),
-      MS_BIND | MS_RDONLY | MS_REMOUNT,
-      NULL);
-
-  if (mount.isError()) {
-    return Failure(
-        "Failed to remount rootfs '" + rootfs + "' read-only: " +
-        mount.error());
-  }
-
-  // Mark the mount as shared+slave.
-  mount = fs::mount(
-      None(),
-      rootfs,
-      None(),
-      MS_SLAVE,
-      NULL);
-
-  if (mount.isError()) {
-    return Failure(
-        "Failed to mark mount '" + rootfs +
-        "' as a slave mount: " + mount.error());
-  }
-
-  mount = fs::mount(
-      None(),
-      rootfs,
-      None(),
-      MS_SHARED,
-      NULL);
-
-  if (mount.isError()) {
-    return Failure(
-        "Failed to mark mount '" + rootfs +
-        "' as a shared mount: " + mount.error());
-  }
-
-  return Nothing();
-}
-
-
-Future<bool> BindBackendProcess::destroy(const string& rootfs)
-{
-  Try<fs::MountInfoTable> mountTable = fs::MountInfoTable::read();
-
-  if (mountTable.isError()) {
-    return Failure("Failed to read mount table: " + mountTable.error());
-  }
-
-  foreach (const fs::MountInfoTable::Entry& entry, mountTable.get().entries) {
-    // TODO(xujyan): If MS_REC was used in 'provision()' we would need
-    // to check `strings::startsWith(entry.target, rootfs)` here to
-    // unmount all nested mounts.
-    if (entry.target == rootfs) {
-      // NOTE: This would fail if the rootfs is still in use.
-      Try<Nothing> unmount = fs::unmount(entry.target);
-      if (unmount.isError()) {
-        return Failure(
-            "Failed to destroy bind-mounted rootfs '" + rootfs + "': " +
-            unmount.error());
-      }
-
-      // TODO(jieyu): If 'rmdir' here returns EBUSY, we still returns
-      // a success. This is currently possible because the parent
-      // mount of 'rootfs' might not be a shared mount. Thus,
-      // containers in different mount namespaces might hold extra
-      // references to this mount. It is OK to ignore the EBUSY error
-      // because the provisioner will later try to delete all the
-      // rootfses for the terminated containers.
-      if (::rmdir(rootfs.c_str()) != 0) {
-        string message =
-          "Failed to remove rootfs mount point '" + rootfs +
-          "': " + strerror(errno);
-
-        if (errno == EBUSY) {
-          LOG(ERROR) << message;
-          ++metrics.remove_rootfs_errors;
-        } else {
-          return Failure(message);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-
-BindBackendProcess::Metrics::Metrics()
-  : remove_rootfs_errors(
-      "containerizer/mesos/provisioner/bind/remove_rootfs_errors")
-{
-  process::metrics::add(remove_rootfs_errors);
-}
-
-
-BindBackendProcess::Metrics::~Metrics()
-{
-  process::metrics::remove(remove_rootfs_errors);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/bind.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/bind.hpp b/src/slave/containerizer/provisioner/backends/bind.hpp
deleted file mode 100644
index 1685938..0000000
--- a/src/slave/containerizer/provisioner/backends/bind.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKENDS_BIND_HPP__
-#define __PROVISIONER_BACKENDS_BIND_HPP__
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class BindBackendProcess;
-
-
-// This is a specialized backend that may be useful for deployments
-// using large (multi-GB) single-layer images *and* where more recent
-// kernel features such as overlayfs are not available (overlayfs-based
-// backend tracked by MESOS-2971). For small images (10's to 100's of MB)
-// the copy backend may be sufficient. NOTE:
-// 1) BindBackend supports only a single layer. Multi-layer images will
-//    fail to provision and the container will fail to launch!
-// 2) The filesystem is read-only because all containers using this
-//    image share the source. Select writable areas can be achieved by
-//    mounting read-write volumes to places like /tmp, /var/tmp,
-//    /home, etc. using the ContainerInfo. These can be relative to
-//    the executor work directory.
-//    N.B. Since the filesystem is read-only, '--sandbox_directory' must
-//    already exist within the filesystem because the filesystem isolator
-//    is unable to create it!
-// 3) It's fast because the bind mount requires (nearly) zero IO.
-class BindBackend : public Backend
-{
-public:
-  virtual ~BindBackend();
-
-  // BindBackend doesn't use any flag.
-  static Try<process::Owned<Backend>> create(const Flags&);
-
-  virtual process::Future<Nothing> provision(
-      const std::vector<std::string>& layers,
-      const std::string& rootfs);
-
-  virtual process::Future<bool> destroy(const std::string& rootfs);
-
-private:
-  explicit BindBackend(process::Owned<BindBackendProcess> process);
-
-  BindBackend(const BindBackend&); // Not copyable.
-  BindBackend& operator=(const BindBackend&); // Not assignable.
-
-  process::Owned<BindBackendProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKENDS_BIND_HPP__

[11/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/perf_event.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/perf_event.hpp b/src/slave/containerizer/isolators/cgroups/perf_event.hpp
deleted file mode 100644
index 1f722ef..0000000
--- a/src/slave/containerizer/isolators/cgroups/perf_event.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PERF_EVENT_ISOLATOR_HPP__
-#define __PERF_EVENT_ISOLATOR_HPP__
-
-#include <set>
-
-#include <process/time.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/nothing.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CgroupsPerfEventIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~CgroupsPerfEventIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-protected:
-  virtual void initialize();
-
-private:
-  CgroupsPerfEventIsolatorProcess(
-      const Flags& _flags,
-      const std::string& _hierarchy,
-      const std::set<std::string>& _events)
-    : flags(_flags),
-      hierarchy(_hierarchy),
-      events(_events) {}
-
-  void sample();
-
-  void _sample(
-      const process::Time& next,
-      const process::Future<hashmap<std::string, PerfStatistics>>& statistics);
-
-  virtual process::Future<Nothing> _cleanup(const ContainerID& containerId);
-
-  struct Info
-  {
-    Info(const ContainerID& _containerId, const std::string& _cgroup)
-      : containerId(_containerId), cgroup(_cgroup), destroying(false)
-    {
-      // Ensure the initial statistics include the required fields.
-      // Note the duration is set to zero to indicate no sampling has
-      // taken place. This empty sample will be returned from usage()
-      // until the first true sample is obtained.
-      statistics.set_timestamp(process::Clock::now().secs());
-      statistics.set_duration(Seconds(0).secs());
-    }
-
-    const ContainerID containerId;
-    const std::string cgroup;
-    PerfStatistics statistics;
-    // Mark a container when we start destruction so we stop sampling it.
-    bool destroying;
-  };
-
-  const Flags flags;
-
-  // The path to the cgroups subsystem hierarchy root.
-  const std::string hierarchy;
-
-  // Set of events to sample.
-  std::set<std::string> events;
-
-  // TODO(jieyu): Use Owned<Info>.
-  hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PERF_EVENT_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/linux.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/linux.cpp b/src/slave/containerizer/isolators/filesystem/linux.cpp
deleted file mode 100644
index 8823b78..0000000
--- a/src/slave/containerizer/isolators/filesystem/linux.cpp
+++ /dev/null
@@ -1,893 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <sstream>
-#include <string>
-
-#include <glog/logging.h>
-
-#include <process/collect.hpp>
-
-#include <process/metrics/metrics.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/strings.hpp>
-
-#include <stout/os/shell.hpp>
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
-
-using namespace process;
-
-using std::list;
-using std::ostringstream;
-using std::string;
-
-using mesos::slave::ContainerState;
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> LinuxFilesystemIsolatorProcess::create(
-    const Flags& flags,
-    const Owned<Provisioner>& provisioner)
-{
-  Result<string> user = os::user();
-  if (!user.isSome()) {
-    return Error("Failed to determine user: " +
-                 (user.isError() ? user.error() : "username not found"));
-  }
-
-  if (user.get() != "root") {
-    return Error("LinuxFilesystemIsolator requires root privileges");
-  }
-
-  // Make slave's work_dir a shared mount so that when forking a child
-  // process (with a new mount namespace), the child process does not
-  // hold extra references to container's work directory mounts and
-  // provisioner mounts (e.g., when using the bind backend) because
-  // cleanup operations within work_dir can be propagted to all
-  // container namespaces. See MESOS-3483 for more details.
-  LOG(INFO) << "Making '" << flags.work_dir << "' a shared mount";
-
-  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
-  if (table.isError()) {
-    return Error("Failed to get mount table: " + table.error());
-  }
-
-  Option<fs::MountInfoTable::Entry> workDirMount;
-  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
-    // TODO(jieyu): Make sure 'flags.work_dir' is a canonical path.
-    if (entry.target == flags.work_dir) {
-      workDirMount = entry;
-      break;
-    }
-  }
-
-  // Do a self bind mount if needed. If the mount already exists, make
-  // sure it is a shared mount of its own peer group.
-  if (workDirMount.isNone()) {
-    // NOTE: Instead of using fs::mount to perform the bind mount, we
-    // use the shell command here because the syscall 'mount' does not
-    // update the mount table (i.e., /etc/mtab). In other words, the
-    // mount will not be visible if the operator types command
-    // 'mount'. Since this mount will still be presented after all
-    // containers and the slave are stopped, it's better to make it
-    // visible. It's OK to use the blocking os::shell here because
-    // 'create' will only be invoked during initialization.
-    Try<string> mount = os::shell(
-        "mount --bind %s %s && "
-        "mount --make-slave %s && "
-        "mount --make-shared %s",
-        flags.work_dir.c_str(),
-        flags.work_dir.c_str(),
-        flags.work_dir.c_str(),
-        flags.work_dir.c_str());
-
-    if (mount.isError()) {
-      return Error(
-          "Failed to self bind mount '" + flags.work_dir +
-          "' and make it a shared mount: " + mount.error());
-    }
-  } else {
-    if (workDirMount.get().shared().isNone()) {
-      // This is the case where the work directory mount is not a
-      // shared mount yet (possibly due to slave crash while preparing
-      // the work directory mount). It's safe to re-do the following.
-      Try<string> mount = os::shell(
-          "mount --make-slave %s && "
-          "mount --make-shared %s",
-          flags.work_dir.c_str(),
-          flags.work_dir.c_str());
-
-      if (mount.isError()) {
-        return Error(
-            "Failed to self bind mount '" + flags.work_dir +
-            "' and make it a shared mount: " + mount.error());
-      }
-    } else {
-      // We need to make sure that the shared mount is in its own peer
-      // group. To check that, we need to get the parent mount.
-      foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
-        if (entry.id == workDirMount.get().parent) {
-          // If the work directory mount and its parent mount are in
-          // the same peer group, we need to re-do the following
-          // commands so that they are in different peer groups.
-          if (entry.shared() == workDirMount.get().shared()) {
-            Try<string> mount = os::shell(
-                "mount --make-slave %s && "
-                "mount --make-shared %s",
-                flags.work_dir.c_str(),
-                flags.work_dir.c_str());
-
-            if (mount.isError()) {
-              return Error(
-                  "Failed to self bind mount '" + flags.work_dir +
-                  "' and make it a shared mount: " + mount.error());
-            }
-          }
-
-          break;
-        }
-      }
-    }
-  }
-
-  Owned<MesosIsolatorProcess> process(
-      new LinuxFilesystemIsolatorProcess(flags, provisioner));
-
-  return new MesosIsolator(process);
-}
-
-
-LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess(
-    const Flags& _flags,
-    const Owned<Provisioner>& _provisioner)
-  : flags(_flags),
-    provisioner(_provisioner),
-    metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {}
-
-
-LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  // Read the mount table in the host mount namespace to recover paths
-  // to containers' work directories if their root filesystems are
-  // changed. Method 'cleanup()' relies on this information to clean
-  // up mounts in the host mount namespace for each container.
-  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
-  if (table.isError()) {
-    return Failure("Failed to get mount table: " + table.error());
-  }
-
-  foreach (const ContainerState& state, states) {
-    Owned<Info> info(new Info(state.directory()));
-
-    foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
-      if (entry.root == info->directory) {
-        info->sandbox = entry.target;
-        break;
-      }
-    }
-
-    infos.put(state.container_id(), info);
-  }
-
-  // Recover both known and unknown orphans by scanning the mount
-  // table and finding those mounts whose roots are under slave's
-  // sandbox root directory. Those mounts are container's work
-  // directory mounts. Mounts from unknown orphans will be cleaned up
-  // immediately. Mounts from known orphans will be cleaned up when
-  // those known orphan containers are being destroyed by the slave.
-  hashset<ContainerID> unknownOrphans;
-
-  string sandboxRootDir = paths::getSandboxRootDir(flags.work_dir);
-
-  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
-    if (!strings::startsWith(entry.root, sandboxRootDir)) {
-      continue;
-    }
-
-    // TODO(jieyu): Here, we retrieve the container ID by taking the
-    // basename of 'entry.root'. This assumes that the slave's sandbox
-    // root directory are organized according to the comments in the
-    // beginning of slave/paths.hpp.
-    ContainerID containerId;
-    containerId.set_value(Path(entry.root).basename());
-
-    if (infos.contains(containerId)) {
-      continue;
-    }
-
-    Owned<Info> info(new Info(entry.root));
-
-    if (entry.root != entry.target) {
-      info->sandbox = entry.target;
-    }
-
-    infos.put(containerId, info);
-
-    // Remember all the unknown orphan containers.
-    if (!orphans.contains(containerId)) {
-      unknownOrphans.insert(containerId);
-    }
-  }
-
-  // Cleanup mounts from unknown orphans.
-  list<Future<Nothing>> futures;
-  foreach (const ContainerID& containerId, unknownOrphans) {
-    futures.push_back(cleanup(containerId));
-  }
-
-  return collect(futures)
-    .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
-                &LinuxFilesystemIsolatorProcess::_recover,
-                states,
-                orphans));
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::_recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  return provisioner->recover(states, orphans)
-    .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  infos.put(containerId, Owned<Info>(new Info(directory)));
-
-  if (!executorInfo.has_container()) {
-    return __prepare(containerId, executorInfo, directory, user, None());
-  }
-
-  // Provision the root filesystem if needed.
-  CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
-  if (!executorInfo.container().mesos().has_image()) {
-    return _prepare(containerId, executorInfo, directory, user, None());
-  }
-
-  const Image& image = executorInfo.container().mesos().image();
-
-  return provisioner->provision(containerId, image)
-    .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
-                &LinuxFilesystemIsolatorProcess::_prepare,
-                containerId,
-                executorInfo,
-                directory,
-                user,
-                lambda::_1));
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user,
-    const Option<string>& rootfs)
-{
-  CHECK(executorInfo.has_container());
-  CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
-  // We will provision the images specified in ContainerInfo::volumes
-  // as well. We will mutate ContainerInfo::volumes to include the
-  // paths to the provisioned root filesystems (by setting the
-  // 'host_path') if the volume specifies an image as the source.
-  Owned<ExecutorInfo> _executorInfo(new ExecutorInfo(executorInfo));
-  list<Future<Nothing>> futures;
-
-  for (int i = 0; i < _executorInfo->container().volumes_size(); i++) {
-    Volume* volume = _executorInfo->mutable_container()->mutable_volumes(i);
-
-    if (!volume->has_image()) {
-      continue;
-    }
-
-    const Image& image = volume->image();
-
-    futures.push_back(
-        provisioner->provision(containerId, image)
-          .then([volume](const string& path) -> Future<Nothing> {
-            volume->set_host_path(path);
-            return Nothing();
-          }));
-  }
-
-  return collect(futures)
-    .then([=]() -> Future<Option<ContainerPrepareInfo>> {
-      return __prepare(containerId, *_executorInfo, directory, user, rootfs);
-    });
-}
-
-
-Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::__prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user,
-    const Option<string>& rootfs)
-{
-  CHECK(infos.contains(containerId));
-
-  const Owned<Info>& info = infos[containerId];
-
-  ContainerPrepareInfo prepareInfo;
-  prepareInfo.set_namespaces(CLONE_NEWNS);
-
-  if (rootfs.isSome()) {
-    // If the container changes its root filesystem, we need to mount
-    // the container's work directory into its root filesystem
-    // (creating it if needed) so that the executor and the task can
-    // access the work directory.
-    //
-    // NOTE: The mount of the work directory must be a shared mount in
-    // the host filesystem so that any mounts underneath it will
-    // propagate into the container's mount namespace. This is how we
-    // can update persistent volumes for the container.
-
-    // This is the mount point of the work directory in the root filesystem.
-    const string sandbox = path::join(rootfs.get(), flags.sandbox_directory);
-
-    // Save the path 'sandbox' which will be used in 'cleanup()'.
-    info->sandbox = sandbox;
-
-    if (!os::exists(sandbox)) {
-      Try<Nothing> mkdir = os::mkdir(sandbox);
-      if (mkdir.isError()) {
-        return Failure(
-            "Failed to create sandbox mount point at '" +
-            sandbox + "': " + mkdir.error());
-      }
-    }
-
-    LOG(INFO) << "Bind mounting work directory from '" << directory
-              << "' to '" << sandbox << "' for container " << containerId;
-
-    Try<Nothing> mount = fs::mount(
-        directory,
-        sandbox,
-        None(),
-        MS_BIND,
-        NULL);
-
-    if (mount.isError()) {
-      return Failure(
-          "Failed to mount work directory '" + directory +
-          "' to '" + sandbox + "': " + mount.error());
-    }
-
-    mount = fs::mount(
-        None(),
-        sandbox,
-        None(),
-        MS_SLAVE,
-        NULL);
-
-    if (mount.isError()) {
-      return Failure(
-          "Failed to mark sandbox '" + sandbox +
-          "' as a slave mount: " + mount.error());
-    }
-
-    mount = fs::mount(
-        None(),
-        sandbox,
-        None(),
-        MS_SHARED,
-        NULL);
-
-    if (mount.isError()) {
-      return Failure(
-          "Failed to mark sandbox '" + sandbox +
-          "' as a shared mount: " + mount.error());
-    }
-
-    prepareInfo.set_rootfs(rootfs.get());
-  }
-
-  // Prepare the commands that will be run in the container's mount
-  // namespace right after forking the executor process. We use these
-  // commands to mount those volumes specified in the container info
-  // so that they don't pollute the host mount namespace.
-  Try<string> _script = script(containerId, executorInfo, directory, rootfs);
-  if (_script.isError()) {
-    return Failure("Failed to generate isolation script: " + _script.error());
-  }
-
-  CommandInfo* command = prepareInfo.add_commands();
-  command->set_value(_script.get());
-
-  return update(containerId, executorInfo.resources())
-    .then([prepareInfo]() -> Future<Option<ContainerPrepareInfo>> {
-      return prepareInfo;
-    });
-}
-
-
-Try<string> LinuxFilesystemIsolatorProcess::script(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& rootfs)
-{
-  ostringstream out;
-  out << "#!/bin/sh\n";
-  out << "set -x -e\n";
-
-  // Make sure mounts in the container mount namespace do not
-  // propagate back to the host mount namespace.
-  out << "mount --make-rslave /\n";
-
-  // Try to unmount work directory mounts and persistent volume mounts
-  // for other containers to release the extra references to them.
-  // NOTE:
-  // 1) This doesn't completely eliminate the race condition between
-  //    this container copying mount table and other containers being
-  //    cleaned up. This is instead a best-effort attempt.
-  // 2) This script assumes that all the mounts the container needs
-  //    under the slave work directory have its container ID in the
-  //    path either for the mount source (e.g. sandbox self-bind mount)
-  //    or the mount target (e.g. mounting sandbox into new rootfs).
-  //
-  // TODO(xujyan): This command may fail if --work_dir is not specified
-  // with a real path as real paths are used in the mount table. It
-  // doesn't work when the paths contain reserved characters such as
-  // spaces either because such characters in mount info are encoded
-  // in the escaped form (i.e. '\0xx').
-  out << "grep -E '" << flags.work_dir << "/.+' /proc/self/mountinfo | "
-      << "grep -v '" << containerId.value() << "' | "
-      << "cut -d' ' -f5 | " // '-f5' is the mount target. See MountInfoTable.
-      << "xargs --no-run-if-empty umount -l || "
-      << "true \n"; // We mask errors in this command.
-
-  if (!executorInfo.has_container()) {
-    return out.str();
-  }
-
-  foreach (const Volume& volume, executorInfo.container().volumes()) {
-    if (!volume.has_host_path()) {
-      return Error("A volume misses 'host_path'");
-    }
-
-    // If both 'host_path' and 'container_path' are relative paths,
-    // return an error because the user can just directly access the
-    // volume in the work directory.
-    if (!strings::startsWith(volume.host_path(), "/") &&
-        !strings::startsWith(volume.container_path(), "/")) {
-      return Error(
-          "Both 'host_path' and 'container_path' of a volume are relative");
-    }
-
-    // Determine the source of the mount.
-    string source;
-
-    if (strings::startsWith(volume.host_path(), "/")) {
-      source = volume.host_path();
-
-      // An absolute path must already exist.
-      if (!os::exists(source)) {
-        return Error("Absolute host path does not exist");
-      }
-    } else {
-      // Path is interpreted as relative to the work directory.
-      source = path::join(directory, volume.host_path());
-
-      // TODO(jieyu): We need to check that source resolves under the
-      // work directory because a user can potentially use a container
-      // path like '../../abc'.
-
-      if (!os::exists(source)) {
-        Try<Nothing> mkdir = os::mkdir(source);
-        if (mkdir.isError()) {
-          return Error(
-              "Failed to create the source of the mount at '" +
-              source + "': " + mkdir.error());
-        }
-
-        // TODO(idownes): Consider setting ownership and mode.
-      }
-    }
-
-    // Determine the target of the mount.
-    string target;
-
-    if (strings::startsWith(volume.container_path(), "/")) {
-      if (rootfs.isSome()) {
-        target = path::join(rootfs.get(), volume.container_path());
-      } else {
-        target = volume.container_path();
-      }
-
-      // An absolute path must already exist. This is because we want
-      // to avoid creating mount points outside the work directory in
-      // the host filesystem or in the container filesystem root.
-      if (!os::exists(target)) {
-        return Error("Absolute container path does not exist");
-      }
-
-      // TODO(jieyu): We need to check that target resolves under
-      // 'rootfs' because a user can potentially use a container path
-      // like '/../../abc'.
-    } else {
-      if (rootfs.isSome()) {
-        target = path::join(rootfs.get(),
-                            flags.sandbox_directory,
-                            volume.container_path());
-      } else {
-        target = path::join(directory, volume.container_path());
-      }
-
-      // TODO(jieyu): We need to check that target resolves under the
-      // sandbox because a user can potentially use a container path
-      // like '../../abc'.
-
-      if (!os::exists(target)) {
-        Try<Nothing> mkdir = os::mkdir(target);
-        if (mkdir.isError()) {
-          return Error(
-              "Failed to create the target of the mount at '" +
-              target + "': " + mkdir.error());
-        }
-      }
-    }
-
-    // TODO(jieyu): Consider the mode in the volume.
-    out << "mount -n --rbind '" << source << "' '" << target << "'\n";
-  }
-
-  return out.str();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  // No-op, isolation happens when unsharing the mount namespace.
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> LinuxFilesystemIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  // No-op.
-  return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  // Mount persistent volumes. We do this in the host namespace and
-  // rely on mount propagation for them to be visible inside the
-  // container.
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  const Owned<Info>& info = infos[containerId];
-
-  Resources current = info->resources;
-
-  // We first remove unneeded persistent volumes.
-  foreach (const Resource& resource, current.persistentVolumes()) {
-    // This is enforced by the master.
-    CHECK(resource.disk().has_volume());
-
-    // Ignore absolute and nested paths.
-    const string& containerPath = resource.disk().volume().container_path();
-    if (strings::contains(containerPath, "/")) {
-      LOG(WARNING) << "Skipping updating mount for persistent volume "
-                   << resource << " of container " << containerId
-                   << " because the container path '" << containerPath
-                   << "' contains slash";
-      continue;
-    }
-
-    if (resources.contains(resource)) {
-      continue;
-    }
-
-    // Determine the target of the mount.
-    string target;
-
-    if (info->sandbox.isSome()) {
-      target = path::join(info->sandbox.get(), containerPath);
-    } else {
-      target = path::join(info->directory, containerPath);
-    }
-
-    LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
-              << resource << " of container " << containerId;
-
-    // The unmount will fail if the task/executor is still using files
-    // or directories under 'target'.
-    Try<Nothing> unmount = fs::unmount(target);
-    if (unmount.isError()) {
-      return Failure(
-          "Failed to unmount unneeded persistent volume at '" +
-          target + "': " + unmount.error());
-    }
-
-    // NOTE: This is a non-recursive rmdir.
-    Try<Nothing> rmdir = os::rmdir(target, false);
-    if (rmdir.isError()) {
-      return Failure(
-          "Failed to remove persistent volume mount point at '" +
-          target + "': " + rmdir.error());
-    }
-  }
-
-  // We then mount new persistent volumes.
-  foreach (const Resource& resource, resources.persistentVolumes()) {
-    // This is enforced by the master.
-    CHECK(resource.disk().has_volume());
-
-    // Ignore absolute and nested paths.
-    const string& containerPath = resource.disk().volume().container_path();
-    if (strings::contains(containerPath, "/")) {
-      LOG(WARNING) << "Skipping updating mount for persistent volume "
-                   << resource << " of container " << containerId
-                   << " because the container path '" << containerPath
-                   << "' contains slash";
-      continue;
-    }
-
-    if (current.contains(resource)) {
-      continue;
-    }
-
-    // Determine the source of the mount.
-    string source = paths::getPersistentVolumePath(
-        flags.work_dir,
-        resource.role(),
-        resource.disk().persistence().id());
-
-    // Set the ownership of the persistent volume to match that of the
-    // sandbox directory.
-    //
-    // NOTE: Currently, persistent volumes in Mesos are exclusive,
-    // meaning that if a persistent volume is used by one task or
-    // executor, it cannot be concurrently used by other task or
-    // executor. But if we allow multiple executors to use same
-    // persistent volume at the same time in the future, the ownership
-    // of the persistent volume may conflict here.
-    //
-    // TODO(haosdent): Consider letting the frameworks specify the
-    // user/group of the persistent volumes.
-    struct stat s;
-    if (::stat(info->directory.c_str(), &s) < 0) {
-      return Failure(
-          "Failed to get ownership for '" + info->directory +
-          "': " + strerror(errno));
-    }
-
-    LOG(INFO) << "Changing the ownership of the persistent volume at '"
-              << source << "' with uid " << s.st_uid
-              << " and gid " << s.st_gid;
-
-    Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, source, true);
-    if (chown.isError()) {
-      return Failure(
-          "Failed to change the ownership of the persistent volume at '" +
-          source + "' with uid " + stringify(s.st_uid) +
-          " and gid " + stringify(s.st_gid) + ": " + chown.error());
-    }
-
-    // Determine the target of the mount.
-    string target;
-
-    if (info->sandbox.isSome()) {
-      target = path::join(info->sandbox.get(), containerPath);
-    } else {
-      target = path::join(info->directory, containerPath);
-    }
-
-    if (os::exists(target)) {
-      // NOTE: This is possible because 'info->resources' will be
-      // reset when slave restarts and recovers. When the slave calls
-      // 'containerizer->update' after the executor re-registers,
-      // we'll try to re-mount all the already mounted volumes.
-
-      // TODO(jieyu): Check the source of the mount matches the entry
-      // with the same target in the mount table if one can be found.
-      // If not, mount the persistent volume as we did below. This is
-      // possible because the slave could crash after it unmounts the
-      // volume but before it is able to delete the mount point.
-    } else {
-      Try<Nothing> mkdir = os::mkdir(target);
-      if (mkdir.isError()) {
-        return Failure(
-            "Failed to create persistent volume mount point at '" +
-            target + "': " + mkdir.error());
-      }
-
-      LOG(INFO) << "Mounting '" << source << "' to '" << target
-                << "' for persistent volume " << resource
-                << " of container " << containerId;
-
-      Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
-      if (mount.isError()) {
-        return Failure(
-            "Failed to mount persistent volume from '" +
-            source + "' to '" + target + "': " + mount.error());
-      }
-    }
-  }
-
-  // Store the new resources;
-  info->resources = resources;
-
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> LinuxFilesystemIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  // No-op, no usage gathered.
-  return ResourceStatistics();
-}
-
-
-Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    VLOG(1) << "Ignoring cleanup request for unknown container: "
-            << containerId;
-
-    return Nothing();
-  }
-
-  const Owned<Info>& info = infos[containerId];
-
-  // NOTE: We don't need to cleanup mounts in the container's mount
-  // namespace because it's done automatically by the kernel when the
-  // mount namespace is destroyed after the last process terminates.
-
-  // The path to the container' work directory which is the parent of
-  // all the persistent volume mounts.
-  string sandbox;
-
-  if (info->sandbox.isSome()) {
-    sandbox = info->sandbox.get();
-  } else {
-    sandbox = info->directory;
-  }
-
-  infos.erase(containerId);
-
-  // Cleanup the mounts for this container in the host mount
-  // namespace, including container's work directory and all the
-  // persistent volume mounts.
-  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
-  if (table.isError()) {
-    return Failure("Failed to get mount table: " + table.error());
-  }
-
-  bool sandboxMountExists = false;
-
-  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
-    // NOTE: All persistent volumes are mounted at targets under the
-    // container's work directory. We unmount all the persistent
-    // volumes before unmounting the sandbox/work directory mount.
-    if (entry.target == sandbox) {
-      sandboxMountExists = true;
-    } else if (strings::startsWith(entry.target, sandbox)) {
-      LOG(INFO) << "Unmounting volume '" << entry.target
-                << "' for container " << containerId;
-
-      Try<Nothing> unmount = fs::unmount(entry.target);
-      if (unmount.isError()) {
-        return Failure(
-            "Failed to unmount volume '" + entry.target +
-            "': " + unmount.error());
-      }
-    }
-  }
-
-  if (!sandboxMountExists) {
-    // This could happen if the container was not launched by this
-    // isolator (e.g., slaves prior to 0.25.0), or the container did
-    // not specify a root filesystem.
-    LOG(INFO) << "Ignoring unmounting sandbox/work directory"
-              << " for container " << containerId;
-  } else {
-    LOG(INFO) << "Unmounting sandbox/work directory '" << sandbox
-              << "' for container " << containerId;
-
-    Try<Nothing> unmount = fs::unmount(sandbox);
-    if (unmount.isError()) {
-      return Failure(
-          "Failed to unmount sandbox/work directory '" + sandbox +
-          "': " + unmount.error());
-    }
-  }
-
-  // Destroy the provisioned root filesystems.
-  return provisioner->destroy(containerId)
-    .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-LinuxFilesystemIsolatorProcess::Metrics::Metrics(
-    const PID<LinuxFilesystemIsolatorProcess>& isolator)
-  : containers_new_rootfs(
-      "containerizer/mesos/filesystem/containers_new_rootfs",
-      defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs))
-{
-  process::metrics::add(containers_new_rootfs);
-}
-
-
-LinuxFilesystemIsolatorProcess::Metrics::~Metrics()
-{
-  process::metrics::remove(containers_new_rootfs);
-}
-
-
-double LinuxFilesystemIsolatorProcess::_containers_new_rootfs()
-{
-  double count = 0.0;
-
-  foreachvalue (const Owned<Info>& info, infos) {
-    if (info->sandbox.isSome()) {
-      ++count;
-    }
-  }
-
-  return count;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/linux.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/linux.hpp b/src/slave/containerizer/isolators/filesystem/linux.hpp
deleted file mode 100644
index 93e85f2..0000000
--- a/src/slave/containerizer/isolators/filesystem/linux.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LINUX_FILESYSTEM_ISOLATOR_HPP__
-#define __LINUX_FILESYSTEM_ISOLATOR_HPP__
-
-#include <mesos/mesos.hpp>
-#include <mesos/resources.hpp>
-
-#include <process/owned.hpp>
-#include <process/pid.hpp>
-
-#include <process/metrics/gauge.hpp>
-
-#include <stout/hashmap.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The filesystem isolator on Linux that is responsible for preparing
-// the root filesystems and volumes (e.g., persistent volumes) for
-// containers. It relies on Linux mount namespace to prevent mounts of
-// a container from being propagated to the host mount table.
-class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(
-      const Flags& flags,
-      const process::Owned<Provisioner>& provisioner);
-
-  virtual ~LinuxFilesystemIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  LinuxFilesystemIsolatorProcess(
-      const Flags& flags,
-      const process::Owned<Provisioner>& provisioner);
-
-  process::Future<Nothing> _recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  process::Future<Option<mesos::slave::ContainerPrepareInfo>> _prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user,
-      const Option<std::string>& rootfs);
-
-  process::Future<Option<mesos::slave::ContainerPrepareInfo>> __prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user,
-      const Option<std::string>& rootfs);
-
-  Try<std::string> script(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& rootfs);
-
-  const Flags flags;
-  const process::Owned<Provisioner> provisioner;
-
-  struct Info
-  {
-    Info(const std::string& _directory) : directory(_directory) {}
-
-    const std::string directory;
-
-    // The absolute path to the container's work directory mount point
-    // in the new root filesystem if the container changes its root
-    // filesystem (i.e., '<rootfs>/<flags.sandbox_directory>'). If the
-    // container does not specify a root filesystem, this field will
-    // not be set.
-    Option<std::string> sandbox;
-
-    // Track resources so we can unmount unneeded persistent volumes.
-    Resources resources;
-  };
-
-  hashmap<ContainerID, process::Owned<Info>> infos;
-
-  struct Metrics
-  {
-    explicit Metrics(
-        const process::PID<LinuxFilesystemIsolatorProcess>& isolator);
-    ~Metrics();
-
-    process::metrics::Gauge containers_new_rootfs;
-  } metrics;
-
-  double _containers_new_rootfs();
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __LINUX_FILESYSTEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/posix.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/posix.cpp b/src/slave/containerizer/isolators/filesystem/posix.cpp
deleted file mode 100644
index eec510c..0000000
--- a/src/slave/containerizer/isolators/filesystem/posix.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <string>
-
-#include <stout/fs.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-
-#include "slave/paths.hpp"
-
-#include "slave/containerizer/isolators/filesystem/posix.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess(
-    const Flags& _flags)
-  : flags(_flags) {}
-
-
-PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {}
-
-
-Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags)
-{
-  process::Owned<MesosIsolatorProcess> process(
-      new PosixFilesystemIsolatorProcess(flags));
-
-  return new MesosIsolator(process);
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  foreach (const ContainerState& state, states) {
-    infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> PosixFilesystemIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  // Return failure if the container change the filesystem root
-  // because the symlinks will become invalid in the new root.
-  if (executorInfo.has_container()) {
-    CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
-
-    if (executorInfo.container().mesos().has_image()) {
-      return Failure("Container root filesystems not supported");
-    }
-
-    // TODO(jieyu): Also return a failure if there exists images in
-    // the specified volumes.
-  }
-
-  infos.put(containerId, Owned<Info>(new Info(directory)));
-
-  return update(containerId, executorInfo.resources())
-      .then([]() -> Future<Option<ContainerPrepareInfo>> { return None(); });
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  // No-op.
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> PosixFilesystemIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  // No-op.
-  return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  const Owned<Info>& info = infos[containerId];
-
-  // TODO(jieyu): Currently, we only allow non-nested relative
-  // container paths for volumes. This is enforced by the master. For
-  // those volumes, we create symlinks in the executor directory.
-  Resources current = info->resources;
-
-  // We first remove unneeded persistent volumes.
-  foreach (const Resource& resource, current.persistentVolumes()) {
-    // This is enforced by the master.
-    CHECK(resource.disk().has_volume());
-
-    // Ignore absolute and nested paths.
-    const string& containerPath = resource.disk().volume().container_path();
-    if (strings::contains(containerPath, "/")) {
-      LOG(WARNING) << "Skipping updating symlink for persistent volume "
-                   << resource << " of container " << containerId
-                   << " because the container path '" << containerPath
-                   << "' contains slash";
-      continue;
-    }
-
-    if (resources.contains(resource)) {
-      continue;
-    }
-
-    string link = path::join(info->directory, containerPath);
-
-    LOG(INFO) << "Removing symlink '" << link << "' for persistent volume "
-              << resource << " of container " << containerId;
-
-    Try<Nothing> rm = os::rm(link);
-    if (rm.isError()) {
-      return Failure(
-          "Failed to remove the symlink for the unneeded "
-          "persistent volume at '" + link + "'");
-    }
-  }
-
-  // We then link additional persistent volumes.
-  foreach (const Resource& resource, resources.persistentVolumes()) {
-    // This is enforced by the master.
-    CHECK(resource.disk().has_volume());
-
-    // Ignore absolute and nested paths.
-    const string& containerPath = resource.disk().volume().container_path();
-    if (strings::contains(containerPath, "/")) {
-      LOG(WARNING) << "Skipping updating symlink for persistent volume "
-                   << resource << " of container " << containerId
-                   << " because the container path '" << containerPath
-                   << "' contains slash";
-      continue;
-    }
-
-    if (current.contains(resource)) {
-      continue;
-    }
-
-    string original = paths::getPersistentVolumePath(
-        flags.work_dir,
-        resource.role(),
-        resource.disk().persistence().id());
-
-    // Set the ownership of the persistent volume to match that of the
-    // sandbox directory.
-    //
-    // NOTE: Currently, persistent volumes in Mesos are exclusive,
-    // meaning that if a persistent volume is used by one task or
-    // executor, it cannot be concurrently used by other task or
-    // executor. But if we allow multiple executors to use same
-    // persistent volume at the same time in the future, the ownership
-    // of the persistent volume may conflict here.
-    //
-    // TODO(haosdent): Consider letting the frameworks specify the
-    // user/group of the persistent volumes.
-    struct stat s;
-    if (::stat(info->directory.c_str(), &s) < 0) {
-      return Failure(
-          "Failed to get ownership for '" + info->directory +
-          "': " + strerror(errno));
-    }
-
-    LOG(INFO) << "Changing the ownership of the persistent volume at '"
-              << original << "' with uid " << s.st_uid
-              << " and gid " << s.st_gid;
-
-    Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true);
-    if (chown.isError()) {
-      return Failure(
-          "Failed to change the ownership of the persistent volume at '" +
-          original + "' with uid " + stringify(s.st_uid) +
-          " and gid " + stringify(s.st_gid) + ": " + chown.error());
-    }
-
-    string link = path::join(info->directory, containerPath);
-
-    if (os::exists(link)) {
-      // NOTE: This is possible because 'info->resources' will be
-      // reset when slave restarts and recovers. When the slave calls
-      // 'containerizer->update' after the executor re-registers,
-      // we'll try to relink all the already symlinked volumes.
-      Result<string> realpath = os::realpath(link);
-      if (!realpath.isSome()) {
-        return Failure(
-            "Failed to get the realpath of symlink '" + link + "': " +
-            (realpath.isError() ? realpath.error() : "No such directory"));
-      }
-
-      // A sanity check to make sure the target of the symlink does
-      // not change. In fact, this is not supposed to happen.
-      // NOTE: Here, we compare the realpaths because 'original' might
-      // contain symbolic links.
-      Result<string> _original = os::realpath(original);
-      if (!_original.isSome()) {
-        return Failure(
-            "Failed to get the realpath of volume '" + original + "': " +
-            (_original.isError() ? _original.error() : "No such directory"));
-      }
-
-      if (realpath.get() != _original.get()) {
-        return Failure(
-            "The existing symlink '" + link + "' points to '" +
-            _original.get() + "' and the new target is '" +
-            realpath.get() + "'");
-      }
-    } else {
-      LOG(INFO) << "Adding symlink from '" << original << "' to '"
-                << link << "' for persistent volume " << resource
-                << " of container " << containerId;
-
-      Try<Nothing> symlink = ::fs::symlink(original, link);
-      if (symlink.isError()) {
-        return Failure(
-            "Failed to symlink persistent volume from '" +
-            original + "' to '" + link + "'");
-      }
-    }
-  }
-
-  // Store the updated resources.
-  info->resources = resources;
-
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> PosixFilesystemIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  // No-op, no usage gathered.
-  return ResourceStatistics();
-}
-
-
-Future<Nothing> PosixFilesystemIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  // Symlinks for persistent resources will be removed when the work
-  // directory is GC'ed, therefore no need to do explicit cleanup.
-  infos.erase(containerId);
-
-  return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/posix.hpp b/src/slave/containerizer/isolators/filesystem/posix.hpp
deleted file mode 100644
index 61b81dd..0000000
--- a/src/slave/containerizer/isolators/filesystem/posix.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_FILESYSTEM_ISOLATOR_HPP__
-#define __POSIX_FILESYSTEM_ISOLATOR_HPP__
-
-#include <mesos/resources.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class PosixFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~PosixFilesystemIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  PosixFilesystemIsolatorProcess(const Flags& flags);
-
-  const Flags flags;
-
-  struct Info
-  {
-    explicit Info(const std::string& _directory)
-      : directory(_directory) {}
-
-    const std::string directory;
-
-    // Track resources so we can unlink unneeded persistent volumes.
-    Resources resources;
-  };
-
-  hashmap<ContainerID, process::Owned<Info>> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_FILESYSTEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/shared.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/shared.cpp b/src/slave/containerizer/isolators/filesystem/shared.cpp
deleted file mode 100644
index 73804ca..0000000
--- a/src/slave/containerizer/isolators/filesystem/shared.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <set>
-
-#include "linux/ns.hpp"
-
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-SharedFilesystemIsolatorProcess::SharedFilesystemIsolatorProcess(
-    const Flags& _flags)
-  : flags(_flags) {}
-
-
-SharedFilesystemIsolatorProcess::~SharedFilesystemIsolatorProcess() {}
-
-
-Try<Isolator*> SharedFilesystemIsolatorProcess::create(const Flags& flags)
-{
-  Result<string> user = os::user();
-  if (!user.isSome()) {
-    return Error("Failed to determine user: " +
-                 (user.isError() ? user.error() : "username not found"));
-  }
-
-  if (user.get() != "root") {
-    return Error("SharedFilesystemIsolator requires root privileges");
-  }
-
-  process::Owned<MesosIsolatorProcess> process(
-      new SharedFilesystemIsolatorProcess(flags));
-
-  return new MesosIsolator(process);
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  // There is nothing to recover because we do not keep any state and
-  // do not monitor filesystem usage or perform any action on cleanup.
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> SharedFilesystemIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (executorInfo.has_container() &&
-      executorInfo.container().type() != ContainerInfo::MESOS) {
-    return Failure("Can only prepare filesystem for a MESOS container");
-  }
-
-  LOG(INFO) << "Preparing shared filesystem for container: "
-            << stringify(containerId);
-
-  if (!executorInfo.has_container()) {
-    // We don't consider this an error, there's just nothing to do so
-    // we return None.
-
-    return None();
-  }
-
-  // We don't support mounting to a container path which is a parent
-  // to another container path as this can mask entries. We'll keep
-  // track of all container paths so we can check this.
-  set<string> containerPaths;
-  containerPaths.insert(directory);
-
-  ContainerPrepareInfo prepareInfo;
-  prepareInfo.set_namespaces(CLONE_NEWNS);
-
-  foreach (const Volume& volume, executorInfo.container().volumes()) {
-    // Because the filesystem is shared we require the container path
-    // already exist, otherwise containers can create arbitrary paths
-    // outside their sandbox.
-    if (!os::exists(volume.container_path())) {
-      return Failure("Volume with container path '" +
-                     volume.container_path() +
-                     "' must exist on host for shared filesystem isolator");
-    }
-
-    // Host path must be provided.
-    if (!volume.has_host_path()) {
-      return Failure("Volume with container path '" +
-                     volume.container_path() +
-                     "' must specify host path for shared filesystem isolator");
-    }
-
-    // Check we won't mask another volume.
-    // NOTE: Assuming here that the container path is absolute, see
-    // Volume protobuf.
-    // TODO(idownes): This test is unnecessarily strict and could be
-    // relaxed if mounts could be re-ordered.
-    foreach (const string& containerPath, containerPaths) {
-      if (strings::startsWith(volume.container_path(), containerPath)) {
-        return Failure("Cannot mount volume to '" +
-                        volume.container_path() +
-                        "' because it is under volume '" +
-                        containerPath +
-                        "'");
-      }
-
-      if (strings::startsWith(containerPath, volume.container_path())) {
-        return Failure("Cannot mount volume to '" +
-                        containerPath +
-                        "' because it is under volume '" +
-                        volume.container_path() +
-                        "'");
-      }
-    }
-    containerPaths.insert(volume.container_path());
-
-    // A relative host path will be created in the container's work
-    // directory, otherwise check it already exists.
-    string hostPath;
-    if (!strings::startsWith(volume.host_path(), "/")) {
-      hostPath = path::join(directory, volume.host_path());
-
-      // Do not support any relative components in the resulting path.
-      // There should not be any links in the work directory to
-      // resolve.
-      if (strings::contains(hostPath, "/./") ||
-          strings::contains(hostPath, "/../")) {
-        return Failure("Relative host path '" +
-                       hostPath +
-                       "' cannot contain relative components");
-      }
-
-      Try<Nothing> mkdir = os::mkdir(hostPath, true);
-      if (mkdir.isError()) {
-        return Failure("Failed to create host_path '" +
-                        hostPath +
-                        "' for mount to '" +
-                        volume.container_path() +
-                        "': " +
-                        mkdir.error());
-      }
-
-      // Set the ownership and permissions to match the container path
-      // as these are inherited from host path on bind mount.
-      struct stat stat;
-      if (::stat(volume.container_path().c_str(), &stat) < 0) {
-        return Failure("Failed to get permissions on '" +
-                        volume.container_path() + "'" +
-                        ": " + strerror(errno));
-      }
-
-      Try<Nothing> chmod = os::chmod(hostPath, stat.st_mode);
-      if (chmod.isError()) {
-        return Failure("Failed to chmod hostPath '" +
-                       hostPath +
-                       "': " +
-                       chmod.error());
-      }
-
-      Try<Nothing> chown = os::chown(stat.st_uid, stat.st_gid, hostPath, false);
-      if (chown.isError()) {
-        return Failure("Failed to chown hostPath '" +
-                       hostPath +
-                       "': " +
-                       chown.error());
-      }
-    } else {
-      hostPath = volume.host_path();
-
-      if (!os::exists(hostPath)) {
-        return Failure("Volume with container path '" +
-                       volume.container_path() +
-                       "' must have host path '" +
-                       hostPath +
-                       "' present on host for shared filesystem isolator");
-      }
-    }
-
-    prepareInfo.add_commands()->set_value(
-        "mount -n --bind " + hostPath + " " + volume.container_path());
-  }
-
-  return prepareInfo;
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  // No-op, isolation happens when unsharing the mount namespace.
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> SharedFilesystemIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  // No-op, for now.
-
-  return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  // No-op, nothing enforced.
-
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> SharedFilesystemIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  // No-op, no usage gathered.
-
-  return ResourceStatistics();
-}
-
-
-Future<Nothing> SharedFilesystemIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  // Cleanup of mounts is done automatically done by the kernel when
-  // the mount namespace is destroyed after the last process
-  // terminates.
-
-  return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/filesystem/shared.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/filesystem/shared.hpp b/src/slave/containerizer/isolators/filesystem/shared.hpp
deleted file mode 100644
index 3a2f7db..0000000
--- a/src/slave/containerizer/isolators/filesystem/shared.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __SHARED_FILESYSTEM_ISOLATOR_HPP__
-#define __SHARED_FILESYSTEM_ISOLATOR_HPP__
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// This isolator is to be used when all containers share the host's
-// filesystem.  It supports creating mounting "volumes" from the host
-// into each container's mount namespace. In particular, this can be
-// used to give each container a "private" system directory, such as
-// /tmp and /var/tmp.
-class SharedFilesystemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~SharedFilesystemIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  SharedFilesystemIsolatorProcess(const Flags& flags);
-
-  const Flags flags;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __SHARED_FILESYSTEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/namespaces/pid.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/namespaces/pid.cpp b/src/slave/containerizer/isolators/namespaces/pid.cpp
deleted file mode 100644
index a9823e0..0000000
--- a/src/slave/containerizer/isolators/namespaces/pid.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sys/mount.h>
-
-#include <list>
-#include <set>
-#include <string>
-
-#include <stout/os.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/ls.hpp>
-#include <stout/os/stat.hpp>
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The root directory where we bind mount all the namespace handles.
-static const char PID_NS_BIND_MOUNT_ROOT[] = "/var/run/mesos/pidns";
-
-
-// The empty directory that we'll use to mask the namespace handles
-// inside each container. This mount ensures they cannot determine the
-// namespace of another container.
-static const char PID_NS_BIND_MOUNT_MASK_DIR[] = "/var/empty/mesos";
-
-
-// Helper to construct the path to a pid's namespace file.
-inline string nsProcFile(pid_t pid)
-{
-  return path::join("/proc", stringify(pid), "ns", "pid");
-}
-
-
-// Helper to construct the path to the additional reference created
-// for a container's pid namespace.
-inline string nsExtraReference(const ContainerID& containerId)
-{
-  return path::join(PID_NS_BIND_MOUNT_ROOT, stringify(containerId));
-}
-
-
-Try<Isolator*> NamespacesPidIsolatorProcess::create(const Flags& flags)
-{
-  // Check for root permission.
-  if (geteuid() != 0) {
-    return Error("The pid namespace isolator requires root permissions");
-  }
-
-  // Verify that pid namespaces are available on this kernel.
-  if (ns::namespaces().count("pid") == 0) {
-    return Error("Pid namespaces are not supported by this kernel");
-  }
-
-  // Create the directory where bind mounts of the pid namespace will
-  // be placed.
-  Try<Nothing> mkdir = os::mkdir(PID_NS_BIND_MOUNT_ROOT);
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount root directory at " +
-        string(PID_NS_BIND_MOUNT_ROOT) + ": " + mkdir.error());
-  }
-
-  // Create the empty directory that will be used to mask the bind
-  // mounts inside each container.
-  mkdir = os::mkdir(PID_NS_BIND_MOUNT_MASK_DIR);
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount mask direcrory at " +
-        string(PID_NS_BIND_MOUNT_MASK_DIR) + ": " + mkdir.error());
-  }
-
-  return new MesosIsolator(Owned<MesosIsolatorProcess>(
-      new NamespacesPidIsolatorProcess()));
-}
-
-
-Result<ino_t> NamespacesPidIsolatorProcess::getNamespace(
-    const ContainerID& containerId)
-{
-  const string target = nsExtraReference(containerId);
-
-  if (os::exists(target)) {
-    return os::stat::inode(target);
-  }
-
-  return None();
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  hashset<ContainerID> recovered;
-  foreach (const ContainerState& state, states) {
-    recovered.insert(state.container_id());
-  }
-
-  // Clean up any unknown orphaned bind mounts and empty files. Known
-  // orphan bind mounts and empty files will be destroyed by the
-  // containerizer using the normal cleanup path. See MESOS-2367 for
-  // details.
-  Try<list<string>> entries = os::ls(PID_NS_BIND_MOUNT_ROOT);
-  if (entries.isError()) {
-    return Failure("Failed to list existing containers in '" +
-                   string(PID_NS_BIND_MOUNT_ROOT) + "': " + entries.error());
-  }
-
-  foreach (const string& entry, entries.get()) {
-    ContainerID containerId;
-    containerId.set_value(entry);
-
-    if (!recovered.contains(containerId) && !orphans.contains(containerId)) {
-      cleanup(containerId);
-    }
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> NamespacesPidIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  ContainerPrepareInfo prepareInfo;
-  prepareInfo.set_namespaces(CLONE_NEWPID | CLONE_NEWNS);
-
-  // Mask the bind mount root directory in each container so
-  // containers cannot see the namespace bind mount of other
-  // containers.
-  prepareInfo.add_commands()->set_value(
-      "mount -n --bind " + string(PID_NS_BIND_MOUNT_MASK_DIR) +
-      " " + string(PID_NS_BIND_MOUNT_ROOT));
-
-  // Mount /proc for the container's pid namespace to show the
-  // container's pids (and other /proc files), not the parent's. We
-  // first recursively make the mount private because /proc is usually
-  // marked explicitly as shared (see /proc/self/mountinfo) and
-  // changes would propagate to the parent's /proc mount otherwise. We
-  // then mount /proc with the standard options. This technique was
-  // taken from unshare.c in utils-linux for --mount-proc. We use the
-  // -n flag so the mount is not added to the mtab where it will not
-  // be correctly removed with the namespace terminates.
-  prepareInfo.add_commands()->set_value(
-      "mount none /proc --make-private -o rec");
-  prepareInfo.add_commands()->set_value(
-      "mount -n -t proc proc /proc -o nosuid,noexec,nodev");
-
-  return prepareInfo;
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  const string source = nsProcFile(pid);
-  const string target = nsExtraReference(containerId);
-
-  // Create a bind mount of the pid namespace so we can control the
-  // lifetime of the pid namespace. This lets us identify the
-  // container's pid namespace, even if the leading pid has exited.
-  Try<Nothing> touch = os::touch(target);
-  if (touch.isError()) {
-    return Failure("Failed to create bind mount target: " + touch.error());
-  }
-
-  Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
-  if (mount.isError()) {
-    return Failure(
-        "Failed to mount pid namespace handle from " +
-        source + " to " + target + ": " + mount.error());
-  }
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> NamespacesPidIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> NamespacesPidIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  return ResourceStatistics();
-}
-
-
-// An old glibc might not have this symbol.
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-
-Future<Nothing> NamespacesPidIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  const string target = nsExtraReference(containerId);
-
-  if (os::exists(target)) {
-    // We don't expect anyone to have a reference to target but do a
-    // lazy umount in case. We do not want to force the umount; it
-    // will not cause an issue if this umount is delayed.
-    Try<Nothing> unmount = fs::unmount(target, MNT_DETACH);
-
-    // This will fail if the unmount hasn't completed yet but this
-    // only leaks a uniquely named empty file that will cleaned up as
-    // an orphan on recovery.
-    os::rm(target);
-  }
-
-  return Nothing();
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/namespaces/pid.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/namespaces/pid.hpp b/src/slave/containerizer/isolators/namespaces/pid.hpp
deleted file mode 100644
index 87270d0..0000000
--- a/src/slave/containerizer/isolators/namespaces/pid.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __NAMESPACES_PID_ISOLATOR_HPP__
-#define __NAMESPACES_PID_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <string>
-
-#include <stout/result.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// This isolator itself does not specify the necessary clone() flags
-// (see the LinuxLauncher for that) but it is used to keep track of a
-// container's pid namespace through a bind mount and exposed by
-// getNamespace().
-class NamespacesPidIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  // Return the pid namespace of the container. Returns None if the
-  // container was not created in a separate pid namespace, i.e.,
-  // processes are in the same namespace as the slave. This is used by
-  // the LinuxLauncher to determine if it can kill the leading process
-  // in the container and let the kernel kill the remaining processes.
-  // A container may not have a pid namespace if it was created
-  // without the namespaces/pid isolator and the slave was
-  // subsequently restarted with namespaces/pid enabled.
-  static Result<ino_t> getNamespace(const ContainerID& container);
-
-  NamespacesPidIsolatorProcess() {}
-
-  virtual ~NamespacesPidIsolatorProcess() {}
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __NAMESPACES_PID_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/helper.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/helper.cpp b/src/slave/containerizer/isolators/network/helper.cpp
deleted file mode 100644
index e5fb99e..0000000
--- a/src/slave/containerizer/isolators/network/helper.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/none.hpp>
-#include <stout/subcommand.hpp>
-
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
-
-using namespace mesos::internal::slave;
-
-
-int main(int argc, char** argv)
-{
-  return Subcommand::dispatch(
-      None(),
-      argc,
-      argv,
-      new PortMappingUpdate(),
-      new PortMappingStatistics());
-}

[08/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp b/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
new file mode 100644
index 0000000..fa45439
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/mem.cpp
@@ -0,0 +1,733 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <list>
+#include <vector>
+
+#include <mesos/type_utils.hpp>
+#include <mesos/values.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/pid.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/lambda.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "common/protobuf_utils.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+
+using namespace process;
+
+using cgroups::memory::pressure::Level;
+using cgroups::memory::pressure::Counter;
+
+using std::list;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+static const vector<Level> levels()
+{
+  return {Level::LOW, Level::MEDIUM, Level::CRITICAL};
+}
+
+
+CgroupsMemIsolatorProcess::CgroupsMemIsolatorProcess(
+    const Flags& _flags,
+    const string& _hierarchy,
+    const bool _limitSwap)
+  : flags(_flags),
+    hierarchy(_hierarchy),
+    limitSwap(_limitSwap) {}
+
+
+CgroupsMemIsolatorProcess::~CgroupsMemIsolatorProcess() {}
+
+
+Try<Isolator*> CgroupsMemIsolatorProcess::create(const Flags& flags)
+{
+  Try<string> hierarchy = cgroups::prepare(
+      flags.cgroups_hierarchy,
+      "memory",
+      flags.cgroups_root);
+
+  if (hierarchy.isError()) {
+    return Error("Failed to create memory cgroup: " + hierarchy.error());
+  }
+
+  // Ensure that no other subsystem is attached to the hierarchy.
+  Try<set<string>> subsystems = cgroups::subsystems(hierarchy.get());
+  if (subsystems.isError()) {
+    return Error(
+        "Failed to get the list of attached subsystems for hierarchy " +
+        hierarchy.get());
+  } else if (subsystems.get().size() != 1) {
+    return Error(
+        "Unexpected subsystems found attached to the hierarchy " +
+        hierarchy.get());
+  }
+
+  // Make sure the kernel OOM-killer is enabled.
+  // The Mesos OOM handler, as implemented, is not capable of handling
+  // the oom condition by itself safely given the limitations Linux
+  // imposes on this code path.
+  Try<Nothing> enable = cgroups::memory::oom::killer::enable(
+      hierarchy.get(), flags.cgroups_root);
+
+  if (enable.isError()) {
+    return Error(enable.error());
+  }
+
+  // Test if memory pressure listening is enabled. We test that on the
+  // root cgroup. We rely on 'Counter::create' to test if memory
+  // pressure listening is enabled or not. The created counters will
+  // be destroyed immediately.
+  foreach (Level level, levels()) {
+    Try<Owned<Counter>> counter = Counter::create(
+        hierarchy.get(),
+        flags.cgroups_root,
+        level);
+
+    if (counter.isError()) {
+      return Error("Failed to listen on " + stringify(level) +
+                   " memory events: " + counter.error());
+    }
+  }
+
+  // Determine whether to limit swap or not.
+  bool limitSwap = false;
+
+  if (flags.cgroups_limit_swap) {
+    Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
+        hierarchy.get(), flags.cgroups_root);
+
+    if (check.isError()) {
+      return Error(
+          "Failed to read 'memory.memsw.limit_in_bytes': " +
+          check.error());
+    } else if (check.isNone()) {
+      return Error("'memory.memsw.limit_in_bytes' is not available");
+    }
+
+    limitSwap = true;
+  }
+
+  process::Owned<MesosIsolatorProcess> process(
+      new CgroupsMemIsolatorProcess(flags, hierarchy.get(), limitSwap));
+
+  return new MesosIsolator(process);
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  foreach (const ContainerState& state, states) {
+    const ContainerID& containerId = state.container_id();
+    const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+    Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+    if (exists.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+      infos.clear();
+      return Failure("Failed to check cgroup for container '" +
+                     stringify(containerId) + "'");
+    }
+
+    if (!exists.get()) {
+      VLOG(1) << "Couldn't find cgroup for container " << containerId;
+      // This may occur if the executor has exited and the isolator
+      // has destroyed the cgroup but the slave dies before noticing
+      // this. This will be detected when the containerizer tries to
+      // monitor the executor's pid.
+      continue;
+    }
+
+    infos[containerId] = new Info(containerId, cgroup);
+
+    oomListen(containerId);
+    pressureListen(containerId);
+  }
+
+  // Remove orphan cgroups.
+  Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
+  if (cgroups.isError()) {
+    foreachvalue (Info* info, infos) {
+      delete info;
+    }
+    infos.clear();
+    return Failure(cgroups.error());
+  }
+
+  foreach (const string& cgroup, cgroups.get()) {
+    // Ignore the slave cgroup (see the --slave_subsystems flag).
+    // TODO(idownes): Remove this when the cgroups layout is updated,
+    // see MESOS-1185.
+    if (cgroup == path::join(flags.cgroups_root, "slave")) {
+      continue;
+    }
+
+    ContainerID containerId;
+    containerId.set_value(Path(cgroup).basename());
+
+    if (infos.contains(containerId)) {
+      continue;
+    }
+
+    // Known orphan cgroups will be destroyed by the containerizer
+    // using the normal cleanup path. See MESOS-2367 for details.
+    if (orphans.contains(containerId)) {
+      infos[containerId] = new Info(containerId, cgroup);
+      continue;
+    }
+
+    LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
+
+    // We don't wait on the destroy as we don't want to block recovery.
+    cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsMemIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  // TODO(bmahler): Don't insert into 'infos' unless we create the
+  // cgroup successfully. It's safe for now because 'cleanup' gets
+  // called if we return a Failure, but cleanup will fail because the
+  // cgroup does not exist when cgroups::destroy is called.
+  Info* info = new Info(
+      containerId, path::join(flags.cgroups_root, containerId.value()));
+
+  infos[containerId] = info;
+
+  // Create a cgroup for this container.
+  Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
+
+  if (exists.isError()) {
+    return Failure("Failed to prepare isolator: " + exists.error());
+  } else if (exists.get()) {
+    return Failure("Failed to prepare isolator: cgroup already exists");
+  }
+
+  Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
+  if (create.isError()) {
+    return Failure("Failed to prepare isolator: " + create.error());
+  }
+
+  // Chown the cgroup so the executor can create nested cgroups. Do
+  // not recurse so the control files are still owned by the slave
+  // user and thus cannot be changed by the executor.
+  if (user.isSome()) {
+    Try<Nothing> chown = os::chown(
+        user.get(),
+        path::join(hierarchy, info->cgroup),
+        false);
+    if (chown.isError()) {
+      return Failure("Failed to prepare isolator: " + chown.error());
+    }
+  }
+
+  oomListen(containerId);
+  pressureListen(containerId);
+
+  return update(containerId, executorInfo.resources())
+    .then([]() -> Future<Option<ContainerPrepareInfo>> {
+      return None();
+    });
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  CHECK_NONE(info->pid);
+  info->pid = pid;
+
+  Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
+  if (assign.isError()) {
+    return Failure("Failed to assign container '" +
+                   stringify(info->containerId) + "' to its own cgroup '" +
+                   path::join(hierarchy, info->cgroup) +
+                   "' : " + assign.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsMemIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  CHECK_NOTNULL(infos[containerId]);
+
+  return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  if (resources.mem().isNone()) {
+    return Failure("No memory resource given");
+  }
+
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  // New limit.
+  Bytes mem = resources.mem().get();
+  Bytes limit = std::max(mem, MIN_MEMORY);
+
+  // Always set the soft limit.
+  Try<Nothing> write =
+    cgroups::memory::soft_limit_in_bytes(hierarchy, info->cgroup, limit);
+
+  if (write.isError()) {
+    return Failure(
+        "Failed to set 'memory.soft_limit_in_bytes': " + write.error());
+  }
+
+  LOG(INFO) << "Updated 'memory.soft_limit_in_bytes' to " << limit
+            << " for container " << containerId;
+
+  // Read the existing limit.
+  Try<Bytes> currentLimit =
+    cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
+
+  // NOTE: If limitSwap is (has been) used then both limit_in_bytes
+  // and memsw.limit_in_bytes will always be set to the same value.
+  if (currentLimit.isError()) {
+    return Failure(
+        "Failed to read 'memory.limit_in_bytes': " + currentLimit.error());
+  }
+
+  // Determine whether to set the hard limit. If this is the first
+  // time (info->pid.isNone()), or we're raising the existing limit,
+  // then we can update the hard limit safely. Otherwise, if we need
+  // to decrease 'memory.limit_in_bytes' we may induce an OOM if too
+  // much memory is in use. As a result, we only update the soft limit
+  // when the memory reservation is being reduced. This is probably
+  // okay if the machine has available resources.
+  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
+  // discrepancy between usage and soft limit and introduces a "manual
+  // oom" if necessary.
+  if (info->pid.isNone() || limit > currentLimit.get()) {
+    // We always set limit_in_bytes first and optionally set
+    // memsw.limit_in_bytes if limitSwap is true.
+    Try<Nothing> write = cgroups::memory::limit_in_bytes(
+        hierarchy, info->cgroup, limit);
+
+    if (write.isError()) {
+      return Failure(
+          "Failed to set 'memory.limit_in_bytes': " + write.error());
+    }
+
+    LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
+              << " for container " << containerId;
+
+    if (limitSwap) {
+      Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
+          hierarchy, info->cgroup, limit);
+
+      if (write.isError()) {
+        return Failure(
+            "Failed to set 'memory.memsw.limit_in_bytes': " + write.error());
+      }
+
+      LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
+                << " for container " << containerId;
+    }
+  }
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsMemIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  ResourceStatistics result;
+
+  // The rss from memory.stat is wrong in two dimensions:
+  //   1. It does not include child cgroups.
+  //   2. It does not include any file backed pages.
+  Try<Bytes> usage = cgroups::memory::usage_in_bytes(hierarchy, info->cgroup);
+  if (usage.isError()) {
+    return Failure("Failed to parse memory.usage_in_bytes: " + usage.error());
+  }
+
+  result.set_mem_total_bytes(usage.get().bytes());
+
+  if (limitSwap) {
+    Try<Bytes> usage =
+      cgroups::memory::memsw_usage_in_bytes(hierarchy, info->cgroup);
+    if (usage.isError()) {
+      return Failure(
+        "Failed to parse memory.memsw.usage_in_bytes: " + usage.error());
+    }
+
+    result.set_mem_total_memsw_bytes(usage.get().bytes());
+  }
+
+  // TODO(bmahler): Add namespacing to cgroups to enforce the expected
+  // structure, e.g, cgroups::memory::stat.
+  Try<hashmap<string, uint64_t>> stat =
+    cgroups::stat(hierarchy, info->cgroup, "memory.stat");
+  if (stat.isError()) {
+    return Failure("Failed to read memory.stat: " + stat.error());
+  }
+
+  Option<uint64_t> total_cache = stat.get().get("total_cache");
+  if (total_cache.isSome()) {
+    // TODO(chzhcn): mem_file_bytes is deprecated in 0.23.0 and will
+    // be removed in 0.24.0.
+    result.set_mem_file_bytes(total_cache.get());
+
+    result.set_mem_cache_bytes(total_cache.get());
+  }
+
+  Option<uint64_t> total_rss = stat.get().get("total_rss");
+  if (total_rss.isSome()) {
+    // TODO(chzhcn): mem_anon_bytes is deprecated in 0.23.0 and will
+    // be removed in 0.24.0.
+    result.set_mem_anon_bytes(total_rss.get());
+
+    result.set_mem_rss_bytes(total_rss.get());
+  }
+
+  Option<uint64_t> total_mapped_file = stat.get().get("total_mapped_file");
+  if (total_mapped_file.isSome()) {
+    result.set_mem_mapped_file_bytes(total_mapped_file.get());
+  }
+
+  Option<uint64_t> total_swap = stat.get().get("total_swap");
+  if (total_swap.isSome()) {
+    result.set_mem_swap_bytes(total_swap.get());
+  }
+
+  Option<uint64_t> total_unevictable = stat.get().get("total_unevictable");
+  if (total_unevictable.isSome()) {
+    result.set_mem_unevictable_bytes(total_unevictable.get());
+  }
+
+  // Get pressure counter readings.
+  list<Level> levels;
+  list<Future<uint64_t>> values;
+  foreachpair (Level level,
+               const Owned<Counter>& counter,
+               info->pressureCounters) {
+    levels.push_back(level);
+    values.push_back(counter->value());
+  }
+
+  return await(values)
+    .then(defer(PID<CgroupsMemIsolatorProcess>(this),
+                &CgroupsMemIsolatorProcess::_usage,
+                containerId,
+                result,
+                levels,
+                lambda::_1));
+}
+
+
+Future<ResourceStatistics> CgroupsMemIsolatorProcess::_usage(
+    const ContainerID& containerId,
+    ResourceStatistics result,
+    const list<Level>& levels,
+    const list<Future<uint64_t>>& values)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  list<Level>::const_iterator iterator = levels.begin();
+  foreach (const Future<uint64_t>& value, values) {
+    if (value.isReady()) {
+      switch (*iterator) {
+        case Level::LOW:
+          result.set_mem_low_pressure_counter(value.get());
+          break;
+        case Level::MEDIUM:
+          result.set_mem_medium_pressure_counter(value.get());
+          break;
+        case Level::CRITICAL:
+          result.set_mem_critical_pressure_counter(value.get());
+          break;
+      }
+    } else {
+      LOG(ERROR) << "Failed to listen on " << stringify(*iterator)
+                 << " pressure events for container " << containerId << ": "
+                 << (value.isFailed() ? value.failure() : "discarded");
+    }
+
+    ++iterator;
+  }
+
+  return result;
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  // Multiple calls may occur during test clean up.
+  if (!infos.contains(containerId)) {
+    VLOG(1) << "Ignoring cleanup request for unknown container: "
+            << containerId;
+    return Nothing();
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  if (info->oomNotifier.isPending()) {
+    info->oomNotifier.discard();
+  }
+
+  return cgroups::destroy(hierarchy, info->cgroup, cgroups::DESTROY_TIMEOUT)
+    .onAny(defer(PID<CgroupsMemIsolatorProcess>(this),
+                 &CgroupsMemIsolatorProcess::_cleanup,
+                 containerId,
+                 lambda::_1));
+}
+
+
+Future<Nothing> CgroupsMemIsolatorProcess::_cleanup(
+    const ContainerID& containerId,
+    const Future<Nothing>& future)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  CHECK_NOTNULL(infos[containerId]);
+
+  if (!future.isReady()) {
+    return Failure("Failed to clean up container " + stringify(containerId) +
+                   " : " + (future.isFailed() ? future.failure()
+                                              : "discarded"));
+  }
+
+  delete infos[containerId];
+  infos.erase(containerId);
+
+  return Nothing();
+}
+
+
+void CgroupsMemIsolatorProcess::oomListen(
+    const ContainerID& containerId)
+{
+  CHECK(infos.contains(containerId));
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  info->oomNotifier = cgroups::memory::oom::listen(hierarchy, info->cgroup);
+
+  // If the listening fails immediately, something very wrong
+  // happened.  Therefore, we report a fatal error here.
+  if (info->oomNotifier.isFailed()) {
+    LOG(FATAL) << "Failed to listen for OOM events for container "
+               << containerId << ": "
+               << info->oomNotifier.failure();
+  }
+
+  LOG(INFO) << "Started listening for OOM events for container "
+            << containerId;
+
+  info->oomNotifier.onReady(defer(
+      PID<CgroupsMemIsolatorProcess>(this),
+      &CgroupsMemIsolatorProcess::oomWaited,
+      containerId,
+      lambda::_1));
+}
+
+
+void CgroupsMemIsolatorProcess::oomWaited(
+    const ContainerID& containerId,
+    const Future<Nothing>& future)
+{
+  if (future.isDiscarded()) {
+    LOG(INFO) << "Discarded OOM notifier for container "
+              << containerId;
+  } else if (future.isFailed()) {
+    LOG(ERROR) << "Listening on OOM events failed for container "
+               << containerId << ": " << future.failure();
+  } else {
+    // Out-of-memory event happened, call the handler.
+    LOG(INFO) << "OOM notifier is triggered for container " << containerId;
+    oom(containerId);
+  }
+}
+
+
+void CgroupsMemIsolatorProcess::oom(const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    // It is likely that process exited is executed before this
+    // function (e.g.  The kill and OOM events happen at the same
+    // time, and the process exit event arrives first.) Therefore, we
+    // should not report a fatal error here.
+    LOG(INFO) << "OOM detected for an already terminated executor";
+    return;
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  LOG(INFO) << "OOM detected for container " << containerId;
+
+  // Construct a "message" string to describe why the isolator
+  // destroyed the executor's cgroup (in order to assist in
+  // debugging).
+  ostringstream message;
+  message << "Memory limit exceeded: ";
+
+  // Output the requested memory limit.
+  // NOTE: If limitSwap is (has been) used then both limit_in_bytes
+  // and memsw.limit_in_bytes will always be set to the same value.
+  Try<Bytes> limit = cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
+
+  if (limit.isError()) {
+    LOG(ERROR) << "Failed to read 'memory.limit_in_bytes': "
+               << limit.error();
+  } else {
+    message << "Requested: " << limit.get() << " ";
+  }
+
+  // Output the maximum memory usage.
+  Try<Bytes> usage = cgroups::memory::max_usage_in_bytes(
+      hierarchy, info->cgroup);
+
+  if (usage.isError()) {
+    LOG(ERROR) << "Failed to read 'memory.max_usage_in_bytes': "
+               << usage.error();
+  } else {
+    message << "Maximum Used: " << usage.get() << "\n";
+  }
+
+  // Output 'memory.stat' of the cgroup to help with debugging.
+  // NOTE: With Kernel OOM-killer enabled these stats may not reflect
+  // memory state at time of OOM.
+  Try<string> read = cgroups::read(hierarchy, info->cgroup, "memory.stat");
+  if (read.isError()) {
+    LOG(ERROR) << "Failed to read 'memory.stat': " << read.error();
+  } else {
+    message << "\nMEMORY STATISTICS: \n" << read.get() << "\n";
+  }
+
+  LOG(INFO) << strings::trim(message.str()); // Trim the extra '\n' at the end.
+
+  // TODO(jieyu): This is not accurate if the memory resource is from
+  // a non-star role or spans roles (e.g., "*" and "role"). Ideally,
+  // we should save the resources passed in and report it here.
+  Resources mem = Resources::parse(
+      "mem",
+      stringify(usage.isSome() ? usage.get().megabytes() : 0),
+      "*").get();
+
+  info->limitation.set(
+      protobuf::slave::createContainerLimitation(
+          mem,
+          message.str(),
+          TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY));
+}
+
+
+void CgroupsMemIsolatorProcess::pressureListen(
+    const ContainerID& containerId)
+{
+  CHECK(infos.contains(containerId));
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  foreach (Level level, levels()) {
+    Try<Owned<Counter>> counter = Counter::create(
+        hierarchy,
+        info->cgroup,
+        level);
+
+    if (counter.isError()) {
+      LOG(ERROR) << "Failed to listen on " << level << " memory pressure "
+                 << "events for container " << containerId << ": "
+                 << counter.error();
+    } else {
+      info->pressureCounters[level] = counter.get();
+
+      LOG(INFO) << "Started listening on " << level << " memory pressure "
+                << "events for container " << containerId;
+    }
+  }
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp b/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
new file mode 100644
index 0000000..47f73c3
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/mem.hpp
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEM_ISOLATOR_HPP__
+#define __MEM_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/nothing.hpp>
+#include <stout/option.hpp>
+
+#include "linux/cgroups.hpp"
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CgroupsMemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~CgroupsMemIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  CgroupsMemIsolatorProcess(
+      const Flags& flags,
+      const std::string& hierarchy,
+      bool limitSwap);
+
+  process::Future<ResourceStatistics> _usage(
+      const ContainerID& containerId,
+      ResourceStatistics result,
+      const std::list<cgroups::memory::pressure::Level>& levels,
+      const std::list<process::Future<uint64_t>>& values);
+
+  process::Future<Nothing> _cleanup(
+      const ContainerID& containerId,
+      const process::Future<Nothing>& future);
+
+  struct Info
+  {
+    Info(const ContainerID& _containerId, const std::string& _cgroup)
+      : containerId(_containerId), cgroup(_cgroup) {}
+
+    const ContainerID containerId;
+    const std::string cgroup;
+    Option<pid_t> pid;
+
+    process::Promise<mesos::slave::ContainerLimitation> limitation;
+
+    // Used to cancel the OOM listening.
+    process::Future<Nothing> oomNotifier;
+
+    hashmap<cgroups::memory::pressure::Level,
+            process::Owned<cgroups::memory::pressure::Counter>>
+      pressureCounters;
+  };
+
+  // Start listening on OOM events. This function will create an
+  // eventfd and start polling on it.
+  void oomListen(const ContainerID& containerId);
+
+  // This function is invoked when the polling on eventfd has a
+  // result.
+  void oomWaited(
+      const ContainerID& containerId,
+      const process::Future<Nothing>& future);
+
+  // This function is invoked when the OOM event happens.
+  void oom(const ContainerID& containerId);
+
+  // Start listening on memory pressure events.
+  void pressureListen(const ContainerID& containerId);
+
+  const Flags flags;
+
+  // The path to the cgroups subsystem hierarchy root.
+  const std::string hierarchy;
+
+  const bool limitSwap;
+
+  // TODO(bmahler): Use Owned<Info>.
+  hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __MEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
new file mode 100644
index 0000000..323f6ff
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <google/protobuf/descriptor.h>
+#include <google/protobuf/message.h>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/io.hpp>
+#include <process/pid.hpp>
+#include <process/reap.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/lambda.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "linux/cgroups.hpp"
+#include "linux/perf.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+using std::list;
+using std::set;
+using std::string;
+using std::vector;
+
+using process::Clock;
+using process::Failure;
+using process::Future;
+using process::PID;
+using process::Time;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> CgroupsPerfEventIsolatorProcess::create(const Flags& flags)
+{
+  LOG(INFO) << "Creating PerfEvent isolator";
+
+  if (!perf::supported()) {
+    return Error("Perf is not supported");
+  }
+
+  if (flags.perf_duration > flags.perf_interval) {
+    return Error("Sampling perf for duration (" +
+                 stringify(flags.perf_duration) +
+                 ") > interval (" +
+                 stringify(flags.perf_interval) +
+                 ") is not supported.");
+  }
+
+  if (!flags.perf_events.isSome()) {
+    return Error("No perf events specified.");
+  }
+
+  set<string> events;
+  foreach (const string& event,
+           strings::tokenize(flags.perf_events.get(), ",")) {
+    events.insert(event);
+  }
+
+  if (!perf::valid(events)) {
+    return Error("Failed to create PerfEvent isolator, invalid events: " +
+                 stringify(events));
+  }
+
+  Try<string> hierarchy = cgroups::prepare(
+      flags.cgroups_hierarchy,
+      "perf_event",
+      flags.cgroups_root);
+
+  if (hierarchy.isError()) {
+    return Error("Failed to create perf_event cgroup: " + hierarchy.error());
+  }
+
+  LOG(INFO) << "PerfEvent isolator will profile for " << flags.perf_duration
+            << " every " << flags.perf_interval
+            << " for events: " << stringify(events);
+
+  process::Owned<MesosIsolatorProcess> process(
+      new CgroupsPerfEventIsolatorProcess(flags, hierarchy.get(), events));
+
+  return new MesosIsolator(process);
+}
+
+
+CgroupsPerfEventIsolatorProcess::~CgroupsPerfEventIsolatorProcess() {}
+
+
+void CgroupsPerfEventIsolatorProcess::initialize()
+{
+  // Start sampling.
+  sample();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  foreach (const ContainerState& state, states) {
+    const ContainerID& containerId = state.container_id();
+    const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+    Try<bool> exists = cgroups::exists(hierarchy, cgroup);
+    if (exists.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+
+      infos.clear();
+      return Failure("Failed to check cgroup " + cgroup +
+                     " for container '" + stringify(containerId) + "'");
+    }
+
+    if (!exists.get()) {
+      // This may occur if the executor is exiting and the isolator has
+      // destroyed the cgroup but the slave dies before noticing this. This
+      // will be detected when the containerizer tries to monitor the
+      // executor's pid.
+      // NOTE: This could also occur if this isolator is now enabled for a
+      // container that was started without this isolator. For this
+      // particular isolator it is acceptable to continue running this
+      // container without a perf_event cgroup because we don't ever
+      // query it and the destroy will succeed immediately.
+      VLOG(1) << "Couldn't find perf event cgroup for container " << containerId
+              << ", perf statistics will not be available";
+      continue;
+    }
+
+    infos[containerId] = new Info(containerId, cgroup);
+  }
+
+  // Remove orphan cgroups.
+  Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
+  if (cgroups.isError()) {
+    foreachvalue (Info* info, infos) {
+      delete info;
+    }
+    infos.clear();
+    return Failure(cgroups.error());
+  }
+
+  foreach (const string& cgroup, cgroups.get()) {
+    // Ignore the slave cgroup (see the --slave_subsystems flag).
+    // TODO(idownes): Remove this when the cgroups layout is updated,
+    // see MESOS-1185.
+    if (cgroup == path::join(flags.cgroups_root, "slave")) {
+      continue;
+    }
+
+    ContainerID containerId;
+    containerId.set_value(Path(cgroup).basename());
+
+    if (infos.contains(containerId)) {
+      continue;
+    }
+
+    // Known orphan cgroups will be destroyed by the containerizer
+    // using the normal cleanup path. See details in MESOS-2367.
+    if (orphans.contains(containerId)) {
+      infos[containerId] = new Info(containerId, cgroup);
+      continue;
+    }
+
+    LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
+
+    // We don't wait on the destroy as we don't want to block recovery.
+    cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsPerfEventIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  LOG(INFO) << "Preparing perf event cgroup for " << containerId;
+
+  Info* info = new Info(
+      containerId,
+      path::join(flags.cgroups_root, containerId.value()));
+
+  infos[containerId] = CHECK_NOTNULL(info);
+
+  // Create a cgroup for this container.
+  Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
+
+  if (exists.isError()) {
+    return Failure("Failed to prepare isolator: " + exists.error());
+  }
+
+  if (exists.get()) {
+    return Failure("Failed to prepare isolator: cgroup already exists");
+  }
+
+  if (!exists.get()) {
+    Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
+    if (create.isError()) {
+      return Failure("Failed to prepare isolator: " + create.error());
+    }
+  }
+
+  // Chown the cgroup so the executor can create nested cgroups. Do
+  // not recurse so the control files are still owned by the slave
+  // user and thus cannot be changed by the executor.
+  if (user.isSome()) {
+    Try<Nothing> chown = os::chown(
+        user.get(),
+        path::join(hierarchy, info->cgroup),
+        false);
+    if (chown.isError()) {
+      return Failure("Failed to prepare isolator: " + chown.error());
+    }
+  }
+
+  return None();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
+  if (assign.isError()) {
+    return Failure("Failed to assign container '" +
+                   stringify(info->containerId) + "' to its own cgroup '" +
+                   path::join(hierarchy, info->cgroup) +
+                   "' : " + assign.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsPerfEventIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  // No resources are limited.
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  // Nothing to update.
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsPerfEventIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    // Return an empty ResourceStatistics, i.e., without
+    // PerfStatistics, if we don't know about this container.
+    return ResourceStatistics();
+  }
+
+  CHECK_NOTNULL(infos[containerId]);
+
+  ResourceStatistics statistics;
+  statistics.mutable_perf()->CopyFrom(infos[containerId]->statistics);
+
+  return statistics;
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  // Tolerate clean up attempts for unknown containers which may arise from
+  // repeated clean up attempts (during test cleanup).
+  if (!infos.contains(containerId)) {
+    VLOG(1) << "Ignoring cleanup request for unknown container: "
+            << containerId;
+    return Nothing();
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  info->destroying = true;
+
+  return cgroups::destroy(hierarchy, info->cgroup)
+    .then(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
+                &CgroupsPerfEventIsolatorProcess::_cleanup,
+                containerId));
+}
+
+
+Future<Nothing> CgroupsPerfEventIsolatorProcess::_cleanup(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Nothing();
+  }
+
+  delete infos[containerId];
+  infos.erase(containerId);
+
+  return Nothing();
+}
+
+
+Future<hashmap<string, PerfStatistics>> discardSample(
+    Future<hashmap<string, PerfStatistics>> future,
+    const Duration& duration,
+    const Duration& timeout)
+{
+  LOG(ERROR) << "Perf sample of " << stringify(duration)
+             << " failed to complete within " << stringify(timeout)
+             << "; sampling will be halted";
+
+  future.discard();
+
+  return future;
+}
+
+
+void CgroupsPerfEventIsolatorProcess::sample()
+{
+  // Collect a perf sample for all cgroups that are not being
+  // destroyed. Since destroyal is asynchronous, 'perf stat' may
+  // fail if the cgroup is destroyed before running perf.
+  set<string> cgroups;
+
+  foreachvalue (Info* info, infos) {
+    CHECK_NOTNULL(info);
+
+    if (!info->destroying) {
+      cgroups.insert(info->cgroup);
+    }
+  }
+
+  // The discard timeout includes an allowance of twice the
+  // reaper interval to ensure we see the perf process exit.
+  Duration timeout = flags.perf_duration + process::MAX_REAP_INTERVAL() * 2;
+
+  perf::sample(events, cgroups, flags.perf_duration)
+    .after(timeout,
+           lambda::bind(&discardSample,
+                        lambda::_1,
+                        flags.perf_duration,
+                        timeout))
+    .onAny(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
+                 &CgroupsPerfEventIsolatorProcess::_sample,
+                 Clock::now() + flags.perf_interval,
+                 lambda::_1));
+}
+
+
+void CgroupsPerfEventIsolatorProcess::_sample(
+    const Time& next,
+    const Future<hashmap<string, PerfStatistics>>& statistics)
+{
+  if (!statistics.isReady()) {
+    // In case the failure is transient or this is due to a timeout,
+    // we continue sampling. Note that since sampling is done on an
+    // interval, it should be ok if this is a non-transient failure.
+    LOG(ERROR) << "Failed to get perf sample: "
+               << (statistics.isFailed()
+                   ? statistics.failure()
+                   : "discarded due to timeout");
+  } else {
+    // Store the latest statistics, note that cgroups added in the
+    // interim will be picked up by the next sample.
+    foreachvalue (Info* info, infos) {
+      CHECK_NOTNULL(info);
+
+      if (statistics->contains(info->cgroup)) {
+        info->statistics = statistics->get(info->cgroup).get();
+      }
+    }
+  }
+
+  // Schedule sample for the next time.
+  delay(next - Clock::now(),
+        PID<CgroupsPerfEventIsolatorProcess>(this),
+        &CgroupsPerfEventIsolatorProcess::sample);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
new file mode 100644
index 0000000..1f722ef
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/perf_event.hpp
@@ -0,0 +1,127 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PERF_EVENT_ISOLATOR_HPP__
+#define __PERF_EVENT_ISOLATOR_HPP__
+
+#include <set>
+
+#include <process/time.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/nothing.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CgroupsPerfEventIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~CgroupsPerfEventIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+protected:
+  virtual void initialize();
+
+private:
+  CgroupsPerfEventIsolatorProcess(
+      const Flags& _flags,
+      const std::string& _hierarchy,
+      const std::set<std::string>& _events)
+    : flags(_flags),
+      hierarchy(_hierarchy),
+      events(_events) {}
+
+  void sample();
+
+  void _sample(
+      const process::Time& next,
+      const process::Future<hashmap<std::string, PerfStatistics>>& statistics);
+
+  virtual process::Future<Nothing> _cleanup(const ContainerID& containerId);
+
+  struct Info
+  {
+    Info(const ContainerID& _containerId, const std::string& _cgroup)
+      : containerId(_containerId), cgroup(_cgroup), destroying(false)
+    {
+      // Ensure the initial statistics include the required fields.
+      // Note the duration is set to zero to indicate no sampling has
+      // taken place. This empty sample will be returned from usage()
+      // until the first true sample is obtained.
+      statistics.set_timestamp(process::Clock::now().secs());
+      statistics.set_duration(Seconds(0).secs());
+    }
+
+    const ContainerID containerId;
+    const std::string cgroup;
+    PerfStatistics statistics;
+    // Mark a container when we start destruction so we stop sampling it.
+    bool destroying;
+  };
+
+  const Flags flags;
+
+  // The path to the cgroups subsystem hierarchy root.
+  const std::string hierarchy;
+
+  // Set of events to sample.
+  std::set<std::string> events;
+
+  // TODO(jieyu): Use Owned<Info>.
+  hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PERF_EVENT_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
new file mode 100644
index 0000000..a126cd6
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.cpp
@@ -0,0 +1,893 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <sstream>
+#include <string>
+
+#include <glog/logging.h>
+
+#include <process/collect.hpp>
+
+#include <process/metrics/metrics.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/strings.hpp>
+
+#include <stout/os/shell.hpp>
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
+
+using namespace process;
+
+using std::list;
+using std::ostringstream;
+using std::string;
+
+using mesos::slave::ContainerState;
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> LinuxFilesystemIsolatorProcess::create(
+    const Flags& flags,
+    const Owned<Provisioner>& provisioner)
+{
+  Result<string> user = os::user();
+  if (!user.isSome()) {
+    return Error("Failed to determine user: " +
+                 (user.isError() ? user.error() : "username not found"));
+  }
+
+  if (user.get() != "root") {
+    return Error("LinuxFilesystemIsolator requires root privileges");
+  }
+
+  // Make slave's work_dir a shared mount so that when forking a child
+  // process (with a new mount namespace), the child process does not
+  // hold extra references to container's work directory mounts and
+  // provisioner mounts (e.g., when using the bind backend) because
+  // cleanup operations within work_dir can be propagted to all
+  // container namespaces. See MESOS-3483 for more details.
+  LOG(INFO) << "Making '" << flags.work_dir << "' a shared mount";
+
+  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+  if (table.isError()) {
+    return Error("Failed to get mount table: " + table.error());
+  }
+
+  Option<fs::MountInfoTable::Entry> workDirMount;
+  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+    // TODO(jieyu): Make sure 'flags.work_dir' is a canonical path.
+    if (entry.target == flags.work_dir) {
+      workDirMount = entry;
+      break;
+    }
+  }
+
+  // Do a self bind mount if needed. If the mount already exists, make
+  // sure it is a shared mount of its own peer group.
+  if (workDirMount.isNone()) {
+    // NOTE: Instead of using fs::mount to perform the bind mount, we
+    // use the shell command here because the syscall 'mount' does not
+    // update the mount table (i.e., /etc/mtab). In other words, the
+    // mount will not be visible if the operator types command
+    // 'mount'. Since this mount will still be presented after all
+    // containers and the slave are stopped, it's better to make it
+    // visible. It's OK to use the blocking os::shell here because
+    // 'create' will only be invoked during initialization.
+    Try<string> mount = os::shell(
+        "mount --bind %s %s && "
+        "mount --make-slave %s && "
+        "mount --make-shared %s",
+        flags.work_dir.c_str(),
+        flags.work_dir.c_str(),
+        flags.work_dir.c_str(),
+        flags.work_dir.c_str());
+
+    if (mount.isError()) {
+      return Error(
+          "Failed to self bind mount '" + flags.work_dir +
+          "' and make it a shared mount: " + mount.error());
+    }
+  } else {
+    if (workDirMount.get().shared().isNone()) {
+      // This is the case where the work directory mount is not a
+      // shared mount yet (possibly due to slave crash while preparing
+      // the work directory mount). It's safe to re-do the following.
+      Try<string> mount = os::shell(
+          "mount --make-slave %s && "
+          "mount --make-shared %s",
+          flags.work_dir.c_str(),
+          flags.work_dir.c_str());
+
+      if (mount.isError()) {
+        return Error(
+            "Failed to self bind mount '" + flags.work_dir +
+            "' and make it a shared mount: " + mount.error());
+      }
+    } else {
+      // We need to make sure that the shared mount is in its own peer
+      // group. To check that, we need to get the parent mount.
+      foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+        if (entry.id == workDirMount.get().parent) {
+          // If the work directory mount and its parent mount are in
+          // the same peer group, we need to re-do the following
+          // commands so that they are in different peer groups.
+          if (entry.shared() == workDirMount.get().shared()) {
+            Try<string> mount = os::shell(
+                "mount --make-slave %s && "
+                "mount --make-shared %s",
+                flags.work_dir.c_str(),
+                flags.work_dir.c_str());
+
+            if (mount.isError()) {
+              return Error(
+                  "Failed to self bind mount '" + flags.work_dir +
+                  "' and make it a shared mount: " + mount.error());
+            }
+          }
+
+          break;
+        }
+      }
+    }
+  }
+
+  Owned<MesosIsolatorProcess> process(
+      new LinuxFilesystemIsolatorProcess(flags, provisioner));
+
+  return new MesosIsolator(process);
+}
+
+
+LinuxFilesystemIsolatorProcess::LinuxFilesystemIsolatorProcess(
+    const Flags& _flags,
+    const Owned<Provisioner>& _provisioner)
+  : flags(_flags),
+    provisioner(_provisioner),
+    metrics(PID<LinuxFilesystemIsolatorProcess>(this)) {}
+
+
+LinuxFilesystemIsolatorProcess::~LinuxFilesystemIsolatorProcess() {}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // Read the mount table in the host mount namespace to recover paths
+  // to containers' work directories if their root filesystems are
+  // changed. Method 'cleanup()' relies on this information to clean
+  // up mounts in the host mount namespace for each container.
+  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+  if (table.isError()) {
+    return Failure("Failed to get mount table: " + table.error());
+  }
+
+  foreach (const ContainerState& state, states) {
+    Owned<Info> info(new Info(state.directory()));
+
+    foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+      if (entry.root == info->directory) {
+        info->sandbox = entry.target;
+        break;
+      }
+    }
+
+    infos.put(state.container_id(), info);
+  }
+
+  // Recover both known and unknown orphans by scanning the mount
+  // table and finding those mounts whose roots are under slave's
+  // sandbox root directory. Those mounts are container's work
+  // directory mounts. Mounts from unknown orphans will be cleaned up
+  // immediately. Mounts from known orphans will be cleaned up when
+  // those known orphan containers are being destroyed by the slave.
+  hashset<ContainerID> unknownOrphans;
+
+  string sandboxRootDir = paths::getSandboxRootDir(flags.work_dir);
+
+  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+    if (!strings::startsWith(entry.root, sandboxRootDir)) {
+      continue;
+    }
+
+    // TODO(jieyu): Here, we retrieve the container ID by taking the
+    // basename of 'entry.root'. This assumes that the slave's sandbox
+    // root directory are organized according to the comments in the
+    // beginning of slave/paths.hpp.
+    ContainerID containerId;
+    containerId.set_value(Path(entry.root).basename());
+
+    if (infos.contains(containerId)) {
+      continue;
+    }
+
+    Owned<Info> info(new Info(entry.root));
+
+    if (entry.root != entry.target) {
+      info->sandbox = entry.target;
+    }
+
+    infos.put(containerId, info);
+
+    // Remember all the unknown orphan containers.
+    if (!orphans.contains(containerId)) {
+      unknownOrphans.insert(containerId);
+    }
+  }
+
+  // Cleanup mounts from unknown orphans.
+  list<Future<Nothing>> futures;
+  foreach (const ContainerID& containerId, unknownOrphans) {
+    futures.push_back(cleanup(containerId));
+  }
+
+  return collect(futures)
+    .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
+                &LinuxFilesystemIsolatorProcess::_recover,
+                states,
+                orphans));
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::_recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  return provisioner->recover(states, orphans)
+    .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  infos.put(containerId, Owned<Info>(new Info(directory)));
+
+  if (!executorInfo.has_container()) {
+    return __prepare(containerId, executorInfo, directory, user, None());
+  }
+
+  // Provision the root filesystem if needed.
+  CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+  if (!executorInfo.container().mesos().has_image()) {
+    return _prepare(containerId, executorInfo, directory, user, None());
+  }
+
+  const Image& image = executorInfo.container().mesos().image();
+
+  return provisioner->provision(containerId, image)
+    .then(defer(PID<LinuxFilesystemIsolatorProcess>(this),
+                &LinuxFilesystemIsolatorProcess::_prepare,
+                containerId,
+                executorInfo,
+                directory,
+                user,
+                lambda::_1));
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::_prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user,
+    const Option<string>& rootfs)
+{
+  CHECK(executorInfo.has_container());
+  CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+  // We will provision the images specified in ContainerInfo::volumes
+  // as well. We will mutate ContainerInfo::volumes to include the
+  // paths to the provisioned root filesystems (by setting the
+  // 'host_path') if the volume specifies an image as the source.
+  Owned<ExecutorInfo> _executorInfo(new ExecutorInfo(executorInfo));
+  list<Future<Nothing>> futures;
+
+  for (int i = 0; i < _executorInfo->container().volumes_size(); i++) {
+    Volume* volume = _executorInfo->mutable_container()->mutable_volumes(i);
+
+    if (!volume->has_image()) {
+      continue;
+    }
+
+    const Image& image = volume->image();
+
+    futures.push_back(
+        provisioner->provision(containerId, image)
+          .then([volume](const string& path) -> Future<Nothing> {
+            volume->set_host_path(path);
+            return Nothing();
+          }));
+  }
+
+  return collect(futures)
+    .then([=]() -> Future<Option<ContainerPrepareInfo>> {
+      return __prepare(containerId, *_executorInfo, directory, user, rootfs);
+    });
+}
+
+
+Future<Option<ContainerPrepareInfo>> LinuxFilesystemIsolatorProcess::__prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user,
+    const Option<string>& rootfs)
+{
+  CHECK(infos.contains(containerId));
+
+  const Owned<Info>& info = infos[containerId];
+
+  ContainerPrepareInfo prepareInfo;
+  prepareInfo.set_namespaces(CLONE_NEWNS);
+
+  if (rootfs.isSome()) {
+    // If the container changes its root filesystem, we need to mount
+    // the container's work directory into its root filesystem
+    // (creating it if needed) so that the executor and the task can
+    // access the work directory.
+    //
+    // NOTE: The mount of the work directory must be a shared mount in
+    // the host filesystem so that any mounts underneath it will
+    // propagate into the container's mount namespace. This is how we
+    // can update persistent volumes for the container.
+
+    // This is the mount point of the work directory in the root filesystem.
+    const string sandbox = path::join(rootfs.get(), flags.sandbox_directory);
+
+    // Save the path 'sandbox' which will be used in 'cleanup()'.
+    info->sandbox = sandbox;
+
+    if (!os::exists(sandbox)) {
+      Try<Nothing> mkdir = os::mkdir(sandbox);
+      if (mkdir.isError()) {
+        return Failure(
+            "Failed to create sandbox mount point at '" +
+            sandbox + "': " + mkdir.error());
+      }
+    }
+
+    LOG(INFO) << "Bind mounting work directory from '" << directory
+              << "' to '" << sandbox << "' for container " << containerId;
+
+    Try<Nothing> mount = fs::mount(
+        directory,
+        sandbox,
+        None(),
+        MS_BIND,
+        NULL);
+
+    if (mount.isError()) {
+      return Failure(
+          "Failed to mount work directory '" + directory +
+          "' to '" + sandbox + "': " + mount.error());
+    }
+
+    mount = fs::mount(
+        None(),
+        sandbox,
+        None(),
+        MS_SLAVE,
+        NULL);
+
+    if (mount.isError()) {
+      return Failure(
+          "Failed to mark sandbox '" + sandbox +
+          "' as a slave mount: " + mount.error());
+    }
+
+    mount = fs::mount(
+        None(),
+        sandbox,
+        None(),
+        MS_SHARED,
+        NULL);
+
+    if (mount.isError()) {
+      return Failure(
+          "Failed to mark sandbox '" + sandbox +
+          "' as a shared mount: " + mount.error());
+    }
+
+    prepareInfo.set_rootfs(rootfs.get());
+  }
+
+  // Prepare the commands that will be run in the container's mount
+  // namespace right after forking the executor process. We use these
+  // commands to mount those volumes specified in the container info
+  // so that they don't pollute the host mount namespace.
+  Try<string> _script = script(containerId, executorInfo, directory, rootfs);
+  if (_script.isError()) {
+    return Failure("Failed to generate isolation script: " + _script.error());
+  }
+
+  CommandInfo* command = prepareInfo.add_commands();
+  command->set_value(_script.get());
+
+  return update(containerId, executorInfo.resources())
+    .then([prepareInfo]() -> Future<Option<ContainerPrepareInfo>> {
+      return prepareInfo;
+    });
+}
+
+
+Try<string> LinuxFilesystemIsolatorProcess::script(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& rootfs)
+{
+  ostringstream out;
+  out << "#!/bin/sh\n";
+  out << "set -x -e\n";
+
+  // Make sure mounts in the container mount namespace do not
+  // propagate back to the host mount namespace.
+  out << "mount --make-rslave /\n";
+
+  // Try to unmount work directory mounts and persistent volume mounts
+  // for other containers to release the extra references to them.
+  // NOTE:
+  // 1) This doesn't completely eliminate the race condition between
+  //    this container copying mount table and other containers being
+  //    cleaned up. This is instead a best-effort attempt.
+  // 2) This script assumes that all the mounts the container needs
+  //    under the slave work directory have its container ID in the
+  //    path either for the mount source (e.g. sandbox self-bind mount)
+  //    or the mount target (e.g. mounting sandbox into new rootfs).
+  //
+  // TODO(xujyan): This command may fail if --work_dir is not specified
+  // with a real path as real paths are used in the mount table. It
+  // doesn't work when the paths contain reserved characters such as
+  // spaces either because such characters in mount info are encoded
+  // in the escaped form (i.e. '\0xx').
+  out << "grep -E '" << flags.work_dir << "/.+' /proc/self/mountinfo | "
+      << "grep -v '" << containerId.value() << "' | "
+      << "cut -d' ' -f5 | " // '-f5' is the mount target. See MountInfoTable.
+      << "xargs --no-run-if-empty umount -l || "
+      << "true \n"; // We mask errors in this command.
+
+  if (!executorInfo.has_container()) {
+    return out.str();
+  }
+
+  foreach (const Volume& volume, executorInfo.container().volumes()) {
+    if (!volume.has_host_path()) {
+      return Error("A volume misses 'host_path'");
+    }
+
+    // If both 'host_path' and 'container_path' are relative paths,
+    // return an error because the user can just directly access the
+    // volume in the work directory.
+    if (!strings::startsWith(volume.host_path(), "/") &&
+        !strings::startsWith(volume.container_path(), "/")) {
+      return Error(
+          "Both 'host_path' and 'container_path' of a volume are relative");
+    }
+
+    // Determine the source of the mount.
+    string source;
+
+    if (strings::startsWith(volume.host_path(), "/")) {
+      source = volume.host_path();
+
+      // An absolute path must already exist.
+      if (!os::exists(source)) {
+        return Error("Absolute host path does not exist");
+      }
+    } else {
+      // Path is interpreted as relative to the work directory.
+      source = path::join(directory, volume.host_path());
+
+      // TODO(jieyu): We need to check that source resolves under the
+      // work directory because a user can potentially use a container
+      // path like '../../abc'.
+
+      if (!os::exists(source)) {
+        Try<Nothing> mkdir = os::mkdir(source);
+        if (mkdir.isError()) {
+          return Error(
+              "Failed to create the source of the mount at '" +
+              source + "': " + mkdir.error());
+        }
+
+        // TODO(idownes): Consider setting ownership and mode.
+      }
+    }
+
+    // Determine the target of the mount.
+    string target;
+
+    if (strings::startsWith(volume.container_path(), "/")) {
+      if (rootfs.isSome()) {
+        target = path::join(rootfs.get(), volume.container_path());
+      } else {
+        target = volume.container_path();
+      }
+
+      // An absolute path must already exist. This is because we want
+      // to avoid creating mount points outside the work directory in
+      // the host filesystem or in the container filesystem root.
+      if (!os::exists(target)) {
+        return Error("Absolute container path does not exist");
+      }
+
+      // TODO(jieyu): We need to check that target resolves under
+      // 'rootfs' because a user can potentially use a container path
+      // like '/../../abc'.
+    } else {
+      if (rootfs.isSome()) {
+        target = path::join(rootfs.get(),
+                            flags.sandbox_directory,
+                            volume.container_path());
+      } else {
+        target = path::join(directory, volume.container_path());
+      }
+
+      // TODO(jieyu): We need to check that target resolves under the
+      // sandbox because a user can potentially use a container path
+      // like '../../abc'.
+
+      if (!os::exists(target)) {
+        Try<Nothing> mkdir = os::mkdir(target);
+        if (mkdir.isError()) {
+          return Error(
+              "Failed to create the target of the mount at '" +
+              target + "': " + mkdir.error());
+        }
+      }
+    }
+
+    // TODO(jieyu): Consider the mode in the volume.
+    out << "mount -n --rbind '" << source << "' '" << target << "'\n";
+  }
+
+  return out.str();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  // No-op, isolation happens when unsharing the mount namespace.
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> LinuxFilesystemIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  // No-op.
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  // Mount persistent volumes. We do this in the host namespace and
+  // rely on mount propagation for them to be visible inside the
+  // container.
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  const Owned<Info>& info = infos[containerId];
+
+  Resources current = info->resources;
+
+  // We first remove unneeded persistent volumes.
+  foreach (const Resource& resource, current.persistentVolumes()) {
+    // This is enforced by the master.
+    CHECK(resource.disk().has_volume());
+
+    // Ignore absolute and nested paths.
+    const string& containerPath = resource.disk().volume().container_path();
+    if (strings::contains(containerPath, "/")) {
+      LOG(WARNING) << "Skipping updating mount for persistent volume "
+                   << resource << " of container " << containerId
+                   << " because the container path '" << containerPath
+                   << "' contains slash";
+      continue;
+    }
+
+    if (resources.contains(resource)) {
+      continue;
+    }
+
+    // Determine the target of the mount.
+    string target;
+
+    if (info->sandbox.isSome()) {
+      target = path::join(info->sandbox.get(), containerPath);
+    } else {
+      target = path::join(info->directory, containerPath);
+    }
+
+    LOG(INFO) << "Removing mount '" << target << "' for persistent volume "
+              << resource << " of container " << containerId;
+
+    // The unmount will fail if the task/executor is still using files
+    // or directories under 'target'.
+    Try<Nothing> unmount = fs::unmount(target);
+    if (unmount.isError()) {
+      return Failure(
+          "Failed to unmount unneeded persistent volume at '" +
+          target + "': " + unmount.error());
+    }
+
+    // NOTE: This is a non-recursive rmdir.
+    Try<Nothing> rmdir = os::rmdir(target, false);
+    if (rmdir.isError()) {
+      return Failure(
+          "Failed to remove persistent volume mount point at '" +
+          target + "': " + rmdir.error());
+    }
+  }
+
+  // We then mount new persistent volumes.
+  foreach (const Resource& resource, resources.persistentVolumes()) {
+    // This is enforced by the master.
+    CHECK(resource.disk().has_volume());
+
+    // Ignore absolute and nested paths.
+    const string& containerPath = resource.disk().volume().container_path();
+    if (strings::contains(containerPath, "/")) {
+      LOG(WARNING) << "Skipping updating mount for persistent volume "
+                   << resource << " of container " << containerId
+                   << " because the container path '" << containerPath
+                   << "' contains slash";
+      continue;
+    }
+
+    if (current.contains(resource)) {
+      continue;
+    }
+
+    // Determine the source of the mount.
+    string source = paths::getPersistentVolumePath(
+        flags.work_dir,
+        resource.role(),
+        resource.disk().persistence().id());
+
+    // Set the ownership of the persistent volume to match that of the
+    // sandbox directory.
+    //
+    // NOTE: Currently, persistent volumes in Mesos are exclusive,
+    // meaning that if a persistent volume is used by one task or
+    // executor, it cannot be concurrently used by other task or
+    // executor. But if we allow multiple executors to use same
+    // persistent volume at the same time in the future, the ownership
+    // of the persistent volume may conflict here.
+    //
+    // TODO(haosdent): Consider letting the frameworks specify the
+    // user/group of the persistent volumes.
+    struct stat s;
+    if (::stat(info->directory.c_str(), &s) < 0) {
+      return Failure(
+          "Failed to get ownership for '" + info->directory +
+          "': " + strerror(errno));
+    }
+
+    LOG(INFO) << "Changing the ownership of the persistent volume at '"
+              << source << "' with uid " << s.st_uid
+              << " and gid " << s.st_gid;
+
+    Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, source, true);
+    if (chown.isError()) {
+      return Failure(
+          "Failed to change the ownership of the persistent volume at '" +
+          source + "' with uid " + stringify(s.st_uid) +
+          " and gid " + stringify(s.st_gid) + ": " + chown.error());
+    }
+
+    // Determine the target of the mount.
+    string target;
+
+    if (info->sandbox.isSome()) {
+      target = path::join(info->sandbox.get(), containerPath);
+    } else {
+      target = path::join(info->directory, containerPath);
+    }
+
+    if (os::exists(target)) {
+      // NOTE: This is possible because 'info->resources' will be
+      // reset when slave restarts and recovers. When the slave calls
+      // 'containerizer->update' after the executor re-registers,
+      // we'll try to re-mount all the already mounted volumes.
+
+      // TODO(jieyu): Check the source of the mount matches the entry
+      // with the same target in the mount table if one can be found.
+      // If not, mount the persistent volume as we did below. This is
+      // possible because the slave could crash after it unmounts the
+      // volume but before it is able to delete the mount point.
+    } else {
+      Try<Nothing> mkdir = os::mkdir(target);
+      if (mkdir.isError()) {
+        return Failure(
+            "Failed to create persistent volume mount point at '" +
+            target + "': " + mkdir.error());
+      }
+
+      LOG(INFO) << "Mounting '" << source << "' to '" << target
+                << "' for persistent volume " << resource
+                << " of container " << containerId;
+
+      Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+      if (mount.isError()) {
+        return Failure(
+            "Failed to mount persistent volume from '" +
+            source + "' to '" + target + "': " + mount.error());
+      }
+    }
+  }
+
+  // Store the new resources;
+  info->resources = resources;
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> LinuxFilesystemIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  // No-op, no usage gathered.
+  return ResourceStatistics();
+}
+
+
+Future<Nothing> LinuxFilesystemIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    VLOG(1) << "Ignoring cleanup request for unknown container: "
+            << containerId;
+
+    return Nothing();
+  }
+
+  const Owned<Info>& info = infos[containerId];
+
+  // NOTE: We don't need to cleanup mounts in the container's mount
+  // namespace because it's done automatically by the kernel when the
+  // mount namespace is destroyed after the last process terminates.
+
+  // The path to the container' work directory which is the parent of
+  // all the persistent volume mounts.
+  string sandbox;
+
+  if (info->sandbox.isSome()) {
+    sandbox = info->sandbox.get();
+  } else {
+    sandbox = info->directory;
+  }
+
+  infos.erase(containerId);
+
+  // Cleanup the mounts for this container in the host mount
+  // namespace, including container's work directory and all the
+  // persistent volume mounts.
+  Try<fs::MountInfoTable> table = fs::MountInfoTable::read();
+  if (table.isError()) {
+    return Failure("Failed to get mount table: " + table.error());
+  }
+
+  bool sandboxMountExists = false;
+
+  foreach (const fs::MountInfoTable::Entry& entry, table.get().entries) {
+    // NOTE: All persistent volumes are mounted at targets under the
+    // container's work directory. We unmount all the persistent
+    // volumes before unmounting the sandbox/work directory mount.
+    if (entry.target == sandbox) {
+      sandboxMountExists = true;
+    } else if (strings::startsWith(entry.target, sandbox)) {
+      LOG(INFO) << "Unmounting volume '" << entry.target
+                << "' for container " << containerId;
+
+      Try<Nothing> unmount = fs::unmount(entry.target);
+      if (unmount.isError()) {
+        return Failure(
+            "Failed to unmount volume '" + entry.target +
+            "': " + unmount.error());
+      }
+    }
+  }
+
+  if (!sandboxMountExists) {
+    // This could happen if the container was not launched by this
+    // isolator (e.g., slaves prior to 0.25.0), or the container did
+    // not specify a root filesystem.
+    LOG(INFO) << "Ignoring unmounting sandbox/work directory"
+              << " for container " << containerId;
+  } else {
+    LOG(INFO) << "Unmounting sandbox/work directory '" << sandbox
+              << "' for container " << containerId;
+
+    Try<Nothing> unmount = fs::unmount(sandbox);
+    if (unmount.isError()) {
+      return Failure(
+          "Failed to unmount sandbox/work directory '" + sandbox +
+          "': " + unmount.error());
+    }
+  }
+
+  // Destroy the provisioned root filesystems.
+  return provisioner->destroy(containerId)
+    .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+LinuxFilesystemIsolatorProcess::Metrics::Metrics(
+    const PID<LinuxFilesystemIsolatorProcess>& isolator)
+  : containers_new_rootfs(
+      "containerizer/mesos/filesystem/containers_new_rootfs",
+      defer(isolator, &LinuxFilesystemIsolatorProcess::_containers_new_rootfs))
+{
+  process::metrics::add(containers_new_rootfs);
+}
+
+
+LinuxFilesystemIsolatorProcess::Metrics::~Metrics()
+{
+  process::metrics::remove(containers_new_rootfs);
+}
+
+
+double LinuxFilesystemIsolatorProcess::_containers_new_rootfs()
+{
+  double count = 0.0;
+
+  foreachvalue (const Owned<Info>& info, infos) {
+    if (info->sandbox.isSome()) {
+      ++count;
+    }
+  }
+
+  return count;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
new file mode 100644
index 0000000..cd99966
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/linux.hpp
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LINUX_FILESYSTEM_ISOLATOR_HPP__
+#define __LINUX_FILESYSTEM_ISOLATOR_HPP__
+
+#include <mesos/mesos.hpp>
+#include <mesos/resources.hpp>
+
+#include <process/owned.hpp>
+#include <process/pid.hpp>
+
+#include <process/metrics/gauge.hpp>
+
+#include <stout/hashmap.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The filesystem isolator on Linux that is responsible for preparing
+// the root filesystems and volumes (e.g., persistent volumes) for
+// containers. It relies on Linux mount namespace to prevent mounts of
+// a container from being propagated to the host mount table.
+class LinuxFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(
+      const Flags& flags,
+      const process::Owned<Provisioner>& provisioner);
+
+  virtual ~LinuxFilesystemIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  LinuxFilesystemIsolatorProcess(
+      const Flags& flags,
+      const process::Owned<Provisioner>& provisioner);
+
+  process::Future<Nothing> _recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  process::Future<Option<mesos::slave::ContainerPrepareInfo>> _prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user,
+      const Option<std::string>& rootfs);
+
+  process::Future<Option<mesos::slave::ContainerPrepareInfo>> __prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user,
+      const Option<std::string>& rootfs);
+
+  Try<std::string> script(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& rootfs);
+
+  const Flags flags;
+  const process::Owned<Provisioner> provisioner;
+
+  struct Info
+  {
+    Info(const std::string& _directory) : directory(_directory) {}
+
+    const std::string directory;
+
+    // The absolute path to the container's work directory mount point
+    // in the new root filesystem if the container changes its root
+    // filesystem (i.e., '<rootfs>/<flags.sandbox_directory>'). If the
+    // container does not specify a root filesystem, this field will
+    // not be set.
+    Option<std::string> sandbox;
+
+    // Track resources so we can unmount unneeded persistent volumes.
+    Resources resources;
+  };
+
+  hashmap<ContainerID, process::Owned<Info>> infos;
+
+  struct Metrics
+  {
+    explicit Metrics(
+        const process::PID<LinuxFilesystemIsolatorProcess>& isolator);
+    ~Metrics();
+
+    process::metrics::Gauge containers_new_rootfs;
+  } metrics;
+
+  double _containers_new_rootfs();
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __LINUX_FILESYSTEM_ISOLATOR_HPP__

[02/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/copy.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/copy.cpp b/src/slave/containerizer/provisioner/backends/copy.cpp
deleted file mode 100644
index 92fb098..0000000
--- a/src/slave/containerizer/provisioner/backends/copy.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/io.hpp>
-#include <process/process.hpp>
-#include <process/subprocess.hpp>
-
-
-#include <stout/foreach.hpp>
-#include <stout/os.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/backends/copy.hpp"
-
-
-using namespace process;
-
-using std::string;
-using std::list;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CopyBackendProcess : public Process<CopyBackendProcess>
-{
-public:
-  Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
-
-  Future<bool> destroy(const string& rootfs);
-
-private:
-  Future<Nothing> _provision(string layer, const string& rootfs);
-};
-
-
-Try<Owned<Backend>> CopyBackend::create(const Flags&)
-{
-  return Owned<Backend>(new CopyBackend(
-      Owned<CopyBackendProcess>(new CopyBackendProcess())));
-}
-
-
-CopyBackend::~CopyBackend()
-{
-  terminate(process.get());
-  wait(process.get());
-}
-
-
-CopyBackend::CopyBackend(Owned<CopyBackendProcess> _process)
-  : process(_process)
-{
-  spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-Future<Nothing> CopyBackend::provision(
-    const vector<string>& layers,
-    const string& rootfs)
-{
-  return dispatch(
-      process.get(), &CopyBackendProcess::provision, layers, rootfs);
-}
-
-
-Future<bool> CopyBackend::destroy(const string& rootfs)
-{
-  return dispatch(process.get(), &CopyBackendProcess::destroy, rootfs);
-}
-
-
-Future<Nothing> CopyBackendProcess::provision(
-    const vector<string>& layers,
-    const string& rootfs)
-{
-  if (layers.size() == 0) {
-    return Failure("No filesystem layers provided");
-  }
-
-  if (os::exists(rootfs)) {
-    return Failure("Rootfs is already provisioned");
-  }
-
-  Try<Nothing> mkdir = os::mkdir(rootfs);
-  if (mkdir.isError()) {
-    return Failure("Failed to create rootfs directory: " + mkdir.error());
-  }
-
-  list<Future<Nothing>> futures{Nothing()};
-
-  foreach (const string layer, layers) {
-    futures.push_back(
-        futures.back().then(
-            defer(self(), &Self::_provision, layer, rootfs)));
-  }
-
-  return collect(futures)
-    .then([]() -> Future<Nothing> { return Nothing(); });
-}
-
-
-Future<Nothing> CopyBackendProcess::_provision(
-  string layer,
-  const string& rootfs)
-{
-  VLOG(1) << "Copying layer path '" << layer << "' to rootfs '" << rootfs
-          << "'";
-
-#ifdef __APPLE__
-  if (!strings::endsWith(layer, "/")) {
-    layer += "/";
-  }
-
-  // OSX cp doesn't support -T flag, but supports source trailing
-  // slash so we only copy the content but not the folder.
-  vector<string> args{"cp", "-a", layer, rootfs};
-#else
-  vector<string> args{"cp", "-aT", layer, rootfs};
-#endif // __APPLE__
-
-  Try<Subprocess> s = subprocess(
-      "cp",
-      args,
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PIPE());
-
-  if (s.isError()) {
-    return Failure("Failed to create 'cp' subprocess: " + s.error());
-  }
-
-  Subprocess cp = s.get();
-
-  return cp.status()
-    .then([cp](const Option<int>& status) -> Future<Nothing> {
-      if (status.isNone()) {
-        return Failure("Failed to reap subprocess to copy image");
-      } else if (status.get() != 0) {
-        return io::read(cp.err().get())
-          .then([](const string& err) -> Future<Nothing> {
-            return Failure("Failed to copy layer: " + err);
-          });
-      }
-
-      return Nothing();
-    });
-}
-
-
-Future<bool> CopyBackendProcess::destroy(const string& rootfs)
-{
-  vector<string> argv{"rm", "-rf", rootfs};
-
-  Try<Subprocess> s = subprocess(
-      "rm",
-      argv,
-      Subprocess::PATH("/dev/null"),
-      Subprocess::FD(STDOUT_FILENO),
-      Subprocess::FD(STDERR_FILENO));
-
-  if (s.isError()) {
-    return Failure("Failed to create 'rm' subprocess: " + s.error());
-  }
-
-  return s.get().status()
-    .then([](const Option<int>& status) -> Future<bool> {
-      if (status.isNone()) {
-        return Failure("Failed to reap subprocess to destroy rootfs");
-      } else if (status.get() != 0) {
-        return Failure("Failed to destroy rootfs, exit status: " +
-                       WSTRINGIFY(status.get()));
-      }
-
-      return true;
-    });
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/backends/copy.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/backends/copy.hpp b/src/slave/containerizer/provisioner/backends/copy.hpp
deleted file mode 100644
index 7a5aaa4..0000000
--- a/src/slave/containerizer/provisioner/backends/copy.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_BACKENDS_COPY_HPP__
-#define __PROVISIONER_BACKENDS_COPY_HPP__
-
-#include "slave/containerizer/provisioner/backend.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declaration.
-class CopyBackendProcess;
-
-
-// The backend implementation that copies the layers to the target.
-// NOTE: Using this backend currently has a few implications:
-// 1) The disk space used by the provisioned rootfs is not counted
-//    towards either the usage by the executor/task or the store
-//    cache, which can interfere with the slave hosts's disk space
-//    allocation.
-// 2) The task can write unrestrictedly into the provisioned rootfs
-//    which is not accounted for (in terms of disk usage) either.
-class CopyBackend : public Backend
-{
-public:
-  virtual ~CopyBackend();
-
-  // CopyBackend doesn't use any flag.
-  static Try<process::Owned<Backend>> create(const Flags&);
-
-  // Provisions a rootfs given the layers' paths and target rootfs
-  // path.
-  virtual process::Future<Nothing> provision(
-      const std::vector<std::string>& layers,
-      const std::string& rootfs);
-
-  virtual process::Future<bool> destroy(const std::string& rootfs);
-
-private:
-  explicit CopyBackend(process::Owned<CopyBackendProcess> process);
-
-  CopyBackend(const CopyBackend&); // Not copyable.
-  CopyBackend& operator=(const CopyBackend&); // Not assignable.
-
-  process::Owned<CopyBackendProcess> process;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_BACKENDS_COPY_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/local_puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/local_puller.cpp b/src/slave/containerizer/provisioner/docker/local_puller.cpp
deleted file mode 100644
index 74d0e1e..0000000
--- a/src/slave/containerizer/provisioner/docker/local_puller.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <list>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/json.hpp>
-#include <stout/os.hpp>
-#include <stout/result.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/subprocess.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/local_puller.hpp"
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/store.hpp"
-
-using namespace process;
-
-using std::list;
-using std::pair;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class LocalPullerProcess : public process::Process<LocalPullerProcess>
-{
-public:
-  LocalPullerProcess(const Flags& _flags) : flags(_flags) {}
-
-  ~LocalPullerProcess() {}
-
-  process::Future<list<pair<string, string>>> pull(
-      const Image::Name& name,
-      const string& directory);
-
-private:
-  process::Future<Nothing> untarImage(
-      const std::string& tarPath,
-      const std::string& directory);
-
-  process::Future<list<pair<string, string>>> putImage(
-      const Image::Name& name,
-      const std::string& directory);
-
-  process::Future<list<pair<string, string>>> putLayers(
-      const std::string& directory,
-      const std::vector<std::string>& layerIds);
-
-  process::Future<pair<string, string>> putLayer(
-      const std::string& directory,
-      const std::string& layerId);
-
-  const Flags flags;
-};
-
-
-LocalPuller::LocalPuller(const Flags& flags)
-{
-  process = Owned<LocalPullerProcess>(new LocalPullerProcess(flags));
-  process::spawn(process.get());
-}
-
-
-LocalPuller::~LocalPuller()
-{
-  process::terminate(process.get());
-  process::wait(process.get());
-}
-
-
-Future<list<pair<string, string>>> LocalPuller::pull(
-    const Image::Name& name,
-    const string& directory)
-{
-  return dispatch(process.get(), &LocalPullerProcess::pull, name, directory);
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::pull(
-    const Image::Name& name,
-    const string& directory)
-{
-  const string tarPath = paths::getImageArchiveTarPath(
-      flags.docker_local_archives_dir,
-      stringify(name));
-
-  if (!os::exists(tarPath)) {
-    return Failure("Failed to find archive for image '" + stringify(name) +
-                   "' at '" + tarPath + "'");
-  }
-
-  return untarImage(tarPath, directory)
-    .then(defer(self(), &Self::putImage, name, directory));
-}
-
-
-Future<Nothing> LocalPullerProcess::untarImage(
-    const string& tarPath,
-    const string& directory)
-{
-  VLOG(1) << "Untarring image from '" << tarPath
-          << "' to '" << directory << "'";
-
-  // Untar store_discovery_local_dir/name.tar into directory/.
-  // TODO(tnachen): Terminate tar process when slave exits.
-  const vector<string> argv = {
-    "tar",
-    "-C",
-    directory,
-    "-x",
-    "-f",
-    tarPath
-  };
-
-  Try<Subprocess> s = subprocess(
-      "tar",
-      argv,
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PATH("/dev/null"));
-
-  if (s.isError()) {
-    return Failure("Failed to create tar subprocess: " + s.error());
-  }
-
-  return s.get().status()
-    .then([tarPath](const Option<int>& status) -> Future<Nothing> {
-      if (status.isNone()) {
-        return Failure("Failed to reap status for tar subprocess in " +
-                        tarPath);
-      }
-      if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
-          return Failure("Untar image failed with exit code: " +
-                          WSTRINGIFY(status.get()));
-      }
-
-      return Nothing();
-    });
-}
-
-
-static Result<string> getParentId(
-    const string& directory,
-    const string& layerId)
-{
-  Try<string> manifest =
-    os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
-
-  if (manifest.isError()) {
-    return Error("Failed to read manifest: " + manifest.error());
-  }
-
-  Try<JSON::Object> json = JSON::parse<JSON::Object>(manifest.get());
-  if (json.isError()) {
-    return Error("Failed to parse manifest: " + json.error());
-  }
-
-  Result<JSON::String> parentId = json.get().find<JSON::String>("parent");
-  if (parentId.isNone() || (parentId.isSome() && parentId.get() == "")) {
-    return None();
-  } else if (parentId.isError()) {
-    return Error("Failed to read parent of layer: " + parentId.error());
-  }
-
-  return parentId.get().value;
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::putImage(
-    const Image::Name& name,
-    const string& directory)
-{
-  Try<string> value =
-    os::read(paths::getImageArchiveRepositoriesPath(directory));
-
-  if (value.isError()) {
-    return Failure("Failed to read repository JSON: " + value.error());
-  }
-
-  Try<JSON::Object> json = JSON::parse<JSON::Object>(value.get());
-  if (json.isError()) {
-    return Failure("Failed to parse JSON: " + json.error());
-  }
-
-  Result<JSON::Object> repositoryValue =
-    json.get().find<JSON::Object>(name.repository());
-
-  if (repositoryValue.isError()) {
-    return Failure("Failed to find repository: " + repositoryValue.error());
-  } else if (repositoryValue.isNone()) {
-    return Failure("Repository '" + name.repository() + "' is not found");
-  }
-
-  const JSON::Object repositoryJson = repositoryValue.get();
-
-  // We don't use JSON find here because a tag might contain a '.'.
-  std::map<string, JSON::Value>::const_iterator entry =
-    repositoryJson.values.find(name.tag());
-
-  if (entry == repositoryJson.values.end()) {
-    return Failure("Tag '" + name.tag() + "' is not found");
-  } else if (!entry->second.is<JSON::String>()) {
-    return Failure("Tag JSON value expected to be JSON::String");
-  }
-
-  const string layerId = entry->second.as<JSON::String>().value;
-
-  Try<string> manifest =
-    os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
-
-  if (manifest.isError()) {
-    return Failure("Failed to read manifest: " + manifest.error());
-  }
-
-  Try<JSON::Object> manifestJson = JSON::parse<JSON::Object>(manifest.get());
-  if (manifestJson.isError()) {
-    return Failure("Failed to parse manifest: " + manifestJson.error());
-  }
-
-  vector<string> layerIds;
-  layerIds.push_back(layerId);
-  Result<string> parentId = getParentId(directory, layerId);
-  while (parentId.isSome()) {
-    layerIds.insert(layerIds.begin(), parentId.get());
-    parentId = getParentId(directory, parentId.get());
-  }
-
-  if (parentId.isError()) {
-    return Failure("Failed to find parent layer id of layer '" + layerId +
-                   "': " + parentId.error());
-  }
-
-  return putLayers(directory, layerIds);
-}
-
-
-Future<list<pair<string, string>>> LocalPullerProcess::putLayers(
-    const string& directory,
-    const vector<string>& layerIds)
-{
-  list<Future<pair<string, string>>> futures;
-  foreach (const string& layerId, layerIds) {
-    futures.push_back(putLayer(directory, layerId));
-  }
-
-  return collect(futures);
-}
-
-
-Future<pair<string, string>> LocalPullerProcess::putLayer(
-    const string& directory,
-    const string& layerId)
-{
-  // We untar the layer from source into a directory, then move the
-  // layer into store. We do this instead of untarring directly to
-  // store to make sure we don't end up with partially untarred layer
-  // rootfs.
-
-  const string localRootfsPath =
-    paths::getImageArchiveLayerRootfsPath(directory, layerId);
-
-  // Image layer has been untarred but is not present in the store directory.
-  if (os::exists(localRootfsPath)) {
-    LOG(WARNING) << "Image layer '" << layerId << "' rootfs present at but not "
-                 << "in store directory '" << localRootfsPath << "'. Removing "
-                 << "staged rootfs and untarring layer again.";
-
-    Try<Nothing> rmdir = os::rmdir(localRootfsPath);
-    if (rmdir.isError()) {
-      return Failure("Failed to remove incomplete staged rootfs for layer '" +
-                     layerId + "': " + rmdir.error());
-    }
-  }
-
-  Try<Nothing> mkdir = os::mkdir(localRootfsPath);
-  if (mkdir.isError()) {
-    return Failure("Failed to create rootfs path '" + localRootfsPath +
-                   "': " + mkdir.error());
-  }
-
-  // Untar directory/id/layer.tar into directory/id/rootfs.
-  // The tar file will be removed when the staging directory is
-  // removed.
-  const vector<string> argv = {
-    "tar",
-    "-C",
-    localRootfsPath,
-    "-x",
-    "-f",
-    paths::getImageArchiveLayerTarPath(directory, layerId)
-  };
-
-  Try<Subprocess> s = subprocess(
-      "tar",
-      argv,
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PATH("/dev/null"),
-      Subprocess::PATH("/dev/null"));
-
-  if (s.isError()) {
-    return Failure("Failed to create tar subprocess: " + s.error());
-  }
-
-  return s.get().status()
-    .then([directory, layerId](
-        const Option<int>& status) -> Future<pair<string, string>> {
-      if (status.isNone()) {
-        return Failure("Failed to reap subprocess to untar image");
-      } else if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
-        return Failure("Untar failed with exit code: " +
-                        WSTRINGIFY(status.get()));
-      }
-
-      const string rootfsPath =
-        paths::getImageArchiveLayerRootfsPath(directory, layerId);
-
-      if (!os::exists(rootfsPath)) {
-        return Failure("Failed to find the rootfs path after extracting layer"
-                       " '" + layerId + "'");
-      }
-
-      return pair<string, string>(layerId, rootfsPath);
-    });
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/local_puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/local_puller.hpp b/src/slave/containerizer/provisioner/docker/local_puller.hpp
deleted file mode 100644
index 4574e8a..0000000
--- a/src/slave/containerizer/provisioner/docker/local_puller.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
-#define __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
-
-#include "slave/containerizer/provisioner/store.hpp"
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward declaration.
-class LocalPullerProcess;
-
-
-/**
- * LocalPuller assumes Docker images are stored in a local directory
- * (configured with flags.docker_local_archives_dir), with all the
- * images saved as tars with file names in the form of <repo>:<tag>.tar.
- */
-class LocalPuller : public Puller
-{
-public:
-  explicit LocalPuller(const Flags& flags);
-
-  ~LocalPuller();
-
-  process::Future<std::list<std::pair<std::string, std::string>>> pull(
-      const Image::Name& name,
-      const std::string& directory);
-
-private:
-  LocalPuller& operator=(const LocalPuller&) = delete; // Not assignable.
-  LocalPuller(const LocalPuller&) = delete; // Not copyable.
-
-  process::Owned<LocalPullerProcess> process;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/message.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/message.hpp b/src/slave/containerizer/provisioner/docker/message.hpp
deleted file mode 100644
index 466e6f8..0000000
--- a/src/slave/containerizer/provisioner/docker/message.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __MESSAGES_DOCKER_PROVISIONER_HPP__
-#define __MESSAGES_DOCKER_PROVISIONER_HPP__
-
-#include <stout/strings.hpp>
-
-// ONLY USEFUL AFTER RUNNING PROTOC.
-#include "slave/containerizer/provisioner/docker/message.pb.h"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Docker expects the image to be specified on the command line as:
-//   [REGISTRY_HOST[:REGISTRY_PORT]/]REPOSITORY[:TAG|@TYPE:DIGEST]
-//
-// This format is inherently ambiguous when dealing with repository
-// names that include forward slashes. To disambiguate, the docker
-// code looks for '.', or ':', or 'localhost' to decide if the
-// first component is a registry or a respository name. For more
-// detail, drill into the implementation of docker pull.
-//
-// TODO(bmahler): We currently store the digest as a tag, does
-// that makes sense?
-//
-// TODO(bmahler): Validate based on docker's validation logic
-// and return a Try here.
-inline Image::Name parseImageName(std::string s)
-{
-  Image::Name name;
-
-  // Extract the digest.
-  if (strings::contains(s, "@")) {
-    std::vector<std::string> split = strings::split(s, "@");
-
-    s = split[0];
-    name.set_tag(split[1]);
-  }
-
-  // Remove the tag. We need to watch out for a
-  // host:port registry, which also contains ':'.
-  if (strings::contains(s, ":")) {
-    std::vector<std::string> split = strings::split(s, ":");
-
-    // The tag must be the last component. If a slash is
-    // present there is a registry port and no tag.
-    if (!strings::contains(split.back(), "/")) {
-      name.set_tag(split.back());
-      split.pop_back();
-
-      s = strings::join(":", split);
-    }
-  }
-
-  // Default to the 'latest' tag when omitted.
-  if (name.tag().empty()) {
-    name.set_tag("latest");
-  }
-
-  // Extract the registry and repository. The first component can
-  // either be the registry, or the first part of the repository!
-  // We resolve this ambiguity using the same hacks used in the
-  // docker code ('.', ':', 'localhost' indicate a registry).
-  std::vector<std::string> split = strings::split(s, "/", 2);
-
-  if (split.size() == 1) {
-    name.set_repository(s);
-  } else if (strings::contains(split[0], ".") ||
-             strings::contains(split[0], ":") ||
-             split[0] == "localhost") {
-    name.set_registry(split[0]);
-    name.set_repository(split[1]);
-  } else {
-    name.set_repository(s);
-  }
-
-  return name;
-}
-
-
-inline std::ostream& operator<<(
-    std::ostream& stream,
-    const Image::Name& name)
-{
-  if (name.has_registry()) {
-    return stream << name.registry() << "/" << name.repository() << ":"
-                  << name.tag();
-  }
-
-  return stream << name.repository() << ":" << name.tag();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __MESSAGES_DOCKER_PROVISIONER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/message.proto
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/message.proto b/src/slave/containerizer/provisioner/docker/message.proto
deleted file mode 100644
index c33e0c5..0000000
--- a/src/slave/containerizer/provisioner/docker/message.proto
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import "mesos/mesos.proto";
-
-package mesos.internal.slave.docker;
-
-/**
- * A Docker Image name and the layer ids of the layers that comprise the image.
- * The layerIds are ordered, with the root layer id (no parent layer id) first
- * and the leaf layer id last.
- */
-message Image {
-  message Name {
-    optional string registry = 1;
-    required string repository = 2;
-
-    // TODO(bmahler): This may hold a tag or a digest, split these?
-    required string tag = 3;
-  }
-
-  required Name name = 1;
-
-  // The order of the layers represents the dependency between layers.
-  repeated string layer_ids = 2;
-}
-
-
-message Images {
-  repeated Image images = 1;
-}
-
-
-/**
-* Protobuf for the Docker image manifest JSON schema:
-* https://github.com/docker/distribution/blob/master/docs/spec/manifest-v2-1.md
-*/
-message DockerImageManifest {
-  required string name = 1;
-  required string tag = 2;
-  required string architecture = 3;
-
-  message FsLayers {
-    required string blobSum = 1;
-  }
-
-  repeated FsLayers fsLayers = 4;
-
-  message History {
-    message V1Compatibility {
-      required string id = 1;
-      required string parent = 2;
-    }
-
-    required V1Compatibility v1Compatibility = 1;
-  }
-
-  repeated History history = 5;
-  required uint32 schemaVersion = 6;
-
-  message Signatures {
-
-    //JOSE (A JSON Web Signature).
-    message Header {
-
-      //JSON Web Key.
-      message Jwk {
-        required string crv = 1;
-        required string kid = 2;
-        required string kty = 3;
-        required string x = 4;
-        required string y = 5;
-      }
-
-      optional Jwk jwk = 1;
-      required string alg = 2;
-    }
-
-    required Header header = 1;
-    required string signature = 2;
-    required string protected = 3;
-  }
-
-  repeated Signatures signatures = 7;
-}

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/metadata_manager.cpp b/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
deleted file mode 100644
index 2b2de52..0000000
--- a/src/slave/containerizer/provisioner/docker/metadata_manager.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/metadata_manager.hpp"
-
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/os.hpp>
-#include <stout/protobuf.hpp>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/owned.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/state.hpp"
-
-using namespace process;
-
-using std::list;
-using std::string;
-using std::vector;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class MetadataManagerProcess : public process::Process<MetadataManagerProcess>
-{
-public:
-  MetadataManagerProcess(const Flags& _flags) : flags(_flags) {}
-
-  ~MetadataManagerProcess() {}
-
-  Future<Nothing> recover();
-
-  Future<Image> put(
-      const Image::Name& name,
-      const std::vector<std::string>& layerIds);
-
-  Future<Option<Image>> get(const Image::Name& name);
-
-  // TODO(chenlily): Implement removal of unreferenced images.
-
-private:
-  // Write out metadata manager state to persistent store.
-  Try<Nothing> persist();
-
-  const Flags flags;
-
-  // This is a lookup table for images that are stored in memory. It is keyed
-  // by the name of the Image.
-  // For example, "ubuntu:14.04" -> ubuntu14:04 Image.
-  hashmap<std::string, Image> storedImages;
-};
-
-
-Try<Owned<MetadataManager>> MetadataManager::create(const Flags& flags)
-{
-  Owned<MetadataManagerProcess> process(new MetadataManagerProcess(flags));
-
-  return Owned<MetadataManager>(new MetadataManager(process));
-}
-
-
-MetadataManager::MetadataManager(Owned<MetadataManagerProcess> process)
-  : process(process)
-{
-  process::spawn(CHECK_NOTNULL(process.get()));
-}
-
-
-MetadataManager::~MetadataManager()
-{
-  process::terminate(process.get());
-  process::wait(process.get());
-}
-
-
-Future<Nothing> MetadataManager::recover()
-{
-  return process::dispatch(process.get(), &MetadataManagerProcess::recover);
-}
-
-
-Future<Image> MetadataManager::put(
-    const Image::Name& name,
-    const vector<string>& layerIds)
-{
-  return dispatch(
-      process.get(),
-      &MetadataManagerProcess::put,
-      name,
-      layerIds);
-}
-
-
-Future<Option<Image>> MetadataManager::get(const Image::Name& name)
-{
-  return dispatch(process.get(), &MetadataManagerProcess::get, name);
-}
-
-
-Future<Image> MetadataManagerProcess::put(
-    const Image::Name& name,
-    const vector<string>& layerIds)
-{
-  const string imageName = stringify(name);
-
-  Image dockerImage;
-  dockerImage.mutable_name()->CopyFrom(name);
-  foreach (const string& layerId, layerIds) {
-    dockerImage.add_layer_ids(layerId);
-  }
-
-  storedImages[imageName] = dockerImage;
-
-  Try<Nothing> status = persist();
-  if (status.isError()) {
-    return Failure("Failed to save state of Docker images: " + status.error());
-  }
-
-  return dockerImage;
-}
-
-
-Future<Option<Image>> MetadataManagerProcess::get(
-    const Image::Name& name)
-{
-  const string imageName = stringify(name);
-
-  if (!storedImages.contains(imageName)) {
-    return None();
-  }
-
-  return storedImages[imageName];
-}
-
-
-Try<Nothing> MetadataManagerProcess::persist()
-{
-  Images images;
-
-  foreachvalue (const Image& image, storedImages) {
-    images.add_images()->CopyFrom(image);
-  }
-
-  Try<Nothing> status = state::checkpoint(
-      paths::getStoredImagesPath(flags.docker_store_dir), images);
-  if (status.isError()) {
-    return Error("Failed to perform checkpoint: " + status.error());
-  }
-
-  return Nothing();
-}
-
-
-Future<Nothing> MetadataManagerProcess::recover()
-{
-  string storedImagesPath = paths::getStoredImagesPath(flags.docker_store_dir);
-
-  if (!os::exists(storedImagesPath)) {
-    LOG(INFO) << "No images to load from disk. Docker provisioner image "
-              << "storage path '" << storedImagesPath << "' does not exist";
-    return Nothing();
-  }
-
-  Result<Images> images = ::protobuf::read<Images>(storedImagesPath);
-  if (images.isError()) {
-    return Failure("Failed to read protobuf for Docker provisioner image: " +
-                   images.error());
-  }
-
-  foreach (const Image image, images.get().images()) {
-    vector<string> missingLayerIds;
-    foreach (const string layerId, image.layer_ids()) {
-      const string rootfsPath =
-        paths::getImageLayerRootfsPath(flags.docker_store_dir, layerId);
-
-      if (!os::exists(rootfsPath)) {
-        missingLayerIds.push_back(layerId);
-      }
-    }
-
-    if (!missingLayerIds.empty()) {
-      LOG(WARNING) << "Skipped loading image  '" << stringify(image.name())
-                   << "' due to missing layers: " << stringify(missingLayerIds);
-      continue;
-    }
-
-    const string imageName = stringify(image.name());
-    if (storedImages.contains(imageName)) {
-      LOG(WARNING) << "Found duplicate image in recovery for image name '"
-                   << imageName << "'";
-    } else {
-      storedImages[imageName] = image;
-    }
-  }
-
-  LOG(INFO) << "Loaded " << storedImages.size() << " Docker images";
-
-  return Nothing();
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/metadata_manager.hpp b/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
deleted file mode 100644
index 885080d..0000000
--- a/src/slave/containerizer/provisioner/docker/metadata_manager.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
-#define __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
-
-#include <list>
-#include <string>
-
-#include <stout/hashmap.hpp>
-#include <stout/json.hpp>
-#include <stout/option.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-#include <process/process.hpp>
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-// Forward Declaration.
-class MetadataManagerProcess;
-
-/**
- * The MetadataManager tracks the Docker images cached by the
- * provisioner that are stored on disk. It keeps track of the layers
- * that Docker images are composed of and recovers Image objects
- * upon initialization by checking for dependent layers stored on disk.
- * Currently, image layers are stored indefinitely, with no garbage
- * collection of unreferenced image layers.
- */
-class MetadataManager
-{
-public:
-  static Try<process::Owned<MetadataManager>> create(const Flags& flags);
-
-  ~MetadataManager();
-
-  /**
-   * Recover all stored Image and its layer references.
-   */
-  process::Future<Nothing> recover();
-
-  /**
-   * Create an Image, put it in metadata manager and persist the reference
-   * store state to disk.
-   *
-   * @param name     the name of the Docker image to place in the reference
-   *                 store.
-   * @param layerIds the list of layer ids that comprise the Docker image in
-   *                 order where the root layer's id (no parent layer) is first
-   *                 and the leaf layer's id is last.
-   */
-  process::Future<Image> put(
-      const Image::Name& name,
-      const std::vector<std::string>& layerIds);
-
-  /**
-   * Retrieve Image based on image name if it is among the Images
-   * stored in memory.
-   *
-   * @param name  the name of the Docker image to retrieve
-   */
-  process::Future<Option<Image>> get(const Image::Name& name);
-
-private:
-  explicit MetadataManager(process::Owned<MetadataManagerProcess> process);
-
-  MetadataManager(const MetadataManager&); // Not copyable.
-  MetadataManager& operator=(const MetadataManager&); // Not assignable.
-
-  process::Owned<MetadataManagerProcess> process;
-};
-
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/paths.cpp b/src/slave/containerizer/provisioner/docker/paths.cpp
deleted file mode 100644
index 5733fb7..0000000
--- a/src/slave/containerizer/provisioner/docker/paths.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/paths.hpp"
-
-#include <stout/path.hpp>
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace paths {
-
-string getStagingDir(const string& storeDir)
-{
-  return path::join(storeDir, "staging");
-}
-
-
-string getStagingTempDir(const string& storeDir)
-{
-  return path::join(getStagingDir(storeDir), "XXXXXX");
-}
-
-
-string getImageArchiveTarPath(
-    const string& discoveryDir,
-    const string& name)
-{
-  return path::join(discoveryDir, name + ".tar");
-}
-
-
-string getImageArchiveRepositoriesPath(const string& archivePath)
-{
-  return path::join(archivePath, "repositories");
-}
-
-
-std::string getImageArchiveLayerPath(
-    const string& archivePath,
-    const string& layerId)
-{
-  return path::join(archivePath, layerId);
-}
-
-
-string getImageArchiveLayerManifestPath(
-    const string& archivePath,
-    const string& layerId)
-{
-  return path::join(getImageArchiveLayerPath(archivePath, layerId), "json");
-}
-
-
-string getImageArchiveLayerTarPath(
-  const string& archivePath,
-  const string& layerId)
-{
-  return path::join(
-      getImageArchiveLayerPath(archivePath, layerId), "layer.tar");
-}
-
-
-string getImageArchiveLayerRootfsPath(
-    const string& archivePath,
-    const string& layerId)
-{
-  return path::join(getImageArchiveLayerPath(archivePath, layerId), "rootfs");
-}
-
-
-string getImageLayerPath(
-    const string& storeDir,
-    const string& layerId)
-{
-  return path::join(storeDir, "layers", layerId);
-}
-
-
-string getImageLayerRootfsPath(
-    const string& storeDir,
-    const string& layerId)
-{
-  return path::join(getImageLayerPath(storeDir, layerId), "rootfs");
-}
-
-
-string getStoredImagesPath(const string& storeDir)
-{
-  return path::join(storeDir, "storedImages");
-}
-
-} // namespace paths {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/paths.hpp b/src/slave/containerizer/provisioner/docker/paths.hpp
deleted file mode 100644
index 18beb2e..0000000
--- a/src/slave/containerizer/provisioner/docker/paths.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_PATHS_HPP__
-#define __PROVISIONER_DOCKER_PATHS_HPP__
-
-#include <list>
-#include <string>
-
-#include <mesos/mesos.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace paths {
-
-/**
- * The Docker store file system layout is as follows:
- * Image store dir ('--docker_store_dir' slave flag)
- *    |--staging
- *       |-- <temp_dir_archive>
- *           |-- <layer_id>
- *               |-- rootfs
- *    |--layers
- *       |--<layer_id>
- *           |--rootfs
- *    |--storedImages (file holding on cached images)
- */
-
-std::string getStagingDir(const std::string& storeDir);
-
-
-std::string getStagingTempDir(const std::string& storeDir);
-
-
-std::string getImageArchiveTarPath(
-    const std::string& discoveryDir,
-    const std::string& name);
-
-
-std::string getImageArchiveRepositoriesPath(const std::string& archivePath);
-
-
-std::string getImageArchiveLayerPath(
-    const std::string& archivePath,
-    const std::string& layerId);
-
-
-std::string getImageArchiveLayerManifestPath(
-    const std::string& archivePath,
-    const std::string& layerId);
-
-
-std::string getImageArchiveLayerTarPath(
-  const std::string& archivePath,
-  const std::string& layerId);
-
-
-std::string getImageArchiveLayerRootfsPath(
-  const std::string& archivePath,
-  const std::string& layerId);
-
-
-std::string getImageLayerPath(
-    const std::string& storeDir,
-    const std::string& layerId);
-
-
-std::string getImageLayerRootfsPath(
-    const std::string& storeDir,
-    const std::string& layerId);
-
-
-std::string getStoredImagesPath(const std::string& storeDir);
-
-} // namespace paths {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/puller.cpp b/src/slave/containerizer/provisioner/docker/puller.cpp
deleted file mode 100644
index cb05324..0000000
--- a/src/slave/containerizer/provisioner/docker/puller.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "slave/containerizer/provisioner/docker/puller.hpp"
-
-#include "slave/containerizer/provisioner/docker/local_puller.hpp"
-
-using std::string;
-
-using process::Owned;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-Try<Owned<Puller>> Puller::create(const Flags& flags)
-{
-  const string puller = flags.docker_puller;
-
-  if (puller == "local") {
-    return Owned<Puller>(new LocalPuller(flags));
-  }
-
-  return Error("Unknown or unsupported docker puller: " + puller);
-}
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/puller.hpp b/src/slave/containerizer/provisioner/docker/puller.hpp
deleted file mode 100644
index 105b4e7..0000000
--- a/src/slave/containerizer/provisioner/docker/puller.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_PULLER_HPP__
-#define __PROVISIONER_DOCKER_PULLER_HPP__
-
-#include <list>
-#include <utility>
-
-#include <stout/try.hpp>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-#include "slave/flags.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-
-class Puller
-{
-public:
-  static Try<process::Owned<Puller>> create(const Flags& flags);
-
-  virtual ~Puller() {}
-
-  /**
-   * Pull a Docker image layers into the specified directory, and
-   * return the list of layer ids in that image in the right
-   * dependency order, and also return the directory where
-   * the puller puts its changeset.
-   *
-   * @param name The name of the image.
-   * @param directory The target directory to store the layers.
-   * @return list of layers maped to its local directory ordered by its
-   *         dependency.
-   */
-  virtual process::Future<std::list<std::pair<std::string, std::string>>> pull(
-      const docker::Image::Name& name,
-      const std::string& directory) = 0;
-};
-
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-
-#endif // __PROVISIONER_DOCKER_PULLER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/registry_client.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/registry_client.cpp b/src/slave/containerizer/provisioner/docker/registry_client.cpp
deleted file mode 100644
index 24aa95c..0000000
--- a/src/slave/containerizer/provisioner/docker/registry_client.cpp
+++ /dev/null
@@ -1,641 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include <process/defer.hpp>
-#include <process/dispatch.hpp>
-#include <process/http.hpp>
-#include <process/io.hpp>
-
-#include <stout/os.hpp>
-
-#include "slave/containerizer/provisioner/docker/registry_client.hpp"
-#include "slave/containerizer/provisioner/docker/token_manager.hpp"
-
-using std::string;
-using std::vector;
-
-using process::Failure;
-using process::Future;
-using process::Owned;
-using process::Process;
-
-using process::http::Request;
-using process::http::Response;
-using process::http::URL;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-using FileSystemLayerInfo = RegistryClient::FileSystemLayerInfo;
-
-using ManifestResponse = RegistryClient::ManifestResponse;
-
-const Duration RegistryClient::DEFAULT_MANIFEST_TIMEOUT_SECS = Seconds(10);
-
-const size_t RegistryClient::DEFAULT_MANIFEST_MAXSIZE_BYTES = 4096;
-
-static const uint16_t DEFAULT_SSL_PORT = 443;
-
-class RegistryClientProcess : public Process<RegistryClientProcess>
-{
-public:
-  static Try<Owned<RegistryClientProcess>> create(
-      const URL& registry,
-      const URL& authServer,
-      const Option<RegistryClient::Credentials>& creds);
-
-  Future<RegistryClient::ManifestResponse> getManifest(
-      const string& path,
-      const Option<string>& tag,
-      const Duration& timeout);
-
-  Future<size_t> getBlob(
-      const string& path,
-      const Option<string>& digest,
-      const Path& filePath,
-      const Duration& timeout,
-      size_t maxSize);
-
-private:
-  RegistryClientProcess(
-    const URL& registryServer,
-    const Owned<TokenManager>& tokenManager,
-    const Option<RegistryClient::Credentials>& creds);
-
-  Future<Response> doHttpGet(
-      const URL& url,
-      const Option<process::http::Headers>& headers,
-      const Duration& timeout,
-      bool resend,
-      const Option<string>& lastResponse) const;
-
-  Try<process::http::Headers> getAuthenticationAttributes(
-      const Response& httpResponse) const;
-
-  const URL registryServer_;
-  Owned<TokenManager> tokenManager_;
-  const Option<RegistryClient::Credentials> credentials_;
-
-  RegistryClientProcess(const RegistryClientProcess&) = delete;
-  RegistryClientProcess& operator = (const RegistryClientProcess&) = delete;
-};
-
-
-Try<Owned<RegistryClient>> RegistryClient::create(
-    const URL& registryServer,
-    const URL& authServer,
-    const Option<Credentials>& creds)
-{
-  Try<Owned<RegistryClientProcess>> process =
-    RegistryClientProcess::create(authServer, registryServer, creds);
-
-  if (process.isError()) {
-    return Error(process.error());
-  }
-
-  return Owned<RegistryClient>(
-      new RegistryClient(authServer, registryServer, creds, process.get()));
-}
-
-
-RegistryClient::RegistryClient(
-    const URL& registryServer,
-    const URL& authServer,
-    const Option<Credentials>& creds,
-    const Owned<RegistryClientProcess>& process)
-  : registryServer_(registryServer),
-    authServer_(authServer),
-    credentials_(creds),
-    process_(process)
-{
-  spawn(CHECK_NOTNULL(process_.get()));
-}
-
-
-RegistryClient::~RegistryClient()
-{
-  terminate(process_.get());
-  process::wait(process_.get());
-}
-
-
-Future<ManifestResponse> RegistryClient::getManifest(
-    const string& _path,
-    const Option<string>& _tag,
-    const Option<Duration>& _timeout)
-{
-  Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
-
-  return dispatch(
-      process_.get(),
-      &RegistryClientProcess::getManifest,
-      _path,
-      _tag,
-      timeout);
-}
-
-
-Future<size_t> RegistryClient::getBlob(
-    const string& _path,
-    const Option<string>& _digest,
-    const Path& _filePath,
-    const Option<Duration>& _timeout,
-    const Option<size_t>& _maxSize)
-{
-  Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
-  size_t maxSize = _maxSize.getOrElse(DEFAULT_MANIFEST_MAXSIZE_BYTES);
-
-  return dispatch(
-        process_.get(),
-        &RegistryClientProcess::getBlob,
-        _path,
-        _digest,
-        _filePath,
-        timeout,
-        maxSize);
-}
-
-
-Try<Owned<RegistryClientProcess>> RegistryClientProcess::create(
-    const URL& registryServer,
-    const URL& authServer,
-    const Option<RegistryClient::Credentials>& creds)
-{
-  Try<Owned<TokenManager>> tokenMgr = TokenManager::create(authServer);
-  if (tokenMgr.isError()) {
-    return Error("Failed to create token manager: " + tokenMgr.error());
-  }
-
-  return Owned<RegistryClientProcess>(
-      new RegistryClientProcess(registryServer, tokenMgr.get(), creds));
-}
-
-
-RegistryClientProcess::RegistryClientProcess(
-    const URL& registryServer,
-    const Owned<TokenManager>& tokenMgr,
-    const Option<RegistryClient::Credentials>& creds)
-  : registryServer_(registryServer),
-    tokenManager_(tokenMgr),
-    credentials_(creds) {}
-
-
-Try<process::http::Headers> RegistryClientProcess::getAuthenticationAttributes(
-    const Response& httpResponse) const
-{
-  if (httpResponse.headers.find("WWW-Authenticate") ==
-      httpResponse.headers.end()) {
-    return Error("Failed to find WWW-Authenticate header value");
-  }
-
-  const string& authString = httpResponse.headers.at("WWW-Authenticate");
-
-  const vector<string> authStringTokens = strings::tokenize(authString, " ");
-  if ((authStringTokens.size() != 2) || (authStringTokens[0] != "Bearer")) {
-    // TODO(jojy): Look at various possibilities of auth response. We currently
-    // assume that the string will have realm information.
-    return Error("Invalid authentication header value: " + authString);
-  }
-
-  const vector<string> authParams = strings::tokenize(authStringTokens[1], ",");
-
-  process::http::Headers authAttributes;
-  auto addAttribute = [&authAttributes](
-      const string& param) -> Try<Nothing> {
-    const vector<string> paramTokens =
-      strings::tokenize(param, "=\"");
-
-    if (paramTokens.size() != 2) {
-      return Error(
-          "Failed to get authentication attribute from response parameter " +
-          param);
-    }
-
-    authAttributes.insert({paramTokens[0], paramTokens[1]});
-
-    return Nothing();
-  };
-
-  foreach (const string& param, authParams) {
-    Try<Nothing> addRes = addAttribute(param);
-    if (addRes.isError()) {
-      return Error(addRes.error());
-    }
-  }
-
-  return authAttributes;
-}
-
-
-Future<Response> RegistryClientProcess::doHttpGet(
-    const URL& url,
-    const Option<process::http::Headers>& headers,
-    const Duration& timeout,
-    bool resend,
-    const Option<string>& lastResponseStatus) const
-{
-  return process::http::get(url, headers)
-    .after(timeout, [](
-        const Future<Response>& httpResponseFuture) -> Future<Response> {
-      return Failure("Response timeout");
-    })
-    .then(defer(self(), [=](
-        const Response& httpResponse) -> Future<Response> {
-      VLOG(1) << "Response status: " + httpResponse.status;
-
-      // Set the future if we get a OK response.
-      if (httpResponse.status == "200 OK") {
-        return httpResponse;
-      } else if (httpResponse.status == "400 Bad Request") {
-        Try<JSON::Object> errorResponse =
-          JSON::parse<JSON::Object>(httpResponse.body);
-
-        if (errorResponse.isError()) {
-          return Failure("Failed to parse bad request response JSON: " +
-                         errorResponse.error());
-        }
-
-        std::ostringstream out;
-        bool first = true;
-        Result<JSON::Array> errorObjects =
-          errorResponse.get().find<JSON::Array>("errors");
-
-        if (errorObjects.isError()) {
-          return Failure("Failed to find 'errors' in bad request response: " +
-                         errorObjects.error());
-        } else if (errorObjects.isNone()) {
-          return Failure("Errors not found in bad request response");
-        }
-
-        foreach (const JSON::Value& error, errorObjects.get().values) {
-          Result<JSON::String> message =
-            error.as<JSON::Object>().find<JSON::String>("message");
-
-          if (message.isError()) {
-            return Failure("Failed to parse bad request error message: " +
-                           message.error());
-          } else if (message.isNone()) {
-            continue;
-          }
-
-          if (first) {
-            out << message.get().value;
-            first = false;
-          } else {
-            out << ", " << message.get().value;
-          }
-        }
-
-        return Failure("Received Bad request, errors: [" + out.str() + "]");
-      }
-
-      // Prevent infinite recursion.
-      if (lastResponseStatus.isSome() &&
-          (lastResponseStatus.get() == httpResponse.status)) {
-        return Failure("Invalid response: " + httpResponse.status);
-      }
-
-      // If resend is not set, we dont try again and stop here.
-      if (!resend) {
-        return Failure("Bad response: " + httpResponse.status);
-      }
-
-      // Handle 401 Unauthorized.
-      if (httpResponse.status == "401 Unauthorized") {
-        Try<process::http::Headers> authAttributes =
-          getAuthenticationAttributes(httpResponse);
-
-        if (authAttributes.isError()) {
-          return Failure(
-              "Failed to get authentication attributes: " +
-              authAttributes.error());
-        }
-
-        // TODO(jojy): Currently only handling TLS/cert authentication.
-        Future<Token> tokenResponse = tokenManager_->getToken(
-          authAttributes.get().at("service"),
-          authAttributes.get().at("scope"),
-          None());
-
-        return tokenResponse
-          .after(timeout, [=](
-              Future<Token> tokenResponse) -> Future<Token> {
-            tokenResponse.discard();
-            return Failure("Token response timeout");
-          })
-          .then(defer(self(), [=](
-              const Future<Token>& tokenResponse) {
-            // Send request with acquired token.
-            process::http::Headers authHeaders = {
-              {"Authorization", "Bearer " + tokenResponse.get().raw}
-            };
-
-            return doHttpGet(
-                url,
-                authHeaders,
-                timeout,
-                true,
-                httpResponse.status);
-        }));
-      } else if (httpResponse.status == "307 Temporary Redirect") {
-        // Handle redirect.
-
-        // TODO(jojy): Add redirect functionality in http::get.
-
-        auto toURL = [](
-            const string& urlString) -> Try<URL> {
-          // TODO(jojy): Need to add functionality to URL class that parses a
-          // string to its URL components. For now, assuming:
-          //  - scheme is https
-          //  - path always ends with /
-
-          static const string schemePrefix = "https://";
-
-          if (!strings::contains(urlString, schemePrefix)) {
-            return Error(
-                "Failed to find expected token '" + schemePrefix +
-                "' in redirect url");
-          }
-
-          const string schemeSuffix = urlString.substr(schemePrefix.length());
-
-          const vector<string> components =
-            strings::tokenize(schemeSuffix, "/");
-
-          const string path = schemeSuffix.substr(components[0].length());
-
-          const vector<string> addrComponents =
-            strings::tokenize(components[0], ":");
-
-          uint16_t port = DEFAULT_SSL_PORT;
-          string domain = components[0];
-
-          // Parse the port.
-          if (addrComponents.size() == 2) {
-            domain = addrComponents[0];
-
-            Try<uint16_t> tryPort = numify<uint16_t>(addrComponents[1]);
-            if (tryPort.isError()) {
-              return Error(
-                  "Failed to parse location: " + urlString + " for port.");
-            }
-
-            port = tryPort.get();
-          }
-
-          return URL("https", domain, port, path);
-        };
-
-        if (httpResponse.headers.find("Location") ==
-            httpResponse.headers.end()) {
-          return Failure(
-              "Invalid redirect response: 'Location' not found in headers.");
-        }
-
-        const string& location = httpResponse.headers.at("Location");
-        Try<URL> tryUrl = toURL(location);
-        if (tryUrl.isError()) {
-          return Failure(
-              "Failed to parse '" + location + "': " + tryUrl.error());
-        }
-
-        return doHttpGet(
-            tryUrl.get(),
-            headers,
-            timeout,
-            false,
-            httpResponse.status);
-      } else {
-        return Failure("Invalid response: " + httpResponse.status);
-      }
-    }));
-}
-
-
-Future<ManifestResponse> RegistryClientProcess::getManifest(
-    const string& path,
-    const Option<string>& tag,
-    const Duration& timeout)
-{
-  if (strings::contains(path, " ")) {
-    return Failure("Invalid repository path: " + path);
-  }
-
-  string repoTag = tag.getOrElse("latest");
-  if (strings::contains(repoTag, " ")) {
-    return Failure("Invalid repository tag: " + repoTag);
-  }
-
-  URL manifestURL(registryServer_);
-  manifestURL.path =
-    "v2/" + path + "/manifests/" + repoTag;
-
-  auto getManifestResponse = [](
-      const Response& httpResponse) -> Try<ManifestResponse> {
-    if (!httpResponse.headers.contains("Docker-Content-Digest")) {
-      return Error("Docker-Content-Digest header missing in response");
-    }
-
-    Try<JSON::Object> responseJSON =
-      JSON::parse<JSON::Object>(httpResponse.body);
-
-    if (responseJSON.isError()) {
-      return Error(responseJSON.error());
-    }
-
-    Result<JSON::String> name = responseJSON.get().find<JSON::String>("name");
-    if (name.isNone()) {
-      return Error("Failed to find \"name\" in manifest response");
-    }
-
-    Result<JSON::Array> fsLayers =
-      responseJSON.get().find<JSON::Array>("fsLayers");
-
-    if (fsLayers.isNone()) {
-      return Error("Failed to find \"fsLayers\" in manifest response");
-    }
-
-    Result<JSON::Array> historyArray =
-      responseJSON.get().find<JSON::Array>("history");
-
-    if (historyArray.isNone()) {
-      return Error("Failed to find \"history\" in manifest response");
-    }
-
-    if (historyArray.get().values.size() != fsLayers.get().values.size()) {
-      return Error(
-          "\"history\" and \"fsLayers\" array count mismatch"
-          "in manifest response");
-    }
-
-    vector<FileSystemLayerInfo> fsLayerInfoList;
-    size_t index = 0;
-
-    foreach (const JSON::Value& layer, fsLayers.get().values) {
-      if (!layer.is<JSON::Object>()) {
-        return Error(
-            "Failed to parse layer as a JSON object for index: " +
-            stringify(index));
-      }
-
-      const JSON::Object& layerInfoJSON = layer.as<JSON::Object>();
-
-      // Get blobsum for layer.
-      const Result<JSON::String> blobSumInfo =
-        layerInfoJSON.find<JSON::String>("blobSum");
-
-      if (blobSumInfo.isNone()) {
-        return Error("Failed to find \"blobSum\" in manifest response");
-      }
-
-      // Get history for layer.
-      if (!historyArray.get().values[index].is<JSON::Object>()) {
-        return Error(
-            "Failed to parse history as a JSON object for index: " +
-            stringify(index));
-      }
-      const JSON::Object& historyObj =
-        historyArray.get().values[index].as<JSON::Object>();
-
-      // Get layer id.
-      const Result<JSON::String> v1CompatibilityJSON =
-        historyObj.find<JSON::String>("v1Compatibility");
-
-      if (!v1CompatibilityJSON.isSome()) {
-        return Error(
-            "Failed to obtain layer v1 compability json in manifest for layer: "
-            + stringify(index));
-      }
-
-      Try<JSON::Object> v1CompatibilityObj =
-        JSON::parse<JSON::Object>(v1CompatibilityJSON.get().value);
-
-      if (!v1CompatibilityObj.isSome()) {
-        return Error(
-            "Failed to parse v1 compability json in manifest for layer: "
-            + stringify(index));
-      }
-
-      const Result<JSON::String> id =
-        v1CompatibilityObj.get().find<JSON::String>("id");
-
-      if (!id.isSome()) {
-        return Error(
-            "Failed to find \"id\" in manifest for layer: " + stringify(index));
-      }
-
-      fsLayerInfoList.emplace_back(
-          FileSystemLayerInfo{
-            blobSumInfo.get().value,
-            id.get().value,
-          });
-
-      index++;
-    }
-
-    return ManifestResponse {
-      name.get().value,
-      httpResponse.headers.at("Docker-Content-Digest"),
-      fsLayerInfoList,
-    };
-  };
-
-  return doHttpGet(manifestURL, None(), timeout, true, None())
-    .then([getManifestResponse] (
-        const Response& response) -> Future<ManifestResponse> {
-      Try<ManifestResponse> manifestResponse = getManifestResponse(response);
-
-      if (manifestResponse.isError()) {
-        return Failure(
-            "Failed to parse manifest response: " + manifestResponse.error());
-      }
-
-      return manifestResponse.get();
-    });
-}
-
-
-Future<size_t> RegistryClientProcess::getBlob(
-    const string& path,
-    const Option<string>& digest,
-    const Path& filePath,
-    const Duration& timeout,
-    size_t maxSize)
-{
-  auto prepare = ([&filePath]() -> Try<Nothing> {
-      const string dirName = filePath.dirname();
-
-      // TODO(jojy): Return more state, for example - if the directory is new.
-      Try<Nothing> dirResult = os::mkdir(dirName, true);
-      if (dirResult.isError()) {
-        return Error(
-            "Failed to create directory to download blob: " +
-            dirResult.error());
-      }
-
-      return dirResult;
-  })();
-
-  // TODO(jojy): This currently leaves a residue in failure cases. Would be
-  // ideal if we can completely rollback.
-  if (prepare.isError()) {
-     return Failure(prepare.error());
-  }
-
-  if (strings::contains(path, " ")) {
-    return Failure("Invalid repository path: " + path);
-  }
-
-  URL blobURL(registryServer_);
-  blobURL.path =
-    "v2/" + path + "/blobs/" + digest.getOrElse("");
-
-  auto saveBlob = [filePath](
-      const Response& httpResponse) -> Future<size_t> {
-    // TODO(jojy): Add verification step.
-    // TODO(jojy): Add check for max size.
-    size_t size = httpResponse.body.length();
-    Try<int> fd = os::open(
-        filePath.value,
-        O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC,
-        S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-
-    if (fd.isError()) {
-      return Failure("Failed to open file '" + filePath.value + "': " +
-                     fd.error());
-    }
-
-    return process::io::write(fd.get(), httpResponse.body)
-      .then([size](const Future<Nothing>&) { return size; })
-      .onAny([fd]() { os::close(fd.get()); } );
-  };
-
-  return doHttpGet(blobURL, None(), timeout, true, None())
-    .then([saveBlob](const Response& response) { return saveBlob(response); });
-}
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/registry_client.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/registry_client.hpp b/src/slave/containerizer/provisioner/docker/registry_client.hpp
deleted file mode 100644
index 1d3377e..0000000
--- a/src/slave/containerizer/provisioner/docker/registry_client.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
-#define __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
-
-#include <string>
-#include <vector>
-
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/json.hpp>
-#include <stout/path.hpp>
-
-#include <process/future.hpp>
-#include <process/http.hpp>
-#include <process/process.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace registry {
-
-// Forward declarations.
-class RegistryClientProcess;
-
-
-class RegistryClient
-{
-public:
-  /**
-   * Encapsulates information about a file system layer.
-   */
-  struct FileSystemLayerInfo {
-    // TODO(jojy): This string includes the checksum type also now. Need to
-    // separate this into checksum method and checksum.
-    const std::string checksumInfo;
-    const std::string layerId;
-  };
-
-  /**
-   * Encapsulates response of "GET Manifest" request.
-   *
-   * Reference: https://docs.docker.com/registry/spec/api
-   */
-  struct ManifestResponse {
-    const std::string name;
-    const std::string digest;
-    const std::vector<FileSystemLayerInfo> fsLayerInfoList;
-  };
-
-  /**
-   * Encapsulates auth credentials for the client sessions.
-   * TODO(jojy): Secure heap to protect the credentials.
-   */
-  struct Credentials {
-    /**
-     * UserId for basic authentication.
-     */
-    const Option<std::string> userId;
-    /**
-     * Password for basic authentication.
-     */
-    const Option<std::string> password;
-    /**
-     * Account for fetching data from registry.
-     */
-    const Option<std::string> account;
-  };
-
-  /**
-   * Factory method for creating RegistryClient objects.
-   *
-   * @param registryServer URL of docker registry server.
-   * @param authServer URL of authorization server.
-   * @param credentials credentials for client session (optional).
-   * @return RegistryClient on Success.
-   *         Error on failure.
-   */
-  static Try<process::Owned<RegistryClient>> create(
-      const process::http::URL& registryServer,
-      const process::http::URL& authServer,
-      const Option<Credentials>& credentials);
-
-  /**
-   * Fetches manifest for a repository from the client's remote registry server.
-   *
-   * @param path path of the repository on the registry.
-   * @param tag unique tag that identifies the repository. Will default to
-   *    latest.
-   * @param timeout Maximum time ater which the request will timeout and return
-   *    a failure. Will default to RESPONSE_TIMEOUT.
-   * @return JSON object on success.
-   *         Failure on process failure.
-   */
-  process::Future<ManifestResponse> getManifest(
-      const std::string& path,
-      const Option<std::string>& tag,
-      const Option<Duration>& timeout);
-
-  /**
-   * Fetches blob for a repository from the client's remote registry server.
-   *
-   * @param path path of the repository on the registry.
-   * @param digest digest of the blob (from manifest).
-   * @param filePath file path to store the fetched blob.
-   * @param timeout Maximum time ater which the request will timeout and return
-   *    a failure. Will default to RESPONSE_TIMEOUT.
-   * @param maxSize Maximum size of the response thats acceptable. Will default
-   *    to MAX_RESPONSE_SIZE.
-   * @return size of downloaded blob on success.
-   *         Failure in case of any errors.
-   */
-  process::Future<size_t> getBlob(
-      const std::string& path,
-      const Option<std::string>& digest,
-      const Path& filePath,
-      const Option<Duration>& timeout,
-      const Option<size_t>& maxSize);
-
-  ~RegistryClient();
-
-private:
-  RegistryClient(
-    const process::http::URL& registryServer,
-    const process::http::URL& authServer,
-    const Option<Credentials>& credentials,
-    const process::Owned<RegistryClientProcess>& process);
-
-  static const Duration DEFAULT_MANIFEST_TIMEOUT_SECS;
-  static const size_t DEFAULT_MANIFEST_MAXSIZE_BYTES;
-
-  const process::http::URL registryServer_;
-  const process::http::URL authServer_;
-  const Option<Credentials> credentials_;
-  process::Owned<RegistryClientProcess> process_;
-
-  RegistryClient(const RegistryClient&) = delete;
-  RegistryClient& operator=(const RegistryClient&) = delete;
-};
-
-} // namespace registry {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/spec.cpp b/src/slave/containerizer/provisioner/docker/spec.cpp
deleted file mode 100644
index 5d6893d..0000000
--- a/src/slave/containerizer/provisioner/docker/spec.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stout/foreach.hpp>
-#include <stout/json.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/strings.hpp>
-
-#include "slave/containerizer/provisioner/docker/spec.hpp"
-
-using std::string;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Docker spec.
-Option<Error> validateManifest(const DockerImageManifest& manifest)
-{
-  // Validate required fields are present,
-  // e.g., repeated fields that has to be >= 1.
-  if (manifest.fslayers_size() <= 0) {
-    return Error("FsLayers field must have at least one blobSum");
-  }
-
-  if (manifest.history_size() <= 0) {
-    return Error("History field must have at least one v1Compatibility");
-  }
-
-  if (manifest.signatures_size() <= 0) {
-    return Error("Signatures field must have at least one signature");
-  }
-
-  // Verify that blobSum and v1Compatibility numbers are equal.
-  if (manifest.fslayers_size() != manifest.history_size()) {
-    return Error("Size of blobSum and v1Compatibility must be equal");
-  }
-
-  // FsLayers field validation.
-  foreach (const docker::DockerImageManifest::FsLayers& fslayer,
-           manifest.fslayers()) {
-    const string& blobSum = fslayer.blobsum();
-    if (!strings::contains(blobSum, ":")) {
-      return Error("Incorrect blobSum format");
-    }
-  }
-
-  return None();
-}
-
-
-Try<docker::DockerImageManifest> parse(const JSON::Object& json)
-{
-  Try<docker::DockerImageManifest> manifest =
-    protobuf::parse<docker::DockerImageManifest>(json);
-
-  if (manifest.isError()) {
-    return Error("Protobuf parse failed: " + manifest.error());
-  }
-
-  Option<Error> error = validateManifest(manifest.get());
-  if (error.isSome()) {
-    return Error("Docker Image Manifest Validation failed: " +
-                 error.get().message);
-  }
-
-  return manifest.get();
-}
-
-} // namespace spec {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/provisioner/docker/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/provisioner/docker/spec.hpp b/src/slave/containerizer/provisioner/docker/spec.hpp
deleted file mode 100644
index 199db54..0000000
--- a/src/slave/containerizer/provisioner/docker/spec.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PROVISIONER_DOCKER_SPEC_HPP__
-#define __PROVISIONER_DOCKER_SPEC_HPP__
-
-#include <stout/error.hpp>
-#include <stout/json.hpp>
-#include <stout/option.hpp>
-
-#include <mesos/mesos.hpp>
-
-#include "slave/containerizer/provisioner/docker/message.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-namespace docker {
-namespace spec {
-
-// Validate if the specified image manifest conforms to the Docker spec.
-Option<Error> validateManifest(const docker::DockerImageManifest& manifest);
-
-// TODO(Gilbert): add validations here, e.g., Manifest, Blob, Layout, ImageID.
-
-// Parse the DockerImageManifest from the specified JSON object.
-Try<docker::DockerImageManifest> parse(const JSON::Object& json);
-
-} // namespace spec {
-} // namespace docker {
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PROVISIONER_DOCKER_SPEC_HPP__

[04/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp b/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
new file mode 100644
index 0000000..f314f20
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/local_puller.cpp
@@ -0,0 +1,355 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/json.hpp>
+#include <stout/os.hpp>
+#include <stout/result.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/subprocess.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/local_puller.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::pair;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class LocalPullerProcess : public process::Process<LocalPullerProcess>
+{
+public:
+  LocalPullerProcess(const Flags& _flags) : flags(_flags) {}
+
+  ~LocalPullerProcess() {}
+
+  process::Future<list<pair<string, string>>> pull(
+      const Image::Name& name,
+      const string& directory);
+
+private:
+  process::Future<Nothing> untarImage(
+      const std::string& tarPath,
+      const std::string& directory);
+
+  process::Future<list<pair<string, string>>> putImage(
+      const Image::Name& name,
+      const std::string& directory);
+
+  process::Future<list<pair<string, string>>> putLayers(
+      const std::string& directory,
+      const std::vector<std::string>& layerIds);
+
+  process::Future<pair<string, string>> putLayer(
+      const std::string& directory,
+      const std::string& layerId);
+
+  const Flags flags;
+};
+
+
+LocalPuller::LocalPuller(const Flags& flags)
+{
+  process = Owned<LocalPullerProcess>(new LocalPullerProcess(flags));
+  process::spawn(process.get());
+}
+
+
+LocalPuller::~LocalPuller()
+{
+  process::terminate(process.get());
+  process::wait(process.get());
+}
+
+
+Future<list<pair<string, string>>> LocalPuller::pull(
+    const Image::Name& name,
+    const string& directory)
+{
+  return dispatch(process.get(), &LocalPullerProcess::pull, name, directory);
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::pull(
+    const Image::Name& name,
+    const string& directory)
+{
+  const string tarPath = paths::getImageArchiveTarPath(
+      flags.docker_local_archives_dir,
+      stringify(name));
+
+  if (!os::exists(tarPath)) {
+    return Failure("Failed to find archive for image '" + stringify(name) +
+                   "' at '" + tarPath + "'");
+  }
+
+  return untarImage(tarPath, directory)
+    .then(defer(self(), &Self::putImage, name, directory));
+}
+
+
+Future<Nothing> LocalPullerProcess::untarImage(
+    const string& tarPath,
+    const string& directory)
+{
+  VLOG(1) << "Untarring image from '" << tarPath
+          << "' to '" << directory << "'";
+
+  // Untar store_discovery_local_dir/name.tar into directory/.
+  // TODO(tnachen): Terminate tar process when slave exits.
+  const vector<string> argv = {
+    "tar",
+    "-C",
+    directory,
+    "-x",
+    "-f",
+    tarPath
+  };
+
+  Try<Subprocess> s = subprocess(
+      "tar",
+      argv,
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PATH("/dev/null"));
+
+  if (s.isError()) {
+    return Failure("Failed to create tar subprocess: " + s.error());
+  }
+
+  return s.get().status()
+    .then([tarPath](const Option<int>& status) -> Future<Nothing> {
+      if (status.isNone()) {
+        return Failure("Failed to reap status for tar subprocess in " +
+                        tarPath);
+      }
+      if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
+          return Failure("Untar image failed with exit code: " +
+                          WSTRINGIFY(status.get()));
+      }
+
+      return Nothing();
+    });
+}
+
+
+static Result<string> getParentId(
+    const string& directory,
+    const string& layerId)
+{
+  Try<string> manifest =
+    os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
+
+  if (manifest.isError()) {
+    return Error("Failed to read manifest: " + manifest.error());
+  }
+
+  Try<JSON::Object> json = JSON::parse<JSON::Object>(manifest.get());
+  if (json.isError()) {
+    return Error("Failed to parse manifest: " + json.error());
+  }
+
+  Result<JSON::String> parentId = json.get().find<JSON::String>("parent");
+  if (parentId.isNone() || (parentId.isSome() && parentId.get() == "")) {
+    return None();
+  } else if (parentId.isError()) {
+    return Error("Failed to read parent of layer: " + parentId.error());
+  }
+
+  return parentId.get().value;
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::putImage(
+    const Image::Name& name,
+    const string& directory)
+{
+  Try<string> value =
+    os::read(paths::getImageArchiveRepositoriesPath(directory));
+
+  if (value.isError()) {
+    return Failure("Failed to read repository JSON: " + value.error());
+  }
+
+  Try<JSON::Object> json = JSON::parse<JSON::Object>(value.get());
+  if (json.isError()) {
+    return Failure("Failed to parse JSON: " + json.error());
+  }
+
+  Result<JSON::Object> repositoryValue =
+    json.get().find<JSON::Object>(name.repository());
+
+  if (repositoryValue.isError()) {
+    return Failure("Failed to find repository: " + repositoryValue.error());
+  } else if (repositoryValue.isNone()) {
+    return Failure("Repository '" + name.repository() + "' is not found");
+  }
+
+  const JSON::Object repositoryJson = repositoryValue.get();
+
+  // We don't use JSON find here because a tag might contain a '.'.
+  std::map<string, JSON::Value>::const_iterator entry =
+    repositoryJson.values.find(name.tag());
+
+  if (entry == repositoryJson.values.end()) {
+    return Failure("Tag '" + name.tag() + "' is not found");
+  } else if (!entry->second.is<JSON::String>()) {
+    return Failure("Tag JSON value expected to be JSON::String");
+  }
+
+  const string layerId = entry->second.as<JSON::String>().value;
+
+  Try<string> manifest =
+    os::read(paths::getImageArchiveLayerManifestPath(directory, layerId));
+
+  if (manifest.isError()) {
+    return Failure("Failed to read manifest: " + manifest.error());
+  }
+
+  Try<JSON::Object> manifestJson = JSON::parse<JSON::Object>(manifest.get());
+  if (manifestJson.isError()) {
+    return Failure("Failed to parse manifest: " + manifestJson.error());
+  }
+
+  vector<string> layerIds;
+  layerIds.push_back(layerId);
+  Result<string> parentId = getParentId(directory, layerId);
+  while (parentId.isSome()) {
+    layerIds.insert(layerIds.begin(), parentId.get());
+    parentId = getParentId(directory, parentId.get());
+  }
+
+  if (parentId.isError()) {
+    return Failure("Failed to find parent layer id of layer '" + layerId +
+                   "': " + parentId.error());
+  }
+
+  return putLayers(directory, layerIds);
+}
+
+
+Future<list<pair<string, string>>> LocalPullerProcess::putLayers(
+    const string& directory,
+    const vector<string>& layerIds)
+{
+  list<Future<pair<string, string>>> futures;
+  foreach (const string& layerId, layerIds) {
+    futures.push_back(putLayer(directory, layerId));
+  }
+
+  return collect(futures);
+}
+
+
+Future<pair<string, string>> LocalPullerProcess::putLayer(
+    const string& directory,
+    const string& layerId)
+{
+  // We untar the layer from source into a directory, then move the
+  // layer into store. We do this instead of untarring directly to
+  // store to make sure we don't end up with partially untarred layer
+  // rootfs.
+
+  const string localRootfsPath =
+    paths::getImageArchiveLayerRootfsPath(directory, layerId);
+
+  // Image layer has been untarred but is not present in the store directory.
+  if (os::exists(localRootfsPath)) {
+    LOG(WARNING) << "Image layer '" << layerId << "' rootfs present at but not "
+                 << "in store directory '" << localRootfsPath << "'. Removing "
+                 << "staged rootfs and untarring layer again.";
+
+    Try<Nothing> rmdir = os::rmdir(localRootfsPath);
+    if (rmdir.isError()) {
+      return Failure("Failed to remove incomplete staged rootfs for layer '" +
+                     layerId + "': " + rmdir.error());
+    }
+  }
+
+  Try<Nothing> mkdir = os::mkdir(localRootfsPath);
+  if (mkdir.isError()) {
+    return Failure("Failed to create rootfs path '" + localRootfsPath +
+                   "': " + mkdir.error());
+  }
+
+  // Untar directory/id/layer.tar into directory/id/rootfs.
+  // The tar file will be removed when the staging directory is
+  // removed.
+  const vector<string> argv = {
+    "tar",
+    "-C",
+    localRootfsPath,
+    "-x",
+    "-f",
+    paths::getImageArchiveLayerTarPath(directory, layerId)
+  };
+
+  Try<Subprocess> s = subprocess(
+      "tar",
+      argv,
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PATH("/dev/null"));
+
+  if (s.isError()) {
+    return Failure("Failed to create tar subprocess: " + s.error());
+  }
+
+  return s.get().status()
+    .then([directory, layerId](
+        const Option<int>& status) -> Future<pair<string, string>> {
+      if (status.isNone()) {
+        return Failure("Failed to reap subprocess to untar image");
+      } else if (!WIFEXITED(status.get()) || WEXITSTATUS(status.get()) != 0) {
+        return Failure("Untar failed with exit code: " +
+                        WSTRINGIFY(status.get()));
+      }
+
+      const string rootfsPath =
+        paths::getImageArchiveLayerRootfsPath(directory, layerId);
+
+      if (!os::exists(rootfsPath)) {
+        return Failure("Failed to find the rootfs path after extracting layer"
+                       " '" + layerId + "'");
+      }
+
+      return pair<string, string>(layerId, rootfsPath);
+    });
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp b/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
new file mode 100644
index 0000000..87d8002
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/local_puller.hpp
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
+#define __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward declaration.
+class LocalPullerProcess;
+
+
+/**
+ * LocalPuller assumes Docker images are stored in a local directory
+ * (configured with flags.docker_local_archives_dir), with all the
+ * images saved as tars with file names in the form of <repo>:<tag>.tar.
+ */
+class LocalPuller : public Puller
+{
+public:
+  explicit LocalPuller(const Flags& flags);
+
+  ~LocalPuller();
+
+  process::Future<std::list<std::pair<std::string, std::string>>> pull(
+      const Image::Name& name,
+      const std::string& directory);
+
+private:
+  LocalPuller& operator=(const LocalPuller&) = delete; // Not assignable.
+  LocalPuller(const LocalPuller&) = delete; // Not copyable.
+
+  process::Owned<LocalPullerProcess> process;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_LOCAL_PULLER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/message.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/message.hpp b/src/slave/containerizer/mesos/provisioner/docker/message.hpp
new file mode 100644
index 0000000..bb5248c
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/message.hpp
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MESSAGES_DOCKER_PROVISIONER_HPP__
+#define __MESSAGES_DOCKER_PROVISIONER_HPP__
+
+#include <stout/strings.hpp>
+
+// ONLY USEFUL AFTER RUNNING PROTOC.
+#include "slave/containerizer/mesos/provisioner/docker/message.pb.h"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Docker expects the image to be specified on the command line as:
+//   [REGISTRY_HOST[:REGISTRY_PORT]/]REPOSITORY[:TAG|@TYPE:DIGEST]
+//
+// This format is inherently ambiguous when dealing with repository
+// names that include forward slashes. To disambiguate, the docker
+// code looks for '.', or ':', or 'localhost' to decide if the
+// first component is a registry or a respository name. For more
+// detail, drill into the implementation of docker pull.
+//
+// TODO(bmahler): We currently store the digest as a tag, does
+// that makes sense?
+//
+// TODO(bmahler): Validate based on docker's validation logic
+// and return a Try here.
+inline Image::Name parseImageName(std::string s)
+{
+  Image::Name name;
+
+  // Extract the digest.
+  if (strings::contains(s, "@")) {
+    std::vector<std::string> split = strings::split(s, "@");
+
+    s = split[0];
+    name.set_tag(split[1]);
+  }
+
+  // Remove the tag. We need to watch out for a
+  // host:port registry, which also contains ':'.
+  if (strings::contains(s, ":")) {
+    std::vector<std::string> split = strings::split(s, ":");
+
+    // The tag must be the last component. If a slash is
+    // present there is a registry port and no tag.
+    if (!strings::contains(split.back(), "/")) {
+      name.set_tag(split.back());
+      split.pop_back();
+
+      s = strings::join(":", split);
+    }
+  }
+
+  // Default to the 'latest' tag when omitted.
+  if (name.tag().empty()) {
+    name.set_tag("latest");
+  }
+
+  // Extract the registry and repository. The first component can
+  // either be the registry, or the first part of the repository!
+  // We resolve this ambiguity using the same hacks used in the
+  // docker code ('.', ':', 'localhost' indicate a registry).
+  std::vector<std::string> split = strings::split(s, "/", 2);
+
+  if (split.size() == 1) {
+    name.set_repository(s);
+  } else if (strings::contains(split[0], ".") ||
+             strings::contains(split[0], ":") ||
+             split[0] == "localhost") {
+    name.set_registry(split[0]);
+    name.set_repository(split[1]);
+  } else {
+    name.set_repository(s);
+  }
+
+  return name;
+}
+
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    const Image::Name& name)
+{
+  if (name.has_registry()) {
+    return stream << name.registry() << "/" << name.repository() << ":"
+                  << name.tag();
+  }
+
+  return stream << name.repository() << ":" << name.tag();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __MESSAGES_DOCKER_PROVISIONER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/message.proto
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/message.proto b/src/slave/containerizer/mesos/provisioner/docker/message.proto
new file mode 100644
index 0000000..c33e0c5
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/message.proto
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import "mesos/mesos.proto";
+
+package mesos.internal.slave.docker;
+
+/**
+ * A Docker Image name and the layer ids of the layers that comprise the image.
+ * The layerIds are ordered, with the root layer id (no parent layer id) first
+ * and the leaf layer id last.
+ */
+message Image {
+  message Name {
+    optional string registry = 1;
+    required string repository = 2;
+
+    // TODO(bmahler): This may hold a tag or a digest, split these?
+    required string tag = 3;
+  }
+
+  required Name name = 1;
+
+  // The order of the layers represents the dependency between layers.
+  repeated string layer_ids = 2;
+}
+
+
+message Images {
+  repeated Image images = 1;
+}
+
+
+/**
+* Protobuf for the Docker image manifest JSON schema:
+* https://github.com/docker/distribution/blob/master/docs/spec/manifest-v2-1.md
+*/
+message DockerImageManifest {
+  required string name = 1;
+  required string tag = 2;
+  required string architecture = 3;
+
+  message FsLayers {
+    required string blobSum = 1;
+  }
+
+  repeated FsLayers fsLayers = 4;
+
+  message History {
+    message V1Compatibility {
+      required string id = 1;
+      required string parent = 2;
+    }
+
+    required V1Compatibility v1Compatibility = 1;
+  }
+
+  repeated History history = 5;
+  required uint32 schemaVersion = 6;
+
+  message Signatures {
+
+    //JOSE (A JSON Web Signature).
+    message Header {
+
+      //JSON Web Key.
+      message Jwk {
+        required string crv = 1;
+        required string kid = 2;
+        required string kty = 3;
+        required string x = 4;
+        required string y = 5;
+      }
+
+      optional Jwk jwk = 1;
+      required string alg = 2;
+    }
+
+    required Header header = 1;
+    required string signature = 2;
+    required string protected = 3;
+  }
+
+  repeated Signatures signatures = 7;
+}

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
new file mode 100644
index 0000000..af6f5b8
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/os.hpp>
+#include <stout/protobuf.hpp>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/owned.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/state.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class MetadataManagerProcess : public process::Process<MetadataManagerProcess>
+{
+public:
+  MetadataManagerProcess(const Flags& _flags) : flags(_flags) {}
+
+  ~MetadataManagerProcess() {}
+
+  Future<Nothing> recover();
+
+  Future<Image> put(
+      const Image::Name& name,
+      const std::vector<std::string>& layerIds);
+
+  Future<Option<Image>> get(const Image::Name& name);
+
+  // TODO(chenlily): Implement removal of unreferenced images.
+
+private:
+  // Write out metadata manager state to persistent store.
+  Try<Nothing> persist();
+
+  const Flags flags;
+
+  // This is a lookup table for images that are stored in memory. It is keyed
+  // by the name of the Image.
+  // For example, "ubuntu:14.04" -> ubuntu14:04 Image.
+  hashmap<std::string, Image> storedImages;
+};
+
+
+Try<Owned<MetadataManager>> MetadataManager::create(const Flags& flags)
+{
+  Owned<MetadataManagerProcess> process(new MetadataManagerProcess(flags));
+
+  return Owned<MetadataManager>(new MetadataManager(process));
+}
+
+
+MetadataManager::MetadataManager(Owned<MetadataManagerProcess> process)
+  : process(process)
+{
+  process::spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+MetadataManager::~MetadataManager()
+{
+  process::terminate(process.get());
+  process::wait(process.get());
+}
+
+
+Future<Nothing> MetadataManager::recover()
+{
+  return process::dispatch(process.get(), &MetadataManagerProcess::recover);
+}
+
+
+Future<Image> MetadataManager::put(
+    const Image::Name& name,
+    const vector<string>& layerIds)
+{
+  return dispatch(
+      process.get(),
+      &MetadataManagerProcess::put,
+      name,
+      layerIds);
+}
+
+
+Future<Option<Image>> MetadataManager::get(const Image::Name& name)
+{
+  return dispatch(process.get(), &MetadataManagerProcess::get, name);
+}
+
+
+Future<Image> MetadataManagerProcess::put(
+    const Image::Name& name,
+    const vector<string>& layerIds)
+{
+  const string imageName = stringify(name);
+
+  Image dockerImage;
+  dockerImage.mutable_name()->CopyFrom(name);
+  foreach (const string& layerId, layerIds) {
+    dockerImage.add_layer_ids(layerId);
+  }
+
+  storedImages[imageName] = dockerImage;
+
+  Try<Nothing> status = persist();
+  if (status.isError()) {
+    return Failure("Failed to save state of Docker images: " + status.error());
+  }
+
+  return dockerImage;
+}
+
+
+Future<Option<Image>> MetadataManagerProcess::get(
+    const Image::Name& name)
+{
+  const string imageName = stringify(name);
+
+  if (!storedImages.contains(imageName)) {
+    return None();
+  }
+
+  return storedImages[imageName];
+}
+
+
+Try<Nothing> MetadataManagerProcess::persist()
+{
+  Images images;
+
+  foreachvalue (const Image& image, storedImages) {
+    images.add_images()->CopyFrom(image);
+  }
+
+  Try<Nothing> status = state::checkpoint(
+      paths::getStoredImagesPath(flags.docker_store_dir), images);
+  if (status.isError()) {
+    return Error("Failed to perform checkpoint: " + status.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<Nothing> MetadataManagerProcess::recover()
+{
+  string storedImagesPath = paths::getStoredImagesPath(flags.docker_store_dir);
+
+  if (!os::exists(storedImagesPath)) {
+    LOG(INFO) << "No images to load from disk. Docker provisioner image "
+              << "storage path '" << storedImagesPath << "' does not exist";
+    return Nothing();
+  }
+
+  Result<Images> images = ::protobuf::read<Images>(storedImagesPath);
+  if (images.isError()) {
+    return Failure("Failed to read protobuf for Docker provisioner image: " +
+                   images.error());
+  }
+
+  foreach (const Image image, images.get().images()) {
+    vector<string> missingLayerIds;
+    foreach (const string layerId, image.layer_ids()) {
+      const string rootfsPath =
+        paths::getImageLayerRootfsPath(flags.docker_store_dir, layerId);
+
+      if (!os::exists(rootfsPath)) {
+        missingLayerIds.push_back(layerId);
+      }
+    }
+
+    if (!missingLayerIds.empty()) {
+      LOG(WARNING) << "Skipped loading image  '" << stringify(image.name())
+                   << "' due to missing layers: " << stringify(missingLayerIds);
+      continue;
+    }
+
+    const string imageName = stringify(image.name());
+    if (storedImages.contains(imageName)) {
+      LOG(WARNING) << "Found duplicate image in recovery for image name '"
+                   << imageName << "'";
+    } else {
+      storedImages[imageName] = image;
+    }
+  }
+
+  LOG(INFO) << "Loaded " << storedImages.size() << " Docker images";
+
+  return Nothing();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
new file mode 100644
index 0000000..dbae8d8
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
+#define __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__
+
+#include <list>
+#include <string>
+
+#include <stout/hashmap.hpp>
+#include <stout/json.hpp>
+#include <stout/option.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+#include <process/process.hpp>
+
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward Declaration.
+class MetadataManagerProcess;
+
+/**
+ * The MetadataManager tracks the Docker images cached by the
+ * provisioner that are stored on disk. It keeps track of the layers
+ * that Docker images are composed of and recovers Image objects
+ * upon initialization by checking for dependent layers stored on disk.
+ * Currently, image layers are stored indefinitely, with no garbage
+ * collection of unreferenced image layers.
+ */
+class MetadataManager
+{
+public:
+  static Try<process::Owned<MetadataManager>> create(const Flags& flags);
+
+  ~MetadataManager();
+
+  /**
+   * Recover all stored Image and its layer references.
+   */
+  process::Future<Nothing> recover();
+
+  /**
+   * Create an Image, put it in metadata manager and persist the reference
+   * store state to disk.
+   *
+   * @param name     the name of the Docker image to place in the reference
+   *                 store.
+   * @param layerIds the list of layer ids that comprise the Docker image in
+   *                 order where the root layer's id (no parent layer) is first
+   *                 and the leaf layer's id is last.
+   */
+  process::Future<Image> put(
+      const Image::Name& name,
+      const std::vector<std::string>& layerIds);
+
+  /**
+   * Retrieve Image based on image name if it is among the Images
+   * stored in memory.
+   *
+   * @param name  the name of the Docker image to retrieve
+   */
+  process::Future<Option<Image>> get(const Image::Name& name);
+
+private:
+  explicit MetadataManager(process::Owned<MetadataManagerProcess> process);
+
+  MetadataManager(const MetadataManager&); // Not copyable.
+  MetadataManager& operator=(const MetadataManager&); // Not assignable.
+
+  process::Owned<MetadataManagerProcess> process;
+};
+
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_METADATA_MANAGER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/paths.cpp b/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
new file mode 100644
index 0000000..e3392ea
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/paths.cpp
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+
+#include <stout/path.hpp>
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace paths {
+
+string getStagingDir(const string& storeDir)
+{
+  return path::join(storeDir, "staging");
+}
+
+
+string getStagingTempDir(const string& storeDir)
+{
+  return path::join(getStagingDir(storeDir), "XXXXXX");
+}
+
+
+string getImageArchiveTarPath(
+    const string& discoveryDir,
+    const string& name)
+{
+  return path::join(discoveryDir, name + ".tar");
+}
+
+
+string getImageArchiveRepositoriesPath(const string& archivePath)
+{
+  return path::join(archivePath, "repositories");
+}
+
+
+std::string getImageArchiveLayerPath(
+    const string& archivePath,
+    const string& layerId)
+{
+  return path::join(archivePath, layerId);
+}
+
+
+string getImageArchiveLayerManifestPath(
+    const string& archivePath,
+    const string& layerId)
+{
+  return path::join(getImageArchiveLayerPath(archivePath, layerId), "json");
+}
+
+
+string getImageArchiveLayerTarPath(
+  const string& archivePath,
+  const string& layerId)
+{
+  return path::join(
+      getImageArchiveLayerPath(archivePath, layerId), "layer.tar");
+}
+
+
+string getImageArchiveLayerRootfsPath(
+    const string& archivePath,
+    const string& layerId)
+{
+  return path::join(getImageArchiveLayerPath(archivePath, layerId), "rootfs");
+}
+
+
+string getImageLayerPath(
+    const string& storeDir,
+    const string& layerId)
+{
+  return path::join(storeDir, "layers", layerId);
+}
+
+
+string getImageLayerRootfsPath(
+    const string& storeDir,
+    const string& layerId)
+{
+  return path::join(getImageLayerPath(storeDir, layerId), "rootfs");
+}
+
+
+string getStoredImagesPath(const string& storeDir)
+{
+  return path::join(storeDir, "storedImages");
+}
+
+} // namespace paths {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/paths.hpp b/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
new file mode 100644
index 0000000..18beb2e
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/paths.hpp
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_PATHS_HPP__
+#define __PROVISIONER_DOCKER_PATHS_HPP__
+
+#include <list>
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace paths {
+
+/**
+ * The Docker store file system layout is as follows:
+ * Image store dir ('--docker_store_dir' slave flag)
+ *    |--staging
+ *       |-- <temp_dir_archive>
+ *           |-- <layer_id>
+ *               |-- rootfs
+ *    |--layers
+ *       |--<layer_id>
+ *           |--rootfs
+ *    |--storedImages (file holding on cached images)
+ */
+
+std::string getStagingDir(const std::string& storeDir);
+
+
+std::string getStagingTempDir(const std::string& storeDir);
+
+
+std::string getImageArchiveTarPath(
+    const std::string& discoveryDir,
+    const std::string& name);
+
+
+std::string getImageArchiveRepositoriesPath(const std::string& archivePath);
+
+
+std::string getImageArchiveLayerPath(
+    const std::string& archivePath,
+    const std::string& layerId);
+
+
+std::string getImageArchiveLayerManifestPath(
+    const std::string& archivePath,
+    const std::string& layerId);
+
+
+std::string getImageArchiveLayerTarPath(
+  const std::string& archivePath,
+  const std::string& layerId);
+
+
+std::string getImageArchiveLayerRootfsPath(
+  const std::string& archivePath,
+  const std::string& layerId);
+
+
+std::string getImageLayerPath(
+    const std::string& storeDir,
+    const std::string& layerId);
+
+
+std::string getImageLayerRootfsPath(
+    const std::string& storeDir,
+    const std::string& layerId);
+
+
+std::string getStoredImagesPath(const std::string& storeDir);
+
+} // namespace paths {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/puller.cpp b/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
new file mode 100644
index 0000000..f61f9e5
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/puller.cpp
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/local_puller.hpp"
+
+using std::string;
+
+using process::Owned;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+Try<Owned<Puller>> Puller::create(const Flags& flags)
+{
+  const string puller = flags.docker_puller;
+
+  if (puller == "local") {
+    return Owned<Puller>(new LocalPuller(flags));
+  }
+
+  return Error("Unknown or unsupported docker puller: " + puller);
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/puller.hpp b/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
new file mode 100644
index 0000000..8010b8a
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/puller.hpp
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_PULLER_HPP__
+#define __PROVISIONER_DOCKER_PULLER_HPP__
+
+#include <list>
+#include <utility>
+
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class Puller
+{
+public:
+  static Try<process::Owned<Puller>> create(const Flags& flags);
+
+  virtual ~Puller() {}
+
+  /**
+   * Pull a Docker image layers into the specified directory, and
+   * return the list of layer ids in that image in the right
+   * dependency order, and also return the directory where
+   * the puller puts its changeset.
+   *
+   * @param name The name of the image.
+   * @param directory The target directory to store the layers.
+   * @return list of layers maped to its local directory ordered by its
+   *         dependency.
+   */
+  virtual process::Future<std::list<std::pair<std::string, std::string>>> pull(
+      const docker::Image::Name& name,
+      const std::string& directory) = 0;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+
+#endif // __PROVISIONER_DOCKER_PULLER_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp b/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
new file mode 100644
index 0000000..5a01f1b
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/registry_client.cpp
@@ -0,0 +1,641 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/http.hpp>
+#include <process/io.hpp>
+
+#include <stout/os.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/registry_client.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/token_manager.hpp"
+
+using std::string;
+using std::vector;
+
+using process::Failure;
+using process::Future;
+using process::Owned;
+using process::Process;
+
+using process::http::Request;
+using process::http::Response;
+using process::http::URL;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+using FileSystemLayerInfo = RegistryClient::FileSystemLayerInfo;
+
+using ManifestResponse = RegistryClient::ManifestResponse;
+
+const Duration RegistryClient::DEFAULT_MANIFEST_TIMEOUT_SECS = Seconds(10);
+
+const size_t RegistryClient::DEFAULT_MANIFEST_MAXSIZE_BYTES = 4096;
+
+static const uint16_t DEFAULT_SSL_PORT = 443;
+
+class RegistryClientProcess : public Process<RegistryClientProcess>
+{
+public:
+  static Try<Owned<RegistryClientProcess>> create(
+      const URL& registry,
+      const URL& authServer,
+      const Option<RegistryClient::Credentials>& creds);
+
+  Future<RegistryClient::ManifestResponse> getManifest(
+      const string& path,
+      const Option<string>& tag,
+      const Duration& timeout);
+
+  Future<size_t> getBlob(
+      const string& path,
+      const Option<string>& digest,
+      const Path& filePath,
+      const Duration& timeout,
+      size_t maxSize);
+
+private:
+  RegistryClientProcess(
+    const URL& registryServer,
+    const Owned<TokenManager>& tokenManager,
+    const Option<RegistryClient::Credentials>& creds);
+
+  Future<Response> doHttpGet(
+      const URL& url,
+      const Option<process::http::Headers>& headers,
+      const Duration& timeout,
+      bool resend,
+      const Option<string>& lastResponse) const;
+
+  Try<process::http::Headers> getAuthenticationAttributes(
+      const Response& httpResponse) const;
+
+  const URL registryServer_;
+  Owned<TokenManager> tokenManager_;
+  const Option<RegistryClient::Credentials> credentials_;
+
+  RegistryClientProcess(const RegistryClientProcess&) = delete;
+  RegistryClientProcess& operator = (const RegistryClientProcess&) = delete;
+};
+
+
+Try<Owned<RegistryClient>> RegistryClient::create(
+    const URL& registryServer,
+    const URL& authServer,
+    const Option<Credentials>& creds)
+{
+  Try<Owned<RegistryClientProcess>> process =
+    RegistryClientProcess::create(authServer, registryServer, creds);
+
+  if (process.isError()) {
+    return Error(process.error());
+  }
+
+  return Owned<RegistryClient>(
+      new RegistryClient(authServer, registryServer, creds, process.get()));
+}
+
+
+RegistryClient::RegistryClient(
+    const URL& registryServer,
+    const URL& authServer,
+    const Option<Credentials>& creds,
+    const Owned<RegistryClientProcess>& process)
+  : registryServer_(registryServer),
+    authServer_(authServer),
+    credentials_(creds),
+    process_(process)
+{
+  spawn(CHECK_NOTNULL(process_.get()));
+}
+
+
+RegistryClient::~RegistryClient()
+{
+  terminate(process_.get());
+  process::wait(process_.get());
+}
+
+
+Future<ManifestResponse> RegistryClient::getManifest(
+    const string& _path,
+    const Option<string>& _tag,
+    const Option<Duration>& _timeout)
+{
+  Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
+
+  return dispatch(
+      process_.get(),
+      &RegistryClientProcess::getManifest,
+      _path,
+      _tag,
+      timeout);
+}
+
+
+Future<size_t> RegistryClient::getBlob(
+    const string& _path,
+    const Option<string>& _digest,
+    const Path& _filePath,
+    const Option<Duration>& _timeout,
+    const Option<size_t>& _maxSize)
+{
+  Duration timeout = _timeout.getOrElse(DEFAULT_MANIFEST_TIMEOUT_SECS);
+  size_t maxSize = _maxSize.getOrElse(DEFAULT_MANIFEST_MAXSIZE_BYTES);
+
+  return dispatch(
+        process_.get(),
+        &RegistryClientProcess::getBlob,
+        _path,
+        _digest,
+        _filePath,
+        timeout,
+        maxSize);
+}
+
+
+Try<Owned<RegistryClientProcess>> RegistryClientProcess::create(
+    const URL& registryServer,
+    const URL& authServer,
+    const Option<RegistryClient::Credentials>& creds)
+{
+  Try<Owned<TokenManager>> tokenMgr = TokenManager::create(authServer);
+  if (tokenMgr.isError()) {
+    return Error("Failed to create token manager: " + tokenMgr.error());
+  }
+
+  return Owned<RegistryClientProcess>(
+      new RegistryClientProcess(registryServer, tokenMgr.get(), creds));
+}
+
+
+RegistryClientProcess::RegistryClientProcess(
+    const URL& registryServer,
+    const Owned<TokenManager>& tokenMgr,
+    const Option<RegistryClient::Credentials>& creds)
+  : registryServer_(registryServer),
+    tokenManager_(tokenMgr),
+    credentials_(creds) {}
+
+
+Try<process::http::Headers> RegistryClientProcess::getAuthenticationAttributes(
+    const Response& httpResponse) const
+{
+  if (httpResponse.headers.find("WWW-Authenticate") ==
+      httpResponse.headers.end()) {
+    return Error("Failed to find WWW-Authenticate header value");
+  }
+
+  const string& authString = httpResponse.headers.at("WWW-Authenticate");
+
+  const vector<string> authStringTokens = strings::tokenize(authString, " ");
+  if ((authStringTokens.size() != 2) || (authStringTokens[0] != "Bearer")) {
+    // TODO(jojy): Look at various possibilities of auth response. We currently
+    // assume that the string will have realm information.
+    return Error("Invalid authentication header value: " + authString);
+  }
+
+  const vector<string> authParams = strings::tokenize(authStringTokens[1], ",");
+
+  process::http::Headers authAttributes;
+  auto addAttribute = [&authAttributes](
+      const string& param) -> Try<Nothing> {
+    const vector<string> paramTokens =
+      strings::tokenize(param, "=\"");
+
+    if (paramTokens.size() != 2) {
+      return Error(
+          "Failed to get authentication attribute from response parameter " +
+          param);
+    }
+
+    authAttributes.insert({paramTokens[0], paramTokens[1]});
+
+    return Nothing();
+  };
+
+  foreach (const string& param, authParams) {
+    Try<Nothing> addRes = addAttribute(param);
+    if (addRes.isError()) {
+      return Error(addRes.error());
+    }
+  }
+
+  return authAttributes;
+}
+
+
+Future<Response> RegistryClientProcess::doHttpGet(
+    const URL& url,
+    const Option<process::http::Headers>& headers,
+    const Duration& timeout,
+    bool resend,
+    const Option<string>& lastResponseStatus) const
+{
+  return process::http::get(url, headers)
+    .after(timeout, [](
+        const Future<Response>& httpResponseFuture) -> Future<Response> {
+      return Failure("Response timeout");
+    })
+    .then(defer(self(), [=](
+        const Response& httpResponse) -> Future<Response> {
+      VLOG(1) << "Response status: " + httpResponse.status;
+
+      // Set the future if we get a OK response.
+      if (httpResponse.status == "200 OK") {
+        return httpResponse;
+      } else if (httpResponse.status == "400 Bad Request") {
+        Try<JSON::Object> errorResponse =
+          JSON::parse<JSON::Object>(httpResponse.body);
+
+        if (errorResponse.isError()) {
+          return Failure("Failed to parse bad request response JSON: " +
+                         errorResponse.error());
+        }
+
+        std::ostringstream out;
+        bool first = true;
+        Result<JSON::Array> errorObjects =
+          errorResponse.get().find<JSON::Array>("errors");
+
+        if (errorObjects.isError()) {
+          return Failure("Failed to find 'errors' in bad request response: " +
+                         errorObjects.error());
+        } else if (errorObjects.isNone()) {
+          return Failure("Errors not found in bad request response");
+        }
+
+        foreach (const JSON::Value& error, errorObjects.get().values) {
+          Result<JSON::String> message =
+            error.as<JSON::Object>().find<JSON::String>("message");
+
+          if (message.isError()) {
+            return Failure("Failed to parse bad request error message: " +
+                           message.error());
+          } else if (message.isNone()) {
+            continue;
+          }
+
+          if (first) {
+            out << message.get().value;
+            first = false;
+          } else {
+            out << ", " << message.get().value;
+          }
+        }
+
+        return Failure("Received Bad request, errors: [" + out.str() + "]");
+      }
+
+      // Prevent infinite recursion.
+      if (lastResponseStatus.isSome() &&
+          (lastResponseStatus.get() == httpResponse.status)) {
+        return Failure("Invalid response: " + httpResponse.status);
+      }
+
+      // If resend is not set, we dont try again and stop here.
+      if (!resend) {
+        return Failure("Bad response: " + httpResponse.status);
+      }
+
+      // Handle 401 Unauthorized.
+      if (httpResponse.status == "401 Unauthorized") {
+        Try<process::http::Headers> authAttributes =
+          getAuthenticationAttributes(httpResponse);
+
+        if (authAttributes.isError()) {
+          return Failure(
+              "Failed to get authentication attributes: " +
+              authAttributes.error());
+        }
+
+        // TODO(jojy): Currently only handling TLS/cert authentication.
+        Future<Token> tokenResponse = tokenManager_->getToken(
+          authAttributes.get().at("service"),
+          authAttributes.get().at("scope"),
+          None());
+
+        return tokenResponse
+          .after(timeout, [=](
+              Future<Token> tokenResponse) -> Future<Token> {
+            tokenResponse.discard();
+            return Failure("Token response timeout");
+          })
+          .then(defer(self(), [=](
+              const Future<Token>& tokenResponse) {
+            // Send request with acquired token.
+            process::http::Headers authHeaders = {
+              {"Authorization", "Bearer " + tokenResponse.get().raw}
+            };
+
+            return doHttpGet(
+                url,
+                authHeaders,
+                timeout,
+                true,
+                httpResponse.status);
+        }));
+      } else if (httpResponse.status == "307 Temporary Redirect") {
+        // Handle redirect.
+
+        // TODO(jojy): Add redirect functionality in http::get.
+
+        auto toURL = [](
+            const string& urlString) -> Try<URL> {
+          // TODO(jojy): Need to add functionality to URL class that parses a
+          // string to its URL components. For now, assuming:
+          //  - scheme is https
+          //  - path always ends with /
+
+          static const string schemePrefix = "https://";
+
+          if (!strings::contains(urlString, schemePrefix)) {
+            return Error(
+                "Failed to find expected token '" + schemePrefix +
+                "' in redirect url");
+          }
+
+          const string schemeSuffix = urlString.substr(schemePrefix.length());
+
+          const vector<string> components =
+            strings::tokenize(schemeSuffix, "/");
+
+          const string path = schemeSuffix.substr(components[0].length());
+
+          const vector<string> addrComponents =
+            strings::tokenize(components[0], ":");
+
+          uint16_t port = DEFAULT_SSL_PORT;
+          string domain = components[0];
+
+          // Parse the port.
+          if (addrComponents.size() == 2) {
+            domain = addrComponents[0];
+
+            Try<uint16_t> tryPort = numify<uint16_t>(addrComponents[1]);
+            if (tryPort.isError()) {
+              return Error(
+                  "Failed to parse location: " + urlString + " for port.");
+            }
+
+            port = tryPort.get();
+          }
+
+          return URL("https", domain, port, path);
+        };
+
+        if (httpResponse.headers.find("Location") ==
+            httpResponse.headers.end()) {
+          return Failure(
+              "Invalid redirect response: 'Location' not found in headers.");
+        }
+
+        const string& location = httpResponse.headers.at("Location");
+        Try<URL> tryUrl = toURL(location);
+        if (tryUrl.isError()) {
+          return Failure(
+              "Failed to parse '" + location + "': " + tryUrl.error());
+        }
+
+        return doHttpGet(
+            tryUrl.get(),
+            headers,
+            timeout,
+            false,
+            httpResponse.status);
+      } else {
+        return Failure("Invalid response: " + httpResponse.status);
+      }
+    }));
+}
+
+
+Future<ManifestResponse> RegistryClientProcess::getManifest(
+    const string& path,
+    const Option<string>& tag,
+    const Duration& timeout)
+{
+  if (strings::contains(path, " ")) {
+    return Failure("Invalid repository path: " + path);
+  }
+
+  string repoTag = tag.getOrElse("latest");
+  if (strings::contains(repoTag, " ")) {
+    return Failure("Invalid repository tag: " + repoTag);
+  }
+
+  URL manifestURL(registryServer_);
+  manifestURL.path =
+    "v2/" + path + "/manifests/" + repoTag;
+
+  auto getManifestResponse = [](
+      const Response& httpResponse) -> Try<ManifestResponse> {
+    if (!httpResponse.headers.contains("Docker-Content-Digest")) {
+      return Error("Docker-Content-Digest header missing in response");
+    }
+
+    Try<JSON::Object> responseJSON =
+      JSON::parse<JSON::Object>(httpResponse.body);
+
+    if (responseJSON.isError()) {
+      return Error(responseJSON.error());
+    }
+
+    Result<JSON::String> name = responseJSON.get().find<JSON::String>("name");
+    if (name.isNone()) {
+      return Error("Failed to find \"name\" in manifest response");
+    }
+
+    Result<JSON::Array> fsLayers =
+      responseJSON.get().find<JSON::Array>("fsLayers");
+
+    if (fsLayers.isNone()) {
+      return Error("Failed to find \"fsLayers\" in manifest response");
+    }
+
+    Result<JSON::Array> historyArray =
+      responseJSON.get().find<JSON::Array>("history");
+
+    if (historyArray.isNone()) {
+      return Error("Failed to find \"history\" in manifest response");
+    }
+
+    if (historyArray.get().values.size() != fsLayers.get().values.size()) {
+      return Error(
+          "\"history\" and \"fsLayers\" array count mismatch"
+          "in manifest response");
+    }
+
+    vector<FileSystemLayerInfo> fsLayerInfoList;
+    size_t index = 0;
+
+    foreach (const JSON::Value& layer, fsLayers.get().values) {
+      if (!layer.is<JSON::Object>()) {
+        return Error(
+            "Failed to parse layer as a JSON object for index: " +
+            stringify(index));
+      }
+
+      const JSON::Object& layerInfoJSON = layer.as<JSON::Object>();
+
+      // Get blobsum for layer.
+      const Result<JSON::String> blobSumInfo =
+        layerInfoJSON.find<JSON::String>("blobSum");
+
+      if (blobSumInfo.isNone()) {
+        return Error("Failed to find \"blobSum\" in manifest response");
+      }
+
+      // Get history for layer.
+      if (!historyArray.get().values[index].is<JSON::Object>()) {
+        return Error(
+            "Failed to parse history as a JSON object for index: " +
+            stringify(index));
+      }
+      const JSON::Object& historyObj =
+        historyArray.get().values[index].as<JSON::Object>();
+
+      // Get layer id.
+      const Result<JSON::String> v1CompatibilityJSON =
+        historyObj.find<JSON::String>("v1Compatibility");
+
+      if (!v1CompatibilityJSON.isSome()) {
+        return Error(
+            "Failed to obtain layer v1 compability json in manifest for layer: "
+            + stringify(index));
+      }
+
+      Try<JSON::Object> v1CompatibilityObj =
+        JSON::parse<JSON::Object>(v1CompatibilityJSON.get().value);
+
+      if (!v1CompatibilityObj.isSome()) {
+        return Error(
+            "Failed to parse v1 compability json in manifest for layer: "
+            + stringify(index));
+      }
+
+      const Result<JSON::String> id =
+        v1CompatibilityObj.get().find<JSON::String>("id");
+
+      if (!id.isSome()) {
+        return Error(
+            "Failed to find \"id\" in manifest for layer: " + stringify(index));
+      }
+
+      fsLayerInfoList.emplace_back(
+          FileSystemLayerInfo{
+            blobSumInfo.get().value,
+            id.get().value,
+          });
+
+      index++;
+    }
+
+    return ManifestResponse {
+      name.get().value,
+      httpResponse.headers.at("Docker-Content-Digest"),
+      fsLayerInfoList,
+    };
+  };
+
+  return doHttpGet(manifestURL, None(), timeout, true, None())
+    .then([getManifestResponse] (
+        const Response& response) -> Future<ManifestResponse> {
+      Try<ManifestResponse> manifestResponse = getManifestResponse(response);
+
+      if (manifestResponse.isError()) {
+        return Failure(
+            "Failed to parse manifest response: " + manifestResponse.error());
+      }
+
+      return manifestResponse.get();
+    });
+}
+
+
+Future<size_t> RegistryClientProcess::getBlob(
+    const string& path,
+    const Option<string>& digest,
+    const Path& filePath,
+    const Duration& timeout,
+    size_t maxSize)
+{
+  auto prepare = ([&filePath]() -> Try<Nothing> {
+      const string dirName = filePath.dirname();
+
+      // TODO(jojy): Return more state, for example - if the directory is new.
+      Try<Nothing> dirResult = os::mkdir(dirName, true);
+      if (dirResult.isError()) {
+        return Error(
+            "Failed to create directory to download blob: " +
+            dirResult.error());
+      }
+
+      return dirResult;
+  })();
+
+  // TODO(jojy): This currently leaves a residue in failure cases. Would be
+  // ideal if we can completely rollback.
+  if (prepare.isError()) {
+     return Failure(prepare.error());
+  }
+
+  if (strings::contains(path, " ")) {
+    return Failure("Invalid repository path: " + path);
+  }
+
+  URL blobURL(registryServer_);
+  blobURL.path =
+    "v2/" + path + "/blobs/" + digest.getOrElse("");
+
+  auto saveBlob = [filePath](
+      const Response& httpResponse) -> Future<size_t> {
+    // TODO(jojy): Add verification step.
+    // TODO(jojy): Add check for max size.
+    size_t size = httpResponse.body.length();
+    Try<int> fd = os::open(
+        filePath.value,
+        O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC,
+        S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+
+    if (fd.isError()) {
+      return Failure("Failed to open file '" + filePath.value + "': " +
+                     fd.error());
+    }
+
+    return process::io::write(fd.get(), httpResponse.body)
+      .then([size](const Future<Nothing>&) { return size; })
+      .onAny([fd]() { os::close(fd.get()); } );
+  };
+
+  return doHttpGet(blobURL, None(), timeout, true, None())
+    .then([saveBlob](const Response& response) { return saveBlob(response); });
+}
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp b/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
new file mode 100644
index 0000000..1d3377e
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/registry_client.hpp
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
+#define __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__
+
+#include <string>
+#include <vector>
+
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/json.hpp>
+#include <stout/path.hpp>
+
+#include <process/future.hpp>
+#include <process/http.hpp>
+#include <process/process.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace registry {
+
+// Forward declarations.
+class RegistryClientProcess;
+
+
+class RegistryClient
+{
+public:
+  /**
+   * Encapsulates information about a file system layer.
+   */
+  struct FileSystemLayerInfo {
+    // TODO(jojy): This string includes the checksum type also now. Need to
+    // separate this into checksum method and checksum.
+    const std::string checksumInfo;
+    const std::string layerId;
+  };
+
+  /**
+   * Encapsulates response of "GET Manifest" request.
+   *
+   * Reference: https://docs.docker.com/registry/spec/api
+   */
+  struct ManifestResponse {
+    const std::string name;
+    const std::string digest;
+    const std::vector<FileSystemLayerInfo> fsLayerInfoList;
+  };
+
+  /**
+   * Encapsulates auth credentials for the client sessions.
+   * TODO(jojy): Secure heap to protect the credentials.
+   */
+  struct Credentials {
+    /**
+     * UserId for basic authentication.
+     */
+    const Option<std::string> userId;
+    /**
+     * Password for basic authentication.
+     */
+    const Option<std::string> password;
+    /**
+     * Account for fetching data from registry.
+     */
+    const Option<std::string> account;
+  };
+
+  /**
+   * Factory method for creating RegistryClient objects.
+   *
+   * @param registryServer URL of docker registry server.
+   * @param authServer URL of authorization server.
+   * @param credentials credentials for client session (optional).
+   * @return RegistryClient on Success.
+   *         Error on failure.
+   */
+  static Try<process::Owned<RegistryClient>> create(
+      const process::http::URL& registryServer,
+      const process::http::URL& authServer,
+      const Option<Credentials>& credentials);
+
+  /**
+   * Fetches manifest for a repository from the client's remote registry server.
+   *
+   * @param path path of the repository on the registry.
+   * @param tag unique tag that identifies the repository. Will default to
+   *    latest.
+   * @param timeout Maximum time ater which the request will timeout and return
+   *    a failure. Will default to RESPONSE_TIMEOUT.
+   * @return JSON object on success.
+   *         Failure on process failure.
+   */
+  process::Future<ManifestResponse> getManifest(
+      const std::string& path,
+      const Option<std::string>& tag,
+      const Option<Duration>& timeout);
+
+  /**
+   * Fetches blob for a repository from the client's remote registry server.
+   *
+   * @param path path of the repository on the registry.
+   * @param digest digest of the blob (from manifest).
+   * @param filePath file path to store the fetched blob.
+   * @param timeout Maximum time ater which the request will timeout and return
+   *    a failure. Will default to RESPONSE_TIMEOUT.
+   * @param maxSize Maximum size of the response thats acceptable. Will default
+   *    to MAX_RESPONSE_SIZE.
+   * @return size of downloaded blob on success.
+   *         Failure in case of any errors.
+   */
+  process::Future<size_t> getBlob(
+      const std::string& path,
+      const Option<std::string>& digest,
+      const Path& filePath,
+      const Option<Duration>& timeout,
+      const Option<size_t>& maxSize);
+
+  ~RegistryClient();
+
+private:
+  RegistryClient(
+    const process::http::URL& registryServer,
+    const process::http::URL& authServer,
+    const Option<Credentials>& credentials,
+    const process::Owned<RegistryClientProcess>& process);
+
+  static const Duration DEFAULT_MANIFEST_TIMEOUT_SECS;
+  static const size_t DEFAULT_MANIFEST_MAXSIZE_BYTES;
+
+  const process::http::URL registryServer_;
+  const process::http::URL authServer_;
+  const Option<Credentials> credentials_;
+  process::Owned<RegistryClientProcess> process_;
+
+  RegistryClient(const RegistryClient&) = delete;
+  RegistryClient& operator=(const RegistryClient&) = delete;
+};
+
+} // namespace registry {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_REGISTRY_CLIENT_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/spec.cpp b/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
new file mode 100644
index 0000000..2703b5d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/spec.cpp
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/foreach.hpp>
+#include <stout/json.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/spec.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Docker spec.
+Option<Error> validateManifest(const DockerImageManifest& manifest)
+{
+  // Validate required fields are present,
+  // e.g., repeated fields that has to be >= 1.
+  if (manifest.fslayers_size() <= 0) {
+    return Error("FsLayers field must have at least one blobSum");
+  }
+
+  if (manifest.history_size() <= 0) {
+    return Error("History field must have at least one v1Compatibility");
+  }
+
+  if (manifest.signatures_size() <= 0) {
+    return Error("Signatures field must have at least one signature");
+  }
+
+  // Verify that blobSum and v1Compatibility numbers are equal.
+  if (manifest.fslayers_size() != manifest.history_size()) {
+    return Error("Size of blobSum and v1Compatibility must be equal");
+  }
+
+  // FsLayers field validation.
+  foreach (const docker::DockerImageManifest::FsLayers& fslayer,
+           manifest.fslayers()) {
+    const string& blobSum = fslayer.blobsum();
+    if (!strings::contains(blobSum, ":")) {
+      return Error("Incorrect blobSum format");
+    }
+  }
+
+  return None();
+}
+
+
+Try<docker::DockerImageManifest> parse(const JSON::Object& json)
+{
+  Try<docker::DockerImageManifest> manifest =
+    protobuf::parse<docker::DockerImageManifest>(json);
+
+  if (manifest.isError()) {
+    return Error("Protobuf parse failed: " + manifest.error());
+  }
+
+  Option<Error> error = validateManifest(manifest.get());
+  if (error.isSome()) {
+    return Error("Docker Image Manifest Validation failed: " +
+                 error.get().message);
+  }
+
+  return manifest.get();
+}
+
+} // namespace spec {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/spec.hpp b/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
new file mode 100644
index 0000000..96e8d6d
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/spec.hpp
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_SPEC_HPP__
+#define __PROVISIONER_DOCKER_SPEC_HPP__
+
+#include <stout/error.hpp>
+#include <stout/json.hpp>
+#include <stout/option.hpp>
+
+#include <mesos/mesos.hpp>
+
+#include "slave/containerizer/mesos/provisioner/docker/message.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Docker spec.
+Option<Error> validateManifest(const docker::DockerImageManifest& manifest);
+
+// TODO(Gilbert): add validations here, e.g., Manifest, Blob, Layout, ImageID.
+
+// Parse the DockerImageManifest from the specified JSON object.
+Try<docker::DockerImageManifest> parse(const JSON::Object& json);
+
+} // namespace spec {
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_SPEC_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/store.cpp b/src/slave/containerizer/mesos/provisioner/docker/store.cpp
new file mode 100644
index 0000000..bb02d65
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/store.cpp
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slave/containerizer/mesos/provisioner/docker/store.hpp"
+
+#include <list>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <stout/json.hpp>
+#include <stout/os.hpp>
+#include <stout/result.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/subprocess.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/docker/puller.hpp"
+
+#include "slave/flags.hpp"
+
+using namespace process;
+
+using std::list;
+using std::pair;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+class StoreProcess : public Process<StoreProcess>
+{
+public:
+  StoreProcess(
+      const Flags& _flags,
+      const Owned<MetadataManager>& _metadataManager,
+      const Owned<Puller>& _puller)
+    : flags(_flags), metadataManager(_metadataManager), puller(_puller) {}
+
+  ~StoreProcess() {}
+
+  Future<Nothing> recover();
+
+  Future<vector<string>> get(const mesos::Image& image);
+
+private:
+  Future<Image> _get(
+      const Image::Name& name,
+      const Option<Image>& image);
+
+  Future<vector<string>> __get(const Image& image);
+
+  Future<vector<string>> moveLayers(
+      const std::string& staging,
+      const std::list<pair<string, string>>& layerPaths);
+
+  Future<Image> storeImage(
+      const Image::Name& name,
+      const std::vector<std::string>& layerIds);
+
+  Future<Nothing> moveLayer(const pair<string, string>& layerPath);
+
+  const Flags flags;
+  Owned<MetadataManager> metadataManager;
+  Owned<Puller> puller;
+};
+
+
+Try<Owned<slave::Store>> Store::create(const Flags& flags)
+{
+  Try<Owned<Puller>> puller = Puller::create(flags);
+  if (puller.isError()) {
+    return Error("Failed to create Docker puller: " + puller.error());
+  }
+
+  if (!os::exists(flags.docker_store_dir)) {
+    Try<Nothing> mkdir = os::mkdir(flags.docker_store_dir);
+    if (mkdir.isError()) {
+      return Error("Failed to create Docker store directory: " + mkdir.error());
+    }
+  }
+
+  if (!os::exists(paths::getStagingDir(flags.docker_store_dir))) {
+    Try<Nothing> mkdir =
+      os::mkdir(paths::getStagingDir(flags.docker_store_dir));
+
+    if (mkdir.isError()) {
+      return Error("Failed to create Docker store staging directory: " +
+                   mkdir.error());
+    }
+  }
+
+  Try<Owned<MetadataManager>> metadataManager = MetadataManager::create(flags);
+  if (metadataManager.isError()) {
+    return Error(metadataManager.error());
+  }
+
+  Owned<StoreProcess> process(
+      new StoreProcess(flags, metadataManager.get(), puller.get()));
+
+  return Owned<slave::Store>(new Store(process));
+}
+
+
+Store::Store(const Owned<StoreProcess>& _process) : process(_process)
+{
+  process::spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Store::~Store()
+{
+  process::terminate(process.get());
+  process::wait(process.get());
+}
+
+
+Future<Nothing> Store::recover()
+{
+  return dispatch(process.get(), &StoreProcess::recover);
+}
+
+
+Future<vector<string>> Store::get(const mesos::Image& image)
+{
+  return dispatch(process.get(), &StoreProcess::get, image);
+}
+
+
+Future<vector<string>> StoreProcess::get(const mesos::Image& image)
+{
+  if (image.type() != mesos::Image::DOCKER) {
+    return Failure("Docker provisioner store only supports Docker images");
+  }
+
+  Image::Name imageName = parseImageName(image.docker().name());
+
+  return metadataManager->get(imageName)
+    .then(defer(self(), &Self::_get, imageName, lambda::_1))
+    .then(defer(self(), &Self::__get, lambda::_1));
+}
+
+
+Future<Image> StoreProcess::_get(
+    const Image::Name& name,
+    const Option<Image>& image)
+{
+  if (image.isSome()) {
+    return image.get();
+  }
+
+  Try<string> staging =
+    os::mkdtemp(paths::getStagingTempDir(flags.docker_store_dir));
+
+  if (staging.isError()) {
+    return Failure("Failed to create a staging directory");
+  }
+
+  return puller->pull(name, staging.get())
+    .then(defer(self(), &Self::moveLayers, staging.get(), lambda::_1))
+    .then(defer(self(), &Self::storeImage, name, lambda::_1))
+    .onAny([staging]() {
+      Try<Nothing> rmdir = os::rmdir(staging.get());
+      if (rmdir.isError()) {
+        LOG(WARNING) << "Failed to remove staging directory: " << rmdir.error();
+      }
+    });
+}
+
+
+Future<vector<string>> StoreProcess::__get(const Image& image)
+{
+  vector<string> layerDirectories;
+  foreach (const string& layer, image.layer_ids()) {
+    layerDirectories.push_back(
+        paths::getImageLayerRootfsPath(
+            flags.docker_store_dir, layer));
+  }
+
+  return layerDirectories;
+}
+
+
+Future<Nothing> StoreProcess::recover()
+{
+  return metadataManager->recover();
+}
+
+
+Future<vector<string>> StoreProcess::moveLayers(
+    const string& staging,
+    const list<pair<string, string>>& layerPaths)
+{
+  list<Future<Nothing>> futures;
+  foreach (const auto& layerPath, layerPaths) {
+    futures.push_back(moveLayer(layerPath));
+  }
+
+  return collect(futures)
+    .then([layerPaths]() {
+        vector<string> layerIds;
+        foreach (const auto& layerPath, layerPaths) {
+          layerIds.push_back(layerPath.first);
+        }
+
+        return layerIds;
+    });
+}
+
+
+Future<Image> StoreProcess::storeImage(
+    const Image::Name& name,
+    const vector<string>& layerIds)
+{
+  return metadataManager->put(name, layerIds);
+}
+
+
+Future<Nothing> StoreProcess::moveLayer(const pair<string, string>& layerPath)
+{
+  if (!os::exists(layerPath.second)) {
+    return Failure("Unable to find layer '" + layerPath.first + "' in '" +
+                   layerPath.second + "'");
+  }
+
+  const string imageLayerPath =
+    paths::getImageLayerPath(flags.docker_store_dir, layerPath.first);
+
+  if (!os::exists(imageLayerPath)) {
+    Try<Nothing> mkdir = os::mkdir(imageLayerPath);
+    if (mkdir.isError()) {
+      return Failure("Failed to create layer path in store for id '" +
+                     layerPath.first + "': " + mkdir.error());
+    }
+  }
+
+  Try<Nothing> status = os::rename(
+      layerPath.second,
+      paths::getImageLayerRootfsPath(
+          flags.docker_store_dir, layerPath.first));
+
+  if (status.isError()) {
+    return Failure("Failed to move layer '" + layerPath.first +
+                   "' to store directory: " + status.error());
+  }
+
+  return Nothing();
+}
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/docker/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/docker/store.hpp b/src/slave/containerizer/mesos/provisioner/docker/store.hpp
new file mode 100644
index 0000000..95e46b9
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/docker/store.hpp
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_DOCKER_STORE_HPP__
+#define __PROVISIONER_DOCKER_STORE_HPP__
+
+#include <string>
+
+#include <stout/try.hpp>
+
+#include <process/future.hpp>
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace docker {
+
+// Forward Declarations.
+class Puller;
+class StoreProcess;
+
+
+// Store fetches the Docker images and stores them on disk.
+class Store : public slave::Store
+{
+public:
+  static Try<process::Owned<slave::Store>> create(const Flags& flags);
+
+  ~Store();
+
+  process::Future<Nothing> recover();
+
+  process::Future<std::vector<std::string>> get(const mesos::Image& image);
+
+private:
+  explicit Store(const process::Owned<StoreProcess>& _process);
+
+  Store& operator=(const Store&) = delete; // Not assignable.
+  Store(const Store&) = delete; // Not copyable.
+
+  process::Owned<StoreProcess> process;
+};
+
+} // namespace docker {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_DOCKER_STORE_HPP__

[10/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.cpp b/src/slave/containerizer/isolators/network/port_mapping.cpp
deleted file mode 100644
index e6bb75e..0000000
--- a/src/slave/containerizer/isolators/network/port_mapping.cpp
+++ /dev/null
@@ -1,3792 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <limits.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <iostream>
-#include <vector>
-
-#include <glog/logging.h>
-
-#include <mesos/mesos.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/io.hpp>
-#include <process/pid.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/fs.hpp>
-#include <stout/hashset.hpp>
-#include <stout/json.hpp>
-#include <stout/lambda.hpp>
-#include <stout/mac.hpp>
-#include <stout/multihashmap.hpp>
-#include <stout/numify.hpp>
-#include <stout/os.hpp>
-#include <stout/option.hpp>
-#include <stout/path.hpp>
-#include <stout/protobuf.hpp>
-#include <stout/result.hpp>
-#include <stout/stringify.hpp>
-#include <stout/strings.hpp>
-#include <stout/utils.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/stat.hpp>
-
-#include "common/status_utils.hpp"
-
-#include "linux/fs.hpp"
-#include "linux/ns.hpp"
-
-#include "linux/routing/route.hpp"
-#include "linux/routing/utils.hpp"
-
-#include "linux/routing/diagnosis/diagnosis.hpp"
-
-#include "linux/routing/filter/basic.hpp"
-#include "linux/routing/filter/icmp.hpp"
-#include "linux/routing/filter/ip.hpp"
-
-#include "linux/routing/handle.hpp"
-
-#include "linux/routing/link/link.hpp"
-
-#include "linux/routing/queueing/fq_codel.hpp"
-#include "linux/routing/queueing/htb.hpp"
-#include "linux/routing/queueing/ingress.hpp"
-#include "linux/routing/queueing/statistics.hpp"
-
-#include "mesos/resources.hpp"
-
-#include "slave/constants.hpp"
-
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
-
-using namespace mesos::internal;
-
-using namespace process;
-
-using namespace routing;
-using namespace routing::filter;
-using namespace routing::queueing;
-using namespace routing::queueing::statistics;
-
-using std::cerr;
-using std::cout;
-using std::dec;
-using std::endl;
-using std::hex;
-using std::list;
-using std::ostringstream;
-using std::set;
-using std::sort;
-using std::string;
-using std::vector;
-
-using filter::ip::PortRange;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-// An old glibc might not have this symbol.
-#ifndef MNT_DETACH
-#define MNT_DETACH 2
-#endif
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The minimum number of ephemeral ports a container should have.
-static const uint16_t MIN_EPHEMERAL_PORTS_SIZE = 16;
-
-// Linux traffic control is a combination of queueing disciplines,
-// filters and classes organized as a tree for the ingress (tx) and
-// egress (rx) flows for each interface. Each container provides two
-// networking interfaces, a virtual eth0 and a loopback interface. The
-// flow of packets from the external network to container is shown
-// below:
-//
-//   +----------------------+----------------------+
-//   |                   Container                 |
-//   |----------------------|----------------------|
-//   |       eth0           |          lo          |
-//   +----------------------+----------------------+
-//          ^   |         ^           |
-//      [3] |   | [4]     |           |
-//          |   |     [7] +-----------+ [10]
-//          |   |
-//          |   |     [8] +-----------+ [9]
-//      [2] |   | [5]     |           |
-//          |   v         v           v
-//   +----------------------+----------------------+
-//   |      veth0           |          lo          |
-//   +----------------------|----------------------+
-//   |                     Host                    |
-//   |----------------------|----------------------|
-//   |                    eth0                     |
-//   +----------------------+----------------------|
-//                    ^           |
-//                [1] |           | [6]
-//                    |           v
-//
-// Traffic flowing from outside the network into a container enters
-// the system via the host ingress interface [1] and is routed based
-// on destination port to the outbound interface for the matching
-// container [2], which forwards the packet to the container's inbound
-// virtual interface. Outbound traffic destined for the external
-// network flows along the reverse path [4,5,6]. Loopback traffic is
-// directed to the corresponding Ethernet interface, either [7,10] or
-// [8,9] where the same destination port routing can be applied as to
-// external traffic. We use traffic control filters at several of the
-// interfaces to create these packet paths.
-//
-// Linux provides only a very simple topology for ingress interfaces.
-// A root is provided on a fixed handle (handle::INGRESS_ROOT) under
-// which a single qdisc can be installed, with handle ingress::HANDLE.
-// Traffic control filters can then be attached to the ingress qdisc.
-// We install one or more ingress filters on the host eth0 [1] to
-// direct traffic to the correct container, and on the container
-// virtual eth0 [5] to direct traffic to other containers or out of
-// the box. Since we know the ip port assignments for each container,
-// we can direct traffic directly to the appropriate container.
-// However, for ICMP and ARP traffic where no equivalent to a port
-// exists, we send a copy of the packet to every container and rely on
-// the network stack to drop unexpected packets.
-//
-// We install a Hierarchical Token Bucket (HTB) qdisc and class to
-// limit the outbound traffic bandwidth as the egress qdisc inside the
-// container [4] and then add a fq_codel qdisc to limit head of line
-// blocking on the egress filter. The egress traffic control chain is
-// thus:
-//
-// root device: handle::EGRESS_ROOT ->
-//    htb egress qdisc: CONTAINER_TX_HTB_HANDLE ->
-//        htb rate limiting class: CONTAINER_TX_HTB_CLASS_ID ->
-//            buffer-bloat reduction: FQ_CODEL
-constexpr Handle CONTAINER_TX_HTB_HANDLE = Handle(1, 0);
-constexpr Handle CONTAINER_TX_HTB_CLASS_ID =
-    Handle(CONTAINER_TX_HTB_HANDLE, 1);
-
-
-// Finally we create a second fq_codel qdisc on the public interface
-// of the host [6] to reduce performance interference between
-// containers. We create independent flows for each container, and
-// one for the host, which ensures packets from each container are
-// guaranteed fair access to the host interface. This egress traffic
-// control chain for the host interface is thus:
-//
-// root device: handle::EGRESS_ROOT ->
-//    buffer-bloat reduction: FQ_CODEL
-constexpr Handle HOST_TX_FQ_CODEL_HANDLE = Handle(1, 0);
-
-
-// The primary priority used by each type of filter.
-static const uint8_t ARP_FILTER_PRIORITY = 1;
-static const uint8_t ICMP_FILTER_PRIORITY = 2;
-static const uint8_t IP_FILTER_PRIORITY = 3;
-static const uint8_t DEFAULT_FILTER_PRIORITY = 4;
-
-
-// The secondary priorities used by filters.
-static const uint8_t HIGH = 1;
-static const uint8_t NORMAL = 2;
-static const uint8_t LOW = 3;
-
-
-// We assign a separate flow on host eth0 egress for each container
-// (See MESOS-2422 for details). Host egress traffic is assigned to a
-// reserved flow (HOST_FLOWID). ARP and ICMP traffic from containers
-// are not heavy, so they can share the same flow.
-static const uint16_t HOST_FLOWID = 1;
-static const uint16_t ARP_FLOWID = 2;
-static const uint16_t ICMP_FLOWID = 2;
-static const uint16_t CONTAINER_MIN_FLOWID = 3;
-
-
-// The well known ports. Used for sanity check.
-static Interval<uint16_t> WELL_KNOWN_PORTS()
-{
-  return (Bound<uint16_t>::closed(0), Bound<uint16_t>::open(1024));
-}
-
-
-/////////////////////////////////////////////////
-// Helper functions for the isolator.
-/////////////////////////////////////////////////
-
-// Given an integer x, find the largest integer t such that t <= x and
-// t is aligned to power of 2.
-static uint32_t roundDownToPowerOfTwo(uint32_t x)
-{
-  // Mutate x from 00001XXX to 0x00001111.
-
-  // We know the MSB has to be a 1, so kill the LSB and make sure the
-  // first 2 most significant bits are 1s.
-  x = x | (x >> 1);
-
-  // Now that the 2 most significant bits are 1s, make sure the first
-  // 4 most significant bits are 1s, too.
-  x = x | (x >> 2);
-
-  // We keep going. Note that the 0s left to the MSB are never turned
-  // to 1s.
-  x = x | (x >> 4);
-  x = x | (x >> 8);
-
-  // Now we have covered all 32 bits.
-  x = x | (x >> 16);
-
-  // 0x00001111 - (0x00001111 >> 1)
-  return x - (x >> 1);
-}
-
-
-// Returns the name of the host end of the virtual ethernet pair for a
-// given container. The kernel restricts link name to 16 characters or
-// less, so we cannot put container ID into the device name. Instead,
-// we use the pid of the executor process forked by the slave to
-// uniquely name the device for each container. It's safe because we
-// cannot have two active containers having the same pid for the
-// executor process.
-static string veth(pid_t pid)
-{
-  return PORT_MAPPING_VETH_PREFIX() + stringify(pid);
-}
-
-
-// Extracts the pid from the given veth name.
-static Option<pid_t> getPidFromVeth(const string& veth)
-{
-  if (strings::startsWith(veth, PORT_MAPPING_VETH_PREFIX())) {
-    Try<pid_t> pid = numify<pid_t>(
-        strings::remove(veth, PORT_MAPPING_VETH_PREFIX(), strings::PREFIX));
-
-    if (pid.isSome()) {
-      return pid.get();
-    }
-  }
-
-  return None();
-}
-
-
-// Extracts the container ID from the symlink that points to the
-// network namespace handle. The following is the layout of the bind
-// mount root and bind mount symlink root:
-//  <PORT_MAPPING_BIND_MOUNT_ROOT()>
-//    |--- 3945 (pid)                           <-|
-//                                                |
-//  <PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()>      |
-//    |--- ecf293e7-e6e8-4cbc-aaee-4d6c958aa276 --|
-//         (symlink: container ID -> pid)
-static Try<ContainerID> getContainerIdFromSymlink(const string& symlink)
-{
-  if (!os::stat::islink(symlink)) {
-    return Error("Not a symlink");
-  }
-
-  string _containerId = Path(symlink).basename();
-
-  ContainerID containerId;
-  containerId.set_value(_containerId);
-
-  return containerId;
-}
-
-
-// Extracts the pid from the network namespace handle. Returns None if
-// the handle is clearly not created by us.
-static Result<pid_t> getPidFromNamespaceHandle(const string& handle)
-{
-  if (os::stat::islink(handle)) {
-    return Error("Not expecting a symlink");
-  }
-
-  string _pid = Path(handle).basename();
-
-  Try<pid_t> pid = numify<pid_t>(_pid);
-  if (pid.isError()) {
-    return None();
-  }
-
-  return pid.get();
-}
-
-
-// Extracts the pid from the symlink that points to the network
-// namespace handle. Returns None if it's a dangling symlink.
-static Result<pid_t> getPidFromSymlink(const string& symlink)
-{
-  if (!os::stat::islink(symlink)) {
-    return Error("Not a symlink");
-  }
-
-  Result<string> target = os::realpath(symlink);
-  if (target.isError()) {
-    return Error("Failed to follow the symlink: " + target.error());
-  } else if (target.isNone()) {
-    // This is a dangling symlink.
-    return None();
-  }
-
-  return getPidFromNamespaceHandle(target.get());
-}
-
-
-static string getSymlinkPath(const ContainerID& containerId)
-{
-  return path::join(
-      PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(),
-      stringify(containerId));
-}
-
-
-static string getNamespaceHandlePath(pid_t pid)
-{
-  return path::join(
-      PORT_MAPPING_BIND_MOUNT_ROOT(),
-      stringify(pid));
-}
-
-
-// Converts from value ranges to interval set.
-static IntervalSet<uint16_t> getIntervalSet(const Value::Ranges& ranges)
-{
-  IntervalSet<uint16_t> set;
-
-  for (int i = 0; i < ranges.range_size(); i++) {
-    set += (Bound<uint16_t>::closed(ranges.range(i).begin()),
-            Bound<uint16_t>::closed(ranges.range(i).end()));
-  }
-
-  return set;
-}
-
-/////////////////////////////////////////////////
-// Implementation for PortMappingUpdate.
-/////////////////////////////////////////////////
-
-const char* PortMappingUpdate::NAME = "update";
-
-
-PortMappingUpdate::Flags::Flags()
-{
-  add(&eth0_name,
-      "eth0_name",
-      "The name of the public network interface (e.g., eth0)");
-
-  add(&lo_name,
-      "lo_name",
-      "The name of the loopback network interface (e.g., lo)");
-
-  add(&pid,
-      "pid",
-      "The pid of the process whose namespaces we will enter");
-
-  add(&ports_to_add,
-      "ports_to_add",
-      "A collection of port ranges (formatted as a JSON object)\n"
-      "for which to add IP filters. E.g.,\n"
-      "--ports_to_add={\"range\":[{\"begin\":4,\"end\":8}]}");
-
-  add(&ports_to_remove,
-      "ports_to_remove",
-      "A collection of port ranges (formatted as a JSON object)\n"
-      "for which to remove IP filters. E.g.,\n"
-      "--ports_to_remove={\"range\":[{\"begin\":4,\"end\":8}]}");
-}
-
-
-// The following two helper functions allow us to convert from a
-// collection of port ranges to a JSON object and vice versa. They
-// will be used for the port mapping update operation.
-template <typename Iterable>
-JSON::Object json(const Iterable& ranges)
-{
-  Value::Ranges values;
-  foreach (const PortRange& range, ranges) {
-    Value::Range value;
-    value.set_begin(range.begin());
-    value.set_end(range.end());
-
-    values.add_range()->CopyFrom(value);
-  }
-  return JSON::Protobuf(values);
-}
-
-
-static Try<vector<PortRange>> parse(const JSON::Object& object)
-{
-  Try<Value::Ranges> parsing = protobuf::parse<Value::Ranges>(object);
-  if (parsing.isError()) {
-    return Error("Failed to parse JSON: " + parsing.error());
-  }
-
-  vector<PortRange> ranges;
-  Value::Ranges values = parsing.get();
-  for (int i = 0; i < values.range_size(); i++) {
-    const Value::Range& value = values.range(i);
-    Try<PortRange> range = PortRange::fromBeginEnd(value.begin(), value.end());
-    if (range.isError()) {
-      return Error("Invalid port range: " + range.error());
-    }
-
-    ranges.push_back(range.get());
-  }
-  return ranges;
-}
-
-
-// Helper function to set up IP filters inside the container for a
-// given port range.
-static Try<Nothing> addContainerIPFilters(
-    const PortRange& range,
-    const string& eth0,
-    const string& lo)
-{
-  // Add an IP packet filter on lo such that local traffic inside a
-  // container will not be redirected to eth0.
-  Try<bool> loTerminal = filter::ip::create(
-      lo,
-      ingress::HANDLE,
-      ip::Classifier(None(), None(), None(), range),
-      Priority(IP_FILTER_PRIORITY, HIGH),
-      action::Terminal());
-
-  if (loTerminal.isError()) {
-    return Error(
-        "Failed to create an IP packet filter on " + lo +
-        " which stops packets from being sent to " + eth0 +
-        ": " + loTerminal.error());
-  } else if (!loTerminal.get()) {
-    return Error(
-        "The IP packet filter on " + lo +
-        " which stops packets from being sent to " +
-        eth0 + " already exists");
-  }
-
-  // Add an IP packet filter (for loopback IP) from eth0 to lo to
-  // redirect all loopback IP traffic to lo.
-  Try<bool> eth0ToLoLoopback = filter::ip::create(
-      eth0,
-      ingress::HANDLE,
-      ip::Classifier(
-          None(),
-          net::IPNetwork::LOOPBACK_V4().address(),
-          None(),
-          range),
-      Priority(IP_FILTER_PRIORITY, NORMAL),
-      action::Redirect(lo));
-
-  if (eth0ToLoLoopback.isError()) {
-    return Error(
-        "Failed to create an IP packet filter (for loopback IP) from " +
-        eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
-  } else if (!eth0ToLoLoopback.get()) {
-    return Error(
-        "The IP packet filter (for loopback IP) from " +
-        eth0 + " to " + lo + " already exists");
-  }
-
-  return Nothing();
-}
-
-
-// Helper function to remove IP filters inside the container for a
-// given port range.
-static Try<Nothing> removeContainerIPFilters(
-    const PortRange& range,
-    const string& eth0,
-    const string& lo)
-{
-  // Remove the 'terminal' IP packet filter on lo.
-  Try<bool> loTerminal = filter::ip::remove(
-      lo,
-      ingress::HANDLE,
-      ip::Classifier(None(), None(), None(), range));
-
-  if (loTerminal.isError()) {
-    return Error(
-        "Failed to remove the IP packet filter on " + lo +
-        " which stops packets from being sent to " + eth0 +
-        ": " + loTerminal.error());
-  } else if (!loTerminal.get()) {
-    return Error(
-        "The IP packet filter on " + lo +
-        " which stops packets from being sent to " + eth0 +
-        " does not exist");
-  }
-
-  // Remove the IP packet filter (for loopback IP) from eth0 to lo.
-  Try<bool> eth0ToLoLoopback = filter::ip::remove(
-      eth0,
-      ingress::HANDLE,
-      ip::Classifier(
-          None(),
-          net::IPNetwork::LOOPBACK_V4().address(),
-          None(),
-          range));
-
-  if (eth0ToLoLoopback.isError()) {
-    return Error(
-        "Failed to remove the IP packet filter (for loopback IP) from " +
-        eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
-  } else if (!eth0ToLoLoopback.get()) {
-    return Error(
-        "The IP packet filter (for loopback IP) from " +
-        eth0 + " to " + lo + " does not exist");
-  }
-
-  return Nothing();
-}
-
-
-int PortMappingUpdate::execute()
-{
-  if (flags.help) {
-    cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
-         << "Supported options:" << endl
-         << flags.usage();
-    return 0;
-  }
-
-  if (flags.eth0_name.isNone()) {
-    cerr << "The public interface name (e.g., eth0) is not specified" << endl;
-    return 1;
-  }
-
-  if (flags.lo_name.isNone()) {
-    cerr << "The loopback interface name (e.g., lo) is not specified" << endl;
-    return 1;
-  }
-
-  if (flags.pid.isNone()) {
-    cerr << "The pid is not specified" << endl;
-    return 1;
-  }
-
-  if (flags.ports_to_add.isNone() && flags.ports_to_remove.isNone()) {
-    cerr << "Nothing to update" << endl;
-    return 1;
-  }
-
-  Option<vector<PortRange>> portsToAdd;
-  Option<vector<PortRange>> portsToRemove;
-
-  if (flags.ports_to_add.isSome()) {
-    Try<vector<PortRange>> parsing = parse(flags.ports_to_add.get());
-    if (parsing.isError()) {
-      cerr << "Parsing 'ports_to_add' failed: " << parsing.error() << endl;
-      return 1;
-    }
-    portsToAdd = parsing.get();
-  }
-
-  if (flags.ports_to_remove.isSome()) {
-    Try<vector<PortRange>> parsing = parse(flags.ports_to_remove.get());
-    if (parsing.isError()) {
-      cerr << "Parsing 'ports_to_remove' failed: " << parsing.error() << endl;
-      return 1;
-    }
-    portsToRemove = parsing.get();
-  }
-
-  // Enter the network namespace.
-  Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
-  if (setns.isError()) {
-    cerr << "Failed to enter the network namespace of pid " << flags.pid.get()
-         << ": " << setns.error() << endl;
-    return 1;
-  }
-
-  // Update IP packet filters.
-  const string eth0 = flags.eth0_name.get();
-  const string lo = flags.lo_name.get();
-
-  if (portsToAdd.isSome()) {
-    foreach (const PortRange& range, portsToAdd.get()) {
-      Try<Nothing> add = addContainerIPFilters(range, eth0, lo);
-      if (add.isError()) {
-        cerr << "Failed to add IP filters: " << add.error() << endl;
-        return 1;
-      }
-    }
-  }
-
-  if (portsToRemove.isSome()) {
-    foreach (const PortRange& range, portsToRemove.get()) {
-      Try<Nothing> remove = removeContainerIPFilters(range, eth0, lo);
-      if (remove.isError()) {
-        cerr << "Failed to remove IP filters: " << remove.error() << endl;
-        return 1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-/////////////////////////////////////////////////
-// Implementation for PortMappingStatistics.
-/////////////////////////////////////////////////
-
-const char* PortMappingStatistics::NAME = "statistics";
-
-
-PortMappingStatistics::Flags::Flags()
-{
-  add(&eth0_name,
-      "eth0_name",
-      "The name of the public network interface (e.g., eth0)");
-
-  add(&pid,
-      "pid",
-      "The pid of the process whose namespaces we will enter");
-
-  add(&enable_socket_statistics_summary,
-      "enable_socket_statistics_summary",
-      "Whether to collect socket statistics summary for this container\n",
-      false);
-
-  add(&enable_socket_statistics_details,
-      "enable_socket_statistics_details",
-      "Whether to collect socket statistics details (e.g., TCP RTT)\n"
-      "for this container.",
-      false);
-}
-
-
-// A helper that copies the traffic control statistics from the
-// statistics hashmap into the ResourceStatistics protocol buffer.
-static void addTrafficControlStatistics(
-    const string& id,
-    const hashmap<string, uint64_t>& statistics,
-    ResourceStatistics* result)
-{
-  TrafficControlStatistics *tc = result->add_net_traffic_control_statistics();
-
-  tc->set_id(id);
-
-  // TODO(pbrett) Use protobuf reflection here.
-  if (statistics.contains(BACKLOG)) {
-    tc->set_backlog(statistics.at(BACKLOG));
-  }
-  if (statistics.contains(BYTES)) {
-    tc->set_bytes(statistics.at(BYTES));
-  }
-  if (statistics.contains(DROPS)) {
-    tc->set_drops(statistics.at(DROPS));
-  }
-  if (statistics.contains(OVERLIMITS)) {
-    tc->set_overlimits(statistics.at(OVERLIMITS));
-  }
-  if (statistics.contains(PACKETS)) {
-    tc->set_packets(statistics.at(PACKETS));
-  }
-  if (statistics.contains(QLEN)) {
-    tc->set_qlen(statistics.at(QLEN));
-  }
-  if (statistics.contains(RATE_BPS)) {
-    tc->set_ratebps(statistics.at(RATE_BPS));
-  }
-  if (statistics.contains(RATE_PPS)) {
-    tc->set_ratepps(statistics.at(RATE_PPS));
-  }
-  if (statistics.contains(REQUEUES)) {
-    tc->set_requeues(statistics.at(REQUEUES));
-  }
-}
-
-
-int PortMappingStatistics::execute()
-{
-  if (flags.help) {
-    cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
-         << "Supported options:" << endl
-         << flags.usage();
-    return 0;
-  }
-
-  if (flags.pid.isNone()) {
-    cerr << "The pid is not specified" << endl;
-    return 1;
-  }
-
-  if (flags.eth0_name.isNone()) {
-    cerr << "The public interface name (e.g., eth0) is not specified" << endl;
-    return 1;
-  }
-
-  // Enter the network namespace.
-  Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
-  if (setns.isError()) {
-    // This could happen if the executor exits before this function is
-    // invoked. We do not log here to avoid spurious logging.
-    return 1;
-  }
-
-  ResourceStatistics result;
-
-  // NOTE: We use a dummy value here since this field will be cleared
-  // before the result is sent to the containerizer.
-  result.set_timestamp(0);
-
-  if (flags.enable_socket_statistics_summary) {
-    // Collections for socket statistics summary are below.
-
-    // For TCP, get the number of ACTIVE and TIME_WAIT connections,
-    // from reading /proc/net/sockstat (/proc/net/sockstat6 for IPV6).
-    // This is not as expensive in the kernel because only counter
-    // values are accessed instead of a dump of all the sockets.
-    // Example output:
-
-    // $ cat /proc/net/sockstat
-    // sockets: used 1391
-    // TCP: inuse 33 orphan 0 tw 0 alloc 37 mem 6
-    // UDP: inuse 15 mem 7
-    // UDPLITE: inuse 0
-    // RAW: inuse 0
-    // FRAG: inuse 0 memory 0
-
-    Try<string> value = os::read("/proc/net/sockstat");
-    if (value.isError()) {
-      cerr << "Failed to read /proc/net/sockstat: " << value.error() << endl;
-      return 1;
-    }
-
-    foreach (const string& line, strings::tokenize(value.get(), "\n")) {
-      if (!strings::startsWith(line, "TCP")) {
-        continue;
-      }
-
-      vector<string> tokens = strings::tokenize(line, " ");
-      for (size_t i = 0; i < tokens.size(); i++) {
-        if (tokens[i] == "inuse") {
-          if (i + 1 >= tokens.size()) {
-            cerr << "Unexpected output from /proc/net/sockstat" << endl;
-            // Be a bit forgiving here here since the /proc file
-            // output format can change, though not very likely.
-            continue;
-          }
-
-          // Set number of active TCP connections.
-          Try<size_t> inuse = numify<size_t>(tokens[i+1]);
-          if (inuse.isError()) {
-            cerr << "Failed to parse the number of tcp connections in use: "
-                 << inuse.error() << endl;
-            continue;
-          }
-
-          result.set_net_tcp_active_connections(inuse.get());
-        } else if (tokens[i] == "tw") {
-          if (i + 1 >= tokens.size()) {
-            cerr << "Unexpected output from /proc/net/sockstat" << endl;
-            // Be a bit forgiving here here since the /proc file
-            // output format can change, though not very likely.
-            continue;
-          }
-
-          // Set number of TIME_WAIT TCP connections.
-          Try<size_t> tw = numify<size_t>(tokens[i+1]);
-          if (tw.isError()) {
-            cerr << "Failed to parse the number of tcp connections in"
-                 << " TIME_WAIT: " << tw.error() << endl;
-            continue;
-          }
-
-          result.set_net_tcp_time_wait_connections(tw.get());
-        }
-      }
-    }
-  }
-
-  if (flags.enable_socket_statistics_details) {
-    // Collections for socket statistics details are below.
-
-    // NOTE: If the underlying library uses the older version of
-    // kernel API, the family argument passed in may not be honored.
-    Try<vector<diagnosis::socket::Info>> infos =
-      diagnosis::socket::infos(AF_INET, diagnosis::socket::state::ALL);
-
-    if (infos.isError()) {
-      cerr << "Failed to retrieve the socket information" << endl;
-      return 1;
-    }
-
-    vector<uint32_t> RTTs;
-    foreach (const diagnosis::socket::Info& info, infos.get()) {
-      // We double check on family regardless.
-      if (info.family != AF_INET) {
-        continue;
-      }
-
-      // We consider all sockets that have non-zero rtt value.
-      if (info.tcpInfo.isSome() && info.tcpInfo.get().tcpi_rtt != 0) {
-        RTTs.push_back(info.tcpInfo.get().tcpi_rtt);
-      }
-    }
-
-    // Only print to stdout when we have results.
-    if (RTTs.size() > 0) {
-      std::sort(RTTs.begin(), RTTs.end());
-
-      // NOTE: The size of RTTs is usually within 1 million so we
-      // don't need to worry about overflow here.
-      // TODO(jieyu): Right now, we choose to use "Nearest rank" for
-      // simplicity. Consider directly using the Statistics abstraction
-      // which computes "Linear interpolation between closest ranks".
-      // http://en.wikipedia.org/wiki/Percentile
-      size_t p50 = RTTs.size() * 50 / 100;
-      size_t p90 = RTTs.size() * 90 / 100;
-      size_t p95 = RTTs.size() * 95 / 100;
-      size_t p99 = RTTs.size() * 99 / 100;
-
-      result.set_net_tcp_rtt_microsecs_p50(RTTs[p50]);
-      result.set_net_tcp_rtt_microsecs_p90(RTTs[p90]);
-      result.set_net_tcp_rtt_microsecs_p95(RTTs[p95]);
-      result.set_net_tcp_rtt_microsecs_p99(RTTs[p99]);
-    }
-  }
-
-  // Collect traffic statistics for the container from the container
-  // virtual interface and export them in JSON.
-  const string& eth0 = flags.eth0_name.get();
-
-  // Overlimits are reported on the HTB qdisc at the egress root.
-  Result<hashmap<string, uint64_t>> statistics =
-    htb::statistics(eth0, EGRESS_ROOT);
-
-  if (statistics.isSome()) {
-    addTrafficControlStatistics(
-        NET_ISOLATOR_BW_LIMIT,
-        statistics.get(),
-        &result);
-  } else if (statistics.isNone()) {
-    // Traffic control statistics are only available when the
-    // container is created on a slave when the egress rate limit is
-    // on (i.e., egress_rate_limit_per_container flag is set). We
-    // can't just test for that flag here however, since the slave may
-    // have been restarted with different flags since the container
-    // was created. It is also possible that isolator statistics are
-    // unavailable because we the container is in the process of being
-    // created or destroy. Hence we do not report a lack of network
-    // statistics as an error.
-  } else if (statistics.isError()) {
-    cerr << "Failed to get htb qdisc statistics on " << eth0
-         << " in namespace " << flags.pid.get() << endl;
-  }
-
-  // Drops due to the bandwidth limit should be reported at the leaf.
-  statistics = fq_codel::statistics(eth0, CONTAINER_TX_HTB_CLASS_ID);
-  if (statistics.isSome()) {
-    addTrafficControlStatistics(
-        NET_ISOLATOR_BLOAT_REDUCTION,
-        statistics.get(),
-        &result);
-  } else if (statistics.isNone()) {
-    // See discussion on network isolator statistics above.
-  } else if (statistics.isError()) {
-    cerr << "Failed to get fq_codel qdisc statistics on " << eth0
-         << " in namespace " << flags.pid.get() << endl;
-  }
-
-  cout << stringify(JSON::Protobuf(result));
-  return 0;
-}
-
-
-/////////////////////////////////////////////////
-// Implementation for the isolator.
-/////////////////////////////////////////////////
-
-PortMappingIsolatorProcess::Metrics::Metrics()
-  : adding_eth0_ip_filters_errors(
-        "port_mapping/adding_eth0_ip_filters_errors"),
-    adding_eth0_ip_filters_already_exist(
-        "port_mapping/adding_eth0_ip_filters_already_exist"),
-    adding_eth0_egress_filters_errors(
-        "port_mapping/adding_eth0_egress_filters_errors"),
-    adding_eth0_egress_filters_already_exist(
-        "port_mapping/adding_eth0_egress_filters_already_exist"),
-    adding_lo_ip_filters_errors(
-        "port_mapping/adding_lo_ip_filters_errors"),
-    adding_lo_ip_filters_already_exist(
-        "port_mapping/adding_lo_ip_filters_already_exist"),
-    adding_veth_ip_filters_errors(
-        "port_mapping/adding_veth_ip_filters_errors"),
-    adding_veth_ip_filters_already_exist(
-        "port_mapping/adding_veth_ip_filters_already_exist"),
-    adding_veth_icmp_filters_errors(
-        "port_mapping/adding_veth_icmp_filters_errors"),
-    adding_veth_icmp_filters_already_exist(
-        "port_mapping/adding_veth_icmp_filters_already_exist"),
-    adding_veth_arp_filters_errors(
-        "port_mapping/adding_veth_arp_filters_errors"),
-    adding_veth_arp_filters_already_exist(
-        "port_mapping/adding_veth_arp_filters_already_exist"),
-    adding_eth0_icmp_filters_errors(
-        "port_mapping/adding_eth0_icmp_filters_errors"),
-    adding_eth0_icmp_filters_already_exist(
-        "port_mapping/adding_eth0_icmp_filters_already_exist"),
-    adding_eth0_arp_filters_errors(
-        "port_mapping/adding_eth0_arp_filters_errors"),
-    adding_eth0_arp_filters_already_exist(
-        "port_mapping/adding_eth0_arp_filters_already_exist"),
-    removing_eth0_ip_filters_errors(
-        "port_mapping/removing_eth0_ip_filters_errors"),
-    removing_eth0_ip_filters_do_not_exist(
-        "port_mapping/removing_eth0_ip_filters_do_not_exist"),
-    removing_eth0_egress_filters_errors(
-        "port_mapping/removing_eth0_egress_filters_errors"),
-    removing_eth0_egress_filters_do_not_exist(
-        "port_mapping/removinging_eth0_egress_filters_do_not_exist"),
-    removing_lo_ip_filters_errors(
-        "port_mapping/removing_lo_ip_filters_errors"),
-    removing_lo_ip_filters_do_not_exist(
-        "port_mapping/removing_lo_ip_filters_do_not_exist"),
-    removing_veth_ip_filters_errors(
-        "port_mapping/removing_veth_ip_filters_errors"),
-    removing_veth_ip_filters_do_not_exist(
-        "port_mapping/removing_veth_ip_filters_do_not_exist"),
-    removing_eth0_icmp_filters_errors(
-        "port_mapping/removing_eth0_icmp_filters_errors"),
-    removing_eth0_icmp_filters_do_not_exist(
-        "port_mapping/removing_eth0_icmp_filters_do_not_exist"),
-    removing_eth0_arp_filters_errors(
-        "port_mapping/removing_eth0_arp_filters_errors"),
-    removing_eth0_arp_filters_do_not_exist(
-        "port_mapping/removing_eth0_arp_filters_do_not_exist"),
-    updating_eth0_icmp_filters_errors(
-        "port_mapping/updating_eth0_icmp_filters_errors"),
-    updating_eth0_icmp_filters_already_exist(
-        "port_mapping/updating_eth0_icmp_filters_already_exist"),
-    updating_eth0_icmp_filters_do_not_exist(
-        "port_mapping/updating_eth0_icmp_filters_do_not_exist"),
-    updating_eth0_arp_filters_errors(
-        "port_mapping/updating_eth0_arp_filters_errors"),
-    updating_eth0_arp_filters_already_exist(
-        "port_mapping/updating_eth0_arp_filters_already_exist"),
-    updating_eth0_arp_filters_do_not_exist(
-        "port_mapping/updating_eth0_arp_filters_do_not_exist"),
-    updating_container_ip_filters_errors(
-        "port_mapping/updating_container_ip_filters_errors")
-{
-  process::metrics::add(adding_eth0_ip_filters_errors);
-  process::metrics::add(adding_eth0_ip_filters_already_exist);
-  process::metrics::add(adding_lo_ip_filters_errors);
-  process::metrics::add(adding_lo_ip_filters_already_exist);
-  process::metrics::add(adding_veth_ip_filters_errors);
-  process::metrics::add(adding_veth_ip_filters_already_exist);
-  process::metrics::add(adding_veth_icmp_filters_errors);
-  process::metrics::add(adding_veth_icmp_filters_already_exist);
-  process::metrics::add(adding_veth_arp_filters_errors);
-  process::metrics::add(adding_veth_arp_filters_already_exist);
-  process::metrics::add(adding_eth0_icmp_filters_errors);
-  process::metrics::add(adding_eth0_icmp_filters_already_exist);
-  process::metrics::add(adding_eth0_arp_filters_errors);
-  process::metrics::add(adding_eth0_arp_filters_already_exist);
-  process::metrics::add(removing_eth0_ip_filters_errors);
-  process::metrics::add(removing_eth0_ip_filters_do_not_exist);
-  process::metrics::add(removing_lo_ip_filters_errors);
-  process::metrics::add(removing_lo_ip_filters_do_not_exist);
-  process::metrics::add(removing_veth_ip_filters_errors);
-  process::metrics::add(removing_veth_ip_filters_do_not_exist);
-  process::metrics::add(removing_eth0_icmp_filters_errors);
-  process::metrics::add(removing_eth0_icmp_filters_do_not_exist);
-  process::metrics::add(removing_eth0_arp_filters_errors);
-  process::metrics::add(removing_eth0_arp_filters_do_not_exist);
-  process::metrics::add(updating_eth0_icmp_filters_errors);
-  process::metrics::add(updating_eth0_icmp_filters_already_exist);
-  process::metrics::add(updating_eth0_icmp_filters_do_not_exist);
-  process::metrics::add(updating_eth0_arp_filters_errors);
-  process::metrics::add(updating_eth0_arp_filters_already_exist);
-  process::metrics::add(updating_eth0_arp_filters_do_not_exist);
-  process::metrics::add(updating_container_ip_filters_errors);
-}
-
-
-PortMappingIsolatorProcess::Metrics::~Metrics()
-{
-  process::metrics::remove(adding_eth0_ip_filters_errors);
-  process::metrics::remove(adding_eth0_ip_filters_already_exist);
-  process::metrics::remove(adding_lo_ip_filters_errors);
-  process::metrics::remove(adding_lo_ip_filters_already_exist);
-  process::metrics::remove(adding_veth_ip_filters_errors);
-  process::metrics::remove(adding_veth_ip_filters_already_exist);
-  process::metrics::remove(adding_veth_icmp_filters_errors);
-  process::metrics::remove(adding_veth_icmp_filters_already_exist);
-  process::metrics::remove(adding_veth_arp_filters_errors);
-  process::metrics::remove(adding_veth_arp_filters_already_exist);
-  process::metrics::remove(adding_eth0_icmp_filters_errors);
-  process::metrics::remove(adding_eth0_icmp_filters_already_exist);
-  process::metrics::remove(adding_eth0_arp_filters_errors);
-  process::metrics::remove(adding_eth0_arp_filters_already_exist);
-  process::metrics::remove(removing_eth0_ip_filters_errors);
-  process::metrics::remove(removing_eth0_ip_filters_do_not_exist);
-  process::metrics::remove(removing_lo_ip_filters_errors);
-  process::metrics::remove(removing_lo_ip_filters_do_not_exist);
-  process::metrics::remove(removing_veth_ip_filters_errors);
-  process::metrics::remove(removing_veth_ip_filters_do_not_exist);
-  process::metrics::remove(removing_eth0_icmp_filters_errors);
-  process::metrics::remove(removing_eth0_icmp_filters_do_not_exist);
-  process::metrics::remove(removing_eth0_arp_filters_errors);
-  process::metrics::remove(removing_eth0_arp_filters_do_not_exist);
-  process::metrics::remove(updating_eth0_icmp_filters_errors);
-  process::metrics::remove(updating_eth0_icmp_filters_already_exist);
-  process::metrics::remove(updating_eth0_icmp_filters_do_not_exist);
-  process::metrics::remove(updating_eth0_arp_filters_errors);
-  process::metrics::remove(updating_eth0_arp_filters_already_exist);
-  process::metrics::remove(updating_eth0_arp_filters_do_not_exist);
-  process::metrics::remove(updating_container_ip_filters_errors);
-}
-
-
-Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
-{
-  // Check for root permission.
-  if (geteuid() != 0) {
-    return Error("Using network isolator requires root permissions");
-  }
-
-  // Verify that the network namespace is available by checking the
-  // existence of the network namespace handle of the current process.
-  if (ns::namespaces().count("net") == 0) {
-    return Error(
-        "Using network isolator requires network namespace. "
-        "Make sure your kernel is newer than 3.4");
-  }
-
-  // Check the routing library.
-  Try<Nothing> check = routing::check();
-  if (check.isError()) {
-    return Error(
-        "Routing library check failed: " +
-        check.error());
-  }
-
-  // Check the availability of a few Linux commands that we will use.
-  // We use the blocking os::shell here because 'create' will only be
-  // invoked during initialization.
-  Try<string> checkCommandTc = os::shell("tc filter show");
-  if (checkCommandTc.isError()) {
-    return Error("Check command 'tc' failed: " + checkCommandTc.error());
-  }
-
-  Try<string> checkCommandIp = os::shell("ip link show");
-  if (checkCommandIp.isError()) {
-    return Error("Check command 'ip' failed: " + checkCommandIp.error());
-  }
-
-  Try<Resources> resources = Resources::parse(
-      flags.resources.getOrElse(""),
-      flags.default_role);
-
-  if (resources.isError()) {
-    return Error("Failed to parse --resources: " + resources.error());
-  }
-
-  // Get 'ports' resource from 'resources' flag. These ports will be
-  // treated as non-ephemeral ports.
-  IntervalSet<uint16_t> nonEphemeralPorts;
-  if (resources.get().ports().isSome()) {
-    nonEphemeralPorts = getIntervalSet(resources.get().ports().get());
-  }
-
-  // Get 'ephemeral_ports' resource from 'resources' flag. These ports
-  // will be allocated to each container as ephemeral ports.
-  IntervalSet<uint16_t> ephemeralPorts;
-  if (resources.get().ephemeral_ports().isSome()) {
-    ephemeralPorts = getIntervalSet(resources.get().ephemeral_ports().get());
-  }
-
-  // Each container requires at least one ephemeral port for slave
-  // executor communication. If no 'ephemeral_ports' resource is
-  // found, we will return error.
-  if (ephemeralPorts.empty()) {
-    return Error("Ephemeral ports are not specified");
-  }
-
-  // Sanity check to make sure that the ephemeral ports specified do
-  // not intersect with the specified non-ephemeral ports.
-  if (ephemeralPorts.intersects(nonEphemeralPorts)) {
-    return Error(
-        "The specified ephemeral ports " + stringify(ephemeralPorts) +
-        " intersect with the specified non-ephemeral ports " +
-        stringify(nonEphemeralPorts));
-  }
-
-  // This is a sanity check to make sure that the ephemeral ports
-  // specified do not intersect with the well known ports.
-  if (ephemeralPorts.intersects(WELL_KNOWN_PORTS())) {
-    return Error(
-        "The specified ephemeral ports " + stringify(ephemeralPorts) +
-        " intersect with well known ports " + stringify(WELL_KNOWN_PORTS()));
-  }
-
-  // Obtain the host ephemeral port range by reading the proc file
-  // system ('ip_local_port_range').
-  Try<string> value = os::read("/proc/sys/net/ipv4/ip_local_port_range");
-  if (value.isError()) {
-    return Error("Failed to read host ip_local_port_range: " + value.error());
-  }
-
-  vector<string> split = strings::split(strings::trim(value.get()), "\t");
-  if (split.size() != 2) {
-    return Error(
-        "Unexpected format from host ip_local_port_range: " + value.get());
-  }
-
-  Try<uint16_t> begin = numify<uint16_t>(split[0]);
-  if (begin.isError()) {
-    return Error(
-        "Failed to parse the begin of host ip_local_port_range: " + split[0]);
-  }
-
-  Try<uint16_t> end = numify<uint16_t>(split[1]);
-  if (end.isError()) {
-    return Error(
-        "Failed to parse the end of host ip_local_port_range: " + split[1]);
-  }
-
-  Interval<uint16_t> hostEphemeralPorts =
-    (Bound<uint16_t>::closed(begin.get()),
-     Bound<uint16_t>::closed(end.get()));
-
-  // Sanity check to make sure the specified ephemeral ports do not
-  // intersect with the ephemeral ports used by the host.
-  if (ephemeralPorts.intersects(hostEphemeralPorts)) {
-    return Error(
-        "The specified ephemeral ports " + stringify(ephemeralPorts) +
-        " intersect with the ephemeral ports used by the host " +
-        stringify(hostEphemeralPorts));
-  }
-
-  // TODO(chzhcn): Cross check ephemeral ports with used ports on the
-  // host (e.g., using port scan).
-
-  // Initialize the ephemeral ports allocator.
-
-  // In theory, any positive integer can be broken up into a few
-  // numbers that are power of 2 aligned. We choose to not allow this
-  // for now so that each container has a fixed (one) number of
-  // filters for ephemeral ports. This makes it easy to debug and
-  // infer performance.
-  if (roundDownToPowerOfTwo(flags.ephemeral_ports_per_container) !=
-      flags.ephemeral_ports_per_container) {
-    return Error(
-        "The number of ephemeral ports for each container (" +
-        stringify(flags.ephemeral_ports_per_container) +
-        ") is not a power of 2");
-  }
-
-  if (ephemeralPorts.size() < flags.ephemeral_ports_per_container) {
-    return Error(
-        "Network Isolator is given ephemeral ports of size: " +
-        stringify(ephemeralPorts.size()) + ", but asked to allocate " +
-        stringify(flags.ephemeral_ports_per_container) +
-        " ephemeral ports for a container");
-  }
-
-  if (flags.ephemeral_ports_per_container < MIN_EPHEMERAL_PORTS_SIZE) {
-    return Error(
-        "Each container has only " +
-        stringify(flags.ephemeral_ports_per_container) +
-        " ephemeral ports. The minimum required is: " +
-        stringify(MIN_EPHEMERAL_PORTS_SIZE));
-  }
-
-  Owned<EphemeralPortsAllocator> ephemeralPortsAllocator(
-      new EphemeralPortsAllocator(
-        ephemeralPorts,
-        flags.ephemeral_ports_per_container));
-
-  // Get the name of the public interface (e.g., eth0). If it is not
-  // specified, try to derive its name from the routing library.
-  Result<string> eth0 = link::eth0();
-  if (flags.eth0_name.isSome()) {
-    eth0 = flags.eth0_name.get();
-
-    // Check if the given public interface exists.
-    Try<bool> hostEth0Exists = link::exists(eth0.get());
-    if (hostEth0Exists.isError()) {
-      return Error(
-          "Failed to check if " + eth0.get() + " exists: " +
-          hostEth0Exists.error());
-    } else if (!hostEth0Exists.get()) {
-      return Error("The public interface " + eth0.get() + " does not exist");
-    }
-  } else if (!eth0.isSome()){
-    // eth0 is not specified in the flag and we did not get a valid
-    // eth0 from the library.
-    return Error(
-        "Network Isolator failed to find a public interface: " + eth0.error());
-  }
-
-  LOG(INFO) << "Using " << eth0.get() << " as the public interface";
-
-  // Get the name of the loopback interface. If it is not specified,
-  // try to derive its name based on the loopback IP address.
-  Result<string> lo = link::lo();
-  // Option<string> lo = flags.lo_name;
-  if (flags.lo_name.isSome()) {
-    lo = flags.lo_name;
-
-    // Check if the given loopback interface exists.
-    Try<bool> hostLoExists = link::exists(lo.get());
-    if (hostLoExists.isError()) {
-      return Error(
-          "Failed to check if " + lo.get() + " exists: " +
-          hostLoExists.error());
-    } else if (!hostLoExists.get()) {
-      return Error("The loopback interface " + lo.get() + " does not exist");
-    }
-  } else if (!lo.isSome()) {
-    // lo is not specified in the flag and we did not get a valid
-    // lo from the library.
-    return Error(
-        "Network Isolator failed to find a loopback interface: " + lo.error());
-  }
-
-  LOG(INFO) << "Using " << lo.get() << " as the loopback interface";
-
-  // If egress rate limit is provided, do a sanity check that it is
-  // not greater than the host physical link speed.
-  Option<Bytes> egressRateLimitPerContainer;
-  if (flags.egress_rate_limit_per_container.isSome()) {
-    // Read host physical link speed from /sys/class/net/eth0/speed.
-    // This value is in MBits/s.
-    Try<string> value =
-      os::read(path::join("/sys/class/net", eth0.get(), "speed"));
-
-    if (value.isError()) {
-      return Error(
-          "Failed to read " +
-          path::join("/sys/class/net", eth0.get(), "speed") +
-          ": " + value.error());
-    }
-
-    Try<uint64_t> hostLinkSpeed = numify<uint64_t>(strings::trim(value.get()));
-    CHECK_SOME(hostLinkSpeed);
-
-    // It could be possible that the nic driver doesn't support
-    // reporting physical link speed. In that case, report error.
-    if (hostLinkSpeed.get() == 0xFFFFFFFF) {
-      return Error(
-          "Network Isolator failed to determine link speed for " + eth0.get());
-    }
-
-    // Convert host link speed to Bytes/s for comparason.
-    if (hostLinkSpeed.get() * 1000000 / 8 <
-        flags.egress_rate_limit_per_container.get().bytes()) {
-      return Error(
-          "The given egress traffic limit for containers " +
-          stringify(flags.egress_rate_limit_per_container.get().bytes()) +
-          " Bytes/s is greater than the host link speed " +
-          stringify(hostLinkSpeed.get() * 1000000 / 8) + " Bytes/s");
-    }
-
-    if (flags.egress_rate_limit_per_container.get() != Bytes(0)) {
-      egressRateLimitPerContainer = flags.egress_rate_limit_per_container.get();
-    } else {
-      LOG(WARNING) << "Ignoring the given zero egress rate limit";
-    }
-  }
-
-  // Get the host IP network, MAC and default gateway.
-  Result<net::IPNetwork> hostIPNetwork =
-    net::IPNetwork::fromLinkDevice(eth0.get(), AF_INET);
-
-  if (!hostIPNetwork.isSome()) {
-    return Error(
-        "Failed to get the public IP network of " + eth0.get() + ": " +
-        (hostIPNetwork.isError() ?
-            hostIPNetwork.error() :
-            "does not have an IPv4 network"));
-  }
-
-  Result<net::MAC> hostMAC = net::mac(eth0.get());
-  if (!hostMAC.isSome()) {
-    return Error(
-        "Failed to get the MAC address of " + eth0.get() + ": " +
-        (hostMAC.isError() ? hostMAC.error() : "does not have a MAC address"));
-  }
-
-  Result<net::IP> hostDefaultGateway = route::defaultGateway();
-  if (!hostDefaultGateway.isSome()) {
-    return Error(
-        "Failed to get the default gateway of the host: " +
-        (hostDefaultGateway.isError() ? hostDefaultGateway.error()
-        : "The default gateway of the host does not exist"));
-  }
-
-  // Set the MAC address of the host loopback interface (lo) so that
-  // it matches that of the host public interface (eth0).  A fairly
-  // recent kernel patch is needed for this operation to succeed:
-  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
-  // 25f929fbff0d1bcebf2e92656d33025cd330cbf8
-  Try<bool> setHostLoMAC = link::setMAC(lo.get(), hostMAC.get());
-  if (setHostLoMAC.isError()) {
-    return Error(
-        "Failed to set the MAC address of " + lo.get() +
-        ": " + setHostLoMAC.error());
-  }
-
-  // Set the MTU of the host loopback interface (lo) so that it
-  // matches that of the host public interface (eth0).
-  Result<unsigned int> hostEth0MTU = link::mtu(eth0.get());
-  if (hostEth0MTU.isError()) {
-    return Error(
-        "Failed to get the MTU of " + eth0.get() +
-        ": " + hostEth0MTU.error());
-  }
-
-  // The host public interface should exist since we just checked it.
-  CHECK_SOME(hostEth0MTU);
-
-  Try<bool> setHostLoMTU = link::setMTU(lo.get(), hostEth0MTU.get());
-  if (setHostLoMTU.isError()) {
-    return Error(
-        "Failed to set the MTU of " + lo.get() +
-        ": " + setHostLoMTU.error());
-  }
-
-  // Prepare the ingress queueing disciplines on host public interface
-  // (eth0) and host loopback interface (lo).
-  Try<bool> createHostEth0IngressQdisc = ingress::create(eth0.get());
-  if (createHostEth0IngressQdisc.isError()) {
-    return Error(
-        "Failed to create the ingress qdisc on " + eth0.get() +
-        ": " + createHostEth0IngressQdisc.error());
-  }
-
-  set<uint16_t> freeFlowIds;
-  if (flags.egress_unique_flow_per_container) {
-    // Prepare a fq_codel queueing discipline on host public interface
-    // (eth0) for egress flow classification.
-    //
-    // TODO(cwang): Maybe we can continue when some other egress qdisc
-    // exists because this is not a necessary qdisc for network
-    // isolation, but we don't want inconsistency, so we just fail in
-    // this case. See details in MESOS-2370.
-    Try<bool> createHostEth0EgressQdisc = fq_codel::create(
-        eth0.get(),
-        EGRESS_ROOT,
-        HOST_TX_FQ_CODEL_HANDLE);
-    if (createHostEth0EgressQdisc.isError()) {
-      return Error(
-          "Failed to create the egress qdisc on " + eth0.get() +
-          ": " + createHostEth0EgressQdisc.error());
-    }
-
-    // TODO(cwang): Make sure DEFAULT_FLOWS is large enough so that
-    // it's unlikely to run out of free flow IDs.
-    for (uint16_t i = CONTAINER_MIN_FLOWID; i < fq_codel::DEFAULT_FLOWS; i++) {
-      freeFlowIds.insert(i);
-    }
-  }
-
-  Try<bool> createHostLoQdisc = ingress::create(lo.get());
-  if (createHostLoQdisc.isError()) {
-    return Error(
-        "Failed to create the ingress qdisc on " + lo.get() +
-        ": " + createHostLoQdisc.error());
-  }
-
-  // Enable 'route_localnet' on host loopback interface (lo). This
-  // enables the use of 127.0.0.1/8 for local routing purpose. This
-  // feature only exists on kernel 3.6 or newer.
-  const string loRouteLocalnet =
-    path::join("/proc/sys/net/ipv4/conf", lo.get(), "route_localnet");
-
-  if (!os::exists(loRouteLocalnet)) {
-    // TODO(jieyu): Consider supporting running the isolator if this
-    // feature is not available. We need to conditionally disable
-    // routing for 127.0.0.1/8, and ask the tasks to use the public IP
-    // for container to container and container to host communication.
-    return Error("The kernel does not support 'route_localnet'");
-  }
-
-  Try<Nothing> write = os::write(loRouteLocalnet, "1");
-  if (write.isError()) {
-    return Error(
-        "Failed to enable route_localnet for " + lo.get() +
-        ": " + write.error());
-  }
-
-  // We disable 'rp_filter' and 'send_redirects' for host loopback
-  // interface (lo) to work around a kernel bug, which was only
-  // recently addressed in upstream in the following 3 commits.
-  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
-  //   6a662719c9868b3d6c7d26b3a085f0cd3cc15e64
-  //   0d5edc68739f1c1e0519acbea1d3f0c1882a15d7
-  //   e374c618b1465f0292047a9f4c244bd71ab5f1f0
-  // The workaround ensures packets don't get dropped at lo.
-  write = os::write("/proc/sys/net/ipv4/conf/all/rp_filter", "0");
-  if (write.isError()) {
-    return Error(
-        "Failed to disable rp_filter for all: " + write.error());
-  }
-
-  write = os::write(path::join(
-      "/proc/sys/net/ipv4/conf", lo.get(), "rp_filter"), "0");
-  if (write.isError()) {
-    return Error(
-        "Failed to disable rp_filter for " + lo.get() +
-        ": " + write.error());
-  }
-
-  write = os::write("/proc/sys/net/ipv4/conf/all/send_redirects", "0");
-  if (write.isError()) {
-    return Error(
-        "Failed to disable send_redirects for all: " + write.error());
-  }
-
-  write = os::write(path::join(
-      "/proc/sys/net/ipv4/conf", lo.get(), "send_redirects"), "0");
-  if (write.isError()) {
-    return Error(
-        "Failed to disable send_redirects for " + lo.get() +
-        ": " + write.error());
-  }
-
-  // We need to enable accept_local on host loopback interface (lo)
-  // for kernels older than 3.6. Refer to the following:
-  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
-  //   7a9bc9b81a5bc6e44ebc80ef781332e4385083f2
-  // https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
-  write = os::write(path::join(
-      "/proc/sys/net/ipv4/conf", lo.get(), "accept_local"), "1");
-  if (write.isError()) {
-    return Error(
-        "Failed to enable accept_local for " + lo.get() +
-        ": " + write.error());
-  }
-
-  // Reading host network configurations. Each container will match
-  // these configurations.
-  hashset<string> procs;
-
-  // TODO(jieyu): The following is a partial list of all the
-  // configurations. In the future, we may want to expose these
-  // configurations using ContainerInfo.
-
-  // The kernel will use a default value for the following
-  // configurations inside a container. Therefore, we need to set them
-  // in the container to match that on the host.
-  procs.insert("/proc/sys/net/core/somaxconn");
-
-  // As of kernel 3.10, the following configurations are shared
-  // between host and containers, and therefore are not required to be
-  // set in containers. We keep them here just in case the kernel
-  // changes in the future.
-  procs.insert("/proc/sys/net/core/netdev_max_backlog");
-  procs.insert("/proc/sys/net/core/rmem_max");
-  procs.insert("/proc/sys/net/core/wmem_max");
-  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_time");
-  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_intvl");
-  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_probes");
-  procs.insert("/proc/sys/net/ipv4/tcp_max_syn_backlog");
-  procs.insert("/proc/sys/net/ipv4/tcp_rmem");
-  procs.insert("/proc/sys/net/ipv4/tcp_retries2");
-  procs.insert("/proc/sys/net/ipv4/tcp_synack_retries");
-  procs.insert("/proc/sys/net/ipv4/tcp_wmem");
-  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh1");
-  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh2");
-  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh3");
-
-  hashmap<string, string> hostNetworkConfigurations;
-  foreach (const string& proc, procs) {
-    Try<string> value = os::read(proc);
-    if (value.isSome()) {
-      LOG(INFO) << proc << " = '" << strings::trim(value.get()) << "'";
-      hostNetworkConfigurations[proc] = strings::trim(value.get());
-    }
-  }
-
-  // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT(). Since we use a
-  // new mount namespace for each container, for this mount point, we
-  // set '--make-rshared' on the host and set '--make-rslave' inside
-  // each container. This is important because when we unmount the
-  // network namespace handles on the host, those handles will be
-  // unmounted in the containers as well, but NOT vice versa.
-
-  // We first create the bind mount directory if it does not exist.
-  Try<Nothing> mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_ROOT());
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount root directory at " +
-        PORT_MAPPING_BIND_MOUNT_ROOT() + ": " + mkdir.error());
-  }
-
-  // Now, check '/proc/mounts' to see if
-  // PORT_MAPPING_BIND_MOUNT_ROOT() has already been self mounted.
-  Try<fs::MountTable> mountTable = fs::MountTable::read("/proc/mounts");
-  if (mountTable.isError()) {
-    return Error(
-        "Failed to the read the mount table at '/proc/mounts': " +
-        mountTable.error());
-  }
-
-  Option<fs::MountTable::Entry> bindMountRoot;
-  foreach (const fs::MountTable::Entry& entry, mountTable.get().entries) {
-    if (entry.dir == PORT_MAPPING_BIND_MOUNT_ROOT()) {
-      bindMountRoot = entry;
-    }
-  }
-
-  // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT().
-  if (bindMountRoot.isNone()) {
-    // NOTE: Instead of using fs::mount to perform the bind mount, we
-    // use the shell command here because the syscall 'mount' does not
-    // update the mount table (i.e., /etc/mtab), which could cause
-    // issues for the shell command 'mount --make-rslave' inside the
-    // container. It's OK to use the blocking os::shell here because
-    // 'create' will only be invoked during initialization.
-    Try<string> mount = os::shell(
-        "mount --bind %s %s",
-        PORT_MAPPING_BIND_MOUNT_ROOT().c_str(),
-        PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
-
-    if (mount.isError()) {
-      return Error(
-          "Failed to self bind mount '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
-          "': " + mount.error());
-    }
-  }
-
-  // Mark the mount point PORT_MAPPING_BIND_MOUNT_ROOT() as
-  // recursively shared.
-  Try<string> mountShared = os::shell(
-      "mount --make-rshared %s",
-      PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
-
-  if (mountShared.isError()) {
-    return Error(
-        "Failed to mark '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
-        "' as recursively shared: " + mountShared.error());
-  }
-
-  // Create the network namespace handle symlink directory if it does
-  // not exist. It is used to host from network namespace handle
-  // symlinks whose basename is a container ID. This allows us to
-  // recover container IDs for orphan containers (i.e., not known by
-  // the slave). This is introduced in 0.23.0.
-  mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
-  if (mkdir.isError()) {
-    return Error(
-        "Failed to create the bind mount root directory at " +
-        PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() + ": " + mkdir.error());
-  }
-
-  return new MesosIsolator(Owned<MesosIsolatorProcess>(
-      new PortMappingIsolatorProcess(
-          flags,
-          eth0.get(),
-          lo.get(),
-          hostMAC.get(),
-          hostIPNetwork.get(),
-          hostEth0MTU.get(),
-          hostDefaultGateway.get(),
-          hostNetworkConfigurations,
-          egressRateLimitPerContainer,
-          nonEphemeralPorts,
-          ephemeralPortsAllocator,
-          freeFlowIds)));
-}
-
-
-Future<Nothing> PortMappingIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  // Extract pids from virtual device names (veth). This tells us
-  // about all the potential live containers on this slave.
-  Try<set<string>> links = net::links();
-  if (links.isError()) {
-    return Failure("Failed to get all the links: " + links.error());
-  }
-
-  hashset<pid_t> pids;
-  foreach (const string& name, links.get()) {
-    Option<pid_t> pid = getPidFromVeth(name);
-    // Not all links follow the naming: mesos{pid}, so we simply
-    // continue, e.g., eth0.
-    if (pid.isNone()) {
-      continue;
-    } else if (pids.contains(pid.get())) {
-      return Failure("Two virtual devices have the same name '" + name + "'");
-    }
-
-    pids.insert(pid.get());
-  }
-
-  // Scan the bind mount root to cleanup all stale network namespace
-  // handles that do not have an active veth associated with.
-  Try<list<string>> entries = os::ls(PORT_MAPPING_BIND_MOUNT_ROOT());
-  if (entries.isError()) {
-    return Failure(
-        "Failed to list bind mount root '" +
-        PORT_MAPPING_BIND_MOUNT_ROOT() +
-        "': " + entries.error());
-  }
-
-  foreach (const string& entry, entries.get()) {
-    const string path = path::join(PORT_MAPPING_BIND_MOUNT_ROOT(), entry);
-
-    // NOTE: We expect all regular files whose names are numbers under
-    // the bind mount root are network namespace handles.
-    Result<pid_t> pid = getPidFromNamespaceHandle(path);
-    if (pid.isError()) {
-      return Failure(
-          "Failed to get pid from network namespace handle '" +
-          path + "': " + pid.error());
-    } else if (pid.isNone()) {
-      // We ignore files that are clearly not network namespace
-      // handles created by us. It's likely that those are created by
-      // users or other tools.
-      LOG(WARNING) << "Unrecognized network namespace handle '" << path << "'";
-      continue;
-    }
-
-    // We cleanup the network namespace handle if the associated
-    // containers have clearly exited (i.e., the veth has gone). The
-    // cleanup here is best effort.
-    if (!pids.contains(pid.get())) {
-      LOG(INFO) << "Removing stale network namespace handle '" << path << "'";
-
-      Try<Nothing> unmount = fs::unmount(path, MNT_DETACH);
-      if (unmount.isError()) {
-        LOG(WARNING) << "Failed to unmount stale network namespace handle '"
-                     << path << "': " << unmount.error();
-      }
-
-      Try<Nothing> rm = os::rm(path);
-      if (rm.isError()) {
-        LOG(WARNING) << "Failed to remove stale network namespace handle '"
-                     << path << "': " << rm.error();
-      }
-    }
-  }
-
-  // Scan the bind mount symlink root for container IDs. This allows us
-  // to recover container IDs for orphan containers (i.e., not known
-  // by the slave). This is introduced in 0.23.0.
-  entries = os::ls(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
-  if (entries.isError()) {
-    return Failure(
-        "Failed to list bind mount symlink root '" +
-        PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() +
-        "': " + entries.error());
-  }
-
-  // This map stores the mapping between pids and container IDs
-  // recovered from the bind mount root that have valid veth links. We
-  // use a multihashmap here because multiple container IDs can map to
-  // the same pid if the removal of a symlink fails in '_cleanup()'
-  // and the pid is reused by a new container.
-  multihashmap<pid_t, ContainerID> linkers;
-
-  foreach (const string& entry, entries.get()) {
-    const string path =
-      path::join(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(), entry);
-
-    // We only create symlinks in this directory and assume
-    // non-symlink files are created by other users or tools,
-    // therefore will be ignored.
-    if (!os::stat::islink(path)) {
-      LOG(WARNING) << "Ignored non-symlink file '" << path
-                   << "' under bind mount symlink root '"
-                   << PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() << "'";
-      continue;
-    }
-
-    // NOTE: We expect all symlinks under the bind mount symlink root
-    // to be container ID symlinks.
-
-    Try<ContainerID> containerId = getContainerIdFromSymlink(path);
-    if (containerId.isError()) {
-      return Failure(
-          "Failed to get container ID from network namespace handle symlink '" +
-          path + "': " + containerId.error());
-    }
-
-    Result<pid_t> pid = getPidFromSymlink(path);
-    if (pid.isError()) {
-      return Failure(
-          "Failed to get pid from network namespace handle symlink '" + path +
-          "': " + pid.error());
-    }
-
-    // We remove the symlink if it's dangling or the associated
-    // containers have clearly exited (i.e., the veth has gone). The
-    // cleanup here is best effort.
-    if (pid.isNone() || !pids.contains(pid.get())) {
-      LOG(INFO) << "Removing stale network namespace handle symlink '"
-                << path << "'";
-
-      Try<Nothing> rm = os::rm(path);
-      if (rm.isError()) {
-        LOG(WARNING) << "Failed to remove stale network namespace handle "
-                     << " symlink '" << path << "': " << rm.error();
-      }
-    } else {
-      LOG(INFO) << "Discovered network namespace handle symlink "
-                << containerId.get() << " -> " << pid.get();
-
-      linkers.put(pid.get(), containerId.get());
-    }
-  }
-
-  // If multiple container IDs point to the same pid, we remove both
-  // symlinks for safety (as if we cannot derive the container ID for
-  // orphans, which is OK because it'll be treated the same as those
-  // containers that are created by older (pre 0.23.0) versions). Note
-  // that it's possible that multiple container IDs map to the same
-  // pid if the removal of a symlink fails in '_cleanup()' and the pid
-  // is reused by a new container.
-  foreach (pid_t pid, linkers.keys()) {
-    list<ContainerID> containerIds = linkers.get(pid);
-    if (containerIds.size() > 1) {
-      foreach (const ContainerID& containerId, containerIds) {
-        const string linker = getSymlinkPath(containerId);
-
-        LOG(WARNING) << "Removing duplicated network namespace handle symlink '"
-                     << linker << "'";
-
-        Try<Nothing> rm = os::rm(linker);
-        if (rm.isError()) {
-          LOG(WARNING) << "Failed to remove duplicated network namespace "
-                       << "handle symlink '" << linker << "': " << rm.error();
-        }
-      }
-
-      linkers.remove(pid);
-    }
-  }
-
-  // Now, actually recover the isolator from slave's state.
-  foreach (const ContainerState& state, states) {
-    const ContainerID& containerId = state.container_id();
-    pid_t pid = state.pid();
-
-    VLOG(1) << "Recovering network isolator for container "
-            << containerId << " with pid " << pid;
-
-    if (!pids.contains(pid)) {
-      // There are two possible cases here:
-      //
-      // 1) The container was launched by the slave with network
-      //    isolation disabled, so the pid could not be found in the
-      //    device names in the system.
-      //
-      // 2) The container was launched by the slave with network
-      //    isolation enabled, but veth is removed (because the
-      //    corresponding container is destroyed), but the slave
-      //    restarts before it is able to write the sentinel file.
-      //
-      // In both cases, we treat the container as unmanaged. For case
-      // (2), it's safe to do so because the container has already
-      // been destroyed.
-      VLOG(1) << "Skipped recovery for container " << containerId
-              << "with pid " << pid << " as either it was not managed by "
-              << "the network isolator or it has already been destroyed";
-
-      unmanaged.insert(containerId);
-      continue;
-    }
-
-    Try<Info*> recover = _recover(pid);
-    if (recover.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-
-      return Failure(
-          "Failed to recover container " + stringify(containerId) +
-          " with pid " + stringify(pid) + ": " + recover.error());
-    }
-
-    infos[containerId] = recover.get();
-
-    // Remove the successfully recovered pid.
-    pids.erase(pid);
-  }
-
-  // Recover orphans. Known orphans will be destroyed by containerizer
-  // using the normal cleanup path (refer to MESOS-2367 for details).
-  // Unknown orphans will be cleaned up immediately. The recovery will
-  // fail if there is some unknown orphan that cannot be cleaned up.
-  vector<Info*> unknownOrphans;
-
-  foreach (pid_t pid, pids) {
-    Try<Info*> recover = _recover(pid);
-    if (recover.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-      foreach (Info* info, unknownOrphans) {
-        delete info;
-      }
-
-      return Failure(
-          "Failed to recover orphaned container with pid " +
-          stringify(pid) + ": " + recover.error());
-    }
-
-    if (linkers.get(pid).size() == 1) {
-      const ContainerID containerId = linkers.get(pid).front();
-      CHECK(!infos.contains(containerId));
-
-      if (orphans.contains(containerId)) {
-        infos[containerId] = recover.get();
-        continue;
-      }
-    }
-
-    unknownOrphans.push_back(recover.get());
-  }
-
-  foreach (Info* info, unknownOrphans) {
-    CHECK_SOME(info->pid);
-    pid_t pid = info->pid.get();
-
-    Option<ContainerID> containerId;
-    if (linkers.get(pid).size() == 1) {
-      containerId = linkers.get(pid).front();
-    }
-
-    // NOTE: If 'infos' is empty (means there is no regular container
-    // or known orphan), the '_cleanup' below will remove the ICMP and
-    // ARP packet filters on host eth0. This will cause subsequent
-    // calls to '_cleanup' for unknown orphans to fail. However, this
-    // is OK because when slave restarts and tries to recover again,
-    // it'll try to remove the remaining unknown orphans.
-    // TODO(jieyu): Consider call '_cleanup' for all the unknown
-    // orphans before returning even if error occurs.
-    Try<Nothing> cleanup = _cleanup(info, containerId);
-    if (cleanup.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-
-      // TODO(jieyu): Also delete 'info' in unknownOrphans. Notice
-      // that some 'info' in unknownOrphans might have already been
-      // deleted in '_cleanup' above.
-
-      return Failure(
-          "Failed to cleanup orphaned container with pid " +
-          stringify(pid) + ": " + cleanup.error());
-    }
-  }
-
-  // TODO(cwang): Consider removing unrecognized flow classifiers from
-  // host eth0 egress.
-
-  LOG(INFO) << "Network isolator recovery complete";
-
-  return Nothing();
-}
-
-
-Try<PortMappingIsolatorProcess::Info*>
-PortMappingIsolatorProcess::_recover(pid_t pid)
-{
-  // Get all the IP filters on veth.
-  // NOTE: We only look at veth devices to recover port ranges
-  // assigned to each container. That's the reason why we need to make
-  // sure that we add filters to veth before adding filters to host
-  // eth0 and host lo. Also, we need to make sure we remove filters
-  // from host eth0 and host lo before removing filters from veth.
-  Result<vector<ip::Classifier>> vethIngressClassifiers =
-    ip::classifiers(veth(pid), ingress::HANDLE);
-
-  if (vethIngressClassifiers.isError()) {
-    return Error(
-        "Failed to get all the IP filters on " + veth(pid) +
-        ": " + vethIngressClassifiers.error());
-  } else if (vethIngressClassifiers.isNone()) {
-    return Error(
-        "Failed to get all the IP filters on " + veth(pid) +
-        ": link does not exist");
-  }
-
-  hashmap<PortRange, uint16_t> flowIds;
-
-  if (flags.egress_unique_flow_per_container) {
-    // Get all egress IP flow classifiers on eth0.
-    Result<vector<filter::Filter<ip::Classifier>>> eth0EgressFilters =
-      ip::filters(eth0, HOST_TX_FQ_CODEL_HANDLE);
-
-    if (eth0EgressFilters.isError()) {
-      return Error(
-          "Failed to get all the IP flow classifiers on " + eth0 +
-          ": " + eth0EgressFilters.error());
-    } else if (eth0EgressFilters.isNone()) {
-      return Error(
-          "Failed to get all the IP flow classifiers on " + eth0 +
-          ": link does not exist");
-    }
-
-    // Construct a port range to flow ID mapping from host eth0
-    // egress. This map will be used later.
-    foreach (const filter::Filter<ip::Classifier>& filter,
-             eth0EgressFilters.get()) {
-      const Option<PortRange> sourcePorts = filter.classifier.sourcePorts;
-      const Option<Handle> classid = filter.classid;
-
-      if (sourcePorts.isNone()) {
-        return Error("Missing source ports for filters on egress of " + eth0);
-      }
-
-      if (classid.isNone()) {
-        return Error("Missing classid for filters on egress of " + eth0);
-      }
-
-      if (flowIds.contains(sourcePorts.get())) {
-        return Error(
-          "Duplicated port range " + stringify(sourcePorts.get()) +
-          " detected on egress of " + eth0);
-      }
-
-      flowIds[sourcePorts.get()] = classid.get().secondary();
-    }
-  }
-
-  IntervalSet<uint16_t> nonEphemeralPorts;
-  IntervalSet<uint16_t> ephemeralPorts;
-  Option<uint16_t> flowId;
-
-  foreach (const ip::Classifier& classifier, vethIngressClassifiers.get()) {
-    const Option<PortRange> sourcePorts = classifier.sourcePorts;
-    const Option<PortRange> destinationPorts = classifier.destinationPorts;
-
-    // All the IP filters on veth used by us only have source ports.
-    if (sourcePorts.isNone() || destinationPorts.isSome()) {
-      return Error("Unexpected IP filter detected on " + veth(pid));
-    }
-
-    if (flowIds.contains(sourcePorts.get())) {
-      if (flowId.isNone()) {
-        flowId = flowIds.get(sourcePorts.get());
-      } else if (flowId != flowIds.get(sourcePorts.get())) {
-        return Error(
-            "A container is associated with multiple flows "
-            "on egress of " + eth0);
-      }
-    } else if (flowId.isSome()) {
-      // This is the case where some port range of a container is
-      // assigned to a flow while some isn't. This could happen if
-      // slave crashes while those filters are created. However, this
-      // is OK for us because packets by default go to the host flow.
-      LOG(WARNING) << "Container port range " << sourcePorts.get()
-                   << " does not have flow id " << flowId.get()
-                   << " assigned";
-    }
-
-    Interval<uint16_t> ports =
-      (Bound<uint16_t>::closed(sourcePorts.get().begin()),
-       Bound<uint16_t>::closed(sourcePorts.get().end()));
-
-    if (managedNonEphemeralPorts.contains(ports)) {
-      nonEphemeralPorts += ports;
-    } else if (ephemeralPortsAllocator->isManaged(ports)) {
-      // We have duplicate here because we have two IP filters with
-      // the same ephemeral port range (one for eth0 and one for lo).
-      // But we should never have two intersecting port ranges.
-      if (!ephemeralPorts.contains(ports) && ephemeralPorts.intersects(ports)) {
-        return Error("Unexpected intersected ephemeral port ranges");
-      }
-
-      ephemeralPorts += ports;
-    } else {
-      return Error("Unexpected IP filter detected on " + veth(pid));
-    }
-  }
-
-  Info* info = NULL;
-
-  if (ephemeralPorts.empty()) {
-    // NOTE: This is possible because the slave may crash while
-    // calling 'isolate()', leaving a partially isolated container. To
-    // clean up this partially isolated container, we still create an
-    // Info struct here and let the 'cleanup' function clean it up
-    // later.
-    LOG(WARNING) << "No ephemeral ports found for container with pid "
-                 << stringify(pid) << ". This could happen if slave crashes "
-                 << "while isolating a container";
-
-    info = new Info(nonEphemeralPorts, Interval<uint16_t>(), pid);
-  } else {
-    if (ephemeralPorts.intervalCount() != 1) {
-      return Error("Each container should have only one ephemeral port range");
-    }
-
-    // Tell the allocator that this ephemeral port range is used.
-    ephemeralPortsAllocator->allocate(*ephemeralPorts.begin());
-
-    info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid);
-
-    VLOG(1) << "Recovered network isolator for container with pid " << pid
-            << " non-ephemeral port ranges " << nonEphemeralPorts
-            << " and ephemeral port range " << *ephemeralPorts.begin();
-  }
-
-  if (flowId.isSome()) {
-    freeFlowIds.erase(flowId.get());
-    info->flowId = flowId.get();
-  }
-
-  return CHECK_NOTNULL(info);
-}
-
-
-Future<Option<ContainerPrepareInfo>> PortMappingIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (unmanaged.contains(containerId)) {
-    return Failure("Asked to prepare an unmanaged container");
-  }
-
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  Resources resources(executorInfo.resources());
-
-  IntervalSet<uint16_t> nonEphemeralPorts;
-
-  if (resources.ports().isSome()) {
-    nonEphemeralPorts = getIntervalSet(resources.ports().get());
-
-    // Sanity check to make sure that the assigned non-ephemeral ports
-    // for the container are part of the non-ephemeral ports specified
-    // by the slave.
-    if (!managedNonEphemeralPorts.contains(nonEphemeralPorts)) {
-        return Failure(
-            "Some non-ephemeral ports specified in " +
-            stringify(nonEphemeralPorts) +
-            " are not managed by the slave");
-    }
-  }
-
-  // TODO(jieyu): For now, we simply ignore the 'ephemeral_ports'
-  // specified in the executor info. However, this behavior needs to
-  // be changed once the master can make default allocations for
-  // ephemeral ports.
-  if (resources.ephemeral_ports().isSome()) {
-    LOG(WARNING) << "Ignoring the specified ephemeral_ports '"
-                 << resources.ephemeral_ports().get()
-                 << "' for container" << containerId
-                 << " of executor " << executorInfo.executor_id();
-  }
-
-  // Allocate the ephemeral ports used by this container.
-  Try<Interval<uint16_t>> ephemeralPorts = ephemeralPortsAllocator->allocate();
-  if (ephemeralPorts.isError()) {
-    return Failure(
-        "Failed to allocate ephemeral ports: " + ephemeralPorts.error());
-  }
-
-  infos[containerId] = new Info(nonEphemeralPorts, ephemeralPorts.get());
-
-  LOG(INFO) << "Using non-ephemeral ports " << nonEphemeralPorts
-            << " and ephemeral ports " << ephemeralPorts.get()
-            << " for container " << containerId << " of executor "
-            << executorInfo.executor_id();
-
-  ContainerPrepareInfo prepareInfo;
-  prepareInfo.add_commands()->set_value(scripts(infos[containerId]));
-
-  // NOTE: the port mapping isolator itself doesn't require mount
-  // namespace. However, if mount namespace is enabled because of
-  // other isolators, we need to set mount sharing accordingly for
-  // PORT_MAPPING_BIND_MOUNT_ROOT to avoid races described in
-  // MESOS-1558. So we turn on mount namespace here for consistency.
-  prepareInfo.set_namespaces(CLONE_NEWNET | CLONE_NEWNS);
-
-  return prepareInfo;
-}
-
-
-Future<Nothing> PortMappingIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  if (unmanaged.contains(containerId)) {
-    return Failure("Asked to isolate an unmanaged container");
-  }
-
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  if (info->pid.isSome()) {
-    return Failure("The container has already been isolated");
-  }
-
-  info->pid = pid;
-
-  if (flags.egress_unique_flow_per_container) {
-    info->flowId = getNextFlowId();
-  }
-
-  // Bind mount the network namespace handle of the process 'pid' to a
-  // directory to hold an extra reference to the network namespace
-  // which will be released in 'cleanup'. By holding the extra
-  // reference, the network namespace will not be destroyed even if
-  // the process 'pid' is gone, which allows us to explicitly control
-  // the network namespace life cycle.
-  const string source = path::join("/proc", stringify(pid), "ns", "net");
-  const string target = getNamespaceHandlePath(pid);
-
-  Try<Nothing> touch = os::touch(target);
-  if (touch.isError()) {
-    return Failure("Failed to create the bind mount point: " + touch.error());
-  }
-
-  Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
-  if (mount.isError()) {
-    return Failure(
-        "Failed to mount the network namespace handle from '" +
-        source + "' to '" + target + "': " + mount.error());
-  }
-
-  LOG(INFO) << "Bind mounted '" << source << "' to '" << target
-            << "' for container " << containerId;
-
-  // Since 0.23.0, we create a symlink to the network namespace handle
-  // using the container ID. This serves two purposes. First, it
-  // allows us to recover the container ID later when slave restarts
-  // even if slave's checkpointed meta data is deleted. Second, it
-  // makes the debugging easier. See MESOS-2528 for details.
-  const string linker = getSymlinkPath(containerId);
-  Try<Nothing> symlink = ::fs::symlink(target, linker);
-  if (symlink.isError()) {
-    return Failure(
-        "Failed to symlink the network namespace handle '" +
-        linker + "' -> '" + target + "': " + symlink.error());
-  }
-
-  LOG(INFO) << "Created network namespace handle symlink '"
-            << linker << "' -> '" << target << "'";
-
-  // Create a virtual ethernet pair for this container.
-  Try<bool> createVethPair = link::create(veth(pid), eth0, pid);
-  if (createVethPair.isError()) {
-    return Failure(
-        "Failed to create virtual ethernet pair: " +
-        createVethPair.error());
-  }
-
-  // Disable IPv6 for veth as IPv6 packets won't be forwarded anyway.
-  const string disableIPv6 =
-    path::join("/proc/sys/net/ipv6/conf", veth(pid), "disable_ipv6");
-
-  if (os::exists(disableIPv6)) {
-    Try<Nothing> write = os::write(disableIPv6, "1");
-    if (write.isError()) {
-      return Failure(
-          "Failed to disable IPv6 for " + veth(pid) +
-          ": " + write.error());
-    }
-  }
-
-  // Sets the MAC address of veth to match the MAC address of the host
-  // public interface (eth0).
-  Try<bool> setVethMAC = link::setMAC(veth(pid), hostMAC);
-  if (setVethMAC.isError()) {
-    return Failure(
-        "Failed to set the MAC address of " + veth(pid) +
-        ": " + setVethMAC.error());
-  }
-
-  // Prepare the ingress queueing disciplines on veth.
-  Try<bool> createQdisc = ingress::create(veth(pid));
-  if (createQdisc.isError()) {
-    return Failure(
-        "Failed to create the ingress qdisc on " + veth(pid) +
-        ": " + createQdisc.error());
-  }
-
-  // Veth device should exist since we just created it.
-  CHECK(createQdisc.get());
-
-  // For each port range, add a set of IP packet filters to properly
-  // redirect IP traffic to/from containers.
-  foreach (const PortRange& range,
-           getPortRanges(info->nonEphemeralPorts + info->ephemeralPorts)) {
-    if (info->flowId.isSome()) {
-      LOG(INFO) << "Adding IP packet filters with ports " << range
-                << " with flow ID " << info->flowId.get()
-                << " for container " << containerId;
-    } else {
-      LOG(INFO) << "Adding IP packet filters with ports " << range
-                << " for container " << containerId;
-    }
-
-    Try<Nothing> add = addHostIPFilters(range, info->flowId, veth(pid));
-    if (add.isError()) {
-      return Failure(
-          "Failed to add IP packet filter with ports " +
-          stringify(range) + " for container with pid " +
-          stringify(pid) + ": " + add.error());
-    }
-  }
-
-  // Relay ICMP packets from veth of the container to host eth0.
-  Try<bool> icmpVethToEth0 = filter::icmp::create(
-      veth(pid),
-      ingress::HANDLE,
-      icmp::Classifier(None()),
-      Priority(ICMP_FILTER_PRIORITY, NORMAL),
-      action::Redirect(eth0));
-
-  if (icmpVethToEth0.isError()) {
-    ++metrics.adding_veth_icmp_filters_errors;
-
-    return Failure(
-        "Failed to create an ICMP packet filter from " + veth(pid) +
-        " to host " + eth0 + ": " + icmpVethToEth0.error());
-  } else if (!icmpVethToEth0.get()) {
-    ++metrics.adding_veth_icmp_filters_already_exist;
-
-    return Failure(
-        "The ICMP packet filter from " + veth(pid) +
-        " to host " + eth0 + " already exists");
-  }
-
-  // Relay ARP packets from veth of the container to host eth0.
-  Try<bool> arpVethToEth0 = filter::basic::create(
-      veth(pid),
-      ingress::HANDLE,
-      ETH_P_ARP,
-      Priority(ARP_FILTER_PRIORITY, NORMAL),
-      action::Redirect(eth0));
-
-  if (arpVethToEth0.isError()) {
-    ++metrics.adding_veth_arp_filters_errors;
-
-    return Failure(
-        "Failed to create an ARP packet filter from " + veth(pid) +
-        " to host " + eth0 + ": " + arpVethToEth0.error());
-  } else if (!arpVethToEth0.get()) {
-    ++metrics.adding_veth_arp_filters_already_exist;
-
-    return Failure(
-        "The ARP packet filter from " + veth(pid) +
-        " to host " + eth0 + " already exists");
-  }
-
-  // Setup filters for ICMP and ARP packets. We mirror ICMP and ARP
-  // packets from host eth0 to veths of all the containers. We also
-  // setup flow classifiers for host eth0 egress.
-  set<string> targets;
-  foreachvalue (Info* info, infos) {
-    if (info->pid.isSome()) {
-      targets.insert(veth(info->pid.get()));
-    }
-  }
-
-  if (targets.size() == 1) {
-    // We just create the first container in which case we should
-    // create filters for ICMP and ARP packets.
-
-    // Create a new ICMP filter on host eth0 ingress for mirroring
-    // packets from host eth0 to veth.
-    Try<bool> icmpEth0ToVeth = filter::icmp::create(
-        eth0,
-        ingress::HANDLE,
-        icmp::Classifier(hostIPNetwork.address()),
-        Priority(ICMP_FILTER_PRIORITY, NORMAL),
-        action::Mirror(targets));
-
-    if (icmpEth0ToVeth.isError()) {
-      ++metrics.adding_eth0_icmp_filters_errors;
-
-      return Failure(
-          "Failed to create an ICMP packet filter from host " + eth0 +
-          " to " + veth(pid) + ": " + icmpEth0ToVeth.error());
-    } else if (!icmpEth0ToVeth.get()) {
-      ++metrics.adding_eth0_icmp_filters_already_exist;
-
-      return Failure(
-          "The ICMP packet filter on host " + eth0 + " already exists");
-    }
-
-    // Create a new ARP filter on host eth0 ingress for mirroring
-    // packets from host eth0 to veth.
-    Try<bool> arpEth0ToVeth = filter::basic::create(
-        eth0,
-        ingress::HANDLE,
-        ETH_P_ARP,
-        Priority(ARP_FILTER_PRIORITY, NORMAL),
-        action::Mirror(targets));
-
-    if (arpEth0ToVeth.isError()) {
-      ++metrics.adding_eth0_arp_filters_errors;
-
-      return Failure(
-          "Failed to create an ARP packet filter from host " + eth0 +
-          " to " + veth(pid) + ": " + arpEth0ToVeth.error());
-    } else if (!arpEth0ToVeth.get()) {
-      ++metrics.adding_eth0_arp_filters_already_exist;
-
-      return Failure(
-          "The ARP packet filter on host " + eth0 + " already exists");
-    }
-
-    if (flags.egress_unique_flow_per_container) {
-      // Create a new ICMP filter on host eth0 egress for classifying
-      // packets into a reserved flow.
-      Try<bool> icmpEth0Egress = filter::icmp::create(
-          eth0,
-          HOST_TX_FQ_CODEL_HANDLE,
-          icmp::Classifier(None()),
-          Priority(ICMP_FILTER_PRIORITY, NORMAL),
-          Handle(HOST_TX_FQ_CODEL_HANDLE, ICMP_FLOWID));
-
-      if (icmpEth0Egress.isError()) {
-        ++metrics.adding_eth0_egress_filters_errors;
-
-        return Failure(
-            "Failed to create the ICMP flow classifier on host " +
-            eth0 + ": " + icmpEth0Egress.error());
-      } else if (!icmpEth0Egress.get()) {
-        ++metrics.adding_eth0_egress_filters_already_exist;
-
-        return Failure(
-            "The ICMP flow classifier on host " + eth0 + " already exists");
-      }
-
-      // Create a new ARP filter on host eth0 egress for classifying
-      // packets into a reserved flow.
-      Try<bool> arpEth0Egress = filter::basic::create(
-          eth0,
-          HOST_TX_FQ_CODEL_HANDLE,
-          ETH_P_ARP,
-          Priority(ARP_FILTER_PRIORITY, NORMAL),
-          Handle(HOST_TX_FQ_CODEL_HANDLE, ARP_FLOWID));
-
-      if (arpEth0Egress.isError()) {
-        ++metrics.adding_eth0_egress_filters_errors;
-
-        return Failure(
-            "Failed to create the ARP flow classifier on host " +
-            eth0 + ": " + arpEth0Egress.error());
-      } else if (!arpEth0Egress.get()) {
-        ++metrics.adding_eth0_egress_filters_already_exist;
-
-        return Failure(
-            "The ARP flow classifier on host " + eth0 + " already exists");
-      }
-
-      // Rest of the host packets go to a reserved flow.
-      Try<bool> defaultEth0Egress = filter::basic::create(
-          eth0,
-          HOST_TX_FQ_CODEL_HANDLE,
-          ETH_P_ALL,
-          Priority(DEFAULT_FILTER_PRIORITY, NORMAL),
-          Handle(HOST_TX_FQ_CODEL_HANDLE, HOST_FLOWID));
-
-      if (defaultEth0Egress.isError()) {
-        ++metrics.adding_eth0_egress_fi

<TRUNCATED>

[06/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
new file mode 100644
index 0000000..565f9cc
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.cpp
@@ -0,0 +1,3792 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+
+#include <glog/logging.h>
+
+#include <mesos/mesos.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/io.hpp>
+#include <process/pid.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/fs.hpp>
+#include <stout/hashset.hpp>
+#include <stout/json.hpp>
+#include <stout/lambda.hpp>
+#include <stout/mac.hpp>
+#include <stout/multihashmap.hpp>
+#include <stout/numify.hpp>
+#include <stout/os.hpp>
+#include <stout/option.hpp>
+#include <stout/path.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/result.hpp>
+#include <stout/stringify.hpp>
+#include <stout/strings.hpp>
+#include <stout/utils.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/stat.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "linux/routing/route.hpp"
+#include "linux/routing/utils.hpp"
+
+#include "linux/routing/diagnosis/diagnosis.hpp"
+
+#include "linux/routing/filter/basic.hpp"
+#include "linux/routing/filter/icmp.hpp"
+#include "linux/routing/filter/ip.hpp"
+
+#include "linux/routing/handle.hpp"
+
+#include "linux/routing/link/link.hpp"
+
+#include "linux/routing/queueing/fq_codel.hpp"
+#include "linux/routing/queueing/htb.hpp"
+#include "linux/routing/queueing/ingress.hpp"
+#include "linux/routing/queueing/statistics.hpp"
+
+#include "mesos/resources.hpp"
+
+#include "slave/constants.hpp"
+
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
+
+using namespace mesos::internal;
+
+using namespace process;
+
+using namespace routing;
+using namespace routing::filter;
+using namespace routing::queueing;
+using namespace routing::queueing::statistics;
+
+using std::cerr;
+using std::cout;
+using std::dec;
+using std::endl;
+using std::hex;
+using std::list;
+using std::ostringstream;
+using std::set;
+using std::sort;
+using std::string;
+using std::vector;
+
+using filter::ip::PortRange;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+// An old glibc might not have this symbol.
+#ifndef MNT_DETACH
+#define MNT_DETACH 2
+#endif
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The minimum number of ephemeral ports a container should have.
+static const uint16_t MIN_EPHEMERAL_PORTS_SIZE = 16;
+
+// Linux traffic control is a combination of queueing disciplines,
+// filters and classes organized as a tree for the ingress (tx) and
+// egress (rx) flows for each interface. Each container provides two
+// networking interfaces, a virtual eth0 and a loopback interface. The
+// flow of packets from the external network to container is shown
+// below:
+//
+//   +----------------------+----------------------+
+//   |                   Container                 |
+//   |----------------------|----------------------|
+//   |       eth0           |          lo          |
+//   +----------------------+----------------------+
+//          ^   |         ^           |
+//      [3] |   | [4]     |           |
+//          |   |     [7] +-----------+ [10]
+//          |   |
+//          |   |     [8] +-----------+ [9]
+//      [2] |   | [5]     |           |
+//          |   v         v           v
+//   +----------------------+----------------------+
+//   |      veth0           |          lo          |
+//   +----------------------|----------------------+
+//   |                     Host                    |
+//   |----------------------|----------------------|
+//   |                    eth0                     |
+//   +----------------------+----------------------|
+//                    ^           |
+//                [1] |           | [6]
+//                    |           v
+//
+// Traffic flowing from outside the network into a container enters
+// the system via the host ingress interface [1] and is routed based
+// on destination port to the outbound interface for the matching
+// container [2], which forwards the packet to the container's inbound
+// virtual interface. Outbound traffic destined for the external
+// network flows along the reverse path [4,5,6]. Loopback traffic is
+// directed to the corresponding Ethernet interface, either [7,10] or
+// [8,9] where the same destination port routing can be applied as to
+// external traffic. We use traffic control filters at several of the
+// interfaces to create these packet paths.
+//
+// Linux provides only a very simple topology for ingress interfaces.
+// A root is provided on a fixed handle (handle::INGRESS_ROOT) under
+// which a single qdisc can be installed, with handle ingress::HANDLE.
+// Traffic control filters can then be attached to the ingress qdisc.
+// We install one or more ingress filters on the host eth0 [1] to
+// direct traffic to the correct container, and on the container
+// virtual eth0 [5] to direct traffic to other containers or out of
+// the box. Since we know the ip port assignments for each container,
+// we can direct traffic directly to the appropriate container.
+// However, for ICMP and ARP traffic where no equivalent to a port
+// exists, we send a copy of the packet to every container and rely on
+// the network stack to drop unexpected packets.
+//
+// We install a Hierarchical Token Bucket (HTB) qdisc and class to
+// limit the outbound traffic bandwidth as the egress qdisc inside the
+// container [4] and then add a fq_codel qdisc to limit head of line
+// blocking on the egress filter. The egress traffic control chain is
+// thus:
+//
+// root device: handle::EGRESS_ROOT ->
+//    htb egress qdisc: CONTAINER_TX_HTB_HANDLE ->
+//        htb rate limiting class: CONTAINER_TX_HTB_CLASS_ID ->
+//            buffer-bloat reduction: FQ_CODEL
+constexpr Handle CONTAINER_TX_HTB_HANDLE = Handle(1, 0);
+constexpr Handle CONTAINER_TX_HTB_CLASS_ID =
+    Handle(CONTAINER_TX_HTB_HANDLE, 1);
+
+
+// Finally we create a second fq_codel qdisc on the public interface
+// of the host [6] to reduce performance interference between
+// containers. We create independent flows for each container, and
+// one for the host, which ensures packets from each container are
+// guaranteed fair access to the host interface. This egress traffic
+// control chain for the host interface is thus:
+//
+// root device: handle::EGRESS_ROOT ->
+//    buffer-bloat reduction: FQ_CODEL
+constexpr Handle HOST_TX_FQ_CODEL_HANDLE = Handle(1, 0);
+
+
+// The primary priority used by each type of filter.
+static const uint8_t ARP_FILTER_PRIORITY = 1;
+static const uint8_t ICMP_FILTER_PRIORITY = 2;
+static const uint8_t IP_FILTER_PRIORITY = 3;
+static const uint8_t DEFAULT_FILTER_PRIORITY = 4;
+
+
+// The secondary priorities used by filters.
+static const uint8_t HIGH = 1;
+static const uint8_t NORMAL = 2;
+static const uint8_t LOW = 3;
+
+
+// We assign a separate flow on host eth0 egress for each container
+// (See MESOS-2422 for details). Host egress traffic is assigned to a
+// reserved flow (HOST_FLOWID). ARP and ICMP traffic from containers
+// are not heavy, so they can share the same flow.
+static const uint16_t HOST_FLOWID = 1;
+static const uint16_t ARP_FLOWID = 2;
+static const uint16_t ICMP_FLOWID = 2;
+static const uint16_t CONTAINER_MIN_FLOWID = 3;
+
+
+// The well known ports. Used for sanity check.
+static Interval<uint16_t> WELL_KNOWN_PORTS()
+{
+  return (Bound<uint16_t>::closed(0), Bound<uint16_t>::open(1024));
+}
+
+
+/////////////////////////////////////////////////
+// Helper functions for the isolator.
+/////////////////////////////////////////////////
+
+// Given an integer x, find the largest integer t such that t <= x and
+// t is aligned to power of 2.
+static uint32_t roundDownToPowerOfTwo(uint32_t x)
+{
+  // Mutate x from 00001XXX to 0x00001111.
+
+  // We know the MSB has to be a 1, so kill the LSB and make sure the
+  // first 2 most significant bits are 1s.
+  x = x | (x >> 1);
+
+  // Now that the 2 most significant bits are 1s, make sure the first
+  // 4 most significant bits are 1s, too.
+  x = x | (x >> 2);
+
+  // We keep going. Note that the 0s left to the MSB are never turned
+  // to 1s.
+  x = x | (x >> 4);
+  x = x | (x >> 8);
+
+  // Now we have covered all 32 bits.
+  x = x | (x >> 16);
+
+  // 0x00001111 - (0x00001111 >> 1)
+  return x - (x >> 1);
+}
+
+
+// Returns the name of the host end of the virtual ethernet pair for a
+// given container. The kernel restricts link name to 16 characters or
+// less, so we cannot put container ID into the device name. Instead,
+// we use the pid of the executor process forked by the slave to
+// uniquely name the device for each container. It's safe because we
+// cannot have two active containers having the same pid for the
+// executor process.
+static string veth(pid_t pid)
+{
+  return PORT_MAPPING_VETH_PREFIX() + stringify(pid);
+}
+
+
+// Extracts the pid from the given veth name.
+static Option<pid_t> getPidFromVeth(const string& veth)
+{
+  if (strings::startsWith(veth, PORT_MAPPING_VETH_PREFIX())) {
+    Try<pid_t> pid = numify<pid_t>(
+        strings::remove(veth, PORT_MAPPING_VETH_PREFIX(), strings::PREFIX));
+
+    if (pid.isSome()) {
+      return pid.get();
+    }
+  }
+
+  return None();
+}
+
+
+// Extracts the container ID from the symlink that points to the
+// network namespace handle. The following is the layout of the bind
+// mount root and bind mount symlink root:
+//  <PORT_MAPPING_BIND_MOUNT_ROOT()>
+//    |--- 3945 (pid)                           <-|
+//                                                |
+//  <PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()>      |
+//    |--- ecf293e7-e6e8-4cbc-aaee-4d6c958aa276 --|
+//         (symlink: container ID -> pid)
+static Try<ContainerID> getContainerIdFromSymlink(const string& symlink)
+{
+  if (!os::stat::islink(symlink)) {
+    return Error("Not a symlink");
+  }
+
+  string _containerId = Path(symlink).basename();
+
+  ContainerID containerId;
+  containerId.set_value(_containerId);
+
+  return containerId;
+}
+
+
+// Extracts the pid from the network namespace handle. Returns None if
+// the handle is clearly not created by us.
+static Result<pid_t> getPidFromNamespaceHandle(const string& handle)
+{
+  if (os::stat::islink(handle)) {
+    return Error("Not expecting a symlink");
+  }
+
+  string _pid = Path(handle).basename();
+
+  Try<pid_t> pid = numify<pid_t>(_pid);
+  if (pid.isError()) {
+    return None();
+  }
+
+  return pid.get();
+}
+
+
+// Extracts the pid from the symlink that points to the network
+// namespace handle. Returns None if it's a dangling symlink.
+static Result<pid_t> getPidFromSymlink(const string& symlink)
+{
+  if (!os::stat::islink(symlink)) {
+    return Error("Not a symlink");
+  }
+
+  Result<string> target = os::realpath(symlink);
+  if (target.isError()) {
+    return Error("Failed to follow the symlink: " + target.error());
+  } else if (target.isNone()) {
+    // This is a dangling symlink.
+    return None();
+  }
+
+  return getPidFromNamespaceHandle(target.get());
+}
+
+
+static string getSymlinkPath(const ContainerID& containerId)
+{
+  return path::join(
+      PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(),
+      stringify(containerId));
+}
+
+
+static string getNamespaceHandlePath(pid_t pid)
+{
+  return path::join(
+      PORT_MAPPING_BIND_MOUNT_ROOT(),
+      stringify(pid));
+}
+
+
+// Converts from value ranges to interval set.
+static IntervalSet<uint16_t> getIntervalSet(const Value::Ranges& ranges)
+{
+  IntervalSet<uint16_t> set;
+
+  for (int i = 0; i < ranges.range_size(); i++) {
+    set += (Bound<uint16_t>::closed(ranges.range(i).begin()),
+            Bound<uint16_t>::closed(ranges.range(i).end()));
+  }
+
+  return set;
+}
+
+/////////////////////////////////////////////////
+// Implementation for PortMappingUpdate.
+/////////////////////////////////////////////////
+
+const char* PortMappingUpdate::NAME = "update";
+
+
+PortMappingUpdate::Flags::Flags()
+{
+  add(&eth0_name,
+      "eth0_name",
+      "The name of the public network interface (e.g., eth0)");
+
+  add(&lo_name,
+      "lo_name",
+      "The name of the loopback network interface (e.g., lo)");
+
+  add(&pid,
+      "pid",
+      "The pid of the process whose namespaces we will enter");
+
+  add(&ports_to_add,
+      "ports_to_add",
+      "A collection of port ranges (formatted as a JSON object)\n"
+      "for which to add IP filters. E.g.,\n"
+      "--ports_to_add={\"range\":[{\"begin\":4,\"end\":8}]}");
+
+  add(&ports_to_remove,
+      "ports_to_remove",
+      "A collection of port ranges (formatted as a JSON object)\n"
+      "for which to remove IP filters. E.g.,\n"
+      "--ports_to_remove={\"range\":[{\"begin\":4,\"end\":8}]}");
+}
+
+
+// The following two helper functions allow us to convert from a
+// collection of port ranges to a JSON object and vice versa. They
+// will be used for the port mapping update operation.
+template <typename Iterable>
+JSON::Object json(const Iterable& ranges)
+{
+  Value::Ranges values;
+  foreach (const PortRange& range, ranges) {
+    Value::Range value;
+    value.set_begin(range.begin());
+    value.set_end(range.end());
+
+    values.add_range()->CopyFrom(value);
+  }
+  return JSON::Protobuf(values);
+}
+
+
+static Try<vector<PortRange>> parse(const JSON::Object& object)
+{
+  Try<Value::Ranges> parsing = protobuf::parse<Value::Ranges>(object);
+  if (parsing.isError()) {
+    return Error("Failed to parse JSON: " + parsing.error());
+  }
+
+  vector<PortRange> ranges;
+  Value::Ranges values = parsing.get();
+  for (int i = 0; i < values.range_size(); i++) {
+    const Value::Range& value = values.range(i);
+    Try<PortRange> range = PortRange::fromBeginEnd(value.begin(), value.end());
+    if (range.isError()) {
+      return Error("Invalid port range: " + range.error());
+    }
+
+    ranges.push_back(range.get());
+  }
+  return ranges;
+}
+
+
+// Helper function to set up IP filters inside the container for a
+// given port range.
+static Try<Nothing> addContainerIPFilters(
+    const PortRange& range,
+    const string& eth0,
+    const string& lo)
+{
+  // Add an IP packet filter on lo such that local traffic inside a
+  // container will not be redirected to eth0.
+  Try<bool> loTerminal = filter::ip::create(
+      lo,
+      ingress::HANDLE,
+      ip::Classifier(None(), None(), None(), range),
+      Priority(IP_FILTER_PRIORITY, HIGH),
+      action::Terminal());
+
+  if (loTerminal.isError()) {
+    return Error(
+        "Failed to create an IP packet filter on " + lo +
+        " which stops packets from being sent to " + eth0 +
+        ": " + loTerminal.error());
+  } else if (!loTerminal.get()) {
+    return Error(
+        "The IP packet filter on " + lo +
+        " which stops packets from being sent to " +
+        eth0 + " already exists");
+  }
+
+  // Add an IP packet filter (for loopback IP) from eth0 to lo to
+  // redirect all loopback IP traffic to lo.
+  Try<bool> eth0ToLoLoopback = filter::ip::create(
+      eth0,
+      ingress::HANDLE,
+      ip::Classifier(
+          None(),
+          net::IPNetwork::LOOPBACK_V4().address(),
+          None(),
+          range),
+      Priority(IP_FILTER_PRIORITY, NORMAL),
+      action::Redirect(lo));
+
+  if (eth0ToLoLoopback.isError()) {
+    return Error(
+        "Failed to create an IP packet filter (for loopback IP) from " +
+        eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
+  } else if (!eth0ToLoLoopback.get()) {
+    return Error(
+        "The IP packet filter (for loopback IP) from " +
+        eth0 + " to " + lo + " already exists");
+  }
+
+  return Nothing();
+}
+
+
+// Helper function to remove IP filters inside the container for a
+// given port range.
+static Try<Nothing> removeContainerIPFilters(
+    const PortRange& range,
+    const string& eth0,
+    const string& lo)
+{
+  // Remove the 'terminal' IP packet filter on lo.
+  Try<bool> loTerminal = filter::ip::remove(
+      lo,
+      ingress::HANDLE,
+      ip::Classifier(None(), None(), None(), range));
+
+  if (loTerminal.isError()) {
+    return Error(
+        "Failed to remove the IP packet filter on " + lo +
+        " which stops packets from being sent to " + eth0 +
+        ": " + loTerminal.error());
+  } else if (!loTerminal.get()) {
+    return Error(
+        "The IP packet filter on " + lo +
+        " which stops packets from being sent to " + eth0 +
+        " does not exist");
+  }
+
+  // Remove the IP packet filter (for loopback IP) from eth0 to lo.
+  Try<bool> eth0ToLoLoopback = filter::ip::remove(
+      eth0,
+      ingress::HANDLE,
+      ip::Classifier(
+          None(),
+          net::IPNetwork::LOOPBACK_V4().address(),
+          None(),
+          range));
+
+  if (eth0ToLoLoopback.isError()) {
+    return Error(
+        "Failed to remove the IP packet filter (for loopback IP) from " +
+        eth0 + " to " + lo + ": " + eth0ToLoLoopback.error());
+  } else if (!eth0ToLoLoopback.get()) {
+    return Error(
+        "The IP packet filter (for loopback IP) from " +
+        eth0 + " to " + lo + " does not exist");
+  }
+
+  return Nothing();
+}
+
+
+int PortMappingUpdate::execute()
+{
+  if (flags.help) {
+    cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
+         << "Supported options:" << endl
+         << flags.usage();
+    return 0;
+  }
+
+  if (flags.eth0_name.isNone()) {
+    cerr << "The public interface name (e.g., eth0) is not specified" << endl;
+    return 1;
+  }
+
+  if (flags.lo_name.isNone()) {
+    cerr << "The loopback interface name (e.g., lo) is not specified" << endl;
+    return 1;
+  }
+
+  if (flags.pid.isNone()) {
+    cerr << "The pid is not specified" << endl;
+    return 1;
+  }
+
+  if (flags.ports_to_add.isNone() && flags.ports_to_remove.isNone()) {
+    cerr << "Nothing to update" << endl;
+    return 1;
+  }
+
+  Option<vector<PortRange>> portsToAdd;
+  Option<vector<PortRange>> portsToRemove;
+
+  if (flags.ports_to_add.isSome()) {
+    Try<vector<PortRange>> parsing = parse(flags.ports_to_add.get());
+    if (parsing.isError()) {
+      cerr << "Parsing 'ports_to_add' failed: " << parsing.error() << endl;
+      return 1;
+    }
+    portsToAdd = parsing.get();
+  }
+
+  if (flags.ports_to_remove.isSome()) {
+    Try<vector<PortRange>> parsing = parse(flags.ports_to_remove.get());
+    if (parsing.isError()) {
+      cerr << "Parsing 'ports_to_remove' failed: " << parsing.error() << endl;
+      return 1;
+    }
+    portsToRemove = parsing.get();
+  }
+
+  // Enter the network namespace.
+  Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
+  if (setns.isError()) {
+    cerr << "Failed to enter the network namespace of pid " << flags.pid.get()
+         << ": " << setns.error() << endl;
+    return 1;
+  }
+
+  // Update IP packet filters.
+  const string eth0 = flags.eth0_name.get();
+  const string lo = flags.lo_name.get();
+
+  if (portsToAdd.isSome()) {
+    foreach (const PortRange& range, portsToAdd.get()) {
+      Try<Nothing> add = addContainerIPFilters(range, eth0, lo);
+      if (add.isError()) {
+        cerr << "Failed to add IP filters: " << add.error() << endl;
+        return 1;
+      }
+    }
+  }
+
+  if (portsToRemove.isSome()) {
+    foreach (const PortRange& range, portsToRemove.get()) {
+      Try<Nothing> remove = removeContainerIPFilters(range, eth0, lo);
+      if (remove.isError()) {
+        cerr << "Failed to remove IP filters: " << remove.error() << endl;
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/////////////////////////////////////////////////
+// Implementation for PortMappingStatistics.
+/////////////////////////////////////////////////
+
+const char* PortMappingStatistics::NAME = "statistics";
+
+
+PortMappingStatistics::Flags::Flags()
+{
+  add(&eth0_name,
+      "eth0_name",
+      "The name of the public network interface (e.g., eth0)");
+
+  add(&pid,
+      "pid",
+      "The pid of the process whose namespaces we will enter");
+
+  add(&enable_socket_statistics_summary,
+      "enable_socket_statistics_summary",
+      "Whether to collect socket statistics summary for this container\n",
+      false);
+
+  add(&enable_socket_statistics_details,
+      "enable_socket_statistics_details",
+      "Whether to collect socket statistics details (e.g., TCP RTT)\n"
+      "for this container.",
+      false);
+}
+
+
+// A helper that copies the traffic control statistics from the
+// statistics hashmap into the ResourceStatistics protocol buffer.
+static void addTrafficControlStatistics(
+    const string& id,
+    const hashmap<string, uint64_t>& statistics,
+    ResourceStatistics* result)
+{
+  TrafficControlStatistics *tc = result->add_net_traffic_control_statistics();
+
+  tc->set_id(id);
+
+  // TODO(pbrett) Use protobuf reflection here.
+  if (statistics.contains(BACKLOG)) {
+    tc->set_backlog(statistics.at(BACKLOG));
+  }
+  if (statistics.contains(BYTES)) {
+    tc->set_bytes(statistics.at(BYTES));
+  }
+  if (statistics.contains(DROPS)) {
+    tc->set_drops(statistics.at(DROPS));
+  }
+  if (statistics.contains(OVERLIMITS)) {
+    tc->set_overlimits(statistics.at(OVERLIMITS));
+  }
+  if (statistics.contains(PACKETS)) {
+    tc->set_packets(statistics.at(PACKETS));
+  }
+  if (statistics.contains(QLEN)) {
+    tc->set_qlen(statistics.at(QLEN));
+  }
+  if (statistics.contains(RATE_BPS)) {
+    tc->set_ratebps(statistics.at(RATE_BPS));
+  }
+  if (statistics.contains(RATE_PPS)) {
+    tc->set_ratepps(statistics.at(RATE_PPS));
+  }
+  if (statistics.contains(REQUEUES)) {
+    tc->set_requeues(statistics.at(REQUEUES));
+  }
+}
+
+
+int PortMappingStatistics::execute()
+{
+  if (flags.help) {
+    cerr << "Usage: " << name() << " [OPTIONS]" << endl << endl
+         << "Supported options:" << endl
+         << flags.usage();
+    return 0;
+  }
+
+  if (flags.pid.isNone()) {
+    cerr << "The pid is not specified" << endl;
+    return 1;
+  }
+
+  if (flags.eth0_name.isNone()) {
+    cerr << "The public interface name (e.g., eth0) is not specified" << endl;
+    return 1;
+  }
+
+  // Enter the network namespace.
+  Try<Nothing> setns = ns::setns(flags.pid.get(), "net");
+  if (setns.isError()) {
+    // This could happen if the executor exits before this function is
+    // invoked. We do not log here to avoid spurious logging.
+    return 1;
+  }
+
+  ResourceStatistics result;
+
+  // NOTE: We use a dummy value here since this field will be cleared
+  // before the result is sent to the containerizer.
+  result.set_timestamp(0);
+
+  if (flags.enable_socket_statistics_summary) {
+    // Collections for socket statistics summary are below.
+
+    // For TCP, get the number of ACTIVE and TIME_WAIT connections,
+    // from reading /proc/net/sockstat (/proc/net/sockstat6 for IPV6).
+    // This is not as expensive in the kernel because only counter
+    // values are accessed instead of a dump of all the sockets.
+    // Example output:
+
+    // $ cat /proc/net/sockstat
+    // sockets: used 1391
+    // TCP: inuse 33 orphan 0 tw 0 alloc 37 mem 6
+    // UDP: inuse 15 mem 7
+    // UDPLITE: inuse 0
+    // RAW: inuse 0
+    // FRAG: inuse 0 memory 0
+
+    Try<string> value = os::read("/proc/net/sockstat");
+    if (value.isError()) {
+      cerr << "Failed to read /proc/net/sockstat: " << value.error() << endl;
+      return 1;
+    }
+
+    foreach (const string& line, strings::tokenize(value.get(), "\n")) {
+      if (!strings::startsWith(line, "TCP")) {
+        continue;
+      }
+
+      vector<string> tokens = strings::tokenize(line, " ");
+      for (size_t i = 0; i < tokens.size(); i++) {
+        if (tokens[i] == "inuse") {
+          if (i + 1 >= tokens.size()) {
+            cerr << "Unexpected output from /proc/net/sockstat" << endl;
+            // Be a bit forgiving here here since the /proc file
+            // output format can change, though not very likely.
+            continue;
+          }
+
+          // Set number of active TCP connections.
+          Try<size_t> inuse = numify<size_t>(tokens[i+1]);
+          if (inuse.isError()) {
+            cerr << "Failed to parse the number of tcp connections in use: "
+                 << inuse.error() << endl;
+            continue;
+          }
+
+          result.set_net_tcp_active_connections(inuse.get());
+        } else if (tokens[i] == "tw") {
+          if (i + 1 >= tokens.size()) {
+            cerr << "Unexpected output from /proc/net/sockstat" << endl;
+            // Be a bit forgiving here here since the /proc file
+            // output format can change, though not very likely.
+            continue;
+          }
+
+          // Set number of TIME_WAIT TCP connections.
+          Try<size_t> tw = numify<size_t>(tokens[i+1]);
+          if (tw.isError()) {
+            cerr << "Failed to parse the number of tcp connections in"
+                 << " TIME_WAIT: " << tw.error() << endl;
+            continue;
+          }
+
+          result.set_net_tcp_time_wait_connections(tw.get());
+        }
+      }
+    }
+  }
+
+  if (flags.enable_socket_statistics_details) {
+    // Collections for socket statistics details are below.
+
+    // NOTE: If the underlying library uses the older version of
+    // kernel API, the family argument passed in may not be honored.
+    Try<vector<diagnosis::socket::Info>> infos =
+      diagnosis::socket::infos(AF_INET, diagnosis::socket::state::ALL);
+
+    if (infos.isError()) {
+      cerr << "Failed to retrieve the socket information" << endl;
+      return 1;
+    }
+
+    vector<uint32_t> RTTs;
+    foreach (const diagnosis::socket::Info& info, infos.get()) {
+      // We double check on family regardless.
+      if (info.family != AF_INET) {
+        continue;
+      }
+
+      // We consider all sockets that have non-zero rtt value.
+      if (info.tcpInfo.isSome() && info.tcpInfo.get().tcpi_rtt != 0) {
+        RTTs.push_back(info.tcpInfo.get().tcpi_rtt);
+      }
+    }
+
+    // Only print to stdout when we have results.
+    if (RTTs.size() > 0) {
+      std::sort(RTTs.begin(), RTTs.end());
+
+      // NOTE: The size of RTTs is usually within 1 million so we
+      // don't need to worry about overflow here.
+      // TODO(jieyu): Right now, we choose to use "Nearest rank" for
+      // simplicity. Consider directly using the Statistics abstraction
+      // which computes "Linear interpolation between closest ranks".
+      // http://en.wikipedia.org/wiki/Percentile
+      size_t p50 = RTTs.size() * 50 / 100;
+      size_t p90 = RTTs.size() * 90 / 100;
+      size_t p95 = RTTs.size() * 95 / 100;
+      size_t p99 = RTTs.size() * 99 / 100;
+
+      result.set_net_tcp_rtt_microsecs_p50(RTTs[p50]);
+      result.set_net_tcp_rtt_microsecs_p90(RTTs[p90]);
+      result.set_net_tcp_rtt_microsecs_p95(RTTs[p95]);
+      result.set_net_tcp_rtt_microsecs_p99(RTTs[p99]);
+    }
+  }
+
+  // Collect traffic statistics for the container from the container
+  // virtual interface and export them in JSON.
+  const string& eth0 = flags.eth0_name.get();
+
+  // Overlimits are reported on the HTB qdisc at the egress root.
+  Result<hashmap<string, uint64_t>> statistics =
+    htb::statistics(eth0, EGRESS_ROOT);
+
+  if (statistics.isSome()) {
+    addTrafficControlStatistics(
+        NET_ISOLATOR_BW_LIMIT,
+        statistics.get(),
+        &result);
+  } else if (statistics.isNone()) {
+    // Traffic control statistics are only available when the
+    // container is created on a slave when the egress rate limit is
+    // on (i.e., egress_rate_limit_per_container flag is set). We
+    // can't just test for that flag here however, since the slave may
+    // have been restarted with different flags since the container
+    // was created. It is also possible that isolator statistics are
+    // unavailable because we the container is in the process of being
+    // created or destroy. Hence we do not report a lack of network
+    // statistics as an error.
+  } else if (statistics.isError()) {
+    cerr << "Failed to get htb qdisc statistics on " << eth0
+         << " in namespace " << flags.pid.get() << endl;
+  }
+
+  // Drops due to the bandwidth limit should be reported at the leaf.
+  statistics = fq_codel::statistics(eth0, CONTAINER_TX_HTB_CLASS_ID);
+  if (statistics.isSome()) {
+    addTrafficControlStatistics(
+        NET_ISOLATOR_BLOAT_REDUCTION,
+        statistics.get(),
+        &result);
+  } else if (statistics.isNone()) {
+    // See discussion on network isolator statistics above.
+  } else if (statistics.isError()) {
+    cerr << "Failed to get fq_codel qdisc statistics on " << eth0
+         << " in namespace " << flags.pid.get() << endl;
+  }
+
+  cout << stringify(JSON::Protobuf(result));
+  return 0;
+}
+
+
+/////////////////////////////////////////////////
+// Implementation for the isolator.
+/////////////////////////////////////////////////
+
+PortMappingIsolatorProcess::Metrics::Metrics()
+  : adding_eth0_ip_filters_errors(
+        "port_mapping/adding_eth0_ip_filters_errors"),
+    adding_eth0_ip_filters_already_exist(
+        "port_mapping/adding_eth0_ip_filters_already_exist"),
+    adding_eth0_egress_filters_errors(
+        "port_mapping/adding_eth0_egress_filters_errors"),
+    adding_eth0_egress_filters_already_exist(
+        "port_mapping/adding_eth0_egress_filters_already_exist"),
+    adding_lo_ip_filters_errors(
+        "port_mapping/adding_lo_ip_filters_errors"),
+    adding_lo_ip_filters_already_exist(
+        "port_mapping/adding_lo_ip_filters_already_exist"),
+    adding_veth_ip_filters_errors(
+        "port_mapping/adding_veth_ip_filters_errors"),
+    adding_veth_ip_filters_already_exist(
+        "port_mapping/adding_veth_ip_filters_already_exist"),
+    adding_veth_icmp_filters_errors(
+        "port_mapping/adding_veth_icmp_filters_errors"),
+    adding_veth_icmp_filters_already_exist(
+        "port_mapping/adding_veth_icmp_filters_already_exist"),
+    adding_veth_arp_filters_errors(
+        "port_mapping/adding_veth_arp_filters_errors"),
+    adding_veth_arp_filters_already_exist(
+        "port_mapping/adding_veth_arp_filters_already_exist"),
+    adding_eth0_icmp_filters_errors(
+        "port_mapping/adding_eth0_icmp_filters_errors"),
+    adding_eth0_icmp_filters_already_exist(
+        "port_mapping/adding_eth0_icmp_filters_already_exist"),
+    adding_eth0_arp_filters_errors(
+        "port_mapping/adding_eth0_arp_filters_errors"),
+    adding_eth0_arp_filters_already_exist(
+        "port_mapping/adding_eth0_arp_filters_already_exist"),
+    removing_eth0_ip_filters_errors(
+        "port_mapping/removing_eth0_ip_filters_errors"),
+    removing_eth0_ip_filters_do_not_exist(
+        "port_mapping/removing_eth0_ip_filters_do_not_exist"),
+    removing_eth0_egress_filters_errors(
+        "port_mapping/removing_eth0_egress_filters_errors"),
+    removing_eth0_egress_filters_do_not_exist(
+        "port_mapping/removinging_eth0_egress_filters_do_not_exist"),
+    removing_lo_ip_filters_errors(
+        "port_mapping/removing_lo_ip_filters_errors"),
+    removing_lo_ip_filters_do_not_exist(
+        "port_mapping/removing_lo_ip_filters_do_not_exist"),
+    removing_veth_ip_filters_errors(
+        "port_mapping/removing_veth_ip_filters_errors"),
+    removing_veth_ip_filters_do_not_exist(
+        "port_mapping/removing_veth_ip_filters_do_not_exist"),
+    removing_eth0_icmp_filters_errors(
+        "port_mapping/removing_eth0_icmp_filters_errors"),
+    removing_eth0_icmp_filters_do_not_exist(
+        "port_mapping/removing_eth0_icmp_filters_do_not_exist"),
+    removing_eth0_arp_filters_errors(
+        "port_mapping/removing_eth0_arp_filters_errors"),
+    removing_eth0_arp_filters_do_not_exist(
+        "port_mapping/removing_eth0_arp_filters_do_not_exist"),
+    updating_eth0_icmp_filters_errors(
+        "port_mapping/updating_eth0_icmp_filters_errors"),
+    updating_eth0_icmp_filters_already_exist(
+        "port_mapping/updating_eth0_icmp_filters_already_exist"),
+    updating_eth0_icmp_filters_do_not_exist(
+        "port_mapping/updating_eth0_icmp_filters_do_not_exist"),
+    updating_eth0_arp_filters_errors(
+        "port_mapping/updating_eth0_arp_filters_errors"),
+    updating_eth0_arp_filters_already_exist(
+        "port_mapping/updating_eth0_arp_filters_already_exist"),
+    updating_eth0_arp_filters_do_not_exist(
+        "port_mapping/updating_eth0_arp_filters_do_not_exist"),
+    updating_container_ip_filters_errors(
+        "port_mapping/updating_container_ip_filters_errors")
+{
+  process::metrics::add(adding_eth0_ip_filters_errors);
+  process::metrics::add(adding_eth0_ip_filters_already_exist);
+  process::metrics::add(adding_lo_ip_filters_errors);
+  process::metrics::add(adding_lo_ip_filters_already_exist);
+  process::metrics::add(adding_veth_ip_filters_errors);
+  process::metrics::add(adding_veth_ip_filters_already_exist);
+  process::metrics::add(adding_veth_icmp_filters_errors);
+  process::metrics::add(adding_veth_icmp_filters_already_exist);
+  process::metrics::add(adding_veth_arp_filters_errors);
+  process::metrics::add(adding_veth_arp_filters_already_exist);
+  process::metrics::add(adding_eth0_icmp_filters_errors);
+  process::metrics::add(adding_eth0_icmp_filters_already_exist);
+  process::metrics::add(adding_eth0_arp_filters_errors);
+  process::metrics::add(adding_eth0_arp_filters_already_exist);
+  process::metrics::add(removing_eth0_ip_filters_errors);
+  process::metrics::add(removing_eth0_ip_filters_do_not_exist);
+  process::metrics::add(removing_lo_ip_filters_errors);
+  process::metrics::add(removing_lo_ip_filters_do_not_exist);
+  process::metrics::add(removing_veth_ip_filters_errors);
+  process::metrics::add(removing_veth_ip_filters_do_not_exist);
+  process::metrics::add(removing_eth0_icmp_filters_errors);
+  process::metrics::add(removing_eth0_icmp_filters_do_not_exist);
+  process::metrics::add(removing_eth0_arp_filters_errors);
+  process::metrics::add(removing_eth0_arp_filters_do_not_exist);
+  process::metrics::add(updating_eth0_icmp_filters_errors);
+  process::metrics::add(updating_eth0_icmp_filters_already_exist);
+  process::metrics::add(updating_eth0_icmp_filters_do_not_exist);
+  process::metrics::add(updating_eth0_arp_filters_errors);
+  process::metrics::add(updating_eth0_arp_filters_already_exist);
+  process::metrics::add(updating_eth0_arp_filters_do_not_exist);
+  process::metrics::add(updating_container_ip_filters_errors);
+}
+
+
+PortMappingIsolatorProcess::Metrics::~Metrics()
+{
+  process::metrics::remove(adding_eth0_ip_filters_errors);
+  process::metrics::remove(adding_eth0_ip_filters_already_exist);
+  process::metrics::remove(adding_lo_ip_filters_errors);
+  process::metrics::remove(adding_lo_ip_filters_already_exist);
+  process::metrics::remove(adding_veth_ip_filters_errors);
+  process::metrics::remove(adding_veth_ip_filters_already_exist);
+  process::metrics::remove(adding_veth_icmp_filters_errors);
+  process::metrics::remove(adding_veth_icmp_filters_already_exist);
+  process::metrics::remove(adding_veth_arp_filters_errors);
+  process::metrics::remove(adding_veth_arp_filters_already_exist);
+  process::metrics::remove(adding_eth0_icmp_filters_errors);
+  process::metrics::remove(adding_eth0_icmp_filters_already_exist);
+  process::metrics::remove(adding_eth0_arp_filters_errors);
+  process::metrics::remove(adding_eth0_arp_filters_already_exist);
+  process::metrics::remove(removing_eth0_ip_filters_errors);
+  process::metrics::remove(removing_eth0_ip_filters_do_not_exist);
+  process::metrics::remove(removing_lo_ip_filters_errors);
+  process::metrics::remove(removing_lo_ip_filters_do_not_exist);
+  process::metrics::remove(removing_veth_ip_filters_errors);
+  process::metrics::remove(removing_veth_ip_filters_do_not_exist);
+  process::metrics::remove(removing_eth0_icmp_filters_errors);
+  process::metrics::remove(removing_eth0_icmp_filters_do_not_exist);
+  process::metrics::remove(removing_eth0_arp_filters_errors);
+  process::metrics::remove(removing_eth0_arp_filters_do_not_exist);
+  process::metrics::remove(updating_eth0_icmp_filters_errors);
+  process::metrics::remove(updating_eth0_icmp_filters_already_exist);
+  process::metrics::remove(updating_eth0_icmp_filters_do_not_exist);
+  process::metrics::remove(updating_eth0_arp_filters_errors);
+  process::metrics::remove(updating_eth0_arp_filters_already_exist);
+  process::metrics::remove(updating_eth0_arp_filters_do_not_exist);
+  process::metrics::remove(updating_container_ip_filters_errors);
+}
+
+
+Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
+{
+  // Check for root permission.
+  if (geteuid() != 0) {
+    return Error("Using network isolator requires root permissions");
+  }
+
+  // Verify that the network namespace is available by checking the
+  // existence of the network namespace handle of the current process.
+  if (ns::namespaces().count("net") == 0) {
+    return Error(
+        "Using network isolator requires network namespace. "
+        "Make sure your kernel is newer than 3.4");
+  }
+
+  // Check the routing library.
+  Try<Nothing> check = routing::check();
+  if (check.isError()) {
+    return Error(
+        "Routing library check failed: " +
+        check.error());
+  }
+
+  // Check the availability of a few Linux commands that we will use.
+  // We use the blocking os::shell here because 'create' will only be
+  // invoked during initialization.
+  Try<string> checkCommandTc = os::shell("tc filter show");
+  if (checkCommandTc.isError()) {
+    return Error("Check command 'tc' failed: " + checkCommandTc.error());
+  }
+
+  Try<string> checkCommandIp = os::shell("ip link show");
+  if (checkCommandIp.isError()) {
+    return Error("Check command 'ip' failed: " + checkCommandIp.error());
+  }
+
+  Try<Resources> resources = Resources::parse(
+      flags.resources.getOrElse(""),
+      flags.default_role);
+
+  if (resources.isError()) {
+    return Error("Failed to parse --resources: " + resources.error());
+  }
+
+  // Get 'ports' resource from 'resources' flag. These ports will be
+  // treated as non-ephemeral ports.
+  IntervalSet<uint16_t> nonEphemeralPorts;
+  if (resources.get().ports().isSome()) {
+    nonEphemeralPorts = getIntervalSet(resources.get().ports().get());
+  }
+
+  // Get 'ephemeral_ports' resource from 'resources' flag. These ports
+  // will be allocated to each container as ephemeral ports.
+  IntervalSet<uint16_t> ephemeralPorts;
+  if (resources.get().ephemeral_ports().isSome()) {
+    ephemeralPorts = getIntervalSet(resources.get().ephemeral_ports().get());
+  }
+
+  // Each container requires at least one ephemeral port for slave
+  // executor communication. If no 'ephemeral_ports' resource is
+  // found, we will return error.
+  if (ephemeralPorts.empty()) {
+    return Error("Ephemeral ports are not specified");
+  }
+
+  // Sanity check to make sure that the ephemeral ports specified do
+  // not intersect with the specified non-ephemeral ports.
+  if (ephemeralPorts.intersects(nonEphemeralPorts)) {
+    return Error(
+        "The specified ephemeral ports " + stringify(ephemeralPorts) +
+        " intersect with the specified non-ephemeral ports " +
+        stringify(nonEphemeralPorts));
+  }
+
+  // This is a sanity check to make sure that the ephemeral ports
+  // specified do not intersect with the well known ports.
+  if (ephemeralPorts.intersects(WELL_KNOWN_PORTS())) {
+    return Error(
+        "The specified ephemeral ports " + stringify(ephemeralPorts) +
+        " intersect with well known ports " + stringify(WELL_KNOWN_PORTS()));
+  }
+
+  // Obtain the host ephemeral port range by reading the proc file
+  // system ('ip_local_port_range').
+  Try<string> value = os::read("/proc/sys/net/ipv4/ip_local_port_range");
+  if (value.isError()) {
+    return Error("Failed to read host ip_local_port_range: " + value.error());
+  }
+
+  vector<string> split = strings::split(strings::trim(value.get()), "\t");
+  if (split.size() != 2) {
+    return Error(
+        "Unexpected format from host ip_local_port_range: " + value.get());
+  }
+
+  Try<uint16_t> begin = numify<uint16_t>(split[0]);
+  if (begin.isError()) {
+    return Error(
+        "Failed to parse the begin of host ip_local_port_range: " + split[0]);
+  }
+
+  Try<uint16_t> end = numify<uint16_t>(split[1]);
+  if (end.isError()) {
+    return Error(
+        "Failed to parse the end of host ip_local_port_range: " + split[1]);
+  }
+
+  Interval<uint16_t> hostEphemeralPorts =
+    (Bound<uint16_t>::closed(begin.get()),
+     Bound<uint16_t>::closed(end.get()));
+
+  // Sanity check to make sure the specified ephemeral ports do not
+  // intersect with the ephemeral ports used by the host.
+  if (ephemeralPorts.intersects(hostEphemeralPorts)) {
+    return Error(
+        "The specified ephemeral ports " + stringify(ephemeralPorts) +
+        " intersect with the ephemeral ports used by the host " +
+        stringify(hostEphemeralPorts));
+  }
+
+  // TODO(chzhcn): Cross check ephemeral ports with used ports on the
+  // host (e.g., using port scan).
+
+  // Initialize the ephemeral ports allocator.
+
+  // In theory, any positive integer can be broken up into a few
+  // numbers that are power of 2 aligned. We choose to not allow this
+  // for now so that each container has a fixed (one) number of
+  // filters for ephemeral ports. This makes it easy to debug and
+  // infer performance.
+  if (roundDownToPowerOfTwo(flags.ephemeral_ports_per_container) !=
+      flags.ephemeral_ports_per_container) {
+    return Error(
+        "The number of ephemeral ports for each container (" +
+        stringify(flags.ephemeral_ports_per_container) +
+        ") is not a power of 2");
+  }
+
+  if (ephemeralPorts.size() < flags.ephemeral_ports_per_container) {
+    return Error(
+        "Network Isolator is given ephemeral ports of size: " +
+        stringify(ephemeralPorts.size()) + ", but asked to allocate " +
+        stringify(flags.ephemeral_ports_per_container) +
+        " ephemeral ports for a container");
+  }
+
+  if (flags.ephemeral_ports_per_container < MIN_EPHEMERAL_PORTS_SIZE) {
+    return Error(
+        "Each container has only " +
+        stringify(flags.ephemeral_ports_per_container) +
+        " ephemeral ports. The minimum required is: " +
+        stringify(MIN_EPHEMERAL_PORTS_SIZE));
+  }
+
+  Owned<EphemeralPortsAllocator> ephemeralPortsAllocator(
+      new EphemeralPortsAllocator(
+        ephemeralPorts,
+        flags.ephemeral_ports_per_container));
+
+  // Get the name of the public interface (e.g., eth0). If it is not
+  // specified, try to derive its name from the routing library.
+  Result<string> eth0 = link::eth0();
+  if (flags.eth0_name.isSome()) {
+    eth0 = flags.eth0_name.get();
+
+    // Check if the given public interface exists.
+    Try<bool> hostEth0Exists = link::exists(eth0.get());
+    if (hostEth0Exists.isError()) {
+      return Error(
+          "Failed to check if " + eth0.get() + " exists: " +
+          hostEth0Exists.error());
+    } else if (!hostEth0Exists.get()) {
+      return Error("The public interface " + eth0.get() + " does not exist");
+    }
+  } else if (!eth0.isSome()){
+    // eth0 is not specified in the flag and we did not get a valid
+    // eth0 from the library.
+    return Error(
+        "Network Isolator failed to find a public interface: " + eth0.error());
+  }
+
+  LOG(INFO) << "Using " << eth0.get() << " as the public interface";
+
+  // Get the name of the loopback interface. If it is not specified,
+  // try to derive its name based on the loopback IP address.
+  Result<string> lo = link::lo();
+  // Option<string> lo = flags.lo_name;
+  if (flags.lo_name.isSome()) {
+    lo = flags.lo_name;
+
+    // Check if the given loopback interface exists.
+    Try<bool> hostLoExists = link::exists(lo.get());
+    if (hostLoExists.isError()) {
+      return Error(
+          "Failed to check if " + lo.get() + " exists: " +
+          hostLoExists.error());
+    } else if (!hostLoExists.get()) {
+      return Error("The loopback interface " + lo.get() + " does not exist");
+    }
+  } else if (!lo.isSome()) {
+    // lo is not specified in the flag and we did not get a valid
+    // lo from the library.
+    return Error(
+        "Network Isolator failed to find a loopback interface: " + lo.error());
+  }
+
+  LOG(INFO) << "Using " << lo.get() << " as the loopback interface";
+
+  // If egress rate limit is provided, do a sanity check that it is
+  // not greater than the host physical link speed.
+  Option<Bytes> egressRateLimitPerContainer;
+  if (flags.egress_rate_limit_per_container.isSome()) {
+    // Read host physical link speed from /sys/class/net/eth0/speed.
+    // This value is in MBits/s.
+    Try<string> value =
+      os::read(path::join("/sys/class/net", eth0.get(), "speed"));
+
+    if (value.isError()) {
+      return Error(
+          "Failed to read " +
+          path::join("/sys/class/net", eth0.get(), "speed") +
+          ": " + value.error());
+    }
+
+    Try<uint64_t> hostLinkSpeed = numify<uint64_t>(strings::trim(value.get()));
+    CHECK_SOME(hostLinkSpeed);
+
+    // It could be possible that the nic driver doesn't support
+    // reporting physical link speed. In that case, report error.
+    if (hostLinkSpeed.get() == 0xFFFFFFFF) {
+      return Error(
+          "Network Isolator failed to determine link speed for " + eth0.get());
+    }
+
+    // Convert host link speed to Bytes/s for comparason.
+    if (hostLinkSpeed.get() * 1000000 / 8 <
+        flags.egress_rate_limit_per_container.get().bytes()) {
+      return Error(
+          "The given egress traffic limit for containers " +
+          stringify(flags.egress_rate_limit_per_container.get().bytes()) +
+          " Bytes/s is greater than the host link speed " +
+          stringify(hostLinkSpeed.get() * 1000000 / 8) + " Bytes/s");
+    }
+
+    if (flags.egress_rate_limit_per_container.get() != Bytes(0)) {
+      egressRateLimitPerContainer = flags.egress_rate_limit_per_container.get();
+    } else {
+      LOG(WARNING) << "Ignoring the given zero egress rate limit";
+    }
+  }
+
+  // Get the host IP network, MAC and default gateway.
+  Result<net::IPNetwork> hostIPNetwork =
+    net::IPNetwork::fromLinkDevice(eth0.get(), AF_INET);
+
+  if (!hostIPNetwork.isSome()) {
+    return Error(
+        "Failed to get the public IP network of " + eth0.get() + ": " +
+        (hostIPNetwork.isError() ?
+            hostIPNetwork.error() :
+            "does not have an IPv4 network"));
+  }
+
+  Result<net::MAC> hostMAC = net::mac(eth0.get());
+  if (!hostMAC.isSome()) {
+    return Error(
+        "Failed to get the MAC address of " + eth0.get() + ": " +
+        (hostMAC.isError() ? hostMAC.error() : "does not have a MAC address"));
+  }
+
+  Result<net::IP> hostDefaultGateway = route::defaultGateway();
+  if (!hostDefaultGateway.isSome()) {
+    return Error(
+        "Failed to get the default gateway of the host: " +
+        (hostDefaultGateway.isError() ? hostDefaultGateway.error()
+        : "The default gateway of the host does not exist"));
+  }
+
+  // Set the MAC address of the host loopback interface (lo) so that
+  // it matches that of the host public interface (eth0).  A fairly
+  // recent kernel patch is needed for this operation to succeed:
+  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+  // 25f929fbff0d1bcebf2e92656d33025cd330cbf8
+  Try<bool> setHostLoMAC = link::setMAC(lo.get(), hostMAC.get());
+  if (setHostLoMAC.isError()) {
+    return Error(
+        "Failed to set the MAC address of " + lo.get() +
+        ": " + setHostLoMAC.error());
+  }
+
+  // Set the MTU of the host loopback interface (lo) so that it
+  // matches that of the host public interface (eth0).
+  Result<unsigned int> hostEth0MTU = link::mtu(eth0.get());
+  if (hostEth0MTU.isError()) {
+    return Error(
+        "Failed to get the MTU of " + eth0.get() +
+        ": " + hostEth0MTU.error());
+  }
+
+  // The host public interface should exist since we just checked it.
+  CHECK_SOME(hostEth0MTU);
+
+  Try<bool> setHostLoMTU = link::setMTU(lo.get(), hostEth0MTU.get());
+  if (setHostLoMTU.isError()) {
+    return Error(
+        "Failed to set the MTU of " + lo.get() +
+        ": " + setHostLoMTU.error());
+  }
+
+  // Prepare the ingress queueing disciplines on host public interface
+  // (eth0) and host loopback interface (lo).
+  Try<bool> createHostEth0IngressQdisc = ingress::create(eth0.get());
+  if (createHostEth0IngressQdisc.isError()) {
+    return Error(
+        "Failed to create the ingress qdisc on " + eth0.get() +
+        ": " + createHostEth0IngressQdisc.error());
+  }
+
+  set<uint16_t> freeFlowIds;
+  if (flags.egress_unique_flow_per_container) {
+    // Prepare a fq_codel queueing discipline on host public interface
+    // (eth0) for egress flow classification.
+    //
+    // TODO(cwang): Maybe we can continue when some other egress qdisc
+    // exists because this is not a necessary qdisc for network
+    // isolation, but we don't want inconsistency, so we just fail in
+    // this case. See details in MESOS-2370.
+    Try<bool> createHostEth0EgressQdisc = fq_codel::create(
+        eth0.get(),
+        EGRESS_ROOT,
+        HOST_TX_FQ_CODEL_HANDLE);
+    if (createHostEth0EgressQdisc.isError()) {
+      return Error(
+          "Failed to create the egress qdisc on " + eth0.get() +
+          ": " + createHostEth0EgressQdisc.error());
+    }
+
+    // TODO(cwang): Make sure DEFAULT_FLOWS is large enough so that
+    // it's unlikely to run out of free flow IDs.
+    for (uint16_t i = CONTAINER_MIN_FLOWID; i < fq_codel::DEFAULT_FLOWS; i++) {
+      freeFlowIds.insert(i);
+    }
+  }
+
+  Try<bool> createHostLoQdisc = ingress::create(lo.get());
+  if (createHostLoQdisc.isError()) {
+    return Error(
+        "Failed to create the ingress qdisc on " + lo.get() +
+        ": " + createHostLoQdisc.error());
+  }
+
+  // Enable 'route_localnet' on host loopback interface (lo). This
+  // enables the use of 127.0.0.1/8 for local routing purpose. This
+  // feature only exists on kernel 3.6 or newer.
+  const string loRouteLocalnet =
+    path::join("/proc/sys/net/ipv4/conf", lo.get(), "route_localnet");
+
+  if (!os::exists(loRouteLocalnet)) {
+    // TODO(jieyu): Consider supporting running the isolator if this
+    // feature is not available. We need to conditionally disable
+    // routing for 127.0.0.1/8, and ask the tasks to use the public IP
+    // for container to container and container to host communication.
+    return Error("The kernel does not support 'route_localnet'");
+  }
+
+  Try<Nothing> write = os::write(loRouteLocalnet, "1");
+  if (write.isError()) {
+    return Error(
+        "Failed to enable route_localnet for " + lo.get() +
+        ": " + write.error());
+  }
+
+  // We disable 'rp_filter' and 'send_redirects' for host loopback
+  // interface (lo) to work around a kernel bug, which was only
+  // recently addressed in upstream in the following 3 commits.
+  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+  //   6a662719c9868b3d6c7d26b3a085f0cd3cc15e64
+  //   0d5edc68739f1c1e0519acbea1d3f0c1882a15d7
+  //   e374c618b1465f0292047a9f4c244bd71ab5f1f0
+  // The workaround ensures packets don't get dropped at lo.
+  write = os::write("/proc/sys/net/ipv4/conf/all/rp_filter", "0");
+  if (write.isError()) {
+    return Error(
+        "Failed to disable rp_filter for all: " + write.error());
+  }
+
+  write = os::write(path::join(
+      "/proc/sys/net/ipv4/conf", lo.get(), "rp_filter"), "0");
+  if (write.isError()) {
+    return Error(
+        "Failed to disable rp_filter for " + lo.get() +
+        ": " + write.error());
+  }
+
+  write = os::write("/proc/sys/net/ipv4/conf/all/send_redirects", "0");
+  if (write.isError()) {
+    return Error(
+        "Failed to disable send_redirects for all: " + write.error());
+  }
+
+  write = os::write(path::join(
+      "/proc/sys/net/ipv4/conf", lo.get(), "send_redirects"), "0");
+  if (write.isError()) {
+    return Error(
+        "Failed to disable send_redirects for " + lo.get() +
+        ": " + write.error());
+  }
+
+  // We need to enable accept_local on host loopback interface (lo)
+  // for kernels older than 3.6. Refer to the following:
+  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+  //   7a9bc9b81a5bc6e44ebc80ef781332e4385083f2
+  // https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
+  write = os::write(path::join(
+      "/proc/sys/net/ipv4/conf", lo.get(), "accept_local"), "1");
+  if (write.isError()) {
+    return Error(
+        "Failed to enable accept_local for " + lo.get() +
+        ": " + write.error());
+  }
+
+  // Reading host network configurations. Each container will match
+  // these configurations.
+  hashset<string> procs;
+
+  // TODO(jieyu): The following is a partial list of all the
+  // configurations. In the future, we may want to expose these
+  // configurations using ContainerInfo.
+
+  // The kernel will use a default value for the following
+  // configurations inside a container. Therefore, we need to set them
+  // in the container to match that on the host.
+  procs.insert("/proc/sys/net/core/somaxconn");
+
+  // As of kernel 3.10, the following configurations are shared
+  // between host and containers, and therefore are not required to be
+  // set in containers. We keep them here just in case the kernel
+  // changes in the future.
+  procs.insert("/proc/sys/net/core/netdev_max_backlog");
+  procs.insert("/proc/sys/net/core/rmem_max");
+  procs.insert("/proc/sys/net/core/wmem_max");
+  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_time");
+  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_intvl");
+  procs.insert("/proc/sys/net/ipv4/tcp_keepalive_probes");
+  procs.insert("/proc/sys/net/ipv4/tcp_max_syn_backlog");
+  procs.insert("/proc/sys/net/ipv4/tcp_rmem");
+  procs.insert("/proc/sys/net/ipv4/tcp_retries2");
+  procs.insert("/proc/sys/net/ipv4/tcp_synack_retries");
+  procs.insert("/proc/sys/net/ipv4/tcp_wmem");
+  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh1");
+  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh2");
+  procs.insert("/proc/sys/net/ipv4/neigh/default/gc_thresh3");
+
+  hashmap<string, string> hostNetworkConfigurations;
+  foreach (const string& proc, procs) {
+    Try<string> value = os::read(proc);
+    if (value.isSome()) {
+      LOG(INFO) << proc << " = '" << strings::trim(value.get()) << "'";
+      hostNetworkConfigurations[proc] = strings::trim(value.get());
+    }
+  }
+
+  // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT(). Since we use a
+  // new mount namespace for each container, for this mount point, we
+  // set '--make-rshared' on the host and set '--make-rslave' inside
+  // each container. This is important because when we unmount the
+  // network namespace handles on the host, those handles will be
+  // unmounted in the containers as well, but NOT vice versa.
+
+  // We first create the bind mount directory if it does not exist.
+  Try<Nothing> mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_ROOT());
+  if (mkdir.isError()) {
+    return Error(
+        "Failed to create the bind mount root directory at " +
+        PORT_MAPPING_BIND_MOUNT_ROOT() + ": " + mkdir.error());
+  }
+
+  // Now, check '/proc/mounts' to see if
+  // PORT_MAPPING_BIND_MOUNT_ROOT() has already been self mounted.
+  Try<fs::MountTable> mountTable = fs::MountTable::read("/proc/mounts");
+  if (mountTable.isError()) {
+    return Error(
+        "Failed to the read the mount table at '/proc/mounts': " +
+        mountTable.error());
+  }
+
+  Option<fs::MountTable::Entry> bindMountRoot;
+  foreach (const fs::MountTable::Entry& entry, mountTable.get().entries) {
+    if (entry.dir == PORT_MAPPING_BIND_MOUNT_ROOT()) {
+      bindMountRoot = entry;
+    }
+  }
+
+  // Self bind mount PORT_MAPPING_BIND_MOUNT_ROOT().
+  if (bindMountRoot.isNone()) {
+    // NOTE: Instead of using fs::mount to perform the bind mount, we
+    // use the shell command here because the syscall 'mount' does not
+    // update the mount table (i.e., /etc/mtab), which could cause
+    // issues for the shell command 'mount --make-rslave' inside the
+    // container. It's OK to use the blocking os::shell here because
+    // 'create' will only be invoked during initialization.
+    Try<string> mount = os::shell(
+        "mount --bind %s %s",
+        PORT_MAPPING_BIND_MOUNT_ROOT().c_str(),
+        PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
+
+    if (mount.isError()) {
+      return Error(
+          "Failed to self bind mount '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
+          "': " + mount.error());
+    }
+  }
+
+  // Mark the mount point PORT_MAPPING_BIND_MOUNT_ROOT() as
+  // recursively shared.
+  Try<string> mountShared = os::shell(
+      "mount --make-rshared %s",
+      PORT_MAPPING_BIND_MOUNT_ROOT().c_str());
+
+  if (mountShared.isError()) {
+    return Error(
+        "Failed to mark '" + PORT_MAPPING_BIND_MOUNT_ROOT() +
+        "' as recursively shared: " + mountShared.error());
+  }
+
+  // Create the network namespace handle symlink directory if it does
+  // not exist. It is used to host from network namespace handle
+  // symlinks whose basename is a container ID. This allows us to
+  // recover container IDs for orphan containers (i.e., not known by
+  // the slave). This is introduced in 0.23.0.
+  mkdir = os::mkdir(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
+  if (mkdir.isError()) {
+    return Error(
+        "Failed to create the bind mount root directory at " +
+        PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() + ": " + mkdir.error());
+  }
+
+  return new MesosIsolator(Owned<MesosIsolatorProcess>(
+      new PortMappingIsolatorProcess(
+          flags,
+          eth0.get(),
+          lo.get(),
+          hostMAC.get(),
+          hostIPNetwork.get(),
+          hostEth0MTU.get(),
+          hostDefaultGateway.get(),
+          hostNetworkConfigurations,
+          egressRateLimitPerContainer,
+          nonEphemeralPorts,
+          ephemeralPortsAllocator,
+          freeFlowIds)));
+}
+
+
+Future<Nothing> PortMappingIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // Extract pids from virtual device names (veth). This tells us
+  // about all the potential live containers on this slave.
+  Try<set<string>> links = net::links();
+  if (links.isError()) {
+    return Failure("Failed to get all the links: " + links.error());
+  }
+
+  hashset<pid_t> pids;
+  foreach (const string& name, links.get()) {
+    Option<pid_t> pid = getPidFromVeth(name);
+    // Not all links follow the naming: mesos{pid}, so we simply
+    // continue, e.g., eth0.
+    if (pid.isNone()) {
+      continue;
+    } else if (pids.contains(pid.get())) {
+      return Failure("Two virtual devices have the same name '" + name + "'");
+    }
+
+    pids.insert(pid.get());
+  }
+
+  // Scan the bind mount root to cleanup all stale network namespace
+  // handles that do not have an active veth associated with.
+  Try<list<string>> entries = os::ls(PORT_MAPPING_BIND_MOUNT_ROOT());
+  if (entries.isError()) {
+    return Failure(
+        "Failed to list bind mount root '" +
+        PORT_MAPPING_BIND_MOUNT_ROOT() +
+        "': " + entries.error());
+  }
+
+  foreach (const string& entry, entries.get()) {
+    const string path = path::join(PORT_MAPPING_BIND_MOUNT_ROOT(), entry);
+
+    // NOTE: We expect all regular files whose names are numbers under
+    // the bind mount root are network namespace handles.
+    Result<pid_t> pid = getPidFromNamespaceHandle(path);
+    if (pid.isError()) {
+      return Failure(
+          "Failed to get pid from network namespace handle '" +
+          path + "': " + pid.error());
+    } else if (pid.isNone()) {
+      // We ignore files that are clearly not network namespace
+      // handles created by us. It's likely that those are created by
+      // users or other tools.
+      LOG(WARNING) << "Unrecognized network namespace handle '" << path << "'";
+      continue;
+    }
+
+    // We cleanup the network namespace handle if the associated
+    // containers have clearly exited (i.e., the veth has gone). The
+    // cleanup here is best effort.
+    if (!pids.contains(pid.get())) {
+      LOG(INFO) << "Removing stale network namespace handle '" << path << "'";
+
+      Try<Nothing> unmount = fs::unmount(path, MNT_DETACH);
+      if (unmount.isError()) {
+        LOG(WARNING) << "Failed to unmount stale network namespace handle '"
+                     << path << "': " << unmount.error();
+      }
+
+      Try<Nothing> rm = os::rm(path);
+      if (rm.isError()) {
+        LOG(WARNING) << "Failed to remove stale network namespace handle '"
+                     << path << "': " << rm.error();
+      }
+    }
+  }
+
+  // Scan the bind mount symlink root for container IDs. This allows us
+  // to recover container IDs for orphan containers (i.e., not known
+  // by the slave). This is introduced in 0.23.0.
+  entries = os::ls(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT());
+  if (entries.isError()) {
+    return Failure(
+        "Failed to list bind mount symlink root '" +
+        PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() +
+        "': " + entries.error());
+  }
+
+  // This map stores the mapping between pids and container IDs
+  // recovered from the bind mount root that have valid veth links. We
+  // use a multihashmap here because multiple container IDs can map to
+  // the same pid if the removal of a symlink fails in '_cleanup()'
+  // and the pid is reused by a new container.
+  multihashmap<pid_t, ContainerID> linkers;
+
+  foreach (const string& entry, entries.get()) {
+    const string path =
+      path::join(PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT(), entry);
+
+    // We only create symlinks in this directory and assume
+    // non-symlink files are created by other users or tools,
+    // therefore will be ignored.
+    if (!os::stat::islink(path)) {
+      LOG(WARNING) << "Ignored non-symlink file '" << path
+                   << "' under bind mount symlink root '"
+                   << PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT() << "'";
+      continue;
+    }
+
+    // NOTE: We expect all symlinks under the bind mount symlink root
+    // to be container ID symlinks.
+
+    Try<ContainerID> containerId = getContainerIdFromSymlink(path);
+    if (containerId.isError()) {
+      return Failure(
+          "Failed to get container ID from network namespace handle symlink '" +
+          path + "': " + containerId.error());
+    }
+
+    Result<pid_t> pid = getPidFromSymlink(path);
+    if (pid.isError()) {
+      return Failure(
+          "Failed to get pid from network namespace handle symlink '" + path +
+          "': " + pid.error());
+    }
+
+    // We remove the symlink if it's dangling or the associated
+    // containers have clearly exited (i.e., the veth has gone). The
+    // cleanup here is best effort.
+    if (pid.isNone() || !pids.contains(pid.get())) {
+      LOG(INFO) << "Removing stale network namespace handle symlink '"
+                << path << "'";
+
+      Try<Nothing> rm = os::rm(path);
+      if (rm.isError()) {
+        LOG(WARNING) << "Failed to remove stale network namespace handle "
+                     << " symlink '" << path << "': " << rm.error();
+      }
+    } else {
+      LOG(INFO) << "Discovered network namespace handle symlink "
+                << containerId.get() << " -> " << pid.get();
+
+      linkers.put(pid.get(), containerId.get());
+    }
+  }
+
+  // If multiple container IDs point to the same pid, we remove both
+  // symlinks for safety (as if we cannot derive the container ID for
+  // orphans, which is OK because it'll be treated the same as those
+  // containers that are created by older (pre 0.23.0) versions). Note
+  // that it's possible that multiple container IDs map to the same
+  // pid if the removal of a symlink fails in '_cleanup()' and the pid
+  // is reused by a new container.
+  foreach (pid_t pid, linkers.keys()) {
+    list<ContainerID> containerIds = linkers.get(pid);
+    if (containerIds.size() > 1) {
+      foreach (const ContainerID& containerId, containerIds) {
+        const string linker = getSymlinkPath(containerId);
+
+        LOG(WARNING) << "Removing duplicated network namespace handle symlink '"
+                     << linker << "'";
+
+        Try<Nothing> rm = os::rm(linker);
+        if (rm.isError()) {
+          LOG(WARNING) << "Failed to remove duplicated network namespace "
+                       << "handle symlink '" << linker << "': " << rm.error();
+        }
+      }
+
+      linkers.remove(pid);
+    }
+  }
+
+  // Now, actually recover the isolator from slave's state.
+  foreach (const ContainerState& state, states) {
+    const ContainerID& containerId = state.container_id();
+    pid_t pid = state.pid();
+
+    VLOG(1) << "Recovering network isolator for container "
+            << containerId << " with pid " << pid;
+
+    if (!pids.contains(pid)) {
+      // There are two possible cases here:
+      //
+      // 1) The container was launched by the slave with network
+      //    isolation disabled, so the pid could not be found in the
+      //    device names in the system.
+      //
+      // 2) The container was launched by the slave with network
+      //    isolation enabled, but veth is removed (because the
+      //    corresponding container is destroyed), but the slave
+      //    restarts before it is able to write the sentinel file.
+      //
+      // In both cases, we treat the container as unmanaged. For case
+      // (2), it's safe to do so because the container has already
+      // been destroyed.
+      VLOG(1) << "Skipped recovery for container " << containerId
+              << "with pid " << pid << " as either it was not managed by "
+              << "the network isolator or it has already been destroyed";
+
+      unmanaged.insert(containerId);
+      continue;
+    }
+
+    Try<Info*> recover = _recover(pid);
+    if (recover.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+
+      return Failure(
+          "Failed to recover container " + stringify(containerId) +
+          " with pid " + stringify(pid) + ": " + recover.error());
+    }
+
+    infos[containerId] = recover.get();
+
+    // Remove the successfully recovered pid.
+    pids.erase(pid);
+  }
+
+  // Recover orphans. Known orphans will be destroyed by containerizer
+  // using the normal cleanup path (refer to MESOS-2367 for details).
+  // Unknown orphans will be cleaned up immediately. The recovery will
+  // fail if there is some unknown orphan that cannot be cleaned up.
+  vector<Info*> unknownOrphans;
+
+  foreach (pid_t pid, pids) {
+    Try<Info*> recover = _recover(pid);
+    if (recover.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+      foreach (Info* info, unknownOrphans) {
+        delete info;
+      }
+
+      return Failure(
+          "Failed to recover orphaned container with pid " +
+          stringify(pid) + ": " + recover.error());
+    }
+
+    if (linkers.get(pid).size() == 1) {
+      const ContainerID containerId = linkers.get(pid).front();
+      CHECK(!infos.contains(containerId));
+
+      if (orphans.contains(containerId)) {
+        infos[containerId] = recover.get();
+        continue;
+      }
+    }
+
+    unknownOrphans.push_back(recover.get());
+  }
+
+  foreach (Info* info, unknownOrphans) {
+    CHECK_SOME(info->pid);
+    pid_t pid = info->pid.get();
+
+    Option<ContainerID> containerId;
+    if (linkers.get(pid).size() == 1) {
+      containerId = linkers.get(pid).front();
+    }
+
+    // NOTE: If 'infos' is empty (means there is no regular container
+    // or known orphan), the '_cleanup' below will remove the ICMP and
+    // ARP packet filters on host eth0. This will cause subsequent
+    // calls to '_cleanup' for unknown orphans to fail. However, this
+    // is OK because when slave restarts and tries to recover again,
+    // it'll try to remove the remaining unknown orphans.
+    // TODO(jieyu): Consider call '_cleanup' for all the unknown
+    // orphans before returning even if error occurs.
+    Try<Nothing> cleanup = _cleanup(info, containerId);
+    if (cleanup.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+
+      // TODO(jieyu): Also delete 'info' in unknownOrphans. Notice
+      // that some 'info' in unknownOrphans might have already been
+      // deleted in '_cleanup' above.
+
+      return Failure(
+          "Failed to cleanup orphaned container with pid " +
+          stringify(pid) + ": " + cleanup.error());
+    }
+  }
+
+  // TODO(cwang): Consider removing unrecognized flow classifiers from
+  // host eth0 egress.
+
+  LOG(INFO) << "Network isolator recovery complete";
+
+  return Nothing();
+}
+
+
+Try<PortMappingIsolatorProcess::Info*>
+PortMappingIsolatorProcess::_recover(pid_t pid)
+{
+  // Get all the IP filters on veth.
+  // NOTE: We only look at veth devices to recover port ranges
+  // assigned to each container. That's the reason why we need to make
+  // sure that we add filters to veth before adding filters to host
+  // eth0 and host lo. Also, we need to make sure we remove filters
+  // from host eth0 and host lo before removing filters from veth.
+  Result<vector<ip::Classifier>> vethIngressClassifiers =
+    ip::classifiers(veth(pid), ingress::HANDLE);
+
+  if (vethIngressClassifiers.isError()) {
+    return Error(
+        "Failed to get all the IP filters on " + veth(pid) +
+        ": " + vethIngressClassifiers.error());
+  } else if (vethIngressClassifiers.isNone()) {
+    return Error(
+        "Failed to get all the IP filters on " + veth(pid) +
+        ": link does not exist");
+  }
+
+  hashmap<PortRange, uint16_t> flowIds;
+
+  if (flags.egress_unique_flow_per_container) {
+    // Get all egress IP flow classifiers on eth0.
+    Result<vector<filter::Filter<ip::Classifier>>> eth0EgressFilters =
+      ip::filters(eth0, HOST_TX_FQ_CODEL_HANDLE);
+
+    if (eth0EgressFilters.isError()) {
+      return Error(
+          "Failed to get all the IP flow classifiers on " + eth0 +
+          ": " + eth0EgressFilters.error());
+    } else if (eth0EgressFilters.isNone()) {
+      return Error(
+          "Failed to get all the IP flow classifiers on " + eth0 +
+          ": link does not exist");
+    }
+
+    // Construct a port range to flow ID mapping from host eth0
+    // egress. This map will be used later.
+    foreach (const filter::Filter<ip::Classifier>& filter,
+             eth0EgressFilters.get()) {
+      const Option<PortRange> sourcePorts = filter.classifier.sourcePorts;
+      const Option<Handle> classid = filter.classid;
+
+      if (sourcePorts.isNone()) {
+        return Error("Missing source ports for filters on egress of " + eth0);
+      }
+
+      if (classid.isNone()) {
+        return Error("Missing classid for filters on egress of " + eth0);
+      }
+
+      if (flowIds.contains(sourcePorts.get())) {
+        return Error(
+          "Duplicated port range " + stringify(sourcePorts.get()) +
+          " detected on egress of " + eth0);
+      }
+
+      flowIds[sourcePorts.get()] = classid.get().secondary();
+    }
+  }
+
+  IntervalSet<uint16_t> nonEphemeralPorts;
+  IntervalSet<uint16_t> ephemeralPorts;
+  Option<uint16_t> flowId;
+
+  foreach (const ip::Classifier& classifier, vethIngressClassifiers.get()) {
+    const Option<PortRange> sourcePorts = classifier.sourcePorts;
+    const Option<PortRange> destinationPorts = classifier.destinationPorts;
+
+    // All the IP filters on veth used by us only have source ports.
+    if (sourcePorts.isNone() || destinationPorts.isSome()) {
+      return Error("Unexpected IP filter detected on " + veth(pid));
+    }
+
+    if (flowIds.contains(sourcePorts.get())) {
+      if (flowId.isNone()) {
+        flowId = flowIds.get(sourcePorts.get());
+      } else if (flowId != flowIds.get(sourcePorts.get())) {
+        return Error(
+            "A container is associated with multiple flows "
+            "on egress of " + eth0);
+      }
+    } else if (flowId.isSome()) {
+      // This is the case where some port range of a container is
+      // assigned to a flow while some isn't. This could happen if
+      // slave crashes while those filters are created. However, this
+      // is OK for us because packets by default go to the host flow.
+      LOG(WARNING) << "Container port range " << sourcePorts.get()
+                   << " does not have flow id " << flowId.get()
+                   << " assigned";
+    }
+
+    Interval<uint16_t> ports =
+      (Bound<uint16_t>::closed(sourcePorts.get().begin()),
+       Bound<uint16_t>::closed(sourcePorts.get().end()));
+
+    if (managedNonEphemeralPorts.contains(ports)) {
+      nonEphemeralPorts += ports;
+    } else if (ephemeralPortsAllocator->isManaged(ports)) {
+      // We have duplicate here because we have two IP filters with
+      // the same ephemeral port range (one for eth0 and one for lo).
+      // But we should never have two intersecting port ranges.
+      if (!ephemeralPorts.contains(ports) && ephemeralPorts.intersects(ports)) {
+        return Error("Unexpected intersected ephemeral port ranges");
+      }
+
+      ephemeralPorts += ports;
+    } else {
+      return Error("Unexpected IP filter detected on " + veth(pid));
+    }
+  }
+
+  Info* info = NULL;
+
+  if (ephemeralPorts.empty()) {
+    // NOTE: This is possible because the slave may crash while
+    // calling 'isolate()', leaving a partially isolated container. To
+    // clean up this partially isolated container, we still create an
+    // Info struct here and let the 'cleanup' function clean it up
+    // later.
+    LOG(WARNING) << "No ephemeral ports found for container with pid "
+                 << stringify(pid) << ". This could happen if slave crashes "
+                 << "while isolating a container";
+
+    info = new Info(nonEphemeralPorts, Interval<uint16_t>(), pid);
+  } else {
+    if (ephemeralPorts.intervalCount() != 1) {
+      return Error("Each container should have only one ephemeral port range");
+    }
+
+    // Tell the allocator that this ephemeral port range is used.
+    ephemeralPortsAllocator->allocate(*ephemeralPorts.begin());
+
+    info = new Info(nonEphemeralPorts, *ephemeralPorts.begin(), pid);
+
+    VLOG(1) << "Recovered network isolator for container with pid " << pid
+            << " non-ephemeral port ranges " << nonEphemeralPorts
+            << " and ephemeral port range " << *ephemeralPorts.begin();
+  }
+
+  if (flowId.isSome()) {
+    freeFlowIds.erase(flowId.get());
+    info->flowId = flowId.get();
+  }
+
+  return CHECK_NOTNULL(info);
+}
+
+
+Future<Option<ContainerPrepareInfo>> PortMappingIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (unmanaged.contains(containerId)) {
+    return Failure("Asked to prepare an unmanaged container");
+  }
+
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  Resources resources(executorInfo.resources());
+
+  IntervalSet<uint16_t> nonEphemeralPorts;
+
+  if (resources.ports().isSome()) {
+    nonEphemeralPorts = getIntervalSet(resources.ports().get());
+
+    // Sanity check to make sure that the assigned non-ephemeral ports
+    // for the container are part of the non-ephemeral ports specified
+    // by the slave.
+    if (!managedNonEphemeralPorts.contains(nonEphemeralPorts)) {
+        return Failure(
+            "Some non-ephemeral ports specified in " +
+            stringify(nonEphemeralPorts) +
+            " are not managed by the slave");
+    }
+  }
+
+  // TODO(jieyu): For now, we simply ignore the 'ephemeral_ports'
+  // specified in the executor info. However, this behavior needs to
+  // be changed once the master can make default allocations for
+  // ephemeral ports.
+  if (resources.ephemeral_ports().isSome()) {
+    LOG(WARNING) << "Ignoring the specified ephemeral_ports '"
+                 << resources.ephemeral_ports().get()
+                 << "' for container" << containerId
+                 << " of executor " << executorInfo.executor_id();
+  }
+
+  // Allocate the ephemeral ports used by this container.
+  Try<Interval<uint16_t>> ephemeralPorts = ephemeralPortsAllocator->allocate();
+  if (ephemeralPorts.isError()) {
+    return Failure(
+        "Failed to allocate ephemeral ports: " + ephemeralPorts.error());
+  }
+
+  infos[containerId] = new Info(nonEphemeralPorts, ephemeralPorts.get());
+
+  LOG(INFO) << "Using non-ephemeral ports " << nonEphemeralPorts
+            << " and ephemeral ports " << ephemeralPorts.get()
+            << " for container " << containerId << " of executor "
+            << executorInfo.executor_id();
+
+  ContainerPrepareInfo prepareInfo;
+  prepareInfo.add_commands()->set_value(scripts(infos[containerId]));
+
+  // NOTE: the port mapping isolator itself doesn't require mount
+  // namespace. However, if mount namespace is enabled because of
+  // other isolators, we need to set mount sharing accordingly for
+  // PORT_MAPPING_BIND_MOUNT_ROOT to avoid races described in
+  // MESOS-1558. So we turn on mount namespace here for consistency.
+  prepareInfo.set_namespaces(CLONE_NEWNET | CLONE_NEWNS);
+
+  return prepareInfo;
+}
+
+
+Future<Nothing> PortMappingIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (unmanaged.contains(containerId)) {
+    return Failure("Asked to isolate an unmanaged container");
+  }
+
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  if (info->pid.isSome()) {
+    return Failure("The container has already been isolated");
+  }
+
+  info->pid = pid;
+
+  if (flags.egress_unique_flow_per_container) {
+    info->flowId = getNextFlowId();
+  }
+
+  // Bind mount the network namespace handle of the process 'pid' to a
+  // directory to hold an extra reference to the network namespace
+  // which will be released in 'cleanup'. By holding the extra
+  // reference, the network namespace will not be destroyed even if
+  // the process 'pid' is gone, which allows us to explicitly control
+  // the network namespace life cycle.
+  const string source = path::join("/proc", stringify(pid), "ns", "net");
+  const string target = getNamespaceHandlePath(pid);
+
+  Try<Nothing> touch = os::touch(target);
+  if (touch.isError()) {
+    return Failure("Failed to create the bind mount point: " + touch.error());
+  }
+
+  Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+  if (mount.isError()) {
+    return Failure(
+        "Failed to mount the network namespace handle from '" +
+        source + "' to '" + target + "': " + mount.error());
+  }
+
+  LOG(INFO) << "Bind mounted '" << source << "' to '" << target
+            << "' for container " << containerId;
+
+  // Since 0.23.0, we create a symlink to the network namespace handle
+  // using the container ID. This serves two purposes. First, it
+  // allows us to recover the container ID later when slave restarts
+  // even if slave's checkpointed meta data is deleted. Second, it
+  // makes the debugging easier. See MESOS-2528 for details.
+  const string linker = getSymlinkPath(containerId);
+  Try<Nothing> symlink = ::fs::symlink(target, linker);
+  if (symlink.isError()) {
+    return Failure(
+        "Failed to symlink the network namespace handle '" +
+        linker + "' -> '" + target + "': " + symlink.error());
+  }
+
+  LOG(INFO) << "Created network namespace handle symlink '"
+            << linker << "' -> '" << target << "'";
+
+  // Create a virtual ethernet pair for this container.
+  Try<bool> createVethPair = link::create(veth(pid), eth0, pid);
+  if (createVethPair.isError()) {
+    return Failure(
+        "Failed to create virtual ethernet pair: " +
+        createVethPair.error());
+  }
+
+  // Disable IPv6 for veth as IPv6 packets won't be forwarded anyway.
+  const string disableIPv6 =
+    path::join("/proc/sys/net/ipv6/conf", veth(pid), "disable_ipv6");
+
+  if (os::exists(disableIPv6)) {
+    Try<Nothing> write = os::write(disableIPv6, "1");
+    if (write.isError()) {
+      return Failure(
+          "Failed to disable IPv6 for " + veth(pid) +
+          ": " + write.error());
+    }
+  }
+
+  // Sets the MAC address of veth to match the MAC address of the host
+  // public interface (eth0).
+  Try<bool> setVethMAC = link::setMAC(veth(pid), hostMAC);
+  if (setVethMAC.isError()) {
+    return Failure(
+        "Failed to set the MAC address of " + veth(pid) +
+        ": " + setVethMAC.error());
+  }
+
+  // Prepare the ingress queueing disciplines on veth.
+  Try<bool> createQdisc = ingress::create(veth(pid));
+  if (createQdisc.isError()) {
+    return Failure(
+        "Failed to create the ingress qdisc on " + veth(pid) +
+        ": " + createQdisc.error());
+  }
+
+  // Veth device should exist since we just created it.
+  CHECK(createQdisc.get());
+
+  // For each port range, add a set of IP packet filters to properly
+  // redirect IP traffic to/from containers.
+  foreach (const PortRange& range,
+           getPortRanges(info->nonEphemeralPorts + info->ephemeralPorts)) {
+    if (info->flowId.isSome()) {
+      LOG(INFO) << "Adding IP packet filters with ports " << range
+                << " with flow ID " << info->flowId.get()
+                << " for container " << containerId;
+    } else {
+      LOG(INFO) << "Adding IP packet filters with ports " << range
+                << " for container " << containerId;
+    }
+
+    Try<Nothing> add = addHostIPFilters(range, info->flowId, veth(pid));
+    if (add.isError()) {
+      return Failure(
+          "Failed to add IP packet filter with ports " +
+          stringify(range) + " for container with pid " +
+          stringify(pid) + ": " + add.error());
+    }
+  }
+
+  // Relay ICMP packets from veth of the container to host eth0.
+  Try<bool> icmpVethToEth0 = filter::icmp::create(
+      veth(pid),
+      ingress::HANDLE,
+      icmp::Classifier(None()),
+      Priority(ICMP_FILTER_PRIORITY, NORMAL),
+      action::Redirect(eth0));
+
+  if (icmpVethToEth0.isError()) {
+    ++metrics.adding_veth_icmp_filters_errors;
+
+    return Failure(
+        "Failed to create an ICMP packet filter from " + veth(pid) +
+        " to host " + eth0 + ": " + icmpVethToEth0.error());
+  } else if (!icmpVethToEth0.get()) {
+    ++metrics.adding_veth_icmp_filters_already_exist;
+
+    return Failure(
+        "The ICMP packet filter from " + veth(pid) +
+        " to host " + eth0 + " already exists");
+  }
+
+  // Relay ARP packets from veth of the container to host eth0.
+  Try<bool> arpVethToEth0 = filter::basic::create(
+      veth(pid),
+      ingress::HANDLE,
+      ETH_P_ARP,
+      Priority(ARP_FILTER_PRIORITY, NORMAL),
+      action::Redirect(eth0));
+
+  if (arpVethToEth0.isError()) {
+    ++metrics.adding_veth_arp_filters_errors;
+
+    return Failure(
+        "Failed to create an ARP packet filter from " + veth(pid) +
+        " to host " + eth0 + ": " + arpVethToEth0.error());
+  } else if (!arpVethToEth0.get()) {
+    ++metrics.adding_veth_arp_filters_already_exist;
+
+    return Failure(
+        "The ARP packet filter from " + veth(pid) +
+        " to host " + eth0 + " already exists");
+  }
+
+  // Setup filters for ICMP and ARP packets. We mirror ICMP and ARP
+  // packets from host eth0 to veths of all the containers. We also
+  // setup flow classifiers for host eth0 egress.
+  set<string> targets;
+  foreachvalue (Info* info, infos) {
+    if (info->pid.isSome()) {
+      targets.insert(veth(info->pid.get()));
+    }
+  }
+
+  if (targets.size() == 1) {
+    // We just create the first container in which case we should
+    // create filters for ICMP and ARP packets.
+
+    // Create a new ICMP filter on host eth0 ingress for mirroring
+    // packets from host eth0 to veth.
+    Try<bool> icmpEth0ToVeth = filter::icmp::create(
+        eth0,
+        ingress::HANDLE,
+        icmp::Classifier(hostIPNetwork.address()),
+        Priority(ICMP_FILTER_PRIORITY, NORMAL),
+        action::Mirror(targets));
+
+    if (icmpEth0ToVeth.isError()) {
+      ++metrics.adding_eth0_icmp_filters_errors;
+
+      return Failure(
+          "Failed to create an ICMP packet filter from host " + eth0 +
+          " to " + veth(pid) + ": " + icmpEth0ToVeth.error());
+    } else if (!icmpEth0ToVeth.get()) {
+      ++metrics.adding_eth0_icmp_filters_already_exist;
+
+      return Failure(
+          "The ICMP packet filter on host " + eth0 + " already exists");
+    }
+
+    // Create a new ARP filter on host eth0 ingress for mirroring
+    // packets from host eth0 to veth.
+    Try<bool> arpEth0ToVeth = filter::basic::create(
+        eth0,
+        ingress::HANDLE,
+        ETH_P_ARP,
+        Priority(ARP_FILTER_PRIORITY, NORMAL),
+        action::Mirror(targets));
+
+    if (arpEth0ToVeth.isError()) {
+      ++metrics.adding_eth0_arp_filters_errors;
+
+      return Failure(
+          "Failed to create an ARP packet filter from host " + eth0 +
+          " to " + veth(pid) + ": " + arpEth0ToVeth.error());
+    } else if (!arpEth0ToVeth.get()) {
+      ++metrics.adding_eth0_arp_filters_already_exist;
+
+      return Failure(
+          "The ARP packet filter on host " + eth0 + " already exists");
+    }
+
+    if (flags.egress_unique_flow_per_container) {
+      // Create a new ICMP filter on host eth0 egress for classifying
+      // packets into a reserved flow.
+      Try<bool> icmpEth0Egress = filter::icmp::create(
+          eth0,
+          HOST_TX_FQ_CODEL_HANDLE,
+          icmp::Classifier(None()),
+          Priority(ICMP_FILTER_PRIORITY, NORMAL),
+          Handle(HOST_TX_FQ_CODEL_HANDLE, ICMP_FLOWID));
+
+      if (icmpEth0Egress.isError()) {
+        ++metrics.adding_eth0_egress_filters_errors;
+
+        return Failure(
+            "Failed to create the ICMP flow classifier on host " +
+            eth0 + ": " + icmpEth0Egress.error());
+      } else if (!icmpEth0Egress.get()) {
+        ++metrics.adding_eth0_egress_filters_already_exist;
+
+        return Failure(
+            "The ICMP flow classifier on host " + eth0 + " already exists");
+      }
+
+      // Create a new ARP filter on host eth0 egress for classifying
+      // packets into a reserved flow.
+      Try<bool> arpEth0Egress = filter::basic::create(
+          eth0,
+          HOST_TX_FQ_CODEL_HANDLE,
+          ETH_P_ARP,
+          Priority(ARP_FILTER_PRIORITY, NORMAL),
+          Handle(HOST_TX_FQ_CODEL_HANDLE, ARP_FLOWID));
+
+      if (arpEth0Egress.isError()) {
+        ++metrics.adding_eth0_egress_filters_errors;
+
+        return Failure(
+            "Failed to create the ARP flow classifier on host " +
+            eth0 + ": " + arpEth0Egress.error());
+      } else if (!arpEth0Egress.get()) {
+        ++metrics.adding_eth0_egress_filters_already_exist;
+
+        return Failure(
+            "The ARP flow classifier on host " + eth0 + " already exists");
+      }
+
+      // Rest of the host packets go to a reserved flow.
+      Try<bool> defaultEth0Egress = filter::basic::create(
+          eth0,
+          HOST_TX_FQ_CODEL_HANDLE,
+          ETH_P_ALL,
+          Priority(DEFAULT_FILTER_PRIORITY, NORMAL),
+          Handle(HOST_TX_FQ_CODEL_HANDLE, HOST_FLOWID));
+
+      if (defaultEth0Egress.isError()) {
+        ++met

<TRUNCATED>

[07/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
new file mode 100644
index 0000000..cc54e90
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.cpp
@@ -0,0 +1,294 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+#include <string>
+
+#include <stout/fs.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include "slave/paths.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/posix.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+PosixFilesystemIsolatorProcess::PosixFilesystemIsolatorProcess(
+    const Flags& _flags)
+  : flags(_flags) {}
+
+
+PosixFilesystemIsolatorProcess::~PosixFilesystemIsolatorProcess() {}
+
+
+Try<Isolator*> PosixFilesystemIsolatorProcess::create(const Flags& flags)
+{
+  process::Owned<MesosIsolatorProcess> process(
+      new PosixFilesystemIsolatorProcess(flags));
+
+  return new MesosIsolator(process);
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  foreach (const ContainerState& state, states) {
+    infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> PosixFilesystemIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  // Return failure if the container change the filesystem root
+  // because the symlinks will become invalid in the new root.
+  if (executorInfo.has_container()) {
+    CHECK_EQ(executorInfo.container().type(), ContainerInfo::MESOS);
+
+    if (executorInfo.container().mesos().has_image()) {
+      return Failure("Container root filesystems not supported");
+    }
+
+    // TODO(jieyu): Also return a failure if there exists images in
+    // the specified volumes.
+  }
+
+  infos.put(containerId, Owned<Info>(new Info(directory)));
+
+  return update(containerId, executorInfo.resources())
+      .then([]() -> Future<Option<ContainerPrepareInfo>> { return None(); });
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  // No-op.
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> PosixFilesystemIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  // No-op.
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  const Owned<Info>& info = infos[containerId];
+
+  // TODO(jieyu): Currently, we only allow non-nested relative
+  // container paths for volumes. This is enforced by the master. For
+  // those volumes, we create symlinks in the executor directory.
+  Resources current = info->resources;
+
+  // We first remove unneeded persistent volumes.
+  foreach (const Resource& resource, current.persistentVolumes()) {
+    // This is enforced by the master.
+    CHECK(resource.disk().has_volume());
+
+    // Ignore absolute and nested paths.
+    const string& containerPath = resource.disk().volume().container_path();
+    if (strings::contains(containerPath, "/")) {
+      LOG(WARNING) << "Skipping updating symlink for persistent volume "
+                   << resource << " of container " << containerId
+                   << " because the container path '" << containerPath
+                   << "' contains slash";
+      continue;
+    }
+
+    if (resources.contains(resource)) {
+      continue;
+    }
+
+    string link = path::join(info->directory, containerPath);
+
+    LOG(INFO) << "Removing symlink '" << link << "' for persistent volume "
+              << resource << " of container " << containerId;
+
+    Try<Nothing> rm = os::rm(link);
+    if (rm.isError()) {
+      return Failure(
+          "Failed to remove the symlink for the unneeded "
+          "persistent volume at '" + link + "'");
+    }
+  }
+
+  // We then link additional persistent volumes.
+  foreach (const Resource& resource, resources.persistentVolumes()) {
+    // This is enforced by the master.
+    CHECK(resource.disk().has_volume());
+
+    // Ignore absolute and nested paths.
+    const string& containerPath = resource.disk().volume().container_path();
+    if (strings::contains(containerPath, "/")) {
+      LOG(WARNING) << "Skipping updating symlink for persistent volume "
+                   << resource << " of container " << containerId
+                   << " because the container path '" << containerPath
+                   << "' contains slash";
+      continue;
+    }
+
+    if (current.contains(resource)) {
+      continue;
+    }
+
+    string original = paths::getPersistentVolumePath(
+        flags.work_dir,
+        resource.role(),
+        resource.disk().persistence().id());
+
+    // Set the ownership of the persistent volume to match that of the
+    // sandbox directory.
+    //
+    // NOTE: Currently, persistent volumes in Mesos are exclusive,
+    // meaning that if a persistent volume is used by one task or
+    // executor, it cannot be concurrently used by other task or
+    // executor. But if we allow multiple executors to use same
+    // persistent volume at the same time in the future, the ownership
+    // of the persistent volume may conflict here.
+    //
+    // TODO(haosdent): Consider letting the frameworks specify the
+    // user/group of the persistent volumes.
+    struct stat s;
+    if (::stat(info->directory.c_str(), &s) < 0) {
+      return Failure(
+          "Failed to get ownership for '" + info->directory +
+          "': " + strerror(errno));
+    }
+
+    LOG(INFO) << "Changing the ownership of the persistent volume at '"
+              << original << "' with uid " << s.st_uid
+              << " and gid " << s.st_gid;
+
+    Try<Nothing> chown = os::chown(s.st_uid, s.st_gid, original, true);
+    if (chown.isError()) {
+      return Failure(
+          "Failed to change the ownership of the persistent volume at '" +
+          original + "' with uid " + stringify(s.st_uid) +
+          " and gid " + stringify(s.st_gid) + ": " + chown.error());
+    }
+
+    string link = path::join(info->directory, containerPath);
+
+    if (os::exists(link)) {
+      // NOTE: This is possible because 'info->resources' will be
+      // reset when slave restarts and recovers. When the slave calls
+      // 'containerizer->update' after the executor re-registers,
+      // we'll try to relink all the already symlinked volumes.
+      Result<string> realpath = os::realpath(link);
+      if (!realpath.isSome()) {
+        return Failure(
+            "Failed to get the realpath of symlink '" + link + "': " +
+            (realpath.isError() ? realpath.error() : "No such directory"));
+      }
+
+      // A sanity check to make sure the target of the symlink does
+      // not change. In fact, this is not supposed to happen.
+      // NOTE: Here, we compare the realpaths because 'original' might
+      // contain symbolic links.
+      Result<string> _original = os::realpath(original);
+      if (!_original.isSome()) {
+        return Failure(
+            "Failed to get the realpath of volume '" + original + "': " +
+            (_original.isError() ? _original.error() : "No such directory"));
+      }
+
+      if (realpath.get() != _original.get()) {
+        return Failure(
+            "The existing symlink '" + link + "' points to '" +
+            _original.get() + "' and the new target is '" +
+            realpath.get() + "'");
+      }
+    } else {
+      LOG(INFO) << "Adding symlink from '" << original << "' to '"
+                << link << "' for persistent volume " << resource
+                << " of container " << containerId;
+
+      Try<Nothing> symlink = ::fs::symlink(original, link);
+      if (symlink.isError()) {
+        return Failure(
+            "Failed to symlink persistent volume from '" +
+            original + "' to '" + link + "'");
+      }
+    }
+  }
+
+  // Store the updated resources.
+  info->resources = resources;
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> PosixFilesystemIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  // No-op, no usage gathered.
+  return ResourceStatistics();
+}
+
+
+Future<Nothing> PosixFilesystemIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  // Symlinks for persistent resources will be removed when the work
+  // directory is GC'ed, therefore no need to do explicit cleanup.
+  infos.erase(containerId);
+
+  return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
new file mode 100644
index 0000000..61b81dd
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/posix.hpp
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_FILESYSTEM_ISOLATOR_HPP__
+#define __POSIX_FILESYSTEM_ISOLATOR_HPP__
+
+#include <mesos/resources.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class PosixFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~PosixFilesystemIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  PosixFilesystemIsolatorProcess(const Flags& flags);
+
+  const Flags flags;
+
+  struct Info
+  {
+    explicit Info(const std::string& _directory)
+      : directory(_directory) {}
+
+    const std::string directory;
+
+    // Track resources so we can unlink unneeded persistent volumes.
+    Resources resources;
+  };
+
+  hashmap<ContainerID, process::Owned<Info>> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_FILESYSTEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp b/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
new file mode 100644
index 0000000..4702211
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/shared.cpp
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+
+#include "linux/ns.hpp"
+
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+SharedFilesystemIsolatorProcess::SharedFilesystemIsolatorProcess(
+    const Flags& _flags)
+  : flags(_flags) {}
+
+
+SharedFilesystemIsolatorProcess::~SharedFilesystemIsolatorProcess() {}
+
+
+Try<Isolator*> SharedFilesystemIsolatorProcess::create(const Flags& flags)
+{
+  Result<string> user = os::user();
+  if (!user.isSome()) {
+    return Error("Failed to determine user: " +
+                 (user.isError() ? user.error() : "username not found"));
+  }
+
+  if (user.get() != "root") {
+    return Error("SharedFilesystemIsolator requires root privileges");
+  }
+
+  process::Owned<MesosIsolatorProcess> process(
+      new SharedFilesystemIsolatorProcess(flags));
+
+  return new MesosIsolator(process);
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  // There is nothing to recover because we do not keep any state and
+  // do not monitor filesystem usage or perform any action on cleanup.
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> SharedFilesystemIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (executorInfo.has_container() &&
+      executorInfo.container().type() != ContainerInfo::MESOS) {
+    return Failure("Can only prepare filesystem for a MESOS container");
+  }
+
+  LOG(INFO) << "Preparing shared filesystem for container: "
+            << stringify(containerId);
+
+  if (!executorInfo.has_container()) {
+    // We don't consider this an error, there's just nothing to do so
+    // we return None.
+
+    return None();
+  }
+
+  // We don't support mounting to a container path which is a parent
+  // to another container path as this can mask entries. We'll keep
+  // track of all container paths so we can check this.
+  set<string> containerPaths;
+  containerPaths.insert(directory);
+
+  ContainerPrepareInfo prepareInfo;
+  prepareInfo.set_namespaces(CLONE_NEWNS);
+
+  foreach (const Volume& volume, executorInfo.container().volumes()) {
+    // Because the filesystem is shared we require the container path
+    // already exist, otherwise containers can create arbitrary paths
+    // outside their sandbox.
+    if (!os::exists(volume.container_path())) {
+      return Failure("Volume with container path '" +
+                     volume.container_path() +
+                     "' must exist on host for shared filesystem isolator");
+    }
+
+    // Host path must be provided.
+    if (!volume.has_host_path()) {
+      return Failure("Volume with container path '" +
+                     volume.container_path() +
+                     "' must specify host path for shared filesystem isolator");
+    }
+
+    // Check we won't mask another volume.
+    // NOTE: Assuming here that the container path is absolute, see
+    // Volume protobuf.
+    // TODO(idownes): This test is unnecessarily strict and could be
+    // relaxed if mounts could be re-ordered.
+    foreach (const string& containerPath, containerPaths) {
+      if (strings::startsWith(volume.container_path(), containerPath)) {
+        return Failure("Cannot mount volume to '" +
+                        volume.container_path() +
+                        "' because it is under volume '" +
+                        containerPath +
+                        "'");
+      }
+
+      if (strings::startsWith(containerPath, volume.container_path())) {
+        return Failure("Cannot mount volume to '" +
+                        containerPath +
+                        "' because it is under volume '" +
+                        volume.container_path() +
+                        "'");
+      }
+    }
+    containerPaths.insert(volume.container_path());
+
+    // A relative host path will be created in the container's work
+    // directory, otherwise check it already exists.
+    string hostPath;
+    if (!strings::startsWith(volume.host_path(), "/")) {
+      hostPath = path::join(directory, volume.host_path());
+
+      // Do not support any relative components in the resulting path.
+      // There should not be any links in the work directory to
+      // resolve.
+      if (strings::contains(hostPath, "/./") ||
+          strings::contains(hostPath, "/../")) {
+        return Failure("Relative host path '" +
+                       hostPath +
+                       "' cannot contain relative components");
+      }
+
+      Try<Nothing> mkdir = os::mkdir(hostPath, true);
+      if (mkdir.isError()) {
+        return Failure("Failed to create host_path '" +
+                        hostPath +
+                        "' for mount to '" +
+                        volume.container_path() +
+                        "': " +
+                        mkdir.error());
+      }
+
+      // Set the ownership and permissions to match the container path
+      // as these are inherited from host path on bind mount.
+      struct stat stat;
+      if (::stat(volume.container_path().c_str(), &stat) < 0) {
+        return Failure("Failed to get permissions on '" +
+                        volume.container_path() + "'" +
+                        ": " + strerror(errno));
+      }
+
+      Try<Nothing> chmod = os::chmod(hostPath, stat.st_mode);
+      if (chmod.isError()) {
+        return Failure("Failed to chmod hostPath '" +
+                       hostPath +
+                       "': " +
+                       chmod.error());
+      }
+
+      Try<Nothing> chown = os::chown(stat.st_uid, stat.st_gid, hostPath, false);
+      if (chown.isError()) {
+        return Failure("Failed to chown hostPath '" +
+                       hostPath +
+                       "': " +
+                       chown.error());
+      }
+    } else {
+      hostPath = volume.host_path();
+
+      if (!os::exists(hostPath)) {
+        return Failure("Volume with container path '" +
+                       volume.container_path() +
+                       "' must have host path '" +
+                       hostPath +
+                       "' present on host for shared filesystem isolator");
+      }
+    }
+
+    prepareInfo.add_commands()->set_value(
+        "mount -n --bind " + hostPath + " " + volume.container_path());
+  }
+
+  return prepareInfo;
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  // No-op, isolation happens when unsharing the mount namespace.
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> SharedFilesystemIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  // No-op, for now.
+
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  // No-op, nothing enforced.
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> SharedFilesystemIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  // No-op, no usage gathered.
+
+  return ResourceStatistics();
+}
+
+
+Future<Nothing> SharedFilesystemIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  // Cleanup of mounts is done automatically done by the kernel when
+  // the mount namespace is destroyed after the last process
+  // terminates.
+
+  return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp b/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
new file mode 100644
index 0000000..3a2f7db
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/filesystem/shared.hpp
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SHARED_FILESYSTEM_ISOLATOR_HPP__
+#define __SHARED_FILESYSTEM_ISOLATOR_HPP__
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// This isolator is to be used when all containers share the host's
+// filesystem.  It supports creating mounting "volumes" from the host
+// into each container's mount namespace. In particular, this can be
+// used to give each container a "private" system directory, such as
+// /tmp and /var/tmp.
+class SharedFilesystemIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~SharedFilesystemIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  SharedFilesystemIsolatorProcess(const Flags& flags);
+
+  const Flags flags;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __SHARED_FILESYSTEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
new file mode 100644
index 0000000..d41efe6
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.cpp
@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/mount.h>
+
+#include <list>
+#include <set>
+#include <string>
+
+#include <stout/os.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/ls.hpp>
+#include <stout/os/stat.hpp>
+
+#include "linux/fs.hpp"
+#include "linux/ns.hpp"
+
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The root directory where we bind mount all the namespace handles.
+static const char PID_NS_BIND_MOUNT_ROOT[] = "/var/run/mesos/pidns";
+
+
+// The empty directory that we'll use to mask the namespace handles
+// inside each container. This mount ensures they cannot determine the
+// namespace of another container.
+static const char PID_NS_BIND_MOUNT_MASK_DIR[] = "/var/empty/mesos";
+
+
+// Helper to construct the path to a pid's namespace file.
+inline string nsProcFile(pid_t pid)
+{
+  return path::join("/proc", stringify(pid), "ns", "pid");
+}
+
+
+// Helper to construct the path to the additional reference created
+// for a container's pid namespace.
+inline string nsExtraReference(const ContainerID& containerId)
+{
+  return path::join(PID_NS_BIND_MOUNT_ROOT, stringify(containerId));
+}
+
+
+Try<Isolator*> NamespacesPidIsolatorProcess::create(const Flags& flags)
+{
+  // Check for root permission.
+  if (geteuid() != 0) {
+    return Error("The pid namespace isolator requires root permissions");
+  }
+
+  // Verify that pid namespaces are available on this kernel.
+  if (ns::namespaces().count("pid") == 0) {
+    return Error("Pid namespaces are not supported by this kernel");
+  }
+
+  // Create the directory where bind mounts of the pid namespace will
+  // be placed.
+  Try<Nothing> mkdir = os::mkdir(PID_NS_BIND_MOUNT_ROOT);
+  if (mkdir.isError()) {
+    return Error(
+        "Failed to create the bind mount root directory at " +
+        string(PID_NS_BIND_MOUNT_ROOT) + ": " + mkdir.error());
+  }
+
+  // Create the empty directory that will be used to mask the bind
+  // mounts inside each container.
+  mkdir = os::mkdir(PID_NS_BIND_MOUNT_MASK_DIR);
+  if (mkdir.isError()) {
+    return Error(
+        "Failed to create the bind mount mask direcrory at " +
+        string(PID_NS_BIND_MOUNT_MASK_DIR) + ": " + mkdir.error());
+  }
+
+  return new MesosIsolator(Owned<MesosIsolatorProcess>(
+      new NamespacesPidIsolatorProcess()));
+}
+
+
+Result<ino_t> NamespacesPidIsolatorProcess::getNamespace(
+    const ContainerID& containerId)
+{
+  const string target = nsExtraReference(containerId);
+
+  if (os::exists(target)) {
+    return os::stat::inode(target);
+  }
+
+  return None();
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  hashset<ContainerID> recovered;
+  foreach (const ContainerState& state, states) {
+    recovered.insert(state.container_id());
+  }
+
+  // Clean up any unknown orphaned bind mounts and empty files. Known
+  // orphan bind mounts and empty files will be destroyed by the
+  // containerizer using the normal cleanup path. See MESOS-2367 for
+  // details.
+  Try<list<string>> entries = os::ls(PID_NS_BIND_MOUNT_ROOT);
+  if (entries.isError()) {
+    return Failure("Failed to list existing containers in '" +
+                   string(PID_NS_BIND_MOUNT_ROOT) + "': " + entries.error());
+  }
+
+  foreach (const string& entry, entries.get()) {
+    ContainerID containerId;
+    containerId.set_value(entry);
+
+    if (!recovered.contains(containerId) && !orphans.contains(containerId)) {
+      cleanup(containerId);
+    }
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> NamespacesPidIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  ContainerPrepareInfo prepareInfo;
+  prepareInfo.set_namespaces(CLONE_NEWPID | CLONE_NEWNS);
+
+  // Mask the bind mount root directory in each container so
+  // containers cannot see the namespace bind mount of other
+  // containers.
+  prepareInfo.add_commands()->set_value(
+      "mount -n --bind " + string(PID_NS_BIND_MOUNT_MASK_DIR) +
+      " " + string(PID_NS_BIND_MOUNT_ROOT));
+
+  // Mount /proc for the container's pid namespace to show the
+  // container's pids (and other /proc files), not the parent's. We
+  // first recursively make the mount private because /proc is usually
+  // marked explicitly as shared (see /proc/self/mountinfo) and
+  // changes would propagate to the parent's /proc mount otherwise. We
+  // then mount /proc with the standard options. This technique was
+  // taken from unshare.c in utils-linux for --mount-proc. We use the
+  // -n flag so the mount is not added to the mtab where it will not
+  // be correctly removed with the namespace terminates.
+  prepareInfo.add_commands()->set_value(
+      "mount none /proc --make-private -o rec");
+  prepareInfo.add_commands()->set_value(
+      "mount -n -t proc proc /proc -o nosuid,noexec,nodev");
+
+  return prepareInfo;
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  const string source = nsProcFile(pid);
+  const string target = nsExtraReference(containerId);
+
+  // Create a bind mount of the pid namespace so we can control the
+  // lifetime of the pid namespace. This lets us identify the
+  // container's pid namespace, even if the leading pid has exited.
+  Try<Nothing> touch = os::touch(target);
+  if (touch.isError()) {
+    return Failure("Failed to create bind mount target: " + touch.error());
+  }
+
+  Try<Nothing> mount = fs::mount(source, target, None(), MS_BIND, NULL);
+  if (mount.isError()) {
+    return Failure(
+        "Failed to mount pid namespace handle from " +
+        source + " to " + target + ": " + mount.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> NamespacesPidIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  return Future<ContainerLimitation>();
+}
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> NamespacesPidIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  return ResourceStatistics();
+}
+
+
+// An old glibc might not have this symbol.
+#ifndef MNT_DETACH
+#define MNT_DETACH 2
+#endif
+
+
+Future<Nothing> NamespacesPidIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  const string target = nsExtraReference(containerId);
+
+  if (os::exists(target)) {
+    // We don't expect anyone to have a reference to target but do a
+    // lazy umount in case. We do not want to force the umount; it
+    // will not cause an issue if this umount is delayed.
+    Try<Nothing> unmount = fs::unmount(target, MNT_DETACH);
+
+    // This will fail if the unmount hasn't completed yet but this
+    // only leaks a uniquely named empty file that will cleaned up as
+    // an orphan on recovery.
+    os::rm(target);
+  }
+
+  return Nothing();
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
new file mode 100644
index 0000000..87270d0
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/namespaces/pid.hpp
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NAMESPACES_PID_ISOLATOR_HPP__
+#define __NAMESPACES_PID_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <string>
+
+#include <stout/result.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// This isolator itself does not specify the necessary clone() flags
+// (see the LinuxLauncher for that) but it is used to keep track of a
+// container's pid namespace through a bind mount and exposed by
+// getNamespace().
+class NamespacesPidIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  // Return the pid namespace of the container. Returns None if the
+  // container was not created in a separate pid namespace, i.e.,
+  // processes are in the same namespace as the slave. This is used by
+  // the LinuxLauncher to determine if it can kill the leading process
+  // in the container and let the kernel kill the remaining processes.
+  // A container may not have a pid namespace if it was created
+  // without the namespaces/pid isolator and the slave was
+  // subsequently restarted with namespaces/pid enabled.
+  static Result<ino_t> getNamespace(const ContainerID& container);
+
+  NamespacesPidIsolatorProcess() {}
+
+  virtual ~NamespacesPidIsolatorProcess() {}
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __NAMESPACES_PID_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/helper.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/helper.cpp b/src/slave/containerizer/mesos/isolators/network/helper.cpp
new file mode 100644
index 0000000..d851195
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/helper.cpp
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/none.hpp>
+#include <stout/subcommand.hpp>
+
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
+
+using namespace mesos::internal::slave;
+
+
+int main(int argc, char** argv)
+{
+  return Subcommand::dispatch(
+      None(),
+      argc,
+      argv,
+      new PortMappingUpdate(),
+      new PortMappingStatistics());
+}

[12/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

Relocated MesosContainerizer specific files to the correct location.

Review: https://reviews.apache.org/r/39360


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/9a722d74
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/9a722d74
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/9a722d74

Branch: refs/heads/master
Commit: 9a722d742b2897ea3c6d5574273010c3b1546a58
Parents: 974906b
Author: Gilbert Song <gi...@mesoshere.io>
Authored: Mon Oct 26 10:52:02 2015 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Mon Oct 26 10:52:07 2015 -0700

----------------------------------------------------------------------
 src/Makefile.am                                 |  118 +-
 src/docker/docker.cpp                           |    4 +-
 src/examples/test_isolator_module.cpp           |    2 +-
 src/slave/containerizer/docker.cpp              |    2 +-
 .../isolators/cgroups/constants.hpp             |   44 -
 .../isolators/cgroups/cpushare.cpp              |  572 ---
 .../isolators/cgroups/cpushare.hpp              |  122 -
 .../containerizer/isolators/cgroups/mem.cpp     |  733 ----
 .../containerizer/isolators/cgroups/mem.hpp     |  141 -
 .../isolators/cgroups/perf_event.cpp            |  435 --
 .../isolators/cgroups/perf_event.hpp            |  127 -
 .../isolators/filesystem/linux.cpp              |  893 -----
 .../isolators/filesystem/linux.hpp              |  149 -
 .../isolators/filesystem/posix.cpp              |  294 --
 .../isolators/filesystem/posix.hpp              |   89 -
 .../isolators/filesystem/shared.cpp             |  268 --
 .../isolators/filesystem/shared.hpp             |   79 -
 .../containerizer/isolators/namespaces/pid.cpp  |  266 --
 .../containerizer/isolators/namespaces/pid.hpp  |   91 -
 .../containerizer/isolators/network/helper.cpp  |   35 -
 .../isolators/network/port_mapping.cpp          | 3792 ------------------
 .../isolators/network/port_mapping.hpp          |  403 --
 src/slave/containerizer/isolators/posix.hpp     |  213 -
 .../containerizer/isolators/posix/disk.cpp      |  525 ---
 .../containerizer/isolators/posix/disk.hpp      |  151 -
 src/slave/containerizer/linux_launcher.cpp      |    2 +-
 src/slave/containerizer/mesos/containerizer.cpp |   23 +-
 .../mesos/isolators/cgroups/constants.hpp       |   44 +
 .../mesos/isolators/cgroups/cpushare.cpp        |  572 +++
 .../mesos/isolators/cgroups/cpushare.hpp        |  122 +
 .../mesos/isolators/cgroups/mem.cpp             |  733 ++++
 .../mesos/isolators/cgroups/mem.hpp             |  141 +
 .../mesos/isolators/cgroups/perf_event.cpp      |  435 ++
 .../mesos/isolators/cgroups/perf_event.hpp      |  127 +
 .../mesos/isolators/filesystem/linux.cpp        |  893 +++++
 .../mesos/isolators/filesystem/linux.hpp        |  149 +
 .../mesos/isolators/filesystem/posix.cpp        |  294 ++
 .../mesos/isolators/filesystem/posix.hpp        |   89 +
 .../mesos/isolators/filesystem/shared.cpp       |  268 ++
 .../mesos/isolators/filesystem/shared.hpp       |   79 +
 .../mesos/isolators/namespaces/pid.cpp          |  266 ++
 .../mesos/isolators/namespaces/pid.hpp          |   91 +
 .../mesos/isolators/network/helper.cpp          |   35 +
 .../mesos/isolators/network/port_mapping.cpp    | 3792 ++++++++++++++++++
 .../mesos/isolators/network/port_mapping.hpp    |  403 ++
 .../containerizer/mesos/isolators/posix.hpp     |  213 +
 .../mesos/isolators/posix/disk.cpp              |  525 +++
 .../mesos/isolators/posix/disk.hpp              |  151 +
 .../mesos/provisioner/appc/paths.cpp            |   85 +
 .../mesos/provisioner/appc/paths.hpp            |   83 +
 .../mesos/provisioner/appc/spec.cpp             |  104 +
 .../mesos/provisioner/appc/spec.hpp             |   54 +
 .../mesos/provisioner/appc/store.cpp            |  288 ++
 .../mesos/provisioner/appc/store.hpp            |   63 +
 .../containerizer/mesos/provisioner/backend.cpp |   62 +
 .../containerizer/mesos/provisioner/backend.hpp |   67 +
 .../mesos/provisioner/backends/bind.cpp         |  250 ++
 .../mesos/provisioner/backends/bind.hpp         |   75 +
 .../mesos/provisioner/backends/copy.cpp         |  203 +
 .../mesos/provisioner/backends/copy.hpp         |   69 +
 .../mesos/provisioner/docker/local_puller.cpp   |  355 ++
 .../mesos/provisioner/docker/local_puller.hpp   |   66 +
 .../mesos/provisioner/docker/message.hpp        |  116 +
 .../mesos/provisioner/docker/message.proto      |  100 +
 .../provisioner/docker/metadata_manager.cpp     |  232 ++
 .../provisioner/docker/metadata_manager.hpp     |  106 +
 .../mesos/provisioner/docker/paths.cpp          |  115 +
 .../mesos/provisioner/docker/paths.hpp          |   98 +
 .../mesos/provisioner/docker/puller.cpp         |   46 +
 .../mesos/provisioner/docker/puller.hpp         |   68 +
 .../provisioner/docker/registry_client.cpp      |  641 +++
 .../provisioner/docker/registry_client.hpp      |  164 +
 .../mesos/provisioner/docker/spec.cpp           |   91 +
 .../mesos/provisioner/docker/spec.hpp           |   50 +
 .../mesos/provisioner/docker/store.cpp          |  277 ++
 .../mesos/provisioner/docker/store.hpp          |   68 +
 .../mesos/provisioner/docker/token_manager.cpp  |  362 ++
 .../mesos/provisioner/docker/token_manager.hpp  |  179 +
 .../containerizer/mesos/provisioner/paths.cpp   |  192 +
 .../containerizer/mesos/provisioner/paths.hpp   |   80 +
 .../mesos/provisioner/provisioner.cpp           |  444 ++
 .../mesos/provisioner/provisioner.hpp           |   94 +
 .../containerizer/mesos/provisioner/store.cpp   |   79 +
 .../containerizer/mesos/provisioner/store.hpp   |   75 +
 .../containerizer/provisioner/appc/paths.cpp    |   85 -
 .../containerizer/provisioner/appc/paths.hpp    |   83 -
 .../containerizer/provisioner/appc/spec.cpp     |  104 -
 .../containerizer/provisioner/appc/spec.hpp     |   54 -
 .../containerizer/provisioner/appc/store.cpp    |  288 --
 .../containerizer/provisioner/appc/store.hpp    |   63 -
 src/slave/containerizer/provisioner/backend.cpp |   62 -
 src/slave/containerizer/provisioner/backend.hpp |   67 -
 .../containerizer/provisioner/backends/bind.cpp |  250 --
 .../containerizer/provisioner/backends/bind.hpp |   75 -
 .../containerizer/provisioner/backends/copy.cpp |  203 -
 .../containerizer/provisioner/backends/copy.hpp |   69 -
 .../provisioner/docker/local_puller.cpp         |  355 --
 .../provisioner/docker/local_puller.hpp         |   66 -
 .../provisioner/docker/message.hpp              |  116 -
 .../provisioner/docker/message.proto            |  100 -
 .../provisioner/docker/metadata_manager.cpp     |  232 --
 .../provisioner/docker/metadata_manager.hpp     |  106 -
 .../containerizer/provisioner/docker/paths.cpp  |  115 -
 .../containerizer/provisioner/docker/paths.hpp  |   98 -
 .../containerizer/provisioner/docker/puller.cpp |   46 -
 .../containerizer/provisioner/docker/puller.hpp |   68 -
 .../provisioner/docker/registry_client.cpp      |  641 ---
 .../provisioner/docker/registry_client.hpp      |  164 -
 .../containerizer/provisioner/docker/spec.cpp   |   91 -
 .../containerizer/provisioner/docker/spec.hpp   |   50 -
 .../containerizer/provisioner/docker/store.cpp  |  277 --
 .../containerizer/provisioner/docker/store.hpp  |   68 -
 .../provisioner/docker/token_manager.cpp        |  362 --
 .../provisioner/docker/token_manager.hpp        |  179 -
 src/slave/containerizer/provisioner/paths.cpp   |  192 -
 src/slave/containerizer/provisioner/paths.hpp   |   80 -
 .../containerizer/provisioner/provisioner.cpp   |  444 --
 .../containerizer/provisioner/provisioner.hpp   |   94 -
 src/slave/containerizer/provisioner/store.cpp   |   79 -
 src/slave/containerizer/provisioner/store.hpp   |   75 -
 .../containerizer/filesystem_isolator_tests.cpp |    4 +-
 src/tests/containerizer/isolator_tests.cpp      |   12 +-
 src/tests/containerizer/port_mapping_tests.cpp  |    2 +-
 src/tests/containerizer/provisioner.hpp         |    2 +-
 .../containerizer/provisioner_appc_tests.cpp    |    8 +-
 .../containerizer/provisioner_backend_tests.cpp |    4 +-
 .../containerizer/provisioner_docker_tests.cpp  |   12 +-
 src/tests/disk_quota_tests.cpp                  |    2 +-
 128 files changed, 15021 insertions(+), 15022 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/Makefile.am
----------------------------------------------------------------------
diff --git a/src/Makefile.am b/src/Makefile.am
index 98cbafc..5675b91 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -275,8 +275,8 @@ BUILT_SOURCES += $(STATE_PROTOS)
 CLEANFILES += $(STATE_PROTOS)
 
 DOCKER_PROVISIONER_PROTOS =                                             \
-  slave/containerizer/provisioner/docker/message.pb.cc			\
-  slave/containerizer/provisioner/docker/message.pb.h
+  slave/containerizer/mesos/provisioner/docker/message.pb.cc		\
+  slave/containerizer/mesos/provisioner/docker/message.pb.h
 
 BUILT_SOURCES += $(DOCKER_PROVISIONER_PROTOS)
 CLEANFILES += $(DOCKER_PROVISIONER_PROTOS)
@@ -541,28 +541,28 @@ libmesos_no_3rdparty_la_SOURCES =					\
 	slave/containerizer/external_containerizer.cpp			\
 	slave/containerizer/fetcher.cpp					\
 	slave/containerizer/isolator.cpp				\
-	slave/containerizer/isolators/filesystem/posix.cpp		\
-	slave/containerizer/isolators/posix/disk.cpp			\
 	slave/containerizer/launcher.cpp				\
 	slave/containerizer/mesos/containerizer.cpp			\
+	slave/containerizer/mesos/isolators/filesystem/posix.cpp	\
+	slave/containerizer/mesos/isolators/posix/disk.cpp		\
 	slave/containerizer/mesos/launch.cpp				\
-        slave/containerizer/provisioner/paths.cpp			\
-	slave/containerizer/provisioner/provisioner.cpp			\
-        slave/containerizer/provisioner/store.cpp			\
-	slave/containerizer/provisioner/appc/paths.cpp			\
-	slave/containerizer/provisioner/appc/spec.cpp			\
-	slave/containerizer/provisioner/appc/store.cpp			\
-	slave/containerizer/provisioner/backend.cpp			\
-	slave/containerizer/provisioner/backends/copy.cpp		\
-	slave/containerizer/provisioner/docker/local_puller.cpp		\
-	slave/containerizer/provisioner/docker/message.proto		\
-	slave/containerizer/provisioner/docker/metadata_manager.cpp	\
-	slave/containerizer/provisioner/docker/paths.cpp		\
-	slave/containerizer/provisioner/docker/puller.cpp		\
-	slave/containerizer/provisioner/docker/registry_client.cpp	\
-	slave/containerizer/provisioner/docker/spec.cpp			\
-	slave/containerizer/provisioner/docker/store.cpp		\
-	slave/containerizer/provisioner/docker/token_manager.cpp	\
+	slave/containerizer/mesos/provisioner/paths.cpp			\
+	slave/containerizer/mesos/provisioner/provisioner.cpp		\
+	slave/containerizer/mesos/provisioner/store.cpp			\
+	slave/containerizer/mesos/provisioner/appc/paths.cpp		\
+	slave/containerizer/mesos/provisioner/appc/spec.cpp		\
+	slave/containerizer/mesos/provisioner/appc/store.cpp		\
+	slave/containerizer/mesos/provisioner/backend.cpp		\
+	slave/containerizer/mesos/provisioner/backends/copy.cpp		\
+	slave/containerizer/mesos/provisioner/docker/local_puller.cpp	\
+	slave/containerizer/mesos/provisioner/docker/message.proto	\
+	slave/containerizer/mesos/provisioner/docker/metadata_manager.cpp	\
+	slave/containerizer/mesos/provisioner/docker/paths.cpp		\
+	slave/containerizer/mesos/provisioner/docker/puller.cpp		\
+	slave/containerizer/mesos/provisioner/docker/registry_client.cpp	\
+	slave/containerizer/mesos/provisioner/docker/spec.cpp			\
+	slave/containerizer/mesos/provisioner/docker/store.cpp		\
+	slave/containerizer/mesos/provisioner/docker/token_manager.cpp	\
 	slave/resource_estimators/noop.cpp				\
 	usage/usage.cpp							\
 	v1/attributes.cpp						\
@@ -730,14 +730,14 @@ if OS_LINUX
   libmesos_no_3rdparty_la_SOURCES += linux/fs.cpp
   libmesos_no_3rdparty_la_SOURCES += linux/perf.cpp
   libmesos_no_3rdparty_la_SOURCES += linux/systemd.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/cpushare.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/mem.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/cgroups/perf_event.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/namespaces/pid.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/linux.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/isolators/filesystem/shared.cpp
   libmesos_no_3rdparty_la_SOURCES += slave/containerizer/linux_launcher.cpp
-  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/provisioner/backends/bind.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/mem.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/cgroups/perf_event.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/namespaces/pid.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/filesystem/linux.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/isolators/filesystem/shared.cpp
+  libmesos_no_3rdparty_la_SOURCES += slave/containerizer/mesos/provisioner/backends/bind.cpp
 else
   EXTRA_DIST += linux/cgroups.cpp
   EXTRA_DIST += linux/fs.cpp
@@ -781,8 +781,8 @@ if WITH_NETWORK_ISOLATOR
 	linux/routing/queueing/statistics.hpp
 
   libmesos_no_3rdparty_la_SOURCES +=					\
-	slave/containerizer/isolators/network/port_mapping.cpp		\
-	slave/containerizer/isolators/network/port_mapping.hpp
+	slave/containerizer/mesos/isolators/network/port_mapping.cpp	\
+	slave/containerizer/mesos/isolators/network/port_mapping.hpp
 endif
 
 libmesos_no_3rdparty_la_SOURCES +=					\
@@ -852,36 +852,36 @@ libmesos_no_3rdparty_la_SOURCES +=					\
 	slave/containerizer/isolator.hpp				\
 	slave/containerizer/launcher.hpp				\
 	slave/containerizer/linux_launcher.hpp				\
-        slave/containerizer/provisioner/paths.hpp			\
-	slave/containerizer/provisioner/provisioner.hpp			\
-	slave/containerizer/provisioner/store.hpp			\
-	slave/containerizer/provisioner/appc/paths.hpp			\
-	slave/containerizer/provisioner/appc/spec.hpp			\
-	slave/containerizer/provisioner/appc/store.hpp			\
-	slave/containerizer/provisioner/backend.hpp			\
-	slave/containerizer/provisioner/backends/bind.hpp		\
-	slave/containerizer/provisioner/backends/copy.hpp		\
-	slave/containerizer/provisioner/docker/local_puller.hpp		\
-	slave/containerizer/provisioner/docker/message.hpp		\
-	slave/containerizer/provisioner/docker/metadata_manager.hpp	\
-	slave/containerizer/provisioner/docker/paths.hpp		\
-	slave/containerizer/provisioner/docker/puller.hpp		\
-	slave/containerizer/provisioner/docker/registry_client.hpp	\
-	slave/containerizer/provisioner/docker/spec.hpp			\
-	slave/containerizer/provisioner/docker/store.hpp		\
-	slave/containerizer/provisioner/docker/token_manager.hpp	\
-	slave/containerizer/isolators/posix.hpp				\
-	slave/containerizer/isolators/posix/disk.hpp			\
-	slave/containerizer/isolators/cgroups/constants.hpp		\
-	slave/containerizer/isolators/cgroups/cpushare.hpp		\
-	slave/containerizer/isolators/cgroups/mem.hpp			\
-	slave/containerizer/isolators/cgroups/perf_event.hpp		\
-	slave/containerizer/isolators/namespaces/pid.hpp		\
-	slave/containerizer/isolators/filesystem/linux.hpp		\
-	slave/containerizer/isolators/filesystem/posix.hpp		\
-	slave/containerizer/isolators/filesystem/shared.hpp		\
 	slave/containerizer/mesos/containerizer.hpp			\
+	slave/containerizer/mesos/isolators/posix.hpp			\
+	slave/containerizer/mesos/isolators/posix/disk.hpp		\
+	slave/containerizer/mesos/isolators/cgroups/constants.hpp	\
+	slave/containerizer/mesos/isolators/cgroups/cpushare.hpp	\
+	slave/containerizer/mesos/isolators/cgroups/mem.hpp		\
+	slave/containerizer/mesos/isolators/cgroups/perf_event.hpp	\
+	slave/containerizer/mesos/isolators/namespaces/pid.hpp		\
+	slave/containerizer/mesos/isolators/filesystem/linux.hpp	\
+	slave/containerizer/mesos/isolators/filesystem/posix.hpp	\
+	slave/containerizer/mesos/isolators/filesystem/shared.hpp	\
 	slave/containerizer/mesos/launch.hpp				\
+	slave/containerizer/mesos/provisioner/paths.hpp			\
+	slave/containerizer/mesos/provisioner/provisioner.hpp		\
+	slave/containerizer/mesos/provisioner/store.hpp			\
+	slave/containerizer/mesos/provisioner/appc/paths.hpp		\
+	slave/containerizer/mesos/provisioner/appc/spec.hpp		\
+	slave/containerizer/mesos/provisioner/appc/store.hpp		\
+	slave/containerizer/mesos/provisioner/backend.hpp		\
+	slave/containerizer/mesos/provisioner/backends/bind.hpp		\
+	slave/containerizer/mesos/provisioner/backends/copy.hpp		\
+	slave/containerizer/mesos/provisioner/docker/local_puller.hpp	\
+	slave/containerizer/mesos/provisioner/docker/message.hpp	\
+	slave/containerizer/mesos/provisioner/docker/metadata_manager.hpp	\
+	slave/containerizer/mesos/provisioner/docker/paths.hpp		\
+	slave/containerizer/mesos/provisioner/docker/puller.hpp		\
+	slave/containerizer/mesos/provisioner/docker/registry_client.hpp	\
+	slave/containerizer/mesos/provisioner/docker/spec.hpp			\
+	slave/containerizer/mesos/provisioner/docker/store.hpp		\
+	slave/containerizer/mesos/provisioner/docker/token_manager.hpp	\
 	slave/qos_controllers/noop.hpp					\
 	slave/resource_estimators/noop.hpp				\
 	tests/cluster.hpp						\
@@ -1120,7 +1120,7 @@ mesos_containerizer_LDADD = libmesos.la $(LDADD)
 
 if WITH_NETWORK_ISOLATOR
 pkglibexec_PROGRAMS += mesos-network-helper
-mesos_network_helper_SOURCES = slave/containerizer/isolators/network/helper.cpp
+mesos_network_helper_SOURCES = slave/containerizer/mesos/isolators/network/helper.cpp
 mesos_network_helper_CPPFLAGS = $(MESOS_CPPFLAGS)
 mesos_network_helper_LDADD = libmesos.la $(LDADD)
 endif

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/docker/docker.cpp
----------------------------------------------------------------------
diff --git a/src/docker/docker.cpp b/src/docker/docker.cpp
index 56d63dc..4ebca66 100755
--- a/src/docker/docker.cpp
+++ b/src/docker/docker.cpp
@@ -38,8 +38,8 @@
 #include "linux/cgroups.hpp"
 #endif // __linux__
 
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
 
 #include "slave/constants.hpp"
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/examples/test_isolator_module.cpp
----------------------------------------------------------------------
diff --git a/src/examples/test_isolator_module.cpp b/src/examples/test_isolator_module.cpp
index 577dfca..8123603 100644
--- a/src/examples/test_isolator_module.cpp
+++ b/src/examples/test_isolator_module.cpp
@@ -25,7 +25,7 @@
 
 #include <stout/try.hpp>
 
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
 #include "slave/flags.hpp"
 
 using namespace mesos;

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/docker.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/docker.cpp b/src/slave/containerizer/docker.cpp
index 7022958..276b6e0 100644
--- a/src/slave/containerizer/docker.cpp
+++ b/src/slave/containerizer/docker.cpp
@@ -48,7 +48,7 @@
 #include "slave/containerizer/docker.hpp"
 #include "slave/containerizer/fetcher.hpp"
 
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
 
 #include "usage/usage.hpp"
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/constants.hpp b/src/slave/containerizer/isolators/cgroups/constants.hpp
deleted file mode 100644
index db9dde0..0000000
--- a/src/slave/containerizer/isolators/cgroups/constants.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CGROUPS_ISOLATOR_CONSTANTS_HPP__
-#define __CGROUPS_ISOLATOR_CONSTANTS_HPP__
-
-#include <stout/bytes.hpp>
-#include <stout/duration.hpp>
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// CPU subsystem constants.
-const uint64_t CPU_SHARES_PER_CPU = 1024;
-const uint64_t CPU_SHARES_PER_CPU_REVOCABLE = 10;
-const uint64_t MIN_CPU_SHARES = 2; // Linux constant.
-const Duration CPU_CFS_PERIOD = Milliseconds(100); // Linux default.
-const Duration MIN_CPU_CFS_QUOTA = Milliseconds(1);
-
-
-// Memory subsystem constants.
-const Bytes MIN_MEMORY = Megabytes(32);
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __CGROUPS_ISOLATOR_CONSTANTS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/cpushare.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/cpushare.cpp b/src/slave/containerizer/isolators/cgroups/cpushare.cpp
deleted file mode 100644
index ba748c6..0000000
--- a/src/slave/containerizer/isolators/cgroups/cpushare.cpp
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <mesos/type_utils.hpp>
-#include <mesos/values.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/pid.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/nothing.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "linux/cgroups.hpp"
-
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-
-using namespace process;
-
-using std::list;
-using std::set;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-CgroupsCpushareIsolatorProcess::CgroupsCpushareIsolatorProcess(
-    const Flags& _flags,
-    const hashmap<string, string>& _hierarchies,
-    const vector<string>& _subsystems)
-  : flags(_flags),
-    hierarchies(_hierarchies),
-    subsystems(_subsystems) {}
-
-
-CgroupsCpushareIsolatorProcess::~CgroupsCpushareIsolatorProcess() {}
-
-
-Try<Isolator*> CgroupsCpushareIsolatorProcess::create(const Flags& flags)
-{
-  Try<string> hierarchyCpu = cgroups::prepare(
-        flags.cgroups_hierarchy,
-        "cpu",
-        flags.cgroups_root);
-
-  if (hierarchyCpu.isError()) {
-    return Error(
-        "Failed to prepare hierarchy for cpu subsystem: " +
-        hierarchyCpu.error());
-  }
-
-  Try<string> hierarchyCpuacct = cgroups::prepare(
-        flags.cgroups_hierarchy,
-        "cpuacct",
-        flags.cgroups_root);
-
-  if (hierarchyCpuacct.isError()) {
-    return Error(
-        "Failed to prepare hierarchy for cpuacct subsystem: " +
-        hierarchyCpuacct.error());
-  }
-
-  hashmap<string, string> hierarchies;
-  vector<string> subsystems;
-
-  hierarchies["cpu"] = hierarchyCpu.get();
-  hierarchies["cpuacct"] = hierarchyCpuacct.get();
-
-  if (hierarchyCpu.get() == hierarchyCpuacct.get()) {
-    // Subsystem cpu and cpuacct are co-mounted (e.g., systemd).
-    hierarchies["cpu,cpuacct"] = hierarchyCpu.get();
-    subsystems.push_back("cpu,cpuacct");
-
-    // Ensure that no other subsystem is attached to the hierarchy.
-    Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
-    if (_subsystems.isError()) {
-      return Error(
-          "Failed to get the list of attached subsystems for hierarchy " +
-          hierarchyCpu.get());
-    } else if (_subsystems.get().size() != 2) {
-      return Error(
-          "Unexpected subsystems found attached to the hierarchy " +
-          hierarchyCpu.get());
-    }
-  } else {
-    // Subsystem cpu and cpuacct are mounted separately.
-    subsystems.push_back("cpu");
-    subsystems.push_back("cpuacct");
-
-    // Ensure that no other subsystem is attached to each of the
-    // hierarchy.
-    Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
-    if (_subsystems.isError()) {
-      return Error(
-          "Failed to get the list of attached subsystems for hierarchy " +
-          hierarchyCpu.get());
-    } else if (_subsystems.get().size() != 1) {
-      return Error(
-          "Unexpected subsystems found attached to the hierarchy " +
-          hierarchyCpu.get());
-    }
-
-    _subsystems = cgroups::subsystems(hierarchyCpuacct.get());
-    if (_subsystems.isError()) {
-      return Error(
-          "Failed to get the list of attached subsystems for hierarchy " +
-          hierarchyCpuacct.get());
-    } else if (_subsystems.get().size() != 1) {
-      return Error(
-          "Unexpected subsystems found attached to the hierarchy " +
-          hierarchyCpuacct.get());
-    }
-  }
-
-  if (flags.cgroups_enable_cfs) {
-    Try<bool> exists = cgroups::exists(
-        hierarchies["cpu"],
-        flags.cgroups_root,
-        "cpu.cfs_quota_us");
-
-    if (exists.isError() || !exists.get()) {
-      return Error(
-          "Failed to find 'cpu.cfs_quota_us'. Your kernel "
-          "might be too old to use the CFS cgroups feature.");
-    }
-  }
-
-  process::Owned<MesosIsolatorProcess> process(
-      new CgroupsCpushareIsolatorProcess(flags, hierarchies, subsystems));
-
-  return new MesosIsolator(process);
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  foreach (const ContainerState& state, states) {
-    const ContainerID& containerId = state.container_id();
-    const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
-    Try<bool> exists = cgroups::exists(hierarchies["cpu"], cgroup);
-    if (exists.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-      infos.clear();
-      return Failure(
-          "Failed to check cgroup for container " + stringify(containerId));
-    }
-
-    if (!exists.get()) {
-      // This may occur if the executor has exited and the isolator
-      // has destroyed the cgroup but the slave dies before noticing
-      // this. This will be detected when the containerizer tries to
-      // monitor the executor's pid.
-      LOG(WARNING) << "Couldn't find cgroup for container " << containerId;
-      continue;
-    }
-
-    infos[containerId] = new Info(containerId, cgroup);
-  }
-
-  // Remove orphan cgroups.
-  foreach (const string& subsystem, subsystems) {
-    Try<vector<string>> cgroups = cgroups::get(
-        hierarchies[subsystem],
-        flags.cgroups_root);
-
-    if (cgroups.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-      infos.clear();
-      return Failure(cgroups.error());
-    }
-
-    foreach (const string& cgroup, cgroups.get()) {
-      // Ignore the slave cgroup (see the --slave_subsystems flag).
-      // TODO(idownes): Remove this when the cgroups layout is
-      // updated, see MESOS-1185.
-      if (cgroup == path::join(flags.cgroups_root, "slave")) {
-        continue;
-      }
-
-      ContainerID containerId;
-      containerId.set_value(Path(cgroup).basename());
-
-      if (infos.contains(containerId)) {
-        continue;
-      }
-
-      // Known orphan cgroups will be destroyed by the containerizer
-      // using the normal cleanup path. See MESOS-2367 for details.
-      if (orphans.contains(containerId)) {
-        infos[containerId] = new Info(containerId, cgroup);
-        continue;
-      }
-
-      LOG(INFO) << "Removing unknown orphaned cgroup '"
-                << path::join(subsystem, cgroup) << "'";
-
-      // We don't wait on the destroy as we don't want to block recovery.
-      cgroups::destroy(
-          hierarchies[subsystem],
-          cgroup,
-          cgroups::DESTROY_TIMEOUT);
-    }
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsCpushareIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  // TODO(bmahler): Don't insert into 'infos' unless we create the
-  // cgroup successfully. It's safe for now because 'cleanup' gets
-  // called if we return a Failure, but cleanup will fail because the
-  // cgroup does not exist when cgroups::destroy is called.
-  Info* info = new Info(
-      containerId, path::join(flags.cgroups_root, containerId.value()));
-
-  infos[containerId] = info;
-
-  foreach (const string& subsystem, subsystems) {
-    Try<bool> exists = cgroups::exists(hierarchies[subsystem], info->cgroup);
-    if (exists.isError()) {
-      return Failure("Failed to prepare isolator: " + exists.error());
-    } else if (exists.get()) {
-      return Failure("Failed to prepare isolator: cgroup already exists");
-    }
-
-    Try<Nothing> create = cgroups::create(hierarchies[subsystem], info->cgroup);
-    if (create.isError()) {
-      return Failure("Failed to prepare isolator: " + create.error());
-    }
-
-    // Chown the cgroup so the executor can create nested cgroups. Do
-    // not recurse so the control files are still owned by the slave
-    // user and thus cannot be changed by the executor.
-    if (user.isSome()) {
-      Try<Nothing> chown = os::chown(
-          user.get(),
-          path::join(hierarchies[subsystem], info->cgroup),
-          false);
-      if (chown.isError()) {
-        return Failure("Failed to prepare isolator: " + chown.error());
-      }
-    }
-  }
-
-  return update(containerId, executorInfo.resources())
-    .then([]() -> Future<Option<ContainerPrepareInfo>> {
-      return None();
-    });
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  CHECK_NONE(info->pid);
-  info->pid = pid;
-
-  foreach (const string& subsystem, subsystems) {
-    Try<Nothing> assign = cgroups::assign(
-        hierarchies[subsystem],
-        info->cgroup,
-        pid);
-
-    if (assign.isError()) {
-      LOG(ERROR) << "Failed to assign container '" << info->containerId
-                 << " to its own cgroup '"
-                 << path::join(hierarchies[subsystem], info->cgroup)
-                 << "' : " << assign.error();
-
-      return Failure("Failed to isolate container: " + assign.error());
-    }
-  }
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsCpushareIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  CHECK_NOTNULL(infos[containerId]);
-
-  return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  if (resources.cpus().isNone()) {
-    return Failure("No cpus resource given");
-  }
-
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  const Option<string>& hierarchy = hierarchies.get("cpu");
-  if (hierarchy.isNone()) {
-    return Failure("No 'cpu' hierarchy");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-  info->resources = resources;
-
-  double cpus = resources.cpus().get();
-
-  // Always set cpu.shares.
-  uint64_t shares;
-
-  if (flags.revocable_cpu_low_priority &&
-      resources.revocable().cpus().isSome()) {
-    shares = std::max(
-        (uint64_t) (CPU_SHARES_PER_CPU_REVOCABLE * cpus),
-        MIN_CPU_SHARES);
-  } else {
-    shares = std::max(
-        (uint64_t) (CPU_SHARES_PER_CPU * cpus),
-        MIN_CPU_SHARES);
-  }
-
-  Try<Nothing> write = cgroups::cpu::shares(
-      hierarchy.get(),
-      info->cgroup,
-      shares);
-
-  if (write.isError()) {
-    return Failure("Failed to update 'cpu.shares': " + write.error());
-  }
-
-  LOG(INFO) << "Updated 'cpu.shares' to " << shares
-            << " (cpus " << cpus << ")"
-            << " for container " << containerId;
-
-  // Set cfs quota if enabled.
-  if (flags.cgroups_enable_cfs) {
-    write = cgroups::cpu::cfs_period_us(
-        hierarchy.get(),
-        info->cgroup,
-        CPU_CFS_PERIOD);
-
-    if (write.isError()) {
-      return Failure("Failed to update 'cpu.cfs_period_us': " + write.error());
-    }
-
-    Duration quota = std::max(CPU_CFS_PERIOD * cpus, MIN_CPU_CFS_QUOTA);
-
-    write = cgroups::cpu::cfs_quota_us(hierarchy.get(), info->cgroup, quota);
-    if (write.isError()) {
-      return Failure("Failed to update 'cpu.cfs_quota_us': " + write.error());
-    }
-
-    LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
-              << " and 'cpu.cfs_quota_us' to " << quota
-              << " (cpus " << cpus << ")"
-              << " for container " << containerId;
-  }
-
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsCpushareIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  ResourceStatistics result;
-
-  // TODO(chzhcn): Getting the number of processes and threads is
-  // available as long as any cgroup subsystem is used so this best
-  // not be tied to a specific cgroup isolator. A better place is
-  // probably Linux Launcher, which uses the cgroup freezer subsystem.
-  // That requires some change for it to adopt the new semantics of
-  // reporting subsystem-independent cgroup usage.
-  // Note: The complexity of this operation is linear to the number of
-  // processes and threads in a container: the kernel has to allocate
-  // memory to contain the list of pids or tids; the userspace has to
-  // parse the cgroup files to get the size. If this proves to be a
-  // performance bottleneck, some kind of rate limiting mechanism
-  // needs to be employed.
-  if (flags.cgroups_cpu_enable_pids_and_tids_count) {
-    Try<std::set<pid_t>> pids =
-      cgroups::processes(hierarchies["cpuacct"], info->cgroup);
-    if (pids.isError()) {
-      return Failure("Failed to get number of processes: " + pids.error());
-    }
-
-    result.set_processes(pids.get().size());
-
-    Try<std::set<pid_t>> tids =
-      cgroups::threads(hierarchies["cpuacct"], info->cgroup);
-    if (tids.isError()) {
-      return Failure("Failed to get number of threads: " + tids.error());
-    }
-
-    result.set_threads(tids.get().size());
-  }
-
-  // Get the number of clock ticks, used for cpu accounting.
-  static long ticks = sysconf(_SC_CLK_TCK);
-
-  PCHECK(ticks > 0) << "Failed to get sysconf(_SC_CLK_TCK)";
-
-  // Add the cpuacct.stat information.
-  Try<hashmap<string, uint64_t>> stat = cgroups::stat(
-      hierarchies["cpuacct"],
-      info->cgroup,
-      "cpuacct.stat");
-
-  if (stat.isError()) {
-    return Failure("Failed to read cpuacct.stat: " + stat.error());
-  }
-
-  // TODO(bmahler): Add namespacing to cgroups to enforce the expected
-  // structure, e.g., cgroups::cpuacct::stat.
-  Option<uint64_t> user = stat.get().get("user");
-  Option<uint64_t> system = stat.get().get("system");
-
-  if (user.isSome() && system.isSome()) {
-    result.set_cpus_user_time_secs((double) user.get() / (double) ticks);
-    result.set_cpus_system_time_secs((double) system.get() / (double) ticks);
-  }
-
-  // Add the cpu.stat information only if CFS is enabled.
-  if (flags.cgroups_enable_cfs) {
-    stat = cgroups::stat(hierarchies["cpu"], info->cgroup, "cpu.stat");
-    if (stat.isError()) {
-      return Failure("Failed to read cpu.stat: " + stat.error());
-    }
-
-    Option<uint64_t> nr_periods = stat.get().get("nr_periods");
-    if (nr_periods.isSome()) {
-      result.set_cpus_nr_periods(nr_periods.get());
-    }
-
-    Option<uint64_t> nr_throttled = stat.get().get("nr_throttled");
-    if (nr_throttled.isSome()) {
-      result.set_cpus_nr_throttled(nr_throttled.get());
-    }
-
-    Option<uint64_t> throttled_time = stat.get().get("throttled_time");
-    if (throttled_time.isSome()) {
-      result.set_cpus_throttled_time_secs(
-          Nanoseconds(throttled_time.get()).secs());
-    }
-  }
-
-  return result;
-}
-
-
-Future<Nothing> CgroupsCpushareIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  // Multiple calls may occur during test clean up.
-  if (!infos.contains(containerId)) {
-    VLOG(1) << "Ignoring cleanup request for unknown container: "
-            << containerId;
-
-    return Nothing();
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  list<Future<Nothing>> futures;
-  foreach (const string& subsystem, subsystems) {
-    futures.push_back(cgroups::destroy(
-        hierarchies[subsystem],
-        info->cgroup,
-        cgroups::DESTROY_TIMEOUT));
-  }
-
-  return collect(futures)
-    .onAny(defer(PID<CgroupsCpushareIsolatorProcess>(this),
-                &CgroupsCpushareIsolatorProcess::_cleanup,
-                containerId,
-                lambda::_1))
-    .then([]() { return Nothing(); });
-}
-
-
-Future<list<Nothing>> CgroupsCpushareIsolatorProcess::_cleanup(
-    const ContainerID& containerId,
-    const Future<list<Nothing>>& future)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  CHECK_NOTNULL(infos[containerId]);
-
-  if (!future.isReady()) {
-    return Failure(
-        "Failed to clean up container " + stringify(containerId) +
-        " : " + (future.isFailed() ? future.failure() : "discarded"));
-  }
-
-  delete infos[containerId];
-  infos.erase(containerId);
-
-  return future;
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/cpushare.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/cpushare.hpp b/src/slave/containerizer/isolators/cgroups/cpushare.hpp
deleted file mode 100644
index 54b83a7..0000000
--- a/src/slave/containerizer/isolators/cgroups/cpushare.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __CPUSHARE_ISOLATOR_HPP__
-#define __CPUSHARE_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <string>
-#include <vector>
-
-#include <process/future.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/option.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Use the Linux cpu cgroup controller for cpu isolation which uses the
-// Completely Fair Scheduler (CFS).
-// - cpushare implements proportionally weighted scheduling.
-// - cfs implements hard quota based scheduling.
-class CgroupsCpushareIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~CgroupsCpushareIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  CgroupsCpushareIsolatorProcess(
-      const Flags& flags,
-      const hashmap<std::string, std::string>& hierarchies,
-      const std::vector<std::string>& subsystems);
-
-  virtual process::Future<std::list<Nothing>> _cleanup(
-      const ContainerID& containerId,
-      const process::Future<std::list<Nothing>>& future);
-
-  struct Info
-  {
-    Info(const ContainerID& _containerId, const std::string& _cgroup)
-      : containerId(_containerId), cgroup(_cgroup) {}
-
-    const ContainerID containerId;
-    const std::string cgroup;
-    Option<pid_t> pid;
-    Option<Resources> resources;
-
-    process::Promise<mesos::slave::ContainerLimitation> limitation;
-  };
-
-  const Flags flags;
-
-  // Map from subsystem to hierarchy.
-  hashmap<std::string, std::string> hierarchies;
-
-  // Subsystems used for this isolator. Typically, there are two
-  // elements in the vector: 'cpu' and 'cpuacct'. If cpu and cpuacct
-  // systems are co-mounted (e.g., systems using systemd), then there
-  // will be only one element in the vector which is 'cpu,cpuacct'.
-  std::vector<std::string> subsystems;
-
-  // TODO(bmahler): Use Owned<Info>.
-  hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __CPUSHARE_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/mem.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/mem.cpp b/src/slave/containerizer/isolators/cgroups/mem.cpp
deleted file mode 100644
index 55fa6f4..0000000
--- a/src/slave/containerizer/isolators/cgroups/mem.cpp
+++ /dev/null
@@ -1,733 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <list>
-#include <vector>
-
-#include <mesos/type_utils.hpp>
-#include <mesos/values.hpp>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/pid.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/lambda.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "common/protobuf_utils.hpp"
-
-#include "slave/containerizer/isolators/cgroups/constants.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-
-using namespace process;
-
-using cgroups::memory::pressure::Level;
-using cgroups::memory::pressure::Counter;
-
-using std::list;
-using std::ostringstream;
-using std::set;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-static const vector<Level> levels()
-{
-  return {Level::LOW, Level::MEDIUM, Level::CRITICAL};
-}
-
-
-CgroupsMemIsolatorProcess::CgroupsMemIsolatorProcess(
-    const Flags& _flags,
-    const string& _hierarchy,
-    const bool _limitSwap)
-  : flags(_flags),
-    hierarchy(_hierarchy),
-    limitSwap(_limitSwap) {}
-
-
-CgroupsMemIsolatorProcess::~CgroupsMemIsolatorProcess() {}
-
-
-Try<Isolator*> CgroupsMemIsolatorProcess::create(const Flags& flags)
-{
-  Try<string> hierarchy = cgroups::prepare(
-      flags.cgroups_hierarchy,
-      "memory",
-      flags.cgroups_root);
-
-  if (hierarchy.isError()) {
-    return Error("Failed to create memory cgroup: " + hierarchy.error());
-  }
-
-  // Ensure that no other subsystem is attached to the hierarchy.
-  Try<set<string>> subsystems = cgroups::subsystems(hierarchy.get());
-  if (subsystems.isError()) {
-    return Error(
-        "Failed to get the list of attached subsystems for hierarchy " +
-        hierarchy.get());
-  } else if (subsystems.get().size() != 1) {
-    return Error(
-        "Unexpected subsystems found attached to the hierarchy " +
-        hierarchy.get());
-  }
-
-  // Make sure the kernel OOM-killer is enabled.
-  // The Mesos OOM handler, as implemented, is not capable of handling
-  // the oom condition by itself safely given the limitations Linux
-  // imposes on this code path.
-  Try<Nothing> enable = cgroups::memory::oom::killer::enable(
-      hierarchy.get(), flags.cgroups_root);
-
-  if (enable.isError()) {
-    return Error(enable.error());
-  }
-
-  // Test if memory pressure listening is enabled. We test that on the
-  // root cgroup. We rely on 'Counter::create' to test if memory
-  // pressure listening is enabled or not. The created counters will
-  // be destroyed immediately.
-  foreach (Level level, levels()) {
-    Try<Owned<Counter>> counter = Counter::create(
-        hierarchy.get(),
-        flags.cgroups_root,
-        level);
-
-    if (counter.isError()) {
-      return Error("Failed to listen on " + stringify(level) +
-                   " memory events: " + counter.error());
-    }
-  }
-
-  // Determine whether to limit swap or not.
-  bool limitSwap = false;
-
-  if (flags.cgroups_limit_swap) {
-    Result<Bytes> check = cgroups::memory::memsw_limit_in_bytes(
-        hierarchy.get(), flags.cgroups_root);
-
-    if (check.isError()) {
-      return Error(
-          "Failed to read 'memory.memsw.limit_in_bytes': " +
-          check.error());
-    } else if (check.isNone()) {
-      return Error("'memory.memsw.limit_in_bytes' is not available");
-    }
-
-    limitSwap = true;
-  }
-
-  process::Owned<MesosIsolatorProcess> process(
-      new CgroupsMemIsolatorProcess(flags, hierarchy.get(), limitSwap));
-
-  return new MesosIsolator(process);
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  foreach (const ContainerState& state, states) {
-    const ContainerID& containerId = state.container_id();
-    const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
-    Try<bool> exists = cgroups::exists(hierarchy, cgroup);
-    if (exists.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-      infos.clear();
-      return Failure("Failed to check cgroup for container '" +
-                     stringify(containerId) + "'");
-    }
-
-    if (!exists.get()) {
-      VLOG(1) << "Couldn't find cgroup for container " << containerId;
-      // This may occur if the executor has exited and the isolator
-      // has destroyed the cgroup but the slave dies before noticing
-      // this. This will be detected when the containerizer tries to
-      // monitor the executor's pid.
-      continue;
-    }
-
-    infos[containerId] = new Info(containerId, cgroup);
-
-    oomListen(containerId);
-    pressureListen(containerId);
-  }
-
-  // Remove orphan cgroups.
-  Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
-  if (cgroups.isError()) {
-    foreachvalue (Info* info, infos) {
-      delete info;
-    }
-    infos.clear();
-    return Failure(cgroups.error());
-  }
-
-  foreach (const string& cgroup, cgroups.get()) {
-    // Ignore the slave cgroup (see the --slave_subsystems flag).
-    // TODO(idownes): Remove this when the cgroups layout is updated,
-    // see MESOS-1185.
-    if (cgroup == path::join(flags.cgroups_root, "slave")) {
-      continue;
-    }
-
-    ContainerID containerId;
-    containerId.set_value(Path(cgroup).basename());
-
-    if (infos.contains(containerId)) {
-      continue;
-    }
-
-    // Known orphan cgroups will be destroyed by the containerizer
-    // using the normal cleanup path. See MESOS-2367 for details.
-    if (orphans.contains(containerId)) {
-      infos[containerId] = new Info(containerId, cgroup);
-      continue;
-    }
-
-    LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
-
-    // We don't wait on the destroy as we don't want to block recovery.
-    cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsMemIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  // TODO(bmahler): Don't insert into 'infos' unless we create the
-  // cgroup successfully. It's safe for now because 'cleanup' gets
-  // called if we return a Failure, but cleanup will fail because the
-  // cgroup does not exist when cgroups::destroy is called.
-  Info* info = new Info(
-      containerId, path::join(flags.cgroups_root, containerId.value()));
-
-  infos[containerId] = info;
-
-  // Create a cgroup for this container.
-  Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
-
-  if (exists.isError()) {
-    return Failure("Failed to prepare isolator: " + exists.error());
-  } else if (exists.get()) {
-    return Failure("Failed to prepare isolator: cgroup already exists");
-  }
-
-  Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
-  if (create.isError()) {
-    return Failure("Failed to prepare isolator: " + create.error());
-  }
-
-  // Chown the cgroup so the executor can create nested cgroups. Do
-  // not recurse so the control files are still owned by the slave
-  // user and thus cannot be changed by the executor.
-  if (user.isSome()) {
-    Try<Nothing> chown = os::chown(
-        user.get(),
-        path::join(hierarchy, info->cgroup),
-        false);
-    if (chown.isError()) {
-      return Failure("Failed to prepare isolator: " + chown.error());
-    }
-  }
-
-  oomListen(containerId);
-  pressureListen(containerId);
-
-  return update(containerId, executorInfo.resources())
-    .then([]() -> Future<Option<ContainerPrepareInfo>> {
-      return None();
-    });
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  CHECK_NONE(info->pid);
-  info->pid = pid;
-
-  Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
-  if (assign.isError()) {
-    return Failure("Failed to assign container '" +
-                   stringify(info->containerId) + "' to its own cgroup '" +
-                   path::join(hierarchy, info->cgroup) +
-                   "' : " + assign.error());
-  }
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsMemIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  CHECK_NOTNULL(infos[containerId]);
-
-  return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  if (resources.mem().isNone()) {
-    return Failure("No memory resource given");
-  }
-
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  // New limit.
-  Bytes mem = resources.mem().get();
-  Bytes limit = std::max(mem, MIN_MEMORY);
-
-  // Always set the soft limit.
-  Try<Nothing> write =
-    cgroups::memory::soft_limit_in_bytes(hierarchy, info->cgroup, limit);
-
-  if (write.isError()) {
-    return Failure(
-        "Failed to set 'memory.soft_limit_in_bytes': " + write.error());
-  }
-
-  LOG(INFO) << "Updated 'memory.soft_limit_in_bytes' to " << limit
-            << " for container " << containerId;
-
-  // Read the existing limit.
-  Try<Bytes> currentLimit =
-    cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
-
-  // NOTE: If limitSwap is (has been) used then both limit_in_bytes
-  // and memsw.limit_in_bytes will always be set to the same value.
-  if (currentLimit.isError()) {
-    return Failure(
-        "Failed to read 'memory.limit_in_bytes': " + currentLimit.error());
-  }
-
-  // Determine whether to set the hard limit. If this is the first
-  // time (info->pid.isNone()), or we're raising the existing limit,
-  // then we can update the hard limit safely. Otherwise, if we need
-  // to decrease 'memory.limit_in_bytes' we may induce an OOM if too
-  // much memory is in use. As a result, we only update the soft limit
-  // when the memory reservation is being reduced. This is probably
-  // okay if the machine has available resources.
-  // TODO(benh): Introduce a MemoryWatcherProcess which monitors the
-  // discrepancy between usage and soft limit and introduces a "manual
-  // oom" if necessary.
-  if (info->pid.isNone() || limit > currentLimit.get()) {
-    // We always set limit_in_bytes first and optionally set
-    // memsw.limit_in_bytes if limitSwap is true.
-    Try<Nothing> write = cgroups::memory::limit_in_bytes(
-        hierarchy, info->cgroup, limit);
-
-    if (write.isError()) {
-      return Failure(
-          "Failed to set 'memory.limit_in_bytes': " + write.error());
-    }
-
-    LOG(INFO) << "Updated 'memory.limit_in_bytes' to " << limit
-              << " for container " << containerId;
-
-    if (limitSwap) {
-      Try<bool> write = cgroups::memory::memsw_limit_in_bytes(
-          hierarchy, info->cgroup, limit);
-
-      if (write.isError()) {
-        return Failure(
-            "Failed to set 'memory.memsw.limit_in_bytes': " + write.error());
-      }
-
-      LOG(INFO) << "Updated 'memory.memsw.limit_in_bytes' to " << limit
-                << " for container " << containerId;
-    }
-  }
-
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsMemIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  ResourceStatistics result;
-
-  // The rss from memory.stat is wrong in two dimensions:
-  //   1. It does not include child cgroups.
-  //   2. It does not include any file backed pages.
-  Try<Bytes> usage = cgroups::memory::usage_in_bytes(hierarchy, info->cgroup);
-  if (usage.isError()) {
-    return Failure("Failed to parse memory.usage_in_bytes: " + usage.error());
-  }
-
-  result.set_mem_total_bytes(usage.get().bytes());
-
-  if (limitSwap) {
-    Try<Bytes> usage =
-      cgroups::memory::memsw_usage_in_bytes(hierarchy, info->cgroup);
-    if (usage.isError()) {
-      return Failure(
-        "Failed to parse memory.memsw.usage_in_bytes: " + usage.error());
-    }
-
-    result.set_mem_total_memsw_bytes(usage.get().bytes());
-  }
-
-  // TODO(bmahler): Add namespacing to cgroups to enforce the expected
-  // structure, e.g, cgroups::memory::stat.
-  Try<hashmap<string, uint64_t>> stat =
-    cgroups::stat(hierarchy, info->cgroup, "memory.stat");
-  if (stat.isError()) {
-    return Failure("Failed to read memory.stat: " + stat.error());
-  }
-
-  Option<uint64_t> total_cache = stat.get().get("total_cache");
-  if (total_cache.isSome()) {
-    // TODO(chzhcn): mem_file_bytes is deprecated in 0.23.0 and will
-    // be removed in 0.24.0.
-    result.set_mem_file_bytes(total_cache.get());
-
-    result.set_mem_cache_bytes(total_cache.get());
-  }
-
-  Option<uint64_t> total_rss = stat.get().get("total_rss");
-  if (total_rss.isSome()) {
-    // TODO(chzhcn): mem_anon_bytes is deprecated in 0.23.0 and will
-    // be removed in 0.24.0.
-    result.set_mem_anon_bytes(total_rss.get());
-
-    result.set_mem_rss_bytes(total_rss.get());
-  }
-
-  Option<uint64_t> total_mapped_file = stat.get().get("total_mapped_file");
-  if (total_mapped_file.isSome()) {
-    result.set_mem_mapped_file_bytes(total_mapped_file.get());
-  }
-
-  Option<uint64_t> total_swap = stat.get().get("total_swap");
-  if (total_swap.isSome()) {
-    result.set_mem_swap_bytes(total_swap.get());
-  }
-
-  Option<uint64_t> total_unevictable = stat.get().get("total_unevictable");
-  if (total_unevictable.isSome()) {
-    result.set_mem_unevictable_bytes(total_unevictable.get());
-  }
-
-  // Get pressure counter readings.
-  list<Level> levels;
-  list<Future<uint64_t>> values;
-  foreachpair (Level level,
-               const Owned<Counter>& counter,
-               info->pressureCounters) {
-    levels.push_back(level);
-    values.push_back(counter->value());
-  }
-
-  return await(values)
-    .then(defer(PID<CgroupsMemIsolatorProcess>(this),
-                &CgroupsMemIsolatorProcess::_usage,
-                containerId,
-                result,
-                levels,
-                lambda::_1));
-}
-
-
-Future<ResourceStatistics> CgroupsMemIsolatorProcess::_usage(
-    const ContainerID& containerId,
-    ResourceStatistics result,
-    const list<Level>& levels,
-    const list<Future<uint64_t>>& values)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  list<Level>::const_iterator iterator = levels.begin();
-  foreach (const Future<uint64_t>& value, values) {
-    if (value.isReady()) {
-      switch (*iterator) {
-        case Level::LOW:
-          result.set_mem_low_pressure_counter(value.get());
-          break;
-        case Level::MEDIUM:
-          result.set_mem_medium_pressure_counter(value.get());
-          break;
-        case Level::CRITICAL:
-          result.set_mem_critical_pressure_counter(value.get());
-          break;
-      }
-    } else {
-      LOG(ERROR) << "Failed to listen on " << stringify(*iterator)
-                 << " pressure events for container " << containerId << ": "
-                 << (value.isFailed() ? value.failure() : "discarded");
-    }
-
-    ++iterator;
-  }
-
-  return result;
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  // Multiple calls may occur during test clean up.
-  if (!infos.contains(containerId)) {
-    VLOG(1) << "Ignoring cleanup request for unknown container: "
-            << containerId;
-    return Nothing();
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  if (info->oomNotifier.isPending()) {
-    info->oomNotifier.discard();
-  }
-
-  return cgroups::destroy(hierarchy, info->cgroup, cgroups::DESTROY_TIMEOUT)
-    .onAny(defer(PID<CgroupsMemIsolatorProcess>(this),
-                 &CgroupsMemIsolatorProcess::_cleanup,
-                 containerId,
-                 lambda::_1));
-}
-
-
-Future<Nothing> CgroupsMemIsolatorProcess::_cleanup(
-    const ContainerID& containerId,
-    const Future<Nothing>& future)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  CHECK_NOTNULL(infos[containerId]);
-
-  if (!future.isReady()) {
-    return Failure("Failed to clean up container " + stringify(containerId) +
-                   " : " + (future.isFailed() ? future.failure()
-                                              : "discarded"));
-  }
-
-  delete infos[containerId];
-  infos.erase(containerId);
-
-  return Nothing();
-}
-
-
-void CgroupsMemIsolatorProcess::oomListen(
-    const ContainerID& containerId)
-{
-  CHECK(infos.contains(containerId));
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  info->oomNotifier = cgroups::memory::oom::listen(hierarchy, info->cgroup);
-
-  // If the listening fails immediately, something very wrong
-  // happened.  Therefore, we report a fatal error here.
-  if (info->oomNotifier.isFailed()) {
-    LOG(FATAL) << "Failed to listen for OOM events for container "
-               << containerId << ": "
-               << info->oomNotifier.failure();
-  }
-
-  LOG(INFO) << "Started listening for OOM events for container "
-            << containerId;
-
-  info->oomNotifier.onReady(defer(
-      PID<CgroupsMemIsolatorProcess>(this),
-      &CgroupsMemIsolatorProcess::oomWaited,
-      containerId,
-      lambda::_1));
-}
-
-
-void CgroupsMemIsolatorProcess::oomWaited(
-    const ContainerID& containerId,
-    const Future<Nothing>& future)
-{
-  if (future.isDiscarded()) {
-    LOG(INFO) << "Discarded OOM notifier for container "
-              << containerId;
-  } else if (future.isFailed()) {
-    LOG(ERROR) << "Listening on OOM events failed for container "
-               << containerId << ": " << future.failure();
-  } else {
-    // Out-of-memory event happened, call the handler.
-    LOG(INFO) << "OOM notifier is triggered for container " << containerId;
-    oom(containerId);
-  }
-}
-
-
-void CgroupsMemIsolatorProcess::oom(const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    // It is likely that process exited is executed before this
-    // function (e.g.  The kill and OOM events happen at the same
-    // time, and the process exit event arrives first.) Therefore, we
-    // should not report a fatal error here.
-    LOG(INFO) << "OOM detected for an already terminated executor";
-    return;
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  LOG(INFO) << "OOM detected for container " << containerId;
-
-  // Construct a "message" string to describe why the isolator
-  // destroyed the executor's cgroup (in order to assist in
-  // debugging).
-  ostringstream message;
-  message << "Memory limit exceeded: ";
-
-  // Output the requested memory limit.
-  // NOTE: If limitSwap is (has been) used then both limit_in_bytes
-  // and memsw.limit_in_bytes will always be set to the same value.
-  Try<Bytes> limit = cgroups::memory::limit_in_bytes(hierarchy, info->cgroup);
-
-  if (limit.isError()) {
-    LOG(ERROR) << "Failed to read 'memory.limit_in_bytes': "
-               << limit.error();
-  } else {
-    message << "Requested: " << limit.get() << " ";
-  }
-
-  // Output the maximum memory usage.
-  Try<Bytes> usage = cgroups::memory::max_usage_in_bytes(
-      hierarchy, info->cgroup);
-
-  if (usage.isError()) {
-    LOG(ERROR) << "Failed to read 'memory.max_usage_in_bytes': "
-               << usage.error();
-  } else {
-    message << "Maximum Used: " << usage.get() << "\n";
-  }
-
-  // Output 'memory.stat' of the cgroup to help with debugging.
-  // NOTE: With Kernel OOM-killer enabled these stats may not reflect
-  // memory state at time of OOM.
-  Try<string> read = cgroups::read(hierarchy, info->cgroup, "memory.stat");
-  if (read.isError()) {
-    LOG(ERROR) << "Failed to read 'memory.stat': " << read.error();
-  } else {
-    message << "\nMEMORY STATISTICS: \n" << read.get() << "\n";
-  }
-
-  LOG(INFO) << strings::trim(message.str()); // Trim the extra '\n' at the end.
-
-  // TODO(jieyu): This is not accurate if the memory resource is from
-  // a non-star role or spans roles (e.g., "*" and "role"). Ideally,
-  // we should save the resources passed in and report it here.
-  Resources mem = Resources::parse(
-      "mem",
-      stringify(usage.isSome() ? usage.get().megabytes() : 0),
-      "*").get();
-
-  info->limitation.set(
-      protobuf::slave::createContainerLimitation(
-          mem,
-          message.str(),
-          TaskStatus::REASON_CONTAINER_LIMITATION_MEMORY));
-}
-
-
-void CgroupsMemIsolatorProcess::pressureListen(
-    const ContainerID& containerId)
-{
-  CHECK(infos.contains(containerId));
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  foreach (Level level, levels()) {
-    Try<Owned<Counter>> counter = Counter::create(
-        hierarchy,
-        info->cgroup,
-        level);
-
-    if (counter.isError()) {
-      LOG(ERROR) << "Failed to listen on " << level << " memory pressure "
-                 << "events for container " << containerId << ": "
-                 << counter.error();
-    } else {
-      info->pressureCounters[level] = counter.get();
-
-      LOG(INFO) << "Started listening on " << level << " memory pressure "
-                << "events for container " << containerId;
-    }
-  }
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/mem.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/mem.hpp b/src/slave/containerizer/isolators/cgroups/mem.hpp
deleted file mode 100644
index 47f73c3..0000000
--- a/src/slave/containerizer/isolators/cgroups/mem.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __MEM_ISOLATOR_HPP__
-#define __MEM_ISOLATOR_HPP__
-
-#include <sys/types.h>
-
-#include <process/future.hpp>
-#include <process/owned.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/nothing.hpp>
-#include <stout/option.hpp>
-
-#include "linux/cgroups.hpp"
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-class CgroupsMemIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~CgroupsMemIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  CgroupsMemIsolatorProcess(
-      const Flags& flags,
-      const std::string& hierarchy,
-      bool limitSwap);
-
-  process::Future<ResourceStatistics> _usage(
-      const ContainerID& containerId,
-      ResourceStatistics result,
-      const std::list<cgroups::memory::pressure::Level>& levels,
-      const std::list<process::Future<uint64_t>>& values);
-
-  process::Future<Nothing> _cleanup(
-      const ContainerID& containerId,
-      const process::Future<Nothing>& future);
-
-  struct Info
-  {
-    Info(const ContainerID& _containerId, const std::string& _cgroup)
-      : containerId(_containerId), cgroup(_cgroup) {}
-
-    const ContainerID containerId;
-    const std::string cgroup;
-    Option<pid_t> pid;
-
-    process::Promise<mesos::slave::ContainerLimitation> limitation;
-
-    // Used to cancel the OOM listening.
-    process::Future<Nothing> oomNotifier;
-
-    hashmap<cgroups::memory::pressure::Level,
-            process::Owned<cgroups::memory::pressure::Counter>>
-      pressureCounters;
-  };
-
-  // Start listening on OOM events. This function will create an
-  // eventfd and start polling on it.
-  void oomListen(const ContainerID& containerId);
-
-  // This function is invoked when the polling on eventfd has a
-  // result.
-  void oomWaited(
-      const ContainerID& containerId,
-      const process::Future<Nothing>& future);
-
-  // This function is invoked when the OOM event happens.
-  void oom(const ContainerID& containerId);
-
-  // Start listening on memory pressure events.
-  void pressureListen(const ContainerID& containerId);
-
-  const Flags flags;
-
-  // The path to the cgroups subsystem hierarchy root.
-  const std::string hierarchy;
-
-  const bool limitSwap;
-
-  // TODO(bmahler): Use Owned<Info>.
-  hashmap<ContainerID, Info*> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __MEM_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/cgroups/perf_event.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/cgroups/perf_event.cpp b/src/slave/containerizer/isolators/cgroups/perf_event.cpp
deleted file mode 100644
index 03035df..0000000
--- a/src/slave/containerizer/isolators/cgroups/perf_event.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-
-#include <vector>
-
-#include <google/protobuf/descriptor.h>
-#include <google/protobuf/message.h>
-
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/delay.hpp>
-#include <process/io.hpp>
-#include <process/pid.hpp>
-#include <process/reap.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/check.hpp>
-#include <stout/error.hpp>
-#include <stout/foreach.hpp>
-#include <stout/hashset.hpp>
-#include <stout/lambda.hpp>
-#include <stout/os.hpp>
-#include <stout/path.hpp>
-#include <stout/stringify.hpp>
-#include <stout/try.hpp>
-
-#include "linux/cgroups.hpp"
-#include "linux/perf.hpp"
-
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-using std::list;
-using std::set;
-using std::string;
-using std::vector;
-
-using process::Clock;
-using process::Failure;
-using process::Future;
-using process::PID;
-using process::Time;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> CgroupsPerfEventIsolatorProcess::create(const Flags& flags)
-{
-  LOG(INFO) << "Creating PerfEvent isolator";
-
-  if (!perf::supported()) {
-    return Error("Perf is not supported");
-  }
-
-  if (flags.perf_duration > flags.perf_interval) {
-    return Error("Sampling perf for duration (" +
-                 stringify(flags.perf_duration) +
-                 ") > interval (" +
-                 stringify(flags.perf_interval) +
-                 ") is not supported.");
-  }
-
-  if (!flags.perf_events.isSome()) {
-    return Error("No perf events specified.");
-  }
-
-  set<string> events;
-  foreach (const string& event,
-           strings::tokenize(flags.perf_events.get(), ",")) {
-    events.insert(event);
-  }
-
-  if (!perf::valid(events)) {
-    return Error("Failed to create PerfEvent isolator, invalid events: " +
-                 stringify(events));
-  }
-
-  Try<string> hierarchy = cgroups::prepare(
-      flags.cgroups_hierarchy,
-      "perf_event",
-      flags.cgroups_root);
-
-  if (hierarchy.isError()) {
-    return Error("Failed to create perf_event cgroup: " + hierarchy.error());
-  }
-
-  LOG(INFO) << "PerfEvent isolator will profile for " << flags.perf_duration
-            << " every " << flags.perf_interval
-            << " for events: " << stringify(events);
-
-  process::Owned<MesosIsolatorProcess> process(
-      new CgroupsPerfEventIsolatorProcess(flags, hierarchy.get(), events));
-
-  return new MesosIsolator(process);
-}
-
-
-CgroupsPerfEventIsolatorProcess::~CgroupsPerfEventIsolatorProcess() {}
-
-
-void CgroupsPerfEventIsolatorProcess::initialize()
-{
-  // Start sampling.
-  sample();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  foreach (const ContainerState& state, states) {
-    const ContainerID& containerId = state.container_id();
-    const string cgroup = path::join(flags.cgroups_root, containerId.value());
-
-    Try<bool> exists = cgroups::exists(hierarchy, cgroup);
-    if (exists.isError()) {
-      foreachvalue (Info* info, infos) {
-        delete info;
-      }
-
-      infos.clear();
-      return Failure("Failed to check cgroup " + cgroup +
-                     " for container '" + stringify(containerId) + "'");
-    }
-
-    if (!exists.get()) {
-      // This may occur if the executor is exiting and the isolator has
-      // destroyed the cgroup but the slave dies before noticing this. This
-      // will be detected when the containerizer tries to monitor the
-      // executor's pid.
-      // NOTE: This could also occur if this isolator is now enabled for a
-      // container that was started without this isolator. For this
-      // particular isolator it is acceptable to continue running this
-      // container without a perf_event cgroup because we don't ever
-      // query it and the destroy will succeed immediately.
-      VLOG(1) << "Couldn't find perf event cgroup for container " << containerId
-              << ", perf statistics will not be available";
-      continue;
-    }
-
-    infos[containerId] = new Info(containerId, cgroup);
-  }
-
-  // Remove orphan cgroups.
-  Try<vector<string>> cgroups = cgroups::get(hierarchy, flags.cgroups_root);
-  if (cgroups.isError()) {
-    foreachvalue (Info* info, infos) {
-      delete info;
-    }
-    infos.clear();
-    return Failure(cgroups.error());
-  }
-
-  foreach (const string& cgroup, cgroups.get()) {
-    // Ignore the slave cgroup (see the --slave_subsystems flag).
-    // TODO(idownes): Remove this when the cgroups layout is updated,
-    // see MESOS-1185.
-    if (cgroup == path::join(flags.cgroups_root, "slave")) {
-      continue;
-    }
-
-    ContainerID containerId;
-    containerId.set_value(Path(cgroup).basename());
-
-    if (infos.contains(containerId)) {
-      continue;
-    }
-
-    // Known orphan cgroups will be destroyed by the containerizer
-    // using the normal cleanup path. See details in MESOS-2367.
-    if (orphans.contains(containerId)) {
-      infos[containerId] = new Info(containerId, cgroup);
-      continue;
-    }
-
-    LOG(INFO) << "Removing unknown orphaned cgroup '" << cgroup << "'";
-
-    // We don't wait on the destroy as we don't want to block recovery.
-    cgroups::destroy(hierarchy, cgroup, cgroups::DESTROY_TIMEOUT);
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> CgroupsPerfEventIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  LOG(INFO) << "Preparing perf event cgroup for " << containerId;
-
-  Info* info = new Info(
-      containerId,
-      path::join(flags.cgroups_root, containerId.value()));
-
-  infos[containerId] = CHECK_NOTNULL(info);
-
-  // Create a cgroup for this container.
-  Try<bool> exists = cgroups::exists(hierarchy, info->cgroup);
-
-  if (exists.isError()) {
-    return Failure("Failed to prepare isolator: " + exists.error());
-  }
-
-  if (exists.get()) {
-    return Failure("Failed to prepare isolator: cgroup already exists");
-  }
-
-  if (!exists.get()) {
-    Try<Nothing> create = cgroups::create(hierarchy, info->cgroup);
-    if (create.isError()) {
-      return Failure("Failed to prepare isolator: " + create.error());
-    }
-  }
-
-  // Chown the cgroup so the executor can create nested cgroups. Do
-  // not recurse so the control files are still owned by the slave
-  // user and thus cannot be changed by the executor.
-  if (user.isSome()) {
-    Try<Nothing> chown = os::chown(
-        user.get(),
-        path::join(hierarchy, info->cgroup),
-        false);
-    if (chown.isError()) {
-      return Failure("Failed to prepare isolator: " + chown.error());
-    }
-  }
-
-  return None();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  Try<Nothing> assign = cgroups::assign(hierarchy, info->cgroup, pid);
-  if (assign.isError()) {
-    return Failure("Failed to assign container '" +
-                   stringify(info->containerId) + "' to its own cgroup '" +
-                   path::join(hierarchy, info->cgroup) +
-                   "' : " + assign.error());
-  }
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> CgroupsPerfEventIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  // No resources are limited.
-  return Future<ContainerLimitation>();
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  // Nothing to update.
-  return Nothing();
-}
-
-
-Future<ResourceStatistics> CgroupsPerfEventIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    // Return an empty ResourceStatistics, i.e., without
-    // PerfStatistics, if we don't know about this container.
-    return ResourceStatistics();
-  }
-
-  CHECK_NOTNULL(infos[containerId]);
-
-  ResourceStatistics statistics;
-  statistics.mutable_perf()->CopyFrom(infos[containerId]->statistics);
-
-  return statistics;
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  // Tolerate clean up attempts for unknown containers which may arise from
-  // repeated clean up attempts (during test cleanup).
-  if (!infos.contains(containerId)) {
-    VLOG(1) << "Ignoring cleanup request for unknown container: "
-            << containerId;
-    return Nothing();
-  }
-
-  Info* info = CHECK_NOTNULL(infos[containerId]);
-
-  info->destroying = true;
-
-  return cgroups::destroy(hierarchy, info->cgroup)
-    .then(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
-                &CgroupsPerfEventIsolatorProcess::_cleanup,
-                containerId));
-}
-
-
-Future<Nothing> CgroupsPerfEventIsolatorProcess::_cleanup(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Nothing();
-  }
-
-  delete infos[containerId];
-  infos.erase(containerId);
-
-  return Nothing();
-}
-
-
-Future<hashmap<string, PerfStatistics>> discardSample(
-    Future<hashmap<string, PerfStatistics>> future,
-    const Duration& duration,
-    const Duration& timeout)
-{
-  LOG(ERROR) << "Perf sample of " << stringify(duration)
-             << " failed to complete within " << stringify(timeout)
-             << "; sampling will be halted";
-
-  future.discard();
-
-  return future;
-}
-
-
-void CgroupsPerfEventIsolatorProcess::sample()
-{
-  // Collect a perf sample for all cgroups that are not being
-  // destroyed. Since destroyal is asynchronous, 'perf stat' may
-  // fail if the cgroup is destroyed before running perf.
-  set<string> cgroups;
-
-  foreachvalue (Info* info, infos) {
-    CHECK_NOTNULL(info);
-
-    if (!info->destroying) {
-      cgroups.insert(info->cgroup);
-    }
-  }
-
-  // The discard timeout includes an allowance of twice the
-  // reaper interval to ensure we see the perf process exit.
-  Duration timeout = flags.perf_duration + process::MAX_REAP_INTERVAL() * 2;
-
-  perf::sample(events, cgroups, flags.perf_duration)
-    .after(timeout,
-           lambda::bind(&discardSample,
-                        lambda::_1,
-                        flags.perf_duration,
-                        timeout))
-    .onAny(defer(PID<CgroupsPerfEventIsolatorProcess>(this),
-                 &CgroupsPerfEventIsolatorProcess::_sample,
-                 Clock::now() + flags.perf_interval,
-                 lambda::_1));
-}
-
-
-void CgroupsPerfEventIsolatorProcess::_sample(
-    const Time& next,
-    const Future<hashmap<string, PerfStatistics>>& statistics)
-{
-  if (!statistics.isReady()) {
-    // In case the failure is transient or this is due to a timeout,
-    // we continue sampling. Note that since sampling is done on an
-    // interval, it should be ok if this is a non-transient failure.
-    LOG(ERROR) << "Failed to get perf sample: "
-               << (statistics.isFailed()
-                   ? statistics.failure()
-                   : "discarded due to timeout");
-  } else {
-    // Store the latest statistics, note that cgroups added in the
-    // interim will be picked up by the next sample.
-    foreachvalue (Info* info, infos) {
-      CHECK_NOTNULL(info);
-
-      if (statistics->contains(info->cgroup)) {
-        info->statistics = statistics->get(info->cgroup).get();
-      }
-    }
-  }
-
-  // Schedule sample for the next time.
-  delay(next - Clock::now(),
-        PID<CgroupsPerfEventIsolatorProcess>(this),
-        &CgroupsPerfEventIsolatorProcess::sample);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

[05/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp b/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
new file mode 100644
index 0000000..ae53c1b
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/network/port_mapping.hpp
@@ -0,0 +1,403 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PORT_MAPPING_ISOLATOR_HPP__
+#define __PORT_MAPPING_ISOLATOR_HPP__
+
+#include <stdint.h>
+
+#include <sys/types.h>
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <process/owned.hpp>
+#include <process/subprocess.hpp>
+
+#include <process/metrics/metrics.hpp>
+#include <process/metrics/counter.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/hashset.hpp>
+#include <stout/ip.hpp>
+#include <stout/interval.hpp>
+#include <stout/mac.hpp>
+#include <stout/none.hpp>
+#include <stout/option.hpp>
+#include <stout/subcommand.hpp>
+
+#include "linux/routing/filter/ip.hpp"
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// The prefix this isolator uses for the virtual ethernet devices.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
+
+
+// The root directory where we bind mount all the namespace handles.
+// We choose the directory '/var/run/netns' so that we can use
+// iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
+// network namespace. This is very useful for debugging purposes.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
+
+// The root directory where we keep all the namespace handle
+// symlinks. This is introduced in 0.23.0.
+// NOTE: This constant is exposed for testing.
+inline std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
+{
+  return "/var/run/mesos/netns";
+}
+
+
+// These names are used to identify the traffic control statistics
+// output for each of the Linux Traffic Control Qdiscs we report.
+constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
+constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
+
+
+// Responsible for allocating ephemeral ports for the port mapping
+// network isolator. This class is exposed mainly for unit testing.
+class EphemeralPortsAllocator
+{
+public:
+  EphemeralPortsAllocator(
+      const IntervalSet<uint16_t>& total,
+      size_t _portsPerContainer)
+    : free(total),
+      portsPerContainer_(_portsPerContainer) {};
+
+  // Returns the number of ephemeral ports for each container.
+  size_t portsPerContainer() const { return portsPerContainer_; }
+
+  // Allocate an ephemeral port range for a container. The allocator
+  // will automatically find one port range with the given container
+  // size. Returns error if the allocation cannot be fulfilled (e.g.,
+  // exhausting available ephemeral ports).
+  Try<Interval<uint16_t>> allocate();
+
+  // Mark the specified ephemeral port range as allocated.
+  void allocate(const Interval<uint16_t>& ports);
+
+  // Deallocate the specified ephemeral port range.
+  void deallocate(const Interval<uint16_t>& ports);
+
+  // Return true if the specified ephemeral port range is managed by
+  // the allocator, regardless it has been allocated to use or not.
+  bool isManaged(const Interval<uint16_t>& ports)
+  {
+    return (free + used).contains(ports);
+  }
+
+private:
+  // Given an integer x, return the smallest integer t such that t >=
+  // x and t % m == 0.
+  static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
+
+  IntervalSet<uint16_t> free;
+  IntervalSet<uint16_t> used;
+
+  // The number of ephemeral ports for each container.
+  size_t portsPerContainer_;
+};
+
+
+// For the specified ports, generate a set of port ranges each of
+// which can be used by a single IP filter. In other words, each port
+// range needs to satisfy the following two conditions: 1) the size of
+// the range is 2^n (n=0,1,2...); 2) the begin of the range is size
+// aligned (i.e., begin % size == 0). This function is exposed mainly
+// for unit testing.
+std::vector<routing::filter::ip::PortRange> getPortRanges(
+    const IntervalSet<uint16_t>& ports);
+
+
+// Provides network isolation using port mapping. Each container is
+// assigned a fixed set of ports (including ephemeral ports). The
+// isolator will set up filters on the host such that network traffic
+// to the host will be properly redirected to the corresponding
+// container depending on the destination ports. The network traffic
+// from containers will also be properly relayed to the host. This
+// isolator is useful when the operator wants to reuse the host IP for
+// all containers running on the host (e.g., there are insufficient
+// IPs).
+class PortMappingIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~PortMappingIsolatorProcess() {}
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  struct Info
+  {
+    Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
+         const Interval<uint16_t>& _ephemeralPorts,
+         const Option<pid_t>& _pid = None())
+      : nonEphemeralPorts(_nonEphemeralPorts),
+        ephemeralPorts(_ephemeralPorts),
+        pid(_pid) {}
+
+    // Non-ephemeral ports used by the container. It's possible that a
+    // container does not use any non-ephemeral ports. In that case,
+    // 'nonEphemeralPorts' will be empty. This variable could change
+    // upon 'update'.
+    IntervalSet<uint16_t> nonEphemeralPorts;
+
+    // Each container has one and only one range of ephemeral ports.
+    // It cannot have more than one ranges of ephemeral ports because
+    // we need to setup the ip_local_port_range (which only accepts a
+    // single interval) inside the container to restrict the ephemeral
+    // ports used by the container.
+    const Interval<uint16_t> ephemeralPorts;
+
+    Option<pid_t> pid;
+    Option<uint16_t> flowId;
+  };
+
+  // Define the metrics used by the port mapping network isolator.
+  struct Metrics
+  {
+    Metrics();
+    ~Metrics();
+
+    process::metrics::Counter adding_eth0_ip_filters_errors;
+    process::metrics::Counter adding_eth0_ip_filters_already_exist;
+    process::metrics::Counter adding_eth0_egress_filters_errors;
+    process::metrics::Counter adding_eth0_egress_filters_already_exist;
+    process::metrics::Counter adding_lo_ip_filters_errors;
+    process::metrics::Counter adding_lo_ip_filters_already_exist;
+    process::metrics::Counter adding_veth_ip_filters_errors;
+    process::metrics::Counter adding_veth_ip_filters_already_exist;
+    process::metrics::Counter adding_veth_icmp_filters_errors;
+    process::metrics::Counter adding_veth_icmp_filters_already_exist;
+    process::metrics::Counter adding_veth_arp_filters_errors;
+    process::metrics::Counter adding_veth_arp_filters_already_exist;
+    process::metrics::Counter adding_eth0_icmp_filters_errors;
+    process::metrics::Counter adding_eth0_icmp_filters_already_exist;
+    process::metrics::Counter adding_eth0_arp_filters_errors;
+    process::metrics::Counter adding_eth0_arp_filters_already_exist;
+    process::metrics::Counter removing_eth0_ip_filters_errors;
+    process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
+    process::metrics::Counter removing_eth0_egress_filters_errors;
+    process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
+    process::metrics::Counter removing_lo_ip_filters_errors;
+    process::metrics::Counter removing_lo_ip_filters_do_not_exist;
+    process::metrics::Counter removing_veth_ip_filters_errors;
+    process::metrics::Counter removing_veth_ip_filters_do_not_exist;
+    process::metrics::Counter removing_eth0_icmp_filters_errors;
+    process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
+    process::metrics::Counter removing_eth0_arp_filters_errors;
+    process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
+    process::metrics::Counter updating_eth0_icmp_filters_errors;
+    process::metrics::Counter updating_eth0_icmp_filters_already_exist;
+    process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
+    process::metrics::Counter updating_eth0_arp_filters_errors;
+    process::metrics::Counter updating_eth0_arp_filters_already_exist;
+    process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
+    process::metrics::Counter updating_container_ip_filters_errors;
+  } metrics;
+
+  PortMappingIsolatorProcess(
+      const Flags& _flags,
+      const std::string& _eth0,
+      const std::string& _lo,
+      const net::MAC& _hostMAC,
+      const net::IPNetwork& _hostIPNetwork,
+      const size_t _hostEth0MTU,
+      const net::IP& _hostDefaultGateway,
+      const hashmap<std::string, std::string>& _hostNetworkConfigurations,
+      const Option<Bytes>& _egressRateLimitPerContainer,
+      const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
+      const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
+      const std::set<uint16_t>& _flowIDs)
+    : flags(_flags),
+      eth0(_eth0),
+      lo(_lo),
+      hostMAC(_hostMAC),
+      hostIPNetwork(_hostIPNetwork),
+      hostEth0MTU(_hostEth0MTU),
+      hostDefaultGateway(_hostDefaultGateway),
+      hostNetworkConfigurations(_hostNetworkConfigurations),
+      egressRateLimitPerContainer(_egressRateLimitPerContainer),
+      managedNonEphemeralPorts(_managedNonEphemeralPorts),
+      ephemeralPortsAllocator(_ephemeralPortsAllocator),
+      freeFlowIds(_flowIDs) {}
+
+  // Continuations.
+  Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
+  Try<Info*> _recover(pid_t pid);
+
+  void _update(
+      const ContainerID& containerId,
+      const process::Future<Option<int>>& status);
+
+  process::Future<ResourceStatistics> _usage(
+      const ResourceStatistics& result,
+      const process::Subprocess& s);
+
+  process::Future<ResourceStatistics> __usage(
+      ResourceStatistics result,
+      const process::Future<std::string>& out);
+
+  // Helper functions.
+  Try<Nothing> addHostIPFilters(
+      const routing::filter::ip::PortRange& range,
+      const Option<uint16_t>& flowId,
+      const std::string& veth);
+
+  Try<Nothing> removeHostIPFilters(
+      const routing::filter::ip::PortRange& range,
+      const std::string& veth,
+      bool removeFiltersOnVeth = true);
+
+  // Return the scripts that will be executed in the child context.
+  std::string scripts(Info* info);
+
+  uint16_t getNextFlowId();
+
+  const Flags flags;
+
+  const std::string eth0;
+  const std::string lo;
+  const net::MAC hostMAC;
+  const net::IPNetwork hostIPNetwork;
+  const size_t hostEth0MTU;
+  const net::IP hostDefaultGateway;
+
+  // Describe the host network configurations. It is a map between
+  // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
+  // values of the configure proc files.
+  const hashmap<std::string, std::string> hostNetworkConfigurations;
+
+  // The optional throughput limit to containers' egress traffic.
+  const Option<Bytes> egressRateLimitPerContainer;
+
+  // All the non-ephemeral ports managed by the slave, as passed in
+  // via flags.resources.
+  const IntervalSet<uint16_t> managedNonEphemeralPorts;
+
+  process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
+
+  // Store a set of unused flow ID's on this slave.
+  std::set<uint16_t> freeFlowIds;
+
+  hashmap<ContainerID, Info*> infos;
+
+  // Recovered containers from a previous run that weren't managed by
+  // the network isolator.
+  hashset<ContainerID> unmanaged;
+};
+
+
+// Defines the subcommand for 'update' that needs to be executed by a
+// subprocess to update the filters inside a container.
+class PortMappingUpdate : public Subcommand
+{
+public:
+  static const char* NAME;
+
+  struct Flags : public flags::FlagsBase
+  {
+    Flags();
+
+    Option<std::string> eth0_name;
+    Option<std::string> lo_name;
+    Option<pid_t> pid;
+    Option<JSON::Object> ports_to_add;
+    Option<JSON::Object> ports_to_remove;
+  };
+
+  PortMappingUpdate() : Subcommand(NAME) {}
+
+  Flags flags;
+
+protected:
+  virtual int execute();
+  virtual flags::FlagsBase* getFlags() { return &flags; }
+};
+
+
+// Defines the subcommand for 'statistics' that needs to be executed
+// by a subprocess to retrieve newtork statistics from inside a
+// container.
+class PortMappingStatistics : public Subcommand
+{
+public:
+  static const char* NAME;
+
+  struct Flags : public flags::FlagsBase
+  {
+    Flags();
+
+    Option<std::string> eth0_name;
+    Option<pid_t> pid;
+    bool enable_socket_statistics_summary;
+    bool enable_socket_statistics_details;
+  };
+
+  PortMappingStatistics() : Subcommand(NAME) {}
+
+  Flags flags;
+
+protected:
+  virtual int execute();
+  virtual flags::FlagsBase* getFlags() { return &flags; }
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PORT_MAPPING_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix.hpp b/src/slave/containerizer/mesos/isolators/posix.hpp
new file mode 100644
index 0000000..00df902
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix.hpp
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_ISOLATOR_HPP__
+#define __POSIX_ISOLATOR_HPP__
+
+#include <process/future.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/os.hpp>
+
+#include <stout/os/pstree.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "usage/usage.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// A basic MesosIsolatorProcess that keeps track of the pid but
+// doesn't do any resource isolation. Subclasses must implement
+// usage() for their appropriate resource(s).
+class PosixIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& state,
+      const hashset<ContainerID>& orphans)
+  {
+    foreach (const mesos::slave::ContainerState& run, state) {
+      // This should (almost) never occur: see comment in
+      // PosixLauncher::recover().
+      if (pids.contains(run.container_id())) {
+        return process::Failure("Container already recovered");
+      }
+
+      pids.put(run.container_id(), run.pid());
+
+      process::Owned<process::Promise<mesos::slave::ContainerLimitation>>
+        promise(new process::Promise<mesos::slave::ContainerLimitation>());
+      promises.put(run.container_id(), promise);
+    }
+
+    return Nothing();
+  }
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user)
+  {
+    if (promises.contains(containerId)) {
+      return process::Failure("Container " + stringify(containerId) +
+                              " has already been prepared");
+    }
+
+    process::Owned<process::Promise<mesos::slave::ContainerLimitation>> promise(
+        new process::Promise<mesos::slave::ContainerLimitation>());
+    promises.put(containerId, promise);
+
+    return None();
+  }
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid)
+  {
+    if (!promises.contains(containerId)) {
+      return process::Failure("Unknown container: " + stringify(containerId));
+    }
+
+    pids.put(containerId, pid);
+
+    return Nothing();
+  }
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId)
+  {
+    if (!promises.contains(containerId)) {
+      return process::Failure("Unknown container: " + stringify(containerId));
+    }
+
+    return promises[containerId]->future();
+  }
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources)
+  {
+    if (!promises.contains(containerId)) {
+      return process::Failure("Unknown container: " + stringify(containerId));
+    }
+
+    // No resources are actually isolated so nothing to do.
+    return Nothing();
+  }
+
+  virtual process::Future<Nothing> cleanup(const ContainerID& containerId)
+  {
+    if (!promises.contains(containerId)) {
+      return process::Failure("Unknown container: " + stringify(containerId));
+    }
+
+    // TODO(idownes): We should discard the container's promise here to signal
+    // to anyone that holds the future from watch().
+    promises.erase(containerId);
+
+    pids.erase(containerId);
+
+    return Nothing();
+  }
+
+protected:
+  hashmap<ContainerID, pid_t> pids;
+  hashmap<ContainerID,
+          process::Owned<process::Promise<mesos::slave::ContainerLimitation>>>
+    promises;
+};
+
+
+class PosixCpuIsolatorProcess : public PosixIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags)
+  {
+    process::Owned<MesosIsolatorProcess> process(
+        new PosixCpuIsolatorProcess());
+
+    return new MesosIsolator(process);
+  }
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId)
+  {
+    if (!pids.contains(containerId)) {
+      LOG(WARNING) << "No resource usage for unknown container '"
+                   << containerId << "'";
+      return ResourceStatistics();
+    }
+
+    // Use 'mesos-usage' but only request 'cpus_' values.
+    Try<ResourceStatistics> usage =
+      mesos::internal::usage(pids.get(containerId).get(), false, true);
+    if (usage.isError()) {
+      return process::Failure(usage.error());
+    }
+    return usage.get();
+  }
+
+private:
+  PosixCpuIsolatorProcess() {}
+};
+
+
+class PosixMemIsolatorProcess : public PosixIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags)
+  {
+    process::Owned<MesosIsolatorProcess> process(
+        new PosixMemIsolatorProcess());
+
+    return new MesosIsolator(process);
+  }
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId)
+  {
+    if (!pids.contains(containerId)) {
+      LOG(WARNING) << "No resource usage for unknown container '"
+                   << containerId << "'";
+      return ResourceStatistics();
+    }
+
+    // Use 'mesos-usage' but only request 'mem_' values.
+    Try<ResourceStatistics> usage =
+      mesos::internal::usage(pids.get(containerId).get(), true, false);
+    if (usage.isError()) {
+      return process::Failure(usage.error());
+    }
+    return usage.get();
+  }
+
+private:
+  PosixMemIsolatorProcess() {}
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix/disk.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix/disk.cpp b/src/slave/containerizer/mesos/isolators/posix/disk.cpp
new file mode 100644
index 0000000..ec25156
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix/disk.cpp
@@ -0,0 +1,525 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <signal.h>
+
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+#include <sys/types.h>
+
+#include <deque>
+#include <tuple>
+
+#include <glog/logging.h>
+
+#include <process/check.hpp>
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/delay.hpp>
+#include <process/io.hpp>
+#include <process/subprocess.hpp>
+
+#include <stout/check.hpp>
+#include <stout/foreach.hpp>
+#include <stout/lambda.hpp>
+#include <stout/numify.hpp>
+#include <stout/strings.hpp>
+
+#include <stout/os/exists.hpp>
+#include <stout/os/killtree.hpp>
+
+#include "common/protobuf_utils.hpp"
+
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
+
+using namespace process;
+
+using std::deque;
+using std::list;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+Try<Isolator*> PosixDiskIsolatorProcess::create(const Flags& flags)
+{
+  // TODO(jieyu): Check the availability of command 'du'.
+
+  return new MesosIsolator(process::Owned<MesosIsolatorProcess>(
+        new PosixDiskIsolatorProcess(flags)));
+}
+
+
+PosixDiskIsolatorProcess::Info::PathInfo::~PathInfo()
+{
+  usage.discard();
+}
+
+
+PosixDiskIsolatorProcess::PosixDiskIsolatorProcess(const Flags& _flags)
+  : flags(_flags), collector(flags.container_disk_watch_interval) {}
+
+
+PosixDiskIsolatorProcess::~PosixDiskIsolatorProcess() {}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  foreach (const ContainerState& state, states) {
+    // Since we checkpoint the executor after we create its working
+    // directory, the working directory should definitely exist.
+    CHECK(os::exists(state.directory()))
+      << "Executor work directory " << state.directory() << " doesn't exist";
+
+    infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> PosixDiskIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  infos.put(containerId, Owned<Info>(new Info(directory)));
+
+  return None();
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> PosixDiskIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  if (!infos.contains(containerId)) {
+    LOG(WARNING) << "Ignoring update for unknown container " << containerId;
+    return Nothing();
+  }
+
+  LOG(INFO) << "Updating the disk resources for container "
+            << containerId << " to " << resources;
+
+  const Owned<Info>& info = infos[containerId];
+
+  // This stores the updated quotas.
+  hashmap<string, Resources> quotas;
+
+  foreach (const Resource& resource, resources) {
+    if (resource.name() != "disk") {
+      continue;
+    }
+
+    // The path at which we will collect disk usage and enforce quota.
+    string path;
+
+    // NOTE: We do not allow the case where has_disk() is true but
+    // with nothing set inside DiskInfo. The master will enforce it.
+    if (!resource.has_disk()) {
+      // Regular disk used for executor working directory.
+      path = info->directory;
+    } else {
+      // TODO(jieyu): Support persistent volmes as well.
+      LOG(ERROR) << "Enforcing disk quota unsupported for " << resource;
+      continue;
+    }
+
+    quotas[path] += resource;
+  }
+
+  // Update the quota for paths. For each new path, we also initiate
+  // the disk usage collection.
+  foreachpair (const string& path, const Resources& quota, quotas) {
+    if (!info->paths.contains(path)) {
+      info->paths[path].usage = collector.usage(path)
+        .onAny(defer(
+            PID<PosixDiskIsolatorProcess>(this),
+            &PosixDiskIsolatorProcess::_collect,
+            containerId,
+            path,
+            lambda::_1));
+    }
+
+    info->paths[path].quota = quota;
+  }
+
+  // Remove paths that we no longer interested in.
+  foreach (const string& path, info->paths.keys()) {
+    if (!quotas.contains(path)) {
+      info->paths.erase(path);
+    }
+  }
+
+  return Nothing();
+}
+
+
+void PosixDiskIsolatorProcess::_collect(
+    const ContainerID& containerId,
+    const string& path,
+    const Future<Bytes>& future)
+{
+  if (future.isDiscarded()) {
+    LOG(INFO) << "Checking disk usage at '" << path << "' for container "
+              << containerId << " has been cancelled";
+  } else if (future.isFailed()) {
+    LOG(ERROR) << "Checking disk usage at '" << path << "' for container "
+               << containerId << " has failed: " << future.failure();
+  }
+
+  if (!infos.contains(containerId)) {
+    // The container might have just been destroyed.
+    return;
+  }
+
+  const Owned<Info>& info = infos[containerId];
+
+  if (!info->paths.contains(path)) {
+    // The path might have just been removed from this container's
+    // resources.
+    return;
+  }
+
+  // Check if the disk usage exceeds the quota. If yes, report the
+  // limitation. We keep collecting the disk usage for 'path' by
+  // initiating another round of disk usage check. The check will be
+  // throttled by DiskUsageCollector.
+  if (future.isReady()) {
+    // Save the last disk usage.
+    info->paths[path].lastUsage = future.get();
+
+    if (flags.enforce_container_disk_quota) {
+      Option<Bytes> quota = info->paths[path].quota.disk();
+      CHECK_SOME(quota);
+
+      if (future.get() > quota.get()) {
+        info->limitation.set(
+            protobuf::slave::createContainerLimitation(
+                Resources(info->paths[path].quota),
+                "Disk usage (" + stringify(future.get()) +
+                ") exceeds quota (" + stringify(quota.get()) + ")",
+                TaskStatus::REASON_CONTAINER_LIMITATION_DISK));
+      }
+    }
+  }
+
+  info->paths[path].usage = collector.usage(path)
+    .onAny(defer(
+        PID<PosixDiskIsolatorProcess>(this),
+        &PosixDiskIsolatorProcess::_collect,
+        containerId,
+        path,
+        lambda::_1));
+}
+
+
+Future<ResourceStatistics> PosixDiskIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  ResourceStatistics result;
+
+  const Owned<Info>& info = infos[containerId];
+
+  if (info->paths.contains(info->directory)) {
+    Option<Bytes> quota = info->paths[info->directory].quota.disk();
+    CHECK_SOME(quota);
+
+    result.set_disk_limit_bytes(quota.get().bytes());
+
+    // NOTE: There may be a large delay (# of containers * interval)
+    // until an initial cached value is returned here!
+    if (info->paths[info->directory].lastUsage.isSome()) {
+      result.set_disk_used_bytes(
+          info->paths[info->directory].lastUsage.get().bytes());
+    }
+  }
+
+  return result;
+}
+
+
+Future<Nothing> PosixDiskIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    LOG(WARNING) << "Ignoring cleanup for unknown container " << containerId;
+    return Nothing();
+  }
+
+  infos.erase(containerId);
+
+  return Nothing();
+}
+
+
+class DiskUsageCollectorProcess : public Process<DiskUsageCollectorProcess>
+{
+public:
+  DiskUsageCollectorProcess(const Duration& _interval) : interval(_interval) {}
+  virtual ~DiskUsageCollectorProcess() {}
+
+  Future<Bytes> usage(const string& path)
+  {
+    foreach (const Owned<Entry>& entry, entries) {
+      if (entry->path == path) {
+        return entry->promise.future();
+      }
+    }
+
+    entries.push_back(Owned<Entry>(new Entry(path)));
+
+    // Install onDiscard callback.
+    Future<Bytes> future = entries.back()->promise.future();
+    future.onDiscard(defer(self(), &Self::discard, path));
+
+    return future;
+  }
+
+protected:
+  void initialize()
+  {
+    schedule();
+  }
+
+  void finalize()
+  {
+    foreach (const Owned<Entry>& entry, entries) {
+      if (entry->du.isSome() && entry->du.get().status().isPending()) {
+        os::killtree(entry->du.get().pid(), SIGKILL);
+      }
+
+      entry->promise.fail("DiskUsageCollector is destroyed");
+    }
+  }
+
+private:
+  // Describe a single pending check.
+  struct Entry
+  {
+    explicit Entry(const string& _path) : path(_path) {}
+
+    string path;
+    Option<Subprocess> du;
+    Promise<Bytes> promise;
+  };
+
+  // This function is invoked right before each 'du' is exec'ed. Note
+  // that this function needs to be async signal safe.
+  static int setupChild()
+  {
+#ifdef __linux__
+    // Kill the child process if the parent exits.
+    // NOTE: This function should never returns non-zero because we
+    // are passing in a valid signal.
+    return ::prctl(PR_SET_PDEATHSIG, SIGKILL);
+#else
+    return 0;
+#endif
+  }
+
+  void discard(const string& path)
+  {
+    for (auto it = entries.begin(); it != entries.end(); ++it) {
+      // We only cancel those checks whose 'du' haven't been launched.
+      if ((*it)->path == path && (*it)->du.isNone()) {
+        (*it)->promise.discard();
+        entries.erase(it);
+        break;
+      }
+    }
+  }
+
+  // Schedule a 'du' to be invoked. The current implementation does
+  // not allow multiple 'du's running concurrently. The minimal
+  // interval between two subsequent 'du's is controlled by 'interval'
+  // for throttling purpose.
+  void schedule()
+  {
+    if (entries.empty()) {
+      delay(interval, self(), &Self::schedule);
+      return;
+    }
+
+    const Owned<Entry>& entry = entries.front();
+
+    // Invoke 'du' and report number of 1K-byte blocks. We fix the
+    // block size here so that we can get consistent results on all
+    // platforms (e.g., OS X uses 512 byte blocks).
+    //
+    // NOTE: The 'du' processes are run in the slave's cgroup and it
+    // will be that cgroup that is charged for (a) memory to cache the
+    // fs data structures, (b) disk I/O to read those structures, and
+    // (c) the cpu time to traverse.
+    Try<Subprocess> s = subprocess(
+        "du",
+        vector<string>({"du", "-k", "-s", entry->path}),
+        Subprocess::PATH("/dev/null"),
+        Subprocess::PIPE(),
+        Subprocess::PIPE(),
+        None(),
+        None(),
+        setupChild);
+
+    if (s.isError()) {
+      entry->promise.fail("Failed to exec 'du': " + s.error());
+
+      entries.pop_front();
+      delay(interval, self(), &Self::schedule);
+      return;
+    }
+
+    entry->du = s.get();
+
+    await(s.get().status(),
+          io::read(s.get().out().get()),
+          io::read(s.get().err().get()))
+      .onAny(defer(self(), &Self::_schedule, lambda::_1));
+  }
+
+  void _schedule(const Future<std::tuple<
+      Future<Option<int>>,
+      Future<string>,
+      Future<string>>>& future)
+  {
+    CHECK_READY(future);
+    CHECK(!entries.empty());
+
+    const Owned<Entry>& entry = entries.front();
+    CHECK_SOME(entry->du);
+
+    Future<Option<int>> status = std::get<0>(future.get());
+
+    if (!status.isReady()) {
+      entry->promise.fail(
+          "Failed to perform 'du': " +
+          (status.isFailed() ? status.failure() : "discarded"));
+    } else if (status.get().isNone()) {
+      entry->promise.fail("Failed to reap the status of 'du'");
+    } else if (status.get().get() != 0) {
+      Future<string> error = std::get<2>(future.get());
+      if (!error.isReady()) {
+        entry->promise.fail(
+            "Failed to perform 'du'. Reading stderr failed: " +
+            (error.isFailed() ? error.failure() : "discarded"));
+      } else {
+        entry->promise.fail("Failed to perform 'du': " + error.get());
+      }
+    } else {
+      Future<string> output = std::get<1>(future.get());
+      if (!output.isReady()) {
+        entry->promise.fail(
+            "Failed to read stdout from 'du': " +
+            (output.isFailed() ? output.failure() : "discarded"));
+      } else {
+        // Parsing the output from 'du'. The following is a sample
+        // output. Tab is used as the delimiter between the number of
+        // blocks and the checked path.
+        // $ du /var/lib/mesos/.../runs/container_id
+        // 1024   /var/lib/mesos/.../runs/container_id
+        vector<string> tokens = strings::tokenize(output.get(), " \t");
+        if (tokens.empty()) {
+          entry->promise.fail("Unexpected output from 'du': " + output.get());
+        } else {
+          Try<size_t> value = numify<size_t>(tokens[0]);
+          if (value.isError()) {
+            entry->promise.fail("Unexpected output from 'du': " + output.get());
+          } else {
+            // Notify the callers.
+            entry->promise.set(Kilobytes(value.get()));
+          }
+        }
+      }
+    }
+
+    entries.pop_front();
+    delay(interval, self(), &Self::schedule);
+  }
+
+  const Duration interval;
+
+  // A queue of pending checks.
+  deque<Owned<Entry>> entries;
+};
+
+
+DiskUsageCollector::DiskUsageCollector(const Duration& interval)
+{
+  process = new DiskUsageCollectorProcess(interval);
+  spawn(process);
+}
+
+
+DiskUsageCollector::~DiskUsageCollector()
+{
+  terminate(process);
+  wait(process);
+  delete process;
+}
+
+
+Future<Bytes> DiskUsageCollector::usage(const string& path)
+{
+  return dispatch(process, &DiskUsageCollectorProcess::usage, path);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/posix/disk.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/posix/disk.hpp b/src/slave/containerizer/mesos/isolators/posix/disk.hpp
new file mode 100644
index 0000000..85df5d2
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/posix/disk.hpp
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __POSIX_DISK_ISOLATOR_HPP__
+#define __POSIX_DISK_ISOLATOR_HPP__
+
+#include <string>
+
+#include <process/owned.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/duration.hpp>
+#include <stout/hashmap.hpp>
+
+#include "slave/flags.hpp"
+#include "slave/state.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declarations.
+class DiskUsageCollectorProcess;
+
+
+// Responsible for collecting disk usage for paths, while ensuring
+// that an interval elapses between each collection.
+class DiskUsageCollector
+{
+public:
+  DiskUsageCollector(const Duration& interval);
+  ~DiskUsageCollector();
+
+  // Returns the disk usage rooted at 'path'. The user can discard the
+  // returned future to cancel the check.
+  process::Future<Bytes> usage(const std::string& path);
+
+private:
+  DiskUsageCollectorProcess* process;
+};
+
+
+// This isolator monitors the disk usage for containers, and reports
+// ContainerLimitation when a container exceeds its disk quota. This
+// leverages the DiskUsageCollector to ensure that we don't induce too
+// much CPU usage and disk caching effects from running 'du' too
+// often.
+//
+// NOTE: Currently all containers are processed in the same queue,
+// which means that when a container starts, it could take many disk
+// collection intervals until any data is available in the resource
+// usage statistics!
+//
+// TODO(jieyu): Consider handling each container independently, or
+// triggering an initial collection when the container starts, to
+// ensure that we have usage statistics without a large delay.
+class PosixDiskIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~PosixDiskIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  PosixDiskIsolatorProcess(const Flags& flags);
+
+  void _collect(
+      const ContainerID& containerId,
+      const std::string& path,
+      const process::Future<Bytes>& future);
+
+  const Flags flags;
+  DiskUsageCollector collector;
+
+  struct Info
+  {
+    explicit Info(const std::string& _directory) : directory(_directory) {}
+
+    // We save executor working directory here so that we know where
+    // to collect disk usage for disk resources without DiskInfo.
+    const std::string directory;
+
+    process::Promise<mesos::slave::ContainerLimitation> limitation;
+
+    // The keys of the hashmaps contain the executor working directory
+    // above, and optionally paths of volumes used by the container.
+    // For each path, we maintain its quota and its last usage.
+    struct PathInfo
+    {
+      ~PathInfo();
+
+      Resources quota;
+      process::Future<Bytes> usage;
+      Option<Bytes> lastUsage;
+    };
+
+    hashmap<std::string, PathInfo> paths;
+  };
+
+  hashmap<ContainerID, process::Owned<Info>> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __POSIX_DISK_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/paths.cpp b/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
new file mode 100644
index 0000000..c898cd3
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/paths.cpp
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <stout/path.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+
+using std::list;
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace paths {
+
+string getStagingDir(const string& storeDir)
+{
+  return path::join(storeDir, "staging");
+}
+
+
+string getImagesDir(const string& storeDir)
+{
+  return path::join(storeDir, "images");
+}
+
+
+string getImagePath(const string& storeDir, const string& imageId)
+{
+  return path::join(getImagesDir(storeDir), imageId);
+}
+
+
+string getImageRootfsPath(
+    const string& storeDir,
+    const string& imageId)
+{
+  return path::join(getImagePath(storeDir, imageId), "rootfs");
+}
+
+
+string getImageRootfsPath(const string& imagePath)
+{
+  return path::join(imagePath, "rootfs");
+}
+
+
+string getImageManifestPath(
+    const string& storeDir,
+    const string& imageId)
+{
+  return path::join(getImagePath(storeDir, imageId), "manifest");
+}
+
+
+string getImageManifestPath(const string& imagePath)
+{
+  return path::join(imagePath, "manifest");
+}
+
+} // namespace paths {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/paths.hpp b/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
new file mode 100644
index 0000000..7c36d67
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/paths.hpp
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_PATHS_HPP__
+#define __PROVISIONER_APPC_PATHS_HPP__
+
+#include <string>
+
+#include <mesos/mesos.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/try.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace paths {
+
+// The appc store file system layout is as follows:
+//
+// <store_dir> ('--appc_store_dir' flag)
+// |--staging (contains temp directories for staging downloads)
+// |
+// |--images (stores validated images)
+//    |--<image_id> (in the form of "sha512-<128_character_hash_sum>")
+//       |--manifest
+//       |--rootfs
+//          |--... (according to the ACI spec)
+//
+// TODO(xujyan): The staging directory is unused for now (it's
+// externally managed) but implemented to illustrate the need for a
+// separate 'images' directory. Complete the layout diagram when the
+// staging directory is utilized by the provisioner.
+
+std::string getStagingDir(const std::string& storeDir);
+
+
+std::string getImagesDir(const std::string& storeDir);
+
+
+std::string getImagePath(
+    const std::string& storeDir,
+    const std::string& imageId);
+
+
+std::string getImageRootfsPath(
+    const std::string& storeDir,
+    const std::string& imageId);
+
+
+std::string getImageRootfsPath(const std::string& imagePath);
+
+
+std::string getImageManifestPath(
+    const std::string& storeDir,
+    const std::string& imageId);
+
+
+std::string getImageManifestPath(const std::string& imagePath);
+
+} // namespace paths {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_PATHS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/spec.cpp b/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
new file mode 100644
index 0000000..5361c24
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/spec.cpp
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stout/os/stat.hpp>
+#include <stout/protobuf.hpp>
+#include <stout/strings.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace spec {
+
+Option<Error> validateManifest(const AppcImageManifest& manifest)
+{
+  // TODO(idownes): Validate that required fields are present when
+  // this cannot be expressed in the protobuf specification, e.g.,
+  // repeated fields with >= 1.
+  // TODO(xujyan): More thorough type validation:
+  // https://github.com/appc/spec/blob/master/spec/types.md
+  if (manifest.ackind() != "ImageManifest") {
+    return Error("Incorrect acKind field: " + manifest.ackind());
+  }
+
+  return None();
+}
+
+
+Option<Error> validateImageID(const string& imageId)
+{
+  if (!strings::startsWith(imageId, "sha512-")) {
+    return Error("Image ID needs to start with sha512-");
+  }
+
+  string hash = strings::remove(imageId, "sha512-", strings::PREFIX);
+  if (hash.length() != 128) {
+    return Error("Invalid hash length for: " + hash);
+  }
+
+  return None();
+}
+
+
+Option<Error> validateLayout(const string& imagePath)
+{
+  if (!os::stat::isdir(paths::getImageRootfsPath(imagePath))) {
+    return Error("No rootfs directory found in image layout");
+  }
+
+  if (!os::stat::isfile(paths::getImageManifestPath(imagePath))) {
+    return Error("No manifest found in image layout");
+  }
+
+  return None();
+}
+
+
+Try<AppcImageManifest> parse(const string& value)
+{
+  Try<JSON::Object> json = JSON::parse<JSON::Object>(value);
+  if (json.isError()) {
+    return Error("JSON parse failed: " + json.error());
+  }
+
+  Try<AppcImageManifest> manifest =
+    protobuf::parse<AppcImageManifest>(json.get());
+
+  if (manifest.isError()) {
+    return Error("Protobuf parse failed: " + manifest.error());
+  }
+
+  Option<Error> error = validateManifest(manifest.get());
+  if (error.isSome()) {
+    return Error("Schema validation failed: " + error.get().message);
+  }
+
+  return manifest.get();
+}
+
+} // namespace spec {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/spec.hpp b/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
new file mode 100644
index 0000000..2bc8c6f
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/spec.hpp
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_SPEC_HPP__
+#define __PROVISIONER_APPC_SPEC_HPP__
+
+#include <string>
+
+#include <stout/error.hpp>
+#include <stout/option.hpp>
+
+#include <mesos/mesos.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+namespace spec {
+
+// Validate if the specified image manifest conforms to the Appc spec.
+Option<Error> validateManifest(const AppcImageManifest& manifest);
+
+// Validate if the specified image ID conforms to the Appc spec.
+Option<Error> validateImageID(const std::string& imageId);
+
+// Validate if the specified image has the disk layout that conforms
+// to the Appc spec.
+Option<Error> validateLayout(const std::string& imagePath);
+
+// Parse the AppcImageManifest in the specified JSON string.
+Try<AppcImageManifest> parse(const std::string& value);
+
+} // namespace spec {
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_SPEC_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/store.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/store.cpp b/src/slave/containerizer/mesos/provisioner/appc/store.cpp
new file mode 100644
index 0000000..a2f36f1
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/store.cpp
@@ -0,0 +1,288 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <glog/logging.h>
+
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+
+#include <stout/check.hpp>
+#include <stout/hashmap.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+
+#include "slave/containerizer/mesos/provisioner/appc/paths.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/spec.hpp"
+#include "slave/containerizer/mesos/provisioner/appc/store.hpp"
+
+using namespace process;
+
+using std::list;
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+
+// Defines a locally cached image (which has passed validation).
+struct CachedImage
+{
+  static Try<CachedImage> create(const string& imagePath);
+
+  CachedImage(
+      const AppcImageManifest& _manifest,
+      const string& _id,
+      const string& _path)
+    : manifest(_manifest), id(_id), path(_path) {}
+
+  string rootfs() const
+  {
+    return path::join(path, "rootfs");
+  }
+
+  const AppcImageManifest manifest;
+
+  // Image ID of the format "sha512-value" where "value" is the hex
+  // encoded string of the sha512 digest of the uncompressed tar file
+  // of the image.
+  const string id;
+
+  // Absolute path to the extracted image.
+  const string path;
+};
+
+
+Try<CachedImage> CachedImage::create(const string& imagePath)
+{
+  Option<Error> error = spec::validateLayout(imagePath);
+  if (error.isSome()) {
+    return Error("Invalid image layout: " + error.get().message);
+  }
+
+  string imageId = Path(imagePath).basename();
+
+  error = spec::validateImageID(imageId);
+  if (error.isSome()) {
+    return Error("Invalid image ID: " + error.get().message);
+  }
+
+  Try<string> read = os::read(paths::getImageManifestPath(imagePath));
+  if (read.isError()) {
+    return Error("Failed to read manifest: " + read.error());
+  }
+
+  Try<AppcImageManifest> manifest = spec::parse(read.get());
+  if (manifest.isError()) {
+    return Error("Failed to parse manifest: " + manifest.error());
+  }
+
+  return CachedImage(manifest.get(), imageId, imagePath);
+}
+
+
+// Helper that implements this:
+// https://github.com/appc/spec/blob/master/spec/aci.md#dependency-matching
+static bool matches(Image::Appc requirements, const CachedImage& candidate)
+{
+  // The name must match.
+  if (candidate.manifest.name() != requirements.name()) {
+    return false;
+  }
+
+  // If an id is specified the candidate must match.
+  if (requirements.has_id() && (candidate.id != requirements.id())) {
+    return false;
+  }
+
+  // Extract labels for easier comparison, this also weeds out duplicates.
+  // TODO(xujyan): Detect duplicate labels in image manifest validation
+  // and Image::Appc validation.
+  hashmap<string, string> requiredLabels;
+  foreach (const Label& label, requirements.labels().labels()) {
+    requiredLabels[label.key()] = label.value();
+  }
+
+  hashmap<string, string> candidateLabels;
+  foreach (const AppcImageManifest::Label& label,
+           candidate.manifest.labels()) {
+    candidateLabels[label.name()] = label.value();
+  }
+
+  // Any label specified must be present and match in the candidate.
+  foreachpair (const string& name,
+               const string& value,
+               requiredLabels) {
+    if (!candidateLabels.contains(name) ||
+        candidateLabels.get(name).get() != value) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+
+class StoreProcess : public Process<StoreProcess>
+{
+public:
+  StoreProcess(const string& rootDir);
+
+  ~StoreProcess() {}
+
+  Future<Nothing> recover();
+
+  Future<vector<string>> get(const Image& image);
+
+private:
+  // Absolute path to the root directory of the store as defined by
+  // --appc_store_dir.
+  const string rootDir;
+
+  // Mappings: name -> id -> image.
+  hashmap<string, hashmap<string, CachedImage>> images;
+};
+
+
+Try<Owned<slave::Store>> Store::create(const Flags& flags)
+{
+  Try<Nothing> mkdir = os::mkdir(paths::getImagesDir(flags.appc_store_dir));
+  if (mkdir.isError()) {
+    return Error("Failed to create the images directory: " + mkdir.error());
+  }
+
+  // Make sure the root path is canonical so all image paths derived
+  // from it are canonical too.
+  Result<string> rootDir = os::realpath(flags.appc_store_dir);
+  if (!rootDir.isSome()) {
+    // The above mkdir call recursively creates the store directory
+    // if necessary so it cannot be None here.
+    CHECK_ERROR(rootDir);
+
+    return Error(
+        "Failed to get the realpath of the store root directory: " +
+        rootDir.error());
+  }
+
+  return Owned<slave::Store>(new Store(
+      Owned<StoreProcess>(new StoreProcess(rootDir.get()))));
+}
+
+
+Store::Store(Owned<StoreProcess> _process)
+  : process(_process)
+{
+  spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Store::~Store()
+{
+  terminate(process.get());
+  wait(process.get());
+}
+
+
+Future<Nothing> Store::recover()
+{
+  return dispatch(process.get(), &StoreProcess::recover);
+}
+
+
+Future<vector<string>> Store::get(const Image& image)
+{
+  return dispatch(process.get(), &StoreProcess::get, image);
+}
+
+
+StoreProcess::StoreProcess(const string& _rootDir) : rootDir(_rootDir) {}
+
+
+Future<Nothing> StoreProcess::recover()
+{
+  // Recover everything in the store.
+  Try<list<string>> imageIds = os::ls(paths::getImagesDir(rootDir));
+  if (imageIds.isError()) {
+    return Failure(
+        "Failed to list images under '" +
+        paths::getImagesDir(rootDir) + "': " +
+        imageIds.error());
+  }
+
+  foreach (const string& imageId, imageIds.get()) {
+    string path = paths::getImagePath(rootDir, imageId);
+    if (!os::stat::isdir(path)) {
+      LOG(WARNING) << "Unexpected entry in storage: " << imageId;
+      continue;
+    }
+
+    Try<CachedImage> image = CachedImage::create(path);
+    if (image.isError()) {
+      LOG(WARNING) << "Unexpected entry in storage: " << image.error();
+      continue;
+    }
+
+    LOG(INFO) << "Restored image '" << image.get().manifest.name() << "'";
+
+    images[image.get().manifest.name()].put(image.get().id, image.get());
+  }
+
+  return Nothing();
+}
+
+
+Future<vector<string>> StoreProcess::get(const Image& image)
+{
+  if (image.type() != Image::APPC) {
+    return Failure("Not an Appc image: " + stringify(image.type()));
+  }
+
+  const Image::Appc& appc = image.appc();
+
+  if (!images.contains(appc.name())) {
+    return Failure("No Appc image named '" + appc.name() + "' can be found");
+  }
+
+  // Get local candidates.
+  vector<CachedImage> candidates;
+  foreach (const CachedImage& candidate, images[appc.name()].values()) {
+    // The first match is returned.
+    // TODO(xujyan): Some tie-breaking rules are necessary.
+    if (matches(appc, candidate)) {
+      LOG(INFO) << "Found match for Appc image '" << appc.name()
+                << "' in the store";
+
+      // The Appc store current doesn't support dependencies and this
+      // is enforced by manifest validation: if the image's manifest
+      // contains dependencies it would fail the validation and
+      // wouldn't be stored in the store.
+      return vector<string>({candidate.rootfs()});
+    }
+  }
+
+  return Failure("No Appc image named '" + appc.name() +
+                 "' can match the requirements");
+}
+
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/appc/store.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/appc/store.hpp b/src/slave/containerizer/mesos/provisioner/appc/store.hpp
new file mode 100644
index 0000000..18dbf63
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/appc/store.hpp
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_APPC_STORE_HPP__
+#define __PROVISIONER_APPC_STORE_HPP__
+
+#include "slave/containerizer/mesos/provisioner/store.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+namespace appc {
+
+// Forward declaration.
+class StoreProcess;
+
+
+class Store : public slave::Store
+{
+public:
+  static Try<process::Owned<slave::Store>> create(const Flags& flags);
+
+  ~Store();
+
+  virtual process::Future<Nothing> recover();
+
+  // TODO(xujyan): Fetching remotely is not implemented for now and
+  // until then the future fails directly if the image is not in the
+  // local cache.
+  // TODO(xujyan): The store currently doesn't support images that
+  // have dependencies and we should add it later.
+  virtual process::Future<std::vector<std::string>> get(const Image& image);
+
+private:
+  Store(process::Owned<StoreProcess> process);
+
+  Store(const Store&) = delete; // Not copyable.
+  Store& operator=(const Store&) = delete; // Not assignable.
+
+  process::Owned<StoreProcess> process;
+};
+
+} // namespace appc {
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_APPC_STORE_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backend.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backend.cpp b/src/slave/containerizer/mesos/provisioner/backend.cpp
new file mode 100644
index 0000000..35ced4b
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backend.cpp
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <glog/logging.h>
+
+#include <stout/os.hpp>
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
+
+using namespace process;
+
+using std::string;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+hashmap<string, Owned<Backend>> Backend::create(const Flags& flags)
+{
+  hashmap<string, Try<Owned<Backend>>(*)(const Flags&)> creators;
+
+#ifdef __linux__
+  creators.put("bind", &BindBackend::create);
+#endif // __linux__
+  creators.put("copy", &CopyBackend::create);
+
+  hashmap<string, Owned<Backend>> backends;
+
+  foreachkey (const string& name, creators) {
+    Try<Owned<Backend>> backend = creators[name](flags);
+    if (backend.isError()) {
+      LOG(WARNING) << "Failed to create '" << name << "' backend: "
+                   << backend.error();
+      continue;
+    }
+    backends.put(name, backend.get());
+  }
+
+  return backends;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backend.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backend.hpp b/src/slave/containerizer/mesos/provisioner/backend.hpp
new file mode 100644
index 0000000..1c80b79
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backend.hpp
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKEND_HPP__
+#define __PROVISIONER_BACKEND_HPP__
+
+#include <string>
+#include <vector>
+
+#include <process/future.hpp>
+#include <process/owned.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/try.hpp>
+
+#include "slave/flags.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Provision a root filesystem for a container.
+class Backend
+{
+public:
+  virtual ~Backend() {}
+
+  // Return a map of all supported backends keyed by their names. Note
+  // that Backends that failed to be created due to incorrect flags are
+  // simply not added to the result.
+  static hashmap<std::string, process::Owned<Backend>> create(
+      const Flags& flags);
+
+  // Provision a root filesystem for a container into the specified 'rootfs'
+  // directory by applying the specified list of root filesystem layers in
+  // the list order, i.e., files in a layer can overwrite/shadow those from
+  // another layer earlier in the list.
+  virtual process::Future<Nothing> provision(
+      const std::vector<std::string>& layers,
+      const std::string& rootfs) = 0;
+
+  // Destroy the root filesystem provisioned at the specified 'rootfs'
+  // directory. Return false if there is no provisioned root filesystem
+  // to destroy for the given directory.
+  virtual process::Future<bool> destroy(const std::string& rootfs) = 0;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKEND_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/bind.cpp b/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
new file mode 100644
index 0000000..22ca3c1
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/bind.cpp
@@ -0,0 +1,250 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <process/dispatch.hpp>
+#include <process/process.hpp>
+
+#include <process/metrics/counter.hpp>
+#include <process/metrics/metrics.hpp>
+
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+
+#include "linux/fs.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/bind.hpp"
+
+using namespace process;
+
+using std::string;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class BindBackendProcess : public Process<BindBackendProcess>
+{
+public:
+  Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
+
+  Future<bool> destroy(const string& rootfs);
+
+  struct Metrics
+  {
+    Metrics();
+    ~Metrics();
+
+    process::metrics::Counter remove_rootfs_errors;
+  } metrics;
+};
+
+
+Try<Owned<Backend>> BindBackend::create(const Flags&)
+{
+  Result<string> user = os::user();
+  if (!user.isSome()) {
+    return Error("Failed to determine user: " +
+                 (user.isError() ? user.error() : "username not found"));
+  }
+
+  if (user.get() != "root") {
+    return Error("BindBackend requires root privileges");
+  }
+
+  return Owned<Backend>(new BindBackend(
+      Owned<BindBackendProcess>(new BindBackendProcess())));
+}
+
+
+BindBackend::~BindBackend()
+{
+  terminate(process.get());
+  wait(process.get());
+}
+
+
+BindBackend::BindBackend(Owned<BindBackendProcess> _process)
+  : process(_process)
+{
+  spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Future<Nothing> BindBackend::provision(
+    const vector<string>& layers,
+    const string& rootfs)
+{
+  return dispatch(
+      process.get(), &BindBackendProcess::provision, layers, rootfs);
+}
+
+
+Future<bool> BindBackend::destroy(const string& rootfs)
+{
+  return dispatch(process.get(), &BindBackendProcess::destroy, rootfs);
+}
+
+
+Future<Nothing> BindBackendProcess::provision(
+    const vector<string>& layers,
+    const string& rootfs)
+{
+  if (layers.size() > 1) {
+    return Failure(
+        "Multiple layers are not supported by the bind backend");
+  }
+
+  if (layers.size() == 0) {
+    return Failure("No filesystem layer provided");
+  }
+
+  Try<Nothing> mkdir = os::mkdir(rootfs);
+  if (mkdir.isError()) {
+    return Failure("Failed to create container rootfs at " + rootfs);
+  }
+
+  // TODO(xujyan): Use MS_REC? Does any provisioner use mounts within
+  // its image store in a single layer?
+  Try<Nothing> mount = fs::mount(
+      layers.front(),
+      rootfs,
+      None(),
+      MS_BIND,
+      NULL);
+
+  if (mount.isError()) {
+    return Failure(
+        "Failed to bind mount rootfs '" + layers.front() +
+        "' to '" + rootfs + "': " + mount.error());
+  }
+
+  // And remount it read-only.
+  mount = fs::mount(
+      None(), // Ignored.
+      rootfs,
+      None(),
+      MS_BIND | MS_RDONLY | MS_REMOUNT,
+      NULL);
+
+  if (mount.isError()) {
+    return Failure(
+        "Failed to remount rootfs '" + rootfs + "' read-only: " +
+        mount.error());
+  }
+
+  // Mark the mount as shared+slave.
+  mount = fs::mount(
+      None(),
+      rootfs,
+      None(),
+      MS_SLAVE,
+      NULL);
+
+  if (mount.isError()) {
+    return Failure(
+        "Failed to mark mount '" + rootfs +
+        "' as a slave mount: " + mount.error());
+  }
+
+  mount = fs::mount(
+      None(),
+      rootfs,
+      None(),
+      MS_SHARED,
+      NULL);
+
+  if (mount.isError()) {
+    return Failure(
+        "Failed to mark mount '" + rootfs +
+        "' as a shared mount: " + mount.error());
+  }
+
+  return Nothing();
+}
+
+
+Future<bool> BindBackendProcess::destroy(const string& rootfs)
+{
+  Try<fs::MountInfoTable> mountTable = fs::MountInfoTable::read();
+
+  if (mountTable.isError()) {
+    return Failure("Failed to read mount table: " + mountTable.error());
+  }
+
+  foreach (const fs::MountInfoTable::Entry& entry, mountTable.get().entries) {
+    // TODO(xujyan): If MS_REC was used in 'provision()' we would need
+    // to check `strings::startsWith(entry.target, rootfs)` here to
+    // unmount all nested mounts.
+    if (entry.target == rootfs) {
+      // NOTE: This would fail if the rootfs is still in use.
+      Try<Nothing> unmount = fs::unmount(entry.target);
+      if (unmount.isError()) {
+        return Failure(
+            "Failed to destroy bind-mounted rootfs '" + rootfs + "': " +
+            unmount.error());
+      }
+
+      // TODO(jieyu): If 'rmdir' here returns EBUSY, we still returns
+      // a success. This is currently possible because the parent
+      // mount of 'rootfs' might not be a shared mount. Thus,
+      // containers in different mount namespaces might hold extra
+      // references to this mount. It is OK to ignore the EBUSY error
+      // because the provisioner will later try to delete all the
+      // rootfses for the terminated containers.
+      if (::rmdir(rootfs.c_str()) != 0) {
+        string message =
+          "Failed to remove rootfs mount point '" + rootfs +
+          "': " + strerror(errno);
+
+        if (errno == EBUSY) {
+          LOG(ERROR) << message;
+          ++metrics.remove_rootfs_errors;
+        } else {
+          return Failure(message);
+        }
+      }
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+BindBackendProcess::Metrics::Metrics()
+  : remove_rootfs_errors(
+      "containerizer/mesos/provisioner/bind/remove_rootfs_errors")
+{
+  process::metrics::add(remove_rootfs_errors);
+}
+
+
+BindBackendProcess::Metrics::~Metrics()
+{
+  process::metrics::remove(remove_rootfs_errors);
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/bind.hpp b/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
new file mode 100644
index 0000000..7747810
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/bind.hpp
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKENDS_BIND_HPP__
+#define __PROVISIONER_BACKENDS_BIND_HPP__
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class BindBackendProcess;
+
+
+// This is a specialized backend that may be useful for deployments
+// using large (multi-GB) single-layer images *and* where more recent
+// kernel features such as overlayfs are not available (overlayfs-based
+// backend tracked by MESOS-2971). For small images (10's to 100's of MB)
+// the copy backend may be sufficient. NOTE:
+// 1) BindBackend supports only a single layer. Multi-layer images will
+//    fail to provision and the container will fail to launch!
+// 2) The filesystem is read-only because all containers using this
+//    image share the source. Select writable areas can be achieved by
+//    mounting read-write volumes to places like /tmp, /var/tmp,
+//    /home, etc. using the ContainerInfo. These can be relative to
+//    the executor work directory.
+//    N.B. Since the filesystem is read-only, '--sandbox_directory' must
+//    already exist within the filesystem because the filesystem isolator
+//    is unable to create it!
+// 3) It's fast because the bind mount requires (nearly) zero IO.
+class BindBackend : public Backend
+{
+public:
+  virtual ~BindBackend();
+
+  // BindBackend doesn't use any flag.
+  static Try<process::Owned<Backend>> create(const Flags&);
+
+  virtual process::Future<Nothing> provision(
+      const std::vector<std::string>& layers,
+      const std::string& rootfs);
+
+  virtual process::Future<bool> destroy(const std::string& rootfs);
+
+private:
+  explicit BindBackend(process::Owned<BindBackendProcess> process);
+
+  BindBackend(const BindBackend&); // Not copyable.
+  BindBackend& operator=(const BindBackend&); // Not assignable.
+
+  process::Owned<BindBackendProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKENDS_BIND_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/copy.cpp b/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
new file mode 100644
index 0000000..59c5a27
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/copy.cpp
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <list>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/dispatch.hpp>
+#include <process/io.hpp>
+#include <process/process.hpp>
+#include <process/subprocess.hpp>
+
+
+#include <stout/foreach.hpp>
+#include <stout/os.hpp>
+
+#include "common/status_utils.hpp"
+
+#include "slave/containerizer/mesos/provisioner/backends/copy.hpp"
+
+
+using namespace process;
+
+using std::string;
+using std::list;
+using std::vector;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+class CopyBackendProcess : public Process<CopyBackendProcess>
+{
+public:
+  Future<Nothing> provision(const vector<string>& layers, const string& rootfs);
+
+  Future<bool> destroy(const string& rootfs);
+
+private:
+  Future<Nothing> _provision(string layer, const string& rootfs);
+};
+
+
+Try<Owned<Backend>> CopyBackend::create(const Flags&)
+{
+  return Owned<Backend>(new CopyBackend(
+      Owned<CopyBackendProcess>(new CopyBackendProcess())));
+}
+
+
+CopyBackend::~CopyBackend()
+{
+  terminate(process.get());
+  wait(process.get());
+}
+
+
+CopyBackend::CopyBackend(Owned<CopyBackendProcess> _process)
+  : process(_process)
+{
+  spawn(CHECK_NOTNULL(process.get()));
+}
+
+
+Future<Nothing> CopyBackend::provision(
+    const vector<string>& layers,
+    const string& rootfs)
+{
+  return dispatch(
+      process.get(), &CopyBackendProcess::provision, layers, rootfs);
+}
+
+
+Future<bool> CopyBackend::destroy(const string& rootfs)
+{
+  return dispatch(process.get(), &CopyBackendProcess::destroy, rootfs);
+}
+
+
+Future<Nothing> CopyBackendProcess::provision(
+    const vector<string>& layers,
+    const string& rootfs)
+{
+  if (layers.size() == 0) {
+    return Failure("No filesystem layers provided");
+  }
+
+  if (os::exists(rootfs)) {
+    return Failure("Rootfs is already provisioned");
+  }
+
+  Try<Nothing> mkdir = os::mkdir(rootfs);
+  if (mkdir.isError()) {
+    return Failure("Failed to create rootfs directory: " + mkdir.error());
+  }
+
+  list<Future<Nothing>> futures{Nothing()};
+
+  foreach (const string layer, layers) {
+    futures.push_back(
+        futures.back().then(
+            defer(self(), &Self::_provision, layer, rootfs)));
+  }
+
+  return collect(futures)
+    .then([]() -> Future<Nothing> { return Nothing(); });
+}
+
+
+Future<Nothing> CopyBackendProcess::_provision(
+  string layer,
+  const string& rootfs)
+{
+  VLOG(1) << "Copying layer path '" << layer << "' to rootfs '" << rootfs
+          << "'";
+
+#ifdef __APPLE__
+  if (!strings::endsWith(layer, "/")) {
+    layer += "/";
+  }
+
+  // OSX cp doesn't support -T flag, but supports source trailing
+  // slash so we only copy the content but not the folder.
+  vector<string> args{"cp", "-a", layer, rootfs};
+#else
+  vector<string> args{"cp", "-aT", layer, rootfs};
+#endif // __APPLE__
+
+  Try<Subprocess> s = subprocess(
+      "cp",
+      args,
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PATH("/dev/null"),
+      Subprocess::PIPE());
+
+  if (s.isError()) {
+    return Failure("Failed to create 'cp' subprocess: " + s.error());
+  }
+
+  Subprocess cp = s.get();
+
+  return cp.status()
+    .then([cp](const Option<int>& status) -> Future<Nothing> {
+      if (status.isNone()) {
+        return Failure("Failed to reap subprocess to copy image");
+      } else if (status.get() != 0) {
+        return io::read(cp.err().get())
+          .then([](const string& err) -> Future<Nothing> {
+            return Failure("Failed to copy layer: " + err);
+          });
+      }
+
+      return Nothing();
+    });
+}
+
+
+Future<bool> CopyBackendProcess::destroy(const string& rootfs)
+{
+  vector<string> argv{"rm", "-rf", rootfs};
+
+  Try<Subprocess> s = subprocess(
+      "rm",
+      argv,
+      Subprocess::PATH("/dev/null"),
+      Subprocess::FD(STDOUT_FILENO),
+      Subprocess::FD(STDERR_FILENO));
+
+  if (s.isError()) {
+    return Failure("Failed to create 'rm' subprocess: " + s.error());
+  }
+
+  return s.get().status()
+    .then([](const Option<int>& status) -> Future<bool> {
+      if (status.isNone()) {
+        return Failure("Failed to reap subprocess to destroy rootfs");
+      } else if (status.get() != 0) {
+        return Failure("Failed to destroy rootfs, exit status: " +
+                       WSTRINGIFY(status.get()));
+      }
+
+      return true;
+    });
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/provisioner/backends/copy.hpp b/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
new file mode 100644
index 0000000..0d3e7d4
--- /dev/null
+++ b/src/slave/containerizer/mesos/provisioner/backends/copy.hpp
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PROVISIONER_BACKENDS_COPY_HPP__
+#define __PROVISIONER_BACKENDS_COPY_HPP__
+
+#include "slave/containerizer/mesos/provisioner/backend.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Forward declaration.
+class CopyBackendProcess;
+
+
+// The backend implementation that copies the layers to the target.
+// NOTE: Using this backend currently has a few implications:
+// 1) The disk space used by the provisioned rootfs is not counted
+//    towards either the usage by the executor/task or the store
+//    cache, which can interfere with the slave hosts's disk space
+//    allocation.
+// 2) The task can write unrestrictedly into the provisioned rootfs
+//    which is not accounted for (in terms of disk usage) either.
+class CopyBackend : public Backend
+{
+public:
+  virtual ~CopyBackend();
+
+  // CopyBackend doesn't use any flag.
+  static Try<process::Owned<Backend>> create(const Flags&);
+
+  // Provisions a rootfs given the layers' paths and target rootfs
+  // path.
+  virtual process::Future<Nothing> provision(
+      const std::vector<std::string>& layers,
+      const std::string& rootfs);
+
+  virtual process::Future<bool> destroy(const std::string& rootfs);
+
+private:
+  explicit CopyBackend(process::Owned<CopyBackendProcess> process);
+
+  CopyBackend(const CopyBackend&); // Not copyable.
+  CopyBackend& operator=(const CopyBackend&); // Not assignable.
+
+  process::Owned<CopyBackendProcess> process;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __PROVISIONER_BACKENDS_COPY_HPP__

[09/12] mesos git commit: Relocated MesosContainerizer specific files to the correct location.

Posted by ji...@apache.org.

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/network/port_mapping.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.hpp b/src/slave/containerizer/isolators/network/port_mapping.hpp
deleted file mode 100644
index ae53c1b..0000000
--- a/src/slave/containerizer/isolators/network/port_mapping.hpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __PORT_MAPPING_ISOLATOR_HPP__
-#define __PORT_MAPPING_ISOLATOR_HPP__
-
-#include <stdint.h>
-
-#include <sys/types.h>
-
-#include <set>
-#include <string>
-#include <vector>
-
-#include <process/owned.hpp>
-#include <process/subprocess.hpp>
-
-#include <process/metrics/metrics.hpp>
-#include <process/metrics/counter.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/hashmap.hpp>
-#include <stout/hashset.hpp>
-#include <stout/ip.hpp>
-#include <stout/interval.hpp>
-#include <stout/mac.hpp>
-#include <stout/none.hpp>
-#include <stout/option.hpp>
-#include <stout/subcommand.hpp>
-
-#include "linux/routing/filter/ip.hpp"
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// The prefix this isolator uses for the virtual ethernet devices.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
-
-
-// The root directory where we bind mount all the namespace handles.
-// We choose the directory '/var/run/netns' so that we can use
-// iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
-// network namespace. This is very useful for debugging purposes.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
-
-// The root directory where we keep all the namespace handle
-// symlinks. This is introduced in 0.23.0.
-// NOTE: This constant is exposed for testing.
-inline std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
-{
-  return "/var/run/mesos/netns";
-}
-
-
-// These names are used to identify the traffic control statistics
-// output for each of the Linux Traffic Control Qdiscs we report.
-constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
-constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
-
-
-// Responsible for allocating ephemeral ports for the port mapping
-// network isolator. This class is exposed mainly for unit testing.
-class EphemeralPortsAllocator
-{
-public:
-  EphemeralPortsAllocator(
-      const IntervalSet<uint16_t>& total,
-      size_t _portsPerContainer)
-    : free(total),
-      portsPerContainer_(_portsPerContainer) {};
-
-  // Returns the number of ephemeral ports for each container.
-  size_t portsPerContainer() const { return portsPerContainer_; }
-
-  // Allocate an ephemeral port range for a container. The allocator
-  // will automatically find one port range with the given container
-  // size. Returns error if the allocation cannot be fulfilled (e.g.,
-  // exhausting available ephemeral ports).
-  Try<Interval<uint16_t>> allocate();
-
-  // Mark the specified ephemeral port range as allocated.
-  void allocate(const Interval<uint16_t>& ports);
-
-  // Deallocate the specified ephemeral port range.
-  void deallocate(const Interval<uint16_t>& ports);
-
-  // Return true if the specified ephemeral port range is managed by
-  // the allocator, regardless it has been allocated to use or not.
-  bool isManaged(const Interval<uint16_t>& ports)
-  {
-    return (free + used).contains(ports);
-  }
-
-private:
-  // Given an integer x, return the smallest integer t such that t >=
-  // x and t % m == 0.
-  static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
-
-  IntervalSet<uint16_t> free;
-  IntervalSet<uint16_t> used;
-
-  // The number of ephemeral ports for each container.
-  size_t portsPerContainer_;
-};
-
-
-// For the specified ports, generate a set of port ranges each of
-// which can be used by a single IP filter. In other words, each port
-// range needs to satisfy the following two conditions: 1) the size of
-// the range is 2^n (n=0,1,2...); 2) the begin of the range is size
-// aligned (i.e., begin % size == 0). This function is exposed mainly
-// for unit testing.
-std::vector<routing::filter::ip::PortRange> getPortRanges(
-    const IntervalSet<uint16_t>& ports);
-
-
-// Provides network isolation using port mapping. Each container is
-// assigned a fixed set of ports (including ephemeral ports). The
-// isolator will set up filters on the host such that network traffic
-// to the host will be properly redirected to the corresponding
-// container depending on the destination ports. The network traffic
-// from containers will also be properly relayed to the host. This
-// isolator is useful when the operator wants to reuse the host IP for
-// all containers running on the host (e.g., there are insufficient
-// IPs).
-class PortMappingIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~PortMappingIsolatorProcess() {}
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  struct Info
-  {
-    Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
-         const Interval<uint16_t>& _ephemeralPorts,
-         const Option<pid_t>& _pid = None())
-      : nonEphemeralPorts(_nonEphemeralPorts),
-        ephemeralPorts(_ephemeralPorts),
-        pid(_pid) {}
-
-    // Non-ephemeral ports used by the container. It's possible that a
-    // container does not use any non-ephemeral ports. In that case,
-    // 'nonEphemeralPorts' will be empty. This variable could change
-    // upon 'update'.
-    IntervalSet<uint16_t> nonEphemeralPorts;
-
-    // Each container has one and only one range of ephemeral ports.
-    // It cannot have more than one ranges of ephemeral ports because
-    // we need to setup the ip_local_port_range (which only accepts a
-    // single interval) inside the container to restrict the ephemeral
-    // ports used by the container.
-    const Interval<uint16_t> ephemeralPorts;
-
-    Option<pid_t> pid;
-    Option<uint16_t> flowId;
-  };
-
-  // Define the metrics used by the port mapping network isolator.
-  struct Metrics
-  {
-    Metrics();
-    ~Metrics();
-
-    process::metrics::Counter adding_eth0_ip_filters_errors;
-    process::metrics::Counter adding_eth0_ip_filters_already_exist;
-    process::metrics::Counter adding_eth0_egress_filters_errors;
-    process::metrics::Counter adding_eth0_egress_filters_already_exist;
-    process::metrics::Counter adding_lo_ip_filters_errors;
-    process::metrics::Counter adding_lo_ip_filters_already_exist;
-    process::metrics::Counter adding_veth_ip_filters_errors;
-    process::metrics::Counter adding_veth_ip_filters_already_exist;
-    process::metrics::Counter adding_veth_icmp_filters_errors;
-    process::metrics::Counter adding_veth_icmp_filters_already_exist;
-    process::metrics::Counter adding_veth_arp_filters_errors;
-    process::metrics::Counter adding_veth_arp_filters_already_exist;
-    process::metrics::Counter adding_eth0_icmp_filters_errors;
-    process::metrics::Counter adding_eth0_icmp_filters_already_exist;
-    process::metrics::Counter adding_eth0_arp_filters_errors;
-    process::metrics::Counter adding_eth0_arp_filters_already_exist;
-    process::metrics::Counter removing_eth0_ip_filters_errors;
-    process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
-    process::metrics::Counter removing_eth0_egress_filters_errors;
-    process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
-    process::metrics::Counter removing_lo_ip_filters_errors;
-    process::metrics::Counter removing_lo_ip_filters_do_not_exist;
-    process::metrics::Counter removing_veth_ip_filters_errors;
-    process::metrics::Counter removing_veth_ip_filters_do_not_exist;
-    process::metrics::Counter removing_eth0_icmp_filters_errors;
-    process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
-    process::metrics::Counter removing_eth0_arp_filters_errors;
-    process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
-    process::metrics::Counter updating_eth0_icmp_filters_errors;
-    process::metrics::Counter updating_eth0_icmp_filters_already_exist;
-    process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
-    process::metrics::Counter updating_eth0_arp_filters_errors;
-    process::metrics::Counter updating_eth0_arp_filters_already_exist;
-    process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
-    process::metrics::Counter updating_container_ip_filters_errors;
-  } metrics;
-
-  PortMappingIsolatorProcess(
-      const Flags& _flags,
-      const std::string& _eth0,
-      const std::string& _lo,
-      const net::MAC& _hostMAC,
-      const net::IPNetwork& _hostIPNetwork,
-      const size_t _hostEth0MTU,
-      const net::IP& _hostDefaultGateway,
-      const hashmap<std::string, std::string>& _hostNetworkConfigurations,
-      const Option<Bytes>& _egressRateLimitPerContainer,
-      const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
-      const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
-      const std::set<uint16_t>& _flowIDs)
-    : flags(_flags),
-      eth0(_eth0),
-      lo(_lo),
-      hostMAC(_hostMAC),
-      hostIPNetwork(_hostIPNetwork),
-      hostEth0MTU(_hostEth0MTU),
-      hostDefaultGateway(_hostDefaultGateway),
-      hostNetworkConfigurations(_hostNetworkConfigurations),
-      egressRateLimitPerContainer(_egressRateLimitPerContainer),
-      managedNonEphemeralPorts(_managedNonEphemeralPorts),
-      ephemeralPortsAllocator(_ephemeralPortsAllocator),
-      freeFlowIds(_flowIDs) {}
-
-  // Continuations.
-  Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
-  Try<Info*> _recover(pid_t pid);
-
-  void _update(
-      const ContainerID& containerId,
-      const process::Future<Option<int>>& status);
-
-  process::Future<ResourceStatistics> _usage(
-      const ResourceStatistics& result,
-      const process::Subprocess& s);
-
-  process::Future<ResourceStatistics> __usage(
-      ResourceStatistics result,
-      const process::Future<std::string>& out);
-
-  // Helper functions.
-  Try<Nothing> addHostIPFilters(
-      const routing::filter::ip::PortRange& range,
-      const Option<uint16_t>& flowId,
-      const std::string& veth);
-
-  Try<Nothing> removeHostIPFilters(
-      const routing::filter::ip::PortRange& range,
-      const std::string& veth,
-      bool removeFiltersOnVeth = true);
-
-  // Return the scripts that will be executed in the child context.
-  std::string scripts(Info* info);
-
-  uint16_t getNextFlowId();
-
-  const Flags flags;
-
-  const std::string eth0;
-  const std::string lo;
-  const net::MAC hostMAC;
-  const net::IPNetwork hostIPNetwork;
-  const size_t hostEth0MTU;
-  const net::IP hostDefaultGateway;
-
-  // Describe the host network configurations. It is a map between
-  // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
-  // values of the configure proc files.
-  const hashmap<std::string, std::string> hostNetworkConfigurations;
-
-  // The optional throughput limit to containers' egress traffic.
-  const Option<Bytes> egressRateLimitPerContainer;
-
-  // All the non-ephemeral ports managed by the slave, as passed in
-  // via flags.resources.
-  const IntervalSet<uint16_t> managedNonEphemeralPorts;
-
-  process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
-
-  // Store a set of unused flow ID's on this slave.
-  std::set<uint16_t> freeFlowIds;
-
-  hashmap<ContainerID, Info*> infos;
-
-  // Recovered containers from a previous run that weren't managed by
-  // the network isolator.
-  hashset<ContainerID> unmanaged;
-};
-
-
-// Defines the subcommand for 'update' that needs to be executed by a
-// subprocess to update the filters inside a container.
-class PortMappingUpdate : public Subcommand
-{
-public:
-  static const char* NAME;
-
-  struct Flags : public flags::FlagsBase
-  {
-    Flags();
-
-    Option<std::string> eth0_name;
-    Option<std::string> lo_name;
-    Option<pid_t> pid;
-    Option<JSON::Object> ports_to_add;
-    Option<JSON::Object> ports_to_remove;
-  };
-
-  PortMappingUpdate() : Subcommand(NAME) {}
-
-  Flags flags;
-
-protected:
-  virtual int execute();
-  virtual flags::FlagsBase* getFlags() { return &flags; }
-};
-
-
-// Defines the subcommand for 'statistics' that needs to be executed
-// by a subprocess to retrieve newtork statistics from inside a
-// container.
-class PortMappingStatistics : public Subcommand
-{
-public:
-  static const char* NAME;
-
-  struct Flags : public flags::FlagsBase
-  {
-    Flags();
-
-    Option<std::string> eth0_name;
-    Option<pid_t> pid;
-    bool enable_socket_statistics_summary;
-    bool enable_socket_statistics_details;
-  };
-
-  PortMappingStatistics() : Subcommand(NAME) {}
-
-  Flags flags;
-
-protected:
-  virtual int execute();
-  virtual flags::FlagsBase* getFlags() { return &flags; }
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __PORT_MAPPING_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix.hpp b/src/slave/containerizer/isolators/posix.hpp
deleted file mode 100644
index 00df902..0000000
--- a/src/slave/containerizer/isolators/posix.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_ISOLATOR_HPP__
-#define __POSIX_ISOLATOR_HPP__
-
-#include <process/future.hpp>
-
-#include <stout/hashmap.hpp>
-#include <stout/os.hpp>
-
-#include <stout/os/pstree.hpp>
-
-#include "slave/flags.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-#include "usage/usage.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// A basic MesosIsolatorProcess that keeps track of the pid but
-// doesn't do any resource isolation. Subclasses must implement
-// usage() for their appropriate resource(s).
-class PosixIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& state,
-      const hashset<ContainerID>& orphans)
-  {
-    foreach (const mesos::slave::ContainerState& run, state) {
-      // This should (almost) never occur: see comment in
-      // PosixLauncher::recover().
-      if (pids.contains(run.container_id())) {
-        return process::Failure("Container already recovered");
-      }
-
-      pids.put(run.container_id(), run.pid());
-
-      process::Owned<process::Promise<mesos::slave::ContainerLimitation>>
-        promise(new process::Promise<mesos::slave::ContainerLimitation>());
-      promises.put(run.container_id(), promise);
-    }
-
-    return Nothing();
-  }
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user)
-  {
-    if (promises.contains(containerId)) {
-      return process::Failure("Container " + stringify(containerId) +
-                              " has already been prepared");
-    }
-
-    process::Owned<process::Promise<mesos::slave::ContainerLimitation>> promise(
-        new process::Promise<mesos::slave::ContainerLimitation>());
-    promises.put(containerId, promise);
-
-    return None();
-  }
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid)
-  {
-    if (!promises.contains(containerId)) {
-      return process::Failure("Unknown container: " + stringify(containerId));
-    }
-
-    pids.put(containerId, pid);
-
-    return Nothing();
-  }
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId)
-  {
-    if (!promises.contains(containerId)) {
-      return process::Failure("Unknown container: " + stringify(containerId));
-    }
-
-    return promises[containerId]->future();
-  }
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources)
-  {
-    if (!promises.contains(containerId)) {
-      return process::Failure("Unknown container: " + stringify(containerId));
-    }
-
-    // No resources are actually isolated so nothing to do.
-    return Nothing();
-  }
-
-  virtual process::Future<Nothing> cleanup(const ContainerID& containerId)
-  {
-    if (!promises.contains(containerId)) {
-      return process::Failure("Unknown container: " + stringify(containerId));
-    }
-
-    // TODO(idownes): We should discard the container's promise here to signal
-    // to anyone that holds the future from watch().
-    promises.erase(containerId);
-
-    pids.erase(containerId);
-
-    return Nothing();
-  }
-
-protected:
-  hashmap<ContainerID, pid_t> pids;
-  hashmap<ContainerID,
-          process::Owned<process::Promise<mesos::slave::ContainerLimitation>>>
-    promises;
-};
-
-
-class PosixCpuIsolatorProcess : public PosixIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags)
-  {
-    process::Owned<MesosIsolatorProcess> process(
-        new PosixCpuIsolatorProcess());
-
-    return new MesosIsolator(process);
-  }
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId)
-  {
-    if (!pids.contains(containerId)) {
-      LOG(WARNING) << "No resource usage for unknown container '"
-                   << containerId << "'";
-      return ResourceStatistics();
-    }
-
-    // Use 'mesos-usage' but only request 'cpus_' values.
-    Try<ResourceStatistics> usage =
-      mesos::internal::usage(pids.get(containerId).get(), false, true);
-    if (usage.isError()) {
-      return process::Failure(usage.error());
-    }
-    return usage.get();
-  }
-
-private:
-  PosixCpuIsolatorProcess() {}
-};
-
-
-class PosixMemIsolatorProcess : public PosixIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags)
-  {
-    process::Owned<MesosIsolatorProcess> process(
-        new PosixMemIsolatorProcess());
-
-    return new MesosIsolator(process);
-  }
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId)
-  {
-    if (!pids.contains(containerId)) {
-      LOG(WARNING) << "No resource usage for unknown container '"
-                   << containerId << "'";
-      return ResourceStatistics();
-    }
-
-    // Use 'mesos-usage' but only request 'mem_' values.
-    Try<ResourceStatistics> usage =
-      mesos::internal::usage(pids.get(containerId).get(), true, false);
-    if (usage.isError()) {
-      return process::Failure(usage.error());
-    }
-    return usage.get();
-  }
-
-private:
-  PosixMemIsolatorProcess() {}
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix/disk.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix/disk.cpp b/src/slave/containerizer/isolators/posix/disk.cpp
deleted file mode 100644
index 73e62a2..0000000
--- a/src/slave/containerizer/isolators/posix/disk.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <signal.h>
-
-#ifdef __linux__
-#include <sys/prctl.h>
-#endif
-#include <sys/types.h>
-
-#include <deque>
-#include <tuple>
-
-#include <glog/logging.h>
-
-#include <process/check.hpp>
-#include <process/collect.hpp>
-#include <process/defer.hpp>
-#include <process/delay.hpp>
-#include <process/io.hpp>
-#include <process/subprocess.hpp>
-
-#include <stout/check.hpp>
-#include <stout/foreach.hpp>
-#include <stout/lambda.hpp>
-#include <stout/numify.hpp>
-#include <stout/strings.hpp>
-
-#include <stout/os/exists.hpp>
-#include <stout/os/killtree.hpp>
-
-#include "common/protobuf_utils.hpp"
-
-#include "slave/containerizer/isolators/posix/disk.hpp"
-
-using namespace process;
-
-using std::deque;
-using std::list;
-using std::string;
-using std::vector;
-
-using mesos::slave::ContainerLimitation;
-using mesos::slave::ContainerPrepareInfo;
-using mesos::slave::ContainerState;
-using mesos::slave::Isolator;
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-Try<Isolator*> PosixDiskIsolatorProcess::create(const Flags& flags)
-{
-  // TODO(jieyu): Check the availability of command 'du'.
-
-  return new MesosIsolator(process::Owned<MesosIsolatorProcess>(
-        new PosixDiskIsolatorProcess(flags)));
-}
-
-
-PosixDiskIsolatorProcess::Info::PathInfo::~PathInfo()
-{
-  usage.discard();
-}
-
-
-PosixDiskIsolatorProcess::PosixDiskIsolatorProcess(const Flags& _flags)
-  : flags(_flags), collector(flags.container_disk_watch_interval) {}
-
-
-PosixDiskIsolatorProcess::~PosixDiskIsolatorProcess() {}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::recover(
-    const list<ContainerState>& states,
-    const hashset<ContainerID>& orphans)
-{
-  foreach (const ContainerState& state, states) {
-    // Since we checkpoint the executor after we create its working
-    // directory, the working directory should definitely exist.
-    CHECK(os::exists(state.directory()))
-      << "Executor work directory " << state.directory() << " doesn't exist";
-
-    infos.put(state.container_id(), Owned<Info>(new Info(state.directory())));
-  }
-
-  return Nothing();
-}
-
-
-Future<Option<ContainerPrepareInfo>> PosixDiskIsolatorProcess::prepare(
-    const ContainerID& containerId,
-    const ExecutorInfo& executorInfo,
-    const string& directory,
-    const Option<string>& user)
-{
-  if (infos.contains(containerId)) {
-    return Failure("Container has already been prepared");
-  }
-
-  infos.put(containerId, Owned<Info>(new Info(directory)));
-
-  return None();
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::isolate(
-    const ContainerID& containerId,
-    pid_t pid)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  return Nothing();
-}
-
-
-Future<ContainerLimitation> PosixDiskIsolatorProcess::watch(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  return infos[containerId]->limitation.future();
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::update(
-    const ContainerID& containerId,
-    const Resources& resources)
-{
-  if (!infos.contains(containerId)) {
-    LOG(WARNING) << "Ignoring update for unknown container " << containerId;
-    return Nothing();
-  }
-
-  LOG(INFO) << "Updating the disk resources for container "
-            << containerId << " to " << resources;
-
-  const Owned<Info>& info = infos[containerId];
-
-  // This stores the updated quotas.
-  hashmap<string, Resources> quotas;
-
-  foreach (const Resource& resource, resources) {
-    if (resource.name() != "disk") {
-      continue;
-    }
-
-    // The path at which we will collect disk usage and enforce quota.
-    string path;
-
-    // NOTE: We do not allow the case where has_disk() is true but
-    // with nothing set inside DiskInfo. The master will enforce it.
-    if (!resource.has_disk()) {
-      // Regular disk used for executor working directory.
-      path = info->directory;
-    } else {
-      // TODO(jieyu): Support persistent volmes as well.
-      LOG(ERROR) << "Enforcing disk quota unsupported for " << resource;
-      continue;
-    }
-
-    quotas[path] += resource;
-  }
-
-  // Update the quota for paths. For each new path, we also initiate
-  // the disk usage collection.
-  foreachpair (const string& path, const Resources& quota, quotas) {
-    if (!info->paths.contains(path)) {
-      info->paths[path].usage = collector.usage(path)
-        .onAny(defer(
-            PID<PosixDiskIsolatorProcess>(this),
-            &PosixDiskIsolatorProcess::_collect,
-            containerId,
-            path,
-            lambda::_1));
-    }
-
-    info->paths[path].quota = quota;
-  }
-
-  // Remove paths that we no longer interested in.
-  foreach (const string& path, info->paths.keys()) {
-    if (!quotas.contains(path)) {
-      info->paths.erase(path);
-    }
-  }
-
-  return Nothing();
-}
-
-
-void PosixDiskIsolatorProcess::_collect(
-    const ContainerID& containerId,
-    const string& path,
-    const Future<Bytes>& future)
-{
-  if (future.isDiscarded()) {
-    LOG(INFO) << "Checking disk usage at '" << path << "' for container "
-              << containerId << " has been cancelled";
-  } else if (future.isFailed()) {
-    LOG(ERROR) << "Checking disk usage at '" << path << "' for container "
-               << containerId << " has failed: " << future.failure();
-  }
-
-  if (!infos.contains(containerId)) {
-    // The container might have just been destroyed.
-    return;
-  }
-
-  const Owned<Info>& info = infos[containerId];
-
-  if (!info->paths.contains(path)) {
-    // The path might have just been removed from this container's
-    // resources.
-    return;
-  }
-
-  // Check if the disk usage exceeds the quota. If yes, report the
-  // limitation. We keep collecting the disk usage for 'path' by
-  // initiating another round of disk usage check. The check will be
-  // throttled by DiskUsageCollector.
-  if (future.isReady()) {
-    // Save the last disk usage.
-    info->paths[path].lastUsage = future.get();
-
-    if (flags.enforce_container_disk_quota) {
-      Option<Bytes> quota = info->paths[path].quota.disk();
-      CHECK_SOME(quota);
-
-      if (future.get() > quota.get()) {
-        info->limitation.set(
-            protobuf::slave::createContainerLimitation(
-                Resources(info->paths[path].quota),
-                "Disk usage (" + stringify(future.get()) +
-                ") exceeds quota (" + stringify(quota.get()) + ")",
-                TaskStatus::REASON_CONTAINER_LIMITATION_DISK));
-      }
-    }
-  }
-
-  info->paths[path].usage = collector.usage(path)
-    .onAny(defer(
-        PID<PosixDiskIsolatorProcess>(this),
-        &PosixDiskIsolatorProcess::_collect,
-        containerId,
-        path,
-        lambda::_1));
-}
-
-
-Future<ResourceStatistics> PosixDiskIsolatorProcess::usage(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    return Failure("Unknown container");
-  }
-
-  ResourceStatistics result;
-
-  const Owned<Info>& info = infos[containerId];
-
-  if (info->paths.contains(info->directory)) {
-    Option<Bytes> quota = info->paths[info->directory].quota.disk();
-    CHECK_SOME(quota);
-
-    result.set_disk_limit_bytes(quota.get().bytes());
-
-    // NOTE: There may be a large delay (# of containers * interval)
-    // until an initial cached value is returned here!
-    if (info->paths[info->directory].lastUsage.isSome()) {
-      result.set_disk_used_bytes(
-          info->paths[info->directory].lastUsage.get().bytes());
-    }
-  }
-
-  return result;
-}
-
-
-Future<Nothing> PosixDiskIsolatorProcess::cleanup(
-    const ContainerID& containerId)
-{
-  if (!infos.contains(containerId)) {
-    LOG(WARNING) << "Ignoring cleanup for unknown container " << containerId;
-    return Nothing();
-  }
-
-  infos.erase(containerId);
-
-  return Nothing();
-}
-
-
-class DiskUsageCollectorProcess : public Process<DiskUsageCollectorProcess>
-{
-public:
-  DiskUsageCollectorProcess(const Duration& _interval) : interval(_interval) {}
-  virtual ~DiskUsageCollectorProcess() {}
-
-  Future<Bytes> usage(const string& path)
-  {
-    foreach (const Owned<Entry>& entry, entries) {
-      if (entry->path == path) {
-        return entry->promise.future();
-      }
-    }
-
-    entries.push_back(Owned<Entry>(new Entry(path)));
-
-    // Install onDiscard callback.
-    Future<Bytes> future = entries.back()->promise.future();
-    future.onDiscard(defer(self(), &Self::discard, path));
-
-    return future;
-  }
-
-protected:
-  void initialize()
-  {
-    schedule();
-  }
-
-  void finalize()
-  {
-    foreach (const Owned<Entry>& entry, entries) {
-      if (entry->du.isSome() && entry->du.get().status().isPending()) {
-        os::killtree(entry->du.get().pid(), SIGKILL);
-      }
-
-      entry->promise.fail("DiskUsageCollector is destroyed");
-    }
-  }
-
-private:
-  // Describe a single pending check.
-  struct Entry
-  {
-    explicit Entry(const string& _path) : path(_path) {}
-
-    string path;
-    Option<Subprocess> du;
-    Promise<Bytes> promise;
-  };
-
-  // This function is invoked right before each 'du' is exec'ed. Note
-  // that this function needs to be async signal safe.
-  static int setupChild()
-  {
-#ifdef __linux__
-    // Kill the child process if the parent exits.
-    // NOTE: This function should never returns non-zero because we
-    // are passing in a valid signal.
-    return ::prctl(PR_SET_PDEATHSIG, SIGKILL);
-#else
-    return 0;
-#endif
-  }
-
-  void discard(const string& path)
-  {
-    for (auto it = entries.begin(); it != entries.end(); ++it) {
-      // We only cancel those checks whose 'du' haven't been launched.
-      if ((*it)->path == path && (*it)->du.isNone()) {
-        (*it)->promise.discard();
-        entries.erase(it);
-        break;
-      }
-    }
-  }
-
-  // Schedule a 'du' to be invoked. The current implementation does
-  // not allow multiple 'du's running concurrently. The minimal
-  // interval between two subsequent 'du's is controlled by 'interval'
-  // for throttling purpose.
-  void schedule()
-  {
-    if (entries.empty()) {
-      delay(interval, self(), &Self::schedule);
-      return;
-    }
-
-    const Owned<Entry>& entry = entries.front();
-
-    // Invoke 'du' and report number of 1K-byte blocks. We fix the
-    // block size here so that we can get consistent results on all
-    // platforms (e.g., OS X uses 512 byte blocks).
-    //
-    // NOTE: The 'du' processes are run in the slave's cgroup and it
-    // will be that cgroup that is charged for (a) memory to cache the
-    // fs data structures, (b) disk I/O to read those structures, and
-    // (c) the cpu time to traverse.
-    Try<Subprocess> s = subprocess(
-        "du",
-        vector<string>({"du", "-k", "-s", entry->path}),
-        Subprocess::PATH("/dev/null"),
-        Subprocess::PIPE(),
-        Subprocess::PIPE(),
-        None(),
-        None(),
-        setupChild);
-
-    if (s.isError()) {
-      entry->promise.fail("Failed to exec 'du': " + s.error());
-
-      entries.pop_front();
-      delay(interval, self(), &Self::schedule);
-      return;
-    }
-
-    entry->du = s.get();
-
-    await(s.get().status(),
-          io::read(s.get().out().get()),
-          io::read(s.get().err().get()))
-      .onAny(defer(self(), &Self::_schedule, lambda::_1));
-  }
-
-  void _schedule(const Future<std::tuple<
-      Future<Option<int>>,
-      Future<string>,
-      Future<string>>>& future)
-  {
-    CHECK_READY(future);
-    CHECK(!entries.empty());
-
-    const Owned<Entry>& entry = entries.front();
-    CHECK_SOME(entry->du);
-
-    Future<Option<int>> status = std::get<0>(future.get());
-
-    if (!status.isReady()) {
-      entry->promise.fail(
-          "Failed to perform 'du': " +
-          (status.isFailed() ? status.failure() : "discarded"));
-    } else if (status.get().isNone()) {
-      entry->promise.fail("Failed to reap the status of 'du'");
-    } else if (status.get().get() != 0) {
-      Future<string> error = std::get<2>(future.get());
-      if (!error.isReady()) {
-        entry->promise.fail(
-            "Failed to perform 'du'. Reading stderr failed: " +
-            (error.isFailed() ? error.failure() : "discarded"));
-      } else {
-        entry->promise.fail("Failed to perform 'du': " + error.get());
-      }
-    } else {
-      Future<string> output = std::get<1>(future.get());
-      if (!output.isReady()) {
-        entry->promise.fail(
-            "Failed to read stdout from 'du': " +
-            (output.isFailed() ? output.failure() : "discarded"));
-      } else {
-        // Parsing the output from 'du'. The following is a sample
-        // output. Tab is used as the delimiter between the number of
-        // blocks and the checked path.
-        // $ du /var/lib/mesos/.../runs/container_id
-        // 1024   /var/lib/mesos/.../runs/container_id
-        vector<string> tokens = strings::tokenize(output.get(), " \t");
-        if (tokens.empty()) {
-          entry->promise.fail("Unexpected output from 'du': " + output.get());
-        } else {
-          Try<size_t> value = numify<size_t>(tokens[0]);
-          if (value.isError()) {
-            entry->promise.fail("Unexpected output from 'du': " + output.get());
-          } else {
-            // Notify the callers.
-            entry->promise.set(Kilobytes(value.get()));
-          }
-        }
-      }
-    }
-
-    entries.pop_front();
-    delay(interval, self(), &Self::schedule);
-  }
-
-  const Duration interval;
-
-  // A queue of pending checks.
-  deque<Owned<Entry>> entries;
-};
-
-
-DiskUsageCollector::DiskUsageCollector(const Duration& interval)
-{
-  process = new DiskUsageCollectorProcess(interval);
-  spawn(process);
-}
-
-
-DiskUsageCollector::~DiskUsageCollector()
-{
-  terminate(process);
-  wait(process);
-  delete process;
-}
-
-
-Future<Bytes> DiskUsageCollector::usage(const string& path)
-{
-  return dispatch(process, &DiskUsageCollectorProcess::usage, path);
-}
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/isolators/posix/disk.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/posix/disk.hpp b/src/slave/containerizer/isolators/posix/disk.hpp
deleted file mode 100644
index 85df5d2..0000000
--- a/src/slave/containerizer/isolators/posix/disk.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __POSIX_DISK_ISOLATOR_HPP__
-#define __POSIX_DISK_ISOLATOR_HPP__
-
-#include <string>
-
-#include <process/owned.hpp>
-
-#include <stout/bytes.hpp>
-#include <stout/duration.hpp>
-#include <stout/hashmap.hpp>
-
-#include "slave/flags.hpp"
-#include "slave/state.hpp"
-
-#include "slave/containerizer/isolator.hpp"
-
-namespace mesos {
-namespace internal {
-namespace slave {
-
-// Forward declarations.
-class DiskUsageCollectorProcess;
-
-
-// Responsible for collecting disk usage for paths, while ensuring
-// that an interval elapses between each collection.
-class DiskUsageCollector
-{
-public:
-  DiskUsageCollector(const Duration& interval);
-  ~DiskUsageCollector();
-
-  // Returns the disk usage rooted at 'path'. The user can discard the
-  // returned future to cancel the check.
-  process::Future<Bytes> usage(const std::string& path);
-
-private:
-  DiskUsageCollectorProcess* process;
-};
-
-
-// This isolator monitors the disk usage for containers, and reports
-// ContainerLimitation when a container exceeds its disk quota. This
-// leverages the DiskUsageCollector to ensure that we don't induce too
-// much CPU usage and disk caching effects from running 'du' too
-// often.
-//
-// NOTE: Currently all containers are processed in the same queue,
-// which means that when a container starts, it could take many disk
-// collection intervals until any data is available in the resource
-// usage statistics!
-//
-// TODO(jieyu): Consider handling each container independently, or
-// triggering an initial collection when the container starts, to
-// ensure that we have usage statistics without a large delay.
-class PosixDiskIsolatorProcess : public MesosIsolatorProcess
-{
-public:
-  static Try<mesos::slave::Isolator*> create(const Flags& flags);
-
-  virtual ~PosixDiskIsolatorProcess();
-
-  virtual process::Future<Nothing> recover(
-      const std::list<mesos::slave::ContainerState>& states,
-      const hashset<ContainerID>& orphans);
-
-  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
-      const ContainerID& containerId,
-      const ExecutorInfo& executorInfo,
-      const std::string& directory,
-      const Option<std::string>& user);
-
-  virtual process::Future<Nothing> isolate(
-      const ContainerID& containerId,
-      pid_t pid);
-
-  virtual process::Future<mesos::slave::ContainerLimitation> watch(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> update(
-      const ContainerID& containerId,
-      const Resources& resources);
-
-  virtual process::Future<ResourceStatistics> usage(
-      const ContainerID& containerId);
-
-  virtual process::Future<Nothing> cleanup(
-      const ContainerID& containerId);
-
-private:
-  PosixDiskIsolatorProcess(const Flags& flags);
-
-  void _collect(
-      const ContainerID& containerId,
-      const std::string& path,
-      const process::Future<Bytes>& future);
-
-  const Flags flags;
-  DiskUsageCollector collector;
-
-  struct Info
-  {
-    explicit Info(const std::string& _directory) : directory(_directory) {}
-
-    // We save executor working directory here so that we know where
-    // to collect disk usage for disk resources without DiskInfo.
-    const std::string directory;
-
-    process::Promise<mesos::slave::ContainerLimitation> limitation;
-
-    // The keys of the hashmaps contain the executor working directory
-    // above, and optionally paths of volumes used by the container.
-    // For each path, we maintain its quota and its last usage.
-    struct PathInfo
-    {
-      ~PathInfo();
-
-      Resources quota;
-      process::Future<Bytes> usage;
-      Option<Bytes> lastUsage;
-    };
-
-    hashmap<std::string, PathInfo> paths;
-  };
-
-  hashmap<ContainerID, process::Owned<Info>> infos;
-};
-
-} // namespace slave {
-} // namespace internal {
-} // namespace mesos {
-
-#endif // __POSIX_DISK_ISOLATOR_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/linux_launcher.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/linux_launcher.cpp b/src/slave/containerizer/linux_launcher.cpp
index c03b89e..654f0e1 100644
--- a/src/slave/containerizer/linux_launcher.cpp
+++ b/src/slave/containerizer/linux_launcher.cpp
@@ -40,7 +40,7 @@
 
 #include "slave/containerizer/linux_launcher.hpp"
 
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
 
 using namespace process;
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/containerizer.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/containerizer.cpp b/src/slave/containerizer/mesos/containerizer.cpp
index d1fc5a4..0664d00 100644
--- a/src/slave/containerizer/mesos/containerizer.cpp
+++ b/src/slave/containerizer/mesos/containerizer.cpp
@@ -47,36 +47,35 @@
 #include "slave/containerizer/linux_launcher.hpp"
 #endif
 
-#include "slave/containerizer/isolators/posix.hpp"
+#include "slave/containerizer/mesos/isolators/posix.hpp"
 
-#include "slave/containerizer/isolators/posix/disk.hpp"
+#include "slave/containerizer/mesos/isolators/posix/disk.hpp"
 
 #ifdef __linux__
-#include "slave/containerizer/isolators/cgroups/cpushare.hpp"
-#include "slave/containerizer/isolators/cgroups/mem.hpp"
-#include "slave/containerizer/isolators/cgroups/perf_event.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/mem.hpp"
+#include "slave/containerizer/mesos/isolators/cgroups/perf_event.hpp"
 #endif
 
 #ifdef __linux__
-#include "slave/containerizer/isolators/filesystem/linux.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/linux.hpp"
 #endif
-#include "slave/containerizer/isolators/filesystem/posix.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/posix.hpp"
 #ifdef __linux__
-#include "slave/containerizer/isolators/filesystem/shared.hpp"
+#include "slave/containerizer/mesos/isolators/filesystem/shared.hpp"
 #endif
 
 #ifdef __linux__
-#include "slave/containerizer/isolators/namespaces/pid.hpp"
+#include "slave/containerizer/mesos/isolators/namespaces/pid.hpp"
 #endif
 
 #ifdef WITH_NETWORK_ISOLATOR
-#include "slave/containerizer/isolators/network/port_mapping.hpp"
+#include "slave/containerizer/mesos/isolators/network/port_mapping.hpp"
 #endif
 
 #include "slave/containerizer/mesos/containerizer.hpp"
 #include "slave/containerizer/mesos/launch.hpp"
-
-#include "slave/containerizer/provisioner/provisioner.hpp"
+#include "slave/containerizer/mesos/provisioner/provisioner.hpp"
 
 using std::list;
 using std::map;

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp b/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
new file mode 100644
index 0000000..db9dde0
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CGROUPS_ISOLATOR_CONSTANTS_HPP__
+#define __CGROUPS_ISOLATOR_CONSTANTS_HPP__
+
+#include <stout/bytes.hpp>
+#include <stout/duration.hpp>
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// CPU subsystem constants.
+const uint64_t CPU_SHARES_PER_CPU = 1024;
+const uint64_t CPU_SHARES_PER_CPU_REVOCABLE = 10;
+const uint64_t MIN_CPU_SHARES = 2; // Linux constant.
+const Duration CPU_CFS_PERIOD = Milliseconds(100); // Linux default.
+const Duration MIN_CPU_CFS_QUOTA = Milliseconds(1);
+
+
+// Memory subsystem constants.
+const Bytes MIN_MEMORY = Megabytes(32);
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __CGROUPS_ISOLATOR_CONSTANTS_HPP__

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
new file mode 100644
index 0000000..84a64e5
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.cpp
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#include <mesos/type_utils.hpp>
+#include <mesos/values.hpp>
+
+#include <process/collect.hpp>
+#include <process/defer.hpp>
+#include <process/pid.hpp>
+
+#include <stout/bytes.hpp>
+#include <stout/check.hpp>
+#include <stout/error.hpp>
+#include <stout/foreach.hpp>
+#include <stout/hashset.hpp>
+#include <stout/nothing.hpp>
+#include <stout/os.hpp>
+#include <stout/path.hpp>
+#include <stout/stringify.hpp>
+#include <stout/try.hpp>
+
+#include "linux/cgroups.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/cpushare.hpp"
+
+using namespace process;
+
+using std::list;
+using std::set;
+using std::string;
+using std::vector;
+
+using mesos::slave::ContainerLimitation;
+using mesos::slave::ContainerPrepareInfo;
+using mesos::slave::ContainerState;
+using mesos::slave::Isolator;
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+CgroupsCpushareIsolatorProcess::CgroupsCpushareIsolatorProcess(
+    const Flags& _flags,
+    const hashmap<string, string>& _hierarchies,
+    const vector<string>& _subsystems)
+  : flags(_flags),
+    hierarchies(_hierarchies),
+    subsystems(_subsystems) {}
+
+
+CgroupsCpushareIsolatorProcess::~CgroupsCpushareIsolatorProcess() {}
+
+
+Try<Isolator*> CgroupsCpushareIsolatorProcess::create(const Flags& flags)
+{
+  Try<string> hierarchyCpu = cgroups::prepare(
+        flags.cgroups_hierarchy,
+        "cpu",
+        flags.cgroups_root);
+
+  if (hierarchyCpu.isError()) {
+    return Error(
+        "Failed to prepare hierarchy for cpu subsystem: " +
+        hierarchyCpu.error());
+  }
+
+  Try<string> hierarchyCpuacct = cgroups::prepare(
+        flags.cgroups_hierarchy,
+        "cpuacct",
+        flags.cgroups_root);
+
+  if (hierarchyCpuacct.isError()) {
+    return Error(
+        "Failed to prepare hierarchy for cpuacct subsystem: " +
+        hierarchyCpuacct.error());
+  }
+
+  hashmap<string, string> hierarchies;
+  vector<string> subsystems;
+
+  hierarchies["cpu"] = hierarchyCpu.get();
+  hierarchies["cpuacct"] = hierarchyCpuacct.get();
+
+  if (hierarchyCpu.get() == hierarchyCpuacct.get()) {
+    // Subsystem cpu and cpuacct are co-mounted (e.g., systemd).
+    hierarchies["cpu,cpuacct"] = hierarchyCpu.get();
+    subsystems.push_back("cpu,cpuacct");
+
+    // Ensure that no other subsystem is attached to the hierarchy.
+    Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
+    if (_subsystems.isError()) {
+      return Error(
+          "Failed to get the list of attached subsystems for hierarchy " +
+          hierarchyCpu.get());
+    } else if (_subsystems.get().size() != 2) {
+      return Error(
+          "Unexpected subsystems found attached to the hierarchy " +
+          hierarchyCpu.get());
+    }
+  } else {
+    // Subsystem cpu and cpuacct are mounted separately.
+    subsystems.push_back("cpu");
+    subsystems.push_back("cpuacct");
+
+    // Ensure that no other subsystem is attached to each of the
+    // hierarchy.
+    Try<set<string>> _subsystems = cgroups::subsystems(hierarchyCpu.get());
+    if (_subsystems.isError()) {
+      return Error(
+          "Failed to get the list of attached subsystems for hierarchy " +
+          hierarchyCpu.get());
+    } else if (_subsystems.get().size() != 1) {
+      return Error(
+          "Unexpected subsystems found attached to the hierarchy " +
+          hierarchyCpu.get());
+    }
+
+    _subsystems = cgroups::subsystems(hierarchyCpuacct.get());
+    if (_subsystems.isError()) {
+      return Error(
+          "Failed to get the list of attached subsystems for hierarchy " +
+          hierarchyCpuacct.get());
+    } else if (_subsystems.get().size() != 1) {
+      return Error(
+          "Unexpected subsystems found attached to the hierarchy " +
+          hierarchyCpuacct.get());
+    }
+  }
+
+  if (flags.cgroups_enable_cfs) {
+    Try<bool> exists = cgroups::exists(
+        hierarchies["cpu"],
+        flags.cgroups_root,
+        "cpu.cfs_quota_us");
+
+    if (exists.isError() || !exists.get()) {
+      return Error(
+          "Failed to find 'cpu.cfs_quota_us'. Your kernel "
+          "might be too old to use the CFS cgroups feature.");
+    }
+  }
+
+  process::Owned<MesosIsolatorProcess> process(
+      new CgroupsCpushareIsolatorProcess(flags, hierarchies, subsystems));
+
+  return new MesosIsolator(process);
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::recover(
+    const list<ContainerState>& states,
+    const hashset<ContainerID>& orphans)
+{
+  foreach (const ContainerState& state, states) {
+    const ContainerID& containerId = state.container_id();
+    const string cgroup = path::join(flags.cgroups_root, containerId.value());
+
+    Try<bool> exists = cgroups::exists(hierarchies["cpu"], cgroup);
+    if (exists.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+      infos.clear();
+      return Failure(
+          "Failed to check cgroup for container " + stringify(containerId));
+    }
+
+    if (!exists.get()) {
+      // This may occur if the executor has exited and the isolator
+      // has destroyed the cgroup but the slave dies before noticing
+      // this. This will be detected when the containerizer tries to
+      // monitor the executor's pid.
+      LOG(WARNING) << "Couldn't find cgroup for container " << containerId;
+      continue;
+    }
+
+    infos[containerId] = new Info(containerId, cgroup);
+  }
+
+  // Remove orphan cgroups.
+  foreach (const string& subsystem, subsystems) {
+    Try<vector<string>> cgroups = cgroups::get(
+        hierarchies[subsystem],
+        flags.cgroups_root);
+
+    if (cgroups.isError()) {
+      foreachvalue (Info* info, infos) {
+        delete info;
+      }
+      infos.clear();
+      return Failure(cgroups.error());
+    }
+
+    foreach (const string& cgroup, cgroups.get()) {
+      // Ignore the slave cgroup (see the --slave_subsystems flag).
+      // TODO(idownes): Remove this when the cgroups layout is
+      // updated, see MESOS-1185.
+      if (cgroup == path::join(flags.cgroups_root, "slave")) {
+        continue;
+      }
+
+      ContainerID containerId;
+      containerId.set_value(Path(cgroup).basename());
+
+      if (infos.contains(containerId)) {
+        continue;
+      }
+
+      // Known orphan cgroups will be destroyed by the containerizer
+      // using the normal cleanup path. See MESOS-2367 for details.
+      if (orphans.contains(containerId)) {
+        infos[containerId] = new Info(containerId, cgroup);
+        continue;
+      }
+
+      LOG(INFO) << "Removing unknown orphaned cgroup '"
+                << path::join(subsystem, cgroup) << "'";
+
+      // We don't wait on the destroy as we don't want to block recovery.
+      cgroups::destroy(
+          hierarchies[subsystem],
+          cgroup,
+          cgroups::DESTROY_TIMEOUT);
+    }
+  }
+
+  return Nothing();
+}
+
+
+Future<Option<ContainerPrepareInfo>> CgroupsCpushareIsolatorProcess::prepare(
+    const ContainerID& containerId,
+    const ExecutorInfo& executorInfo,
+    const string& directory,
+    const Option<string>& user)
+{
+  if (infos.contains(containerId)) {
+    return Failure("Container has already been prepared");
+  }
+
+  // TODO(bmahler): Don't insert into 'infos' unless we create the
+  // cgroup successfully. It's safe for now because 'cleanup' gets
+  // called if we return a Failure, but cleanup will fail because the
+  // cgroup does not exist when cgroups::destroy is called.
+  Info* info = new Info(
+      containerId, path::join(flags.cgroups_root, containerId.value()));
+
+  infos[containerId] = info;
+
+  foreach (const string& subsystem, subsystems) {
+    Try<bool> exists = cgroups::exists(hierarchies[subsystem], info->cgroup);
+    if (exists.isError()) {
+      return Failure("Failed to prepare isolator: " + exists.error());
+    } else if (exists.get()) {
+      return Failure("Failed to prepare isolator: cgroup already exists");
+    }
+
+    Try<Nothing> create = cgroups::create(hierarchies[subsystem], info->cgroup);
+    if (create.isError()) {
+      return Failure("Failed to prepare isolator: " + create.error());
+    }
+
+    // Chown the cgroup so the executor can create nested cgroups. Do
+    // not recurse so the control files are still owned by the slave
+    // user and thus cannot be changed by the executor.
+    if (user.isSome()) {
+      Try<Nothing> chown = os::chown(
+          user.get(),
+          path::join(hierarchies[subsystem], info->cgroup),
+          false);
+      if (chown.isError()) {
+        return Failure("Failed to prepare isolator: " + chown.error());
+      }
+    }
+  }
+
+  return update(containerId, executorInfo.resources())
+    .then([]() -> Future<Option<ContainerPrepareInfo>> {
+      return None();
+    });
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::isolate(
+    const ContainerID& containerId,
+    pid_t pid)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  CHECK_NONE(info->pid);
+  info->pid = pid;
+
+  foreach (const string& subsystem, subsystems) {
+    Try<Nothing> assign = cgroups::assign(
+        hierarchies[subsystem],
+        info->cgroup,
+        pid);
+
+    if (assign.isError()) {
+      LOG(ERROR) << "Failed to assign container '" << info->containerId
+                 << " to its own cgroup '"
+                 << path::join(hierarchies[subsystem], info->cgroup)
+                 << "' : " << assign.error();
+
+      return Failure("Failed to isolate container: " + assign.error());
+    }
+  }
+
+  return Nothing();
+}
+
+
+Future<ContainerLimitation> CgroupsCpushareIsolatorProcess::watch(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  CHECK_NOTNULL(infos[containerId]);
+
+  return infos[containerId]->limitation.future();
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::update(
+    const ContainerID& containerId,
+    const Resources& resources)
+{
+  if (resources.cpus().isNone()) {
+    return Failure("No cpus resource given");
+  }
+
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  const Option<string>& hierarchy = hierarchies.get("cpu");
+  if (hierarchy.isNone()) {
+    return Failure("No 'cpu' hierarchy");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+  info->resources = resources;
+
+  double cpus = resources.cpus().get();
+
+  // Always set cpu.shares.
+  uint64_t shares;
+
+  if (flags.revocable_cpu_low_priority &&
+      resources.revocable().cpus().isSome()) {
+    shares = std::max(
+        (uint64_t) (CPU_SHARES_PER_CPU_REVOCABLE * cpus),
+        MIN_CPU_SHARES);
+  } else {
+    shares = std::max(
+        (uint64_t) (CPU_SHARES_PER_CPU * cpus),
+        MIN_CPU_SHARES);
+  }
+
+  Try<Nothing> write = cgroups::cpu::shares(
+      hierarchy.get(),
+      info->cgroup,
+      shares);
+
+  if (write.isError()) {
+    return Failure("Failed to update 'cpu.shares': " + write.error());
+  }
+
+  LOG(INFO) << "Updated 'cpu.shares' to " << shares
+            << " (cpus " << cpus << ")"
+            << " for container " << containerId;
+
+  // Set cfs quota if enabled.
+  if (flags.cgroups_enable_cfs) {
+    write = cgroups::cpu::cfs_period_us(
+        hierarchy.get(),
+        info->cgroup,
+        CPU_CFS_PERIOD);
+
+    if (write.isError()) {
+      return Failure("Failed to update 'cpu.cfs_period_us': " + write.error());
+    }
+
+    Duration quota = std::max(CPU_CFS_PERIOD * cpus, MIN_CPU_CFS_QUOTA);
+
+    write = cgroups::cpu::cfs_quota_us(hierarchy.get(), info->cgroup, quota);
+    if (write.isError()) {
+      return Failure("Failed to update 'cpu.cfs_quota_us': " + write.error());
+    }
+
+    LOG(INFO) << "Updated 'cpu.cfs_period_us' to " << CPU_CFS_PERIOD
+              << " and 'cpu.cfs_quota_us' to " << quota
+              << " (cpus " << cpus << ")"
+              << " for container " << containerId;
+  }
+
+  return Nothing();
+}
+
+
+Future<ResourceStatistics> CgroupsCpushareIsolatorProcess::usage(
+    const ContainerID& containerId)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  ResourceStatistics result;
+
+  // TODO(chzhcn): Getting the number of processes and threads is
+  // available as long as any cgroup subsystem is used so this best
+  // not be tied to a specific cgroup isolator. A better place is
+  // probably Linux Launcher, which uses the cgroup freezer subsystem.
+  // That requires some change for it to adopt the new semantics of
+  // reporting subsystem-independent cgroup usage.
+  // Note: The complexity of this operation is linear to the number of
+  // processes and threads in a container: the kernel has to allocate
+  // memory to contain the list of pids or tids; the userspace has to
+  // parse the cgroup files to get the size. If this proves to be a
+  // performance bottleneck, some kind of rate limiting mechanism
+  // needs to be employed.
+  if (flags.cgroups_cpu_enable_pids_and_tids_count) {
+    Try<std::set<pid_t>> pids =
+      cgroups::processes(hierarchies["cpuacct"], info->cgroup);
+    if (pids.isError()) {
+      return Failure("Failed to get number of processes: " + pids.error());
+    }
+
+    result.set_processes(pids.get().size());
+
+    Try<std::set<pid_t>> tids =
+      cgroups::threads(hierarchies["cpuacct"], info->cgroup);
+    if (tids.isError()) {
+      return Failure("Failed to get number of threads: " + tids.error());
+    }
+
+    result.set_threads(tids.get().size());
+  }
+
+  // Get the number of clock ticks, used for cpu accounting.
+  static long ticks = sysconf(_SC_CLK_TCK);
+
+  PCHECK(ticks > 0) << "Failed to get sysconf(_SC_CLK_TCK)";
+
+  // Add the cpuacct.stat information.
+  Try<hashmap<string, uint64_t>> stat = cgroups::stat(
+      hierarchies["cpuacct"],
+      info->cgroup,
+      "cpuacct.stat");
+
+  if (stat.isError()) {
+    return Failure("Failed to read cpuacct.stat: " + stat.error());
+  }
+
+  // TODO(bmahler): Add namespacing to cgroups to enforce the expected
+  // structure, e.g., cgroups::cpuacct::stat.
+  Option<uint64_t> user = stat.get().get("user");
+  Option<uint64_t> system = stat.get().get("system");
+
+  if (user.isSome() && system.isSome()) {
+    result.set_cpus_user_time_secs((double) user.get() / (double) ticks);
+    result.set_cpus_system_time_secs((double) system.get() / (double) ticks);
+  }
+
+  // Add the cpu.stat information only if CFS is enabled.
+  if (flags.cgroups_enable_cfs) {
+    stat = cgroups::stat(hierarchies["cpu"], info->cgroup, "cpu.stat");
+    if (stat.isError()) {
+      return Failure("Failed to read cpu.stat: " + stat.error());
+    }
+
+    Option<uint64_t> nr_periods = stat.get().get("nr_periods");
+    if (nr_periods.isSome()) {
+      result.set_cpus_nr_periods(nr_periods.get());
+    }
+
+    Option<uint64_t> nr_throttled = stat.get().get("nr_throttled");
+    if (nr_throttled.isSome()) {
+      result.set_cpus_nr_throttled(nr_throttled.get());
+    }
+
+    Option<uint64_t> throttled_time = stat.get().get("throttled_time");
+    if (throttled_time.isSome()) {
+      result.set_cpus_throttled_time_secs(
+          Nanoseconds(throttled_time.get()).secs());
+    }
+  }
+
+  return result;
+}
+
+
+Future<Nothing> CgroupsCpushareIsolatorProcess::cleanup(
+    const ContainerID& containerId)
+{
+  // Multiple calls may occur during test clean up.
+  if (!infos.contains(containerId)) {
+    VLOG(1) << "Ignoring cleanup request for unknown container: "
+            << containerId;
+
+    return Nothing();
+  }
+
+  Info* info = CHECK_NOTNULL(infos[containerId]);
+
+  list<Future<Nothing>> futures;
+  foreach (const string& subsystem, subsystems) {
+    futures.push_back(cgroups::destroy(
+        hierarchies[subsystem],
+        info->cgroup,
+        cgroups::DESTROY_TIMEOUT));
+  }
+
+  return collect(futures)
+    .onAny(defer(PID<CgroupsCpushareIsolatorProcess>(this),
+                &CgroupsCpushareIsolatorProcess::_cleanup,
+                containerId,
+                lambda::_1))
+    .then([]() { return Nothing(); });
+}
+
+
+Future<list<Nothing>> CgroupsCpushareIsolatorProcess::_cleanup(
+    const ContainerID& containerId,
+    const Future<list<Nothing>>& future)
+{
+  if (!infos.contains(containerId)) {
+    return Failure("Unknown container");
+  }
+
+  CHECK_NOTNULL(infos[containerId]);
+
+  if (!future.isReady()) {
+    return Failure(
+        "Failed to clean up container " + stringify(containerId) +
+        " : " + (future.isFailed() ? future.failure() : "discarded"));
+  }
+
+  delete infos[containerId];
+  infos.erase(containerId);
+
+  return future;
+}
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {

http://git-wip-us.apache.org/repos/asf/mesos/blob/9a722d74/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
new file mode 100644
index 0000000..4ce37bf
--- /dev/null
+++ b/src/slave/containerizer/mesos/isolators/cgroups/cpushare.hpp
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CPUSHARE_ISOLATOR_HPP__
+#define __CPUSHARE_ISOLATOR_HPP__
+
+#include <sys/types.h>
+
+#include <string>
+#include <vector>
+
+#include <process/future.hpp>
+
+#include <stout/hashmap.hpp>
+#include <stout/option.hpp>
+
+#include "slave/flags.hpp"
+
+#include "slave/containerizer/isolator.hpp"
+
+#include "slave/containerizer/mesos/isolators/cgroups/constants.hpp"
+
+namespace mesos {
+namespace internal {
+namespace slave {
+
+// Use the Linux cpu cgroup controller for cpu isolation which uses the
+// Completely Fair Scheduler (CFS).
+// - cpushare implements proportionally weighted scheduling.
+// - cfs implements hard quota based scheduling.
+class CgroupsCpushareIsolatorProcess : public MesosIsolatorProcess
+{
+public:
+  static Try<mesos::slave::Isolator*> create(const Flags& flags);
+
+  virtual ~CgroupsCpushareIsolatorProcess();
+
+  virtual process::Future<Nothing> recover(
+      const std::list<mesos::slave::ContainerState>& states,
+      const hashset<ContainerID>& orphans);
+
+  virtual process::Future<Option<mesos::slave::ContainerPrepareInfo>> prepare(
+      const ContainerID& containerId,
+      const ExecutorInfo& executorInfo,
+      const std::string& directory,
+      const Option<std::string>& user);
+
+  virtual process::Future<Nothing> isolate(
+      const ContainerID& containerId,
+      pid_t pid);
+
+  virtual process::Future<mesos::slave::ContainerLimitation> watch(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> update(
+      const ContainerID& containerId,
+      const Resources& resources);
+
+  virtual process::Future<ResourceStatistics> usage(
+      const ContainerID& containerId);
+
+  virtual process::Future<Nothing> cleanup(
+      const ContainerID& containerId);
+
+private:
+  CgroupsCpushareIsolatorProcess(
+      const Flags& flags,
+      const hashmap<std::string, std::string>& hierarchies,
+      const std::vector<std::string>& subsystems);
+
+  virtual process::Future<std::list<Nothing>> _cleanup(
+      const ContainerID& containerId,
+      const process::Future<std::list<Nothing>>& future);
+
+  struct Info
+  {
+    Info(const ContainerID& _containerId, const std::string& _cgroup)
+      : containerId(_containerId), cgroup(_cgroup) {}
+
+    const ContainerID containerId;
+    const std::string cgroup;
+    Option<pid_t> pid;
+    Option<Resources> resources;
+
+    process::Promise<mesos::slave::ContainerLimitation> limitation;
+  };
+
+  const Flags flags;
+
+  // Map from subsystem to hierarchy.
+  hashmap<std::string, std::string> hierarchies;
+
+  // Subsystems used for this isolator. Typically, there are two
+  // elements in the vector: 'cpu' and 'cpuacct'. If cpu and cpuacct
+  // systems are co-mounted (e.g., systems using systemd), then there
+  // will be only one element in the vector which is 'cpu,cpuacct'.
+  std::vector<std::string> subsystems;
+
+  // TODO(bmahler): Use Owned<Info>.
+  hashmap<ContainerID, Info*> infos;
+};
+
+} // namespace slave {
+} // namespace internal {
+} // namespace mesos {
+
+#endif // __CPUSHARE_ISOLATOR_HPP__