You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2014/09/17 19:52:35 UTC

git commit: Added per-container egress rate limit for port mapping network isolator.

Repository: mesos
Updated Branches:
  refs/heads/master a47398bec -> 190e87c51


Added per-container egress rate limit for port mapping network isolator.

When used, the network isolator will install HTB Qdiscs to containers to
control egress traffic throughput to not go more than the specified
value.

Review: https://reviews.apache.org/r/25488


Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/190e87c5
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/190e87c5
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/190e87c5

Branch: refs/heads/master
Commit: 190e87c51d25646fa501ffca0bf7150157982050
Parents: a47398b
Author: Chi Zhang <ch...@gmail.com>
Authored: Wed Sep 17 10:22:16 2014 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Wed Sep 17 10:36:23 2014 -0700

----------------------------------------------------------------------
 .../isolators/network/port_mapping.cpp          |  62 ++++++++++
 .../isolators/network/port_mapping.hpp          |   6 +
 src/slave/flags.hpp                             |   9 ++
 src/tests/port_mapping_tests.cpp                | 121 +++++++++++++++++++
 4 files changed, 198 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mesos/blob/190e87c5/src/slave/containerizer/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.cpp b/src/slave/containerizer/isolators/network/port_mapping.cpp
index 9248460..2766a00 100644
--- a/src/slave/containerizer/isolators/network/port_mapping.cpp
+++ b/src/slave/containerizer/isolators/network/port_mapping.cpp
@@ -815,6 +815,49 @@ Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
 
   LOG(INFO) << "Using " << lo.get() << " as the loopback interface";
 
+  // If egress rate limit is provided, do a sanity check that it is
+  // not greater than the host physical link speed.
+  Option<Bytes> egressRateLimitPerContainer;
+  if (flags.egress_rate_limit_per_container.isSome()) {
+    // Read host physical link speed from /sys/class/net/eth0/speed.
+    // This value is in MBits/s.
+    Try<string> value =
+      os::read(path::join("/sys/class/net", eth0.get(), "speed"));
+
+    if (value.isError()) {
+      return Error(
+          "Failed to read " +
+          path::join("/sys/class/net", eth0.get(), "speed") +
+          ": " + value.error());
+    }
+
+    Try<uint64_t> hostLinkSpeed = numify<uint64_t>(strings::trim(value.get()));
+    CHECK_SOME(hostLinkSpeed);
+
+    // It could be possible that the nic driver doesn't support
+    // reporting physical link speed. In that case, report error.
+    if (hostLinkSpeed.get() == 0xFFFFFFFF) {
+      return Error(
+          "Network Isolator failed to determine link speed for " + eth0.get());
+    }
+
+    // Convert host link speed to Bytes/s for comparason.
+    if (hostLinkSpeed.get() * 1000000 / 8 <
+        flags.egress_rate_limit_per_container.get().bytes()) {
+      return Error(
+          "The given egress traffic limit for containers " +
+          stringify(flags.egress_rate_limit_per_container.get().bytes()) +
+          " Bytes/s is greater than the host link speed " +
+          stringify(hostLinkSpeed.get() * 1000000 / 8) + " Bytes/s");
+    }
+
+    if (flags.egress_rate_limit_per_container.get() != Bytes(0)) {
+      egressRateLimitPerContainer = flags.egress_rate_limit_per_container.get();
+    } else {
+      LOG(WARNING) << "Ignoring the given zero egress rate limit";
+    }
+  }
+
   // Get the host IP, MAC and default gateway.
   Result<net::IP> hostIP = net::ip(eth0.get());
   if (!hostIP.isSome()) {
@@ -1036,6 +1079,7 @@ Try<Isolator*> PortMappingIsolatorProcess::create(const Flags& flags)
           hostIP.get(),
           hostEth0MTU.get(),
           hostDefaultGateway.get(),
+          egressRateLimitPerContainer,
           nonEphemeralPorts,
           ephemeralPortsAllocator)));
 }
@@ -2423,6 +2467,24 @@ string PortMappingIsolatorProcess::scripts(Info* info)
   script << "tc filter show dev " << eth0 << " parent ffff:\n";
   script << "tc filter show dev " << lo << " parent ffff:\n";
 
+  // If throughput limit for container egress traffic exists, use HTB
+  // qdisc to achieve traffic shaping.
+  // TBF has some known issues with GSO packets.
+  // https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/:
+  // e43ac79a4bc6ca90de4ba10983b4ca39cd215b4b
+  // Additionally, HTB has a simpler interface for just capping the
+  // throughput. TBF requires other parameters such as 'burst' that
+  // HTB already has default values for.
+  if (egressRateLimitPerContainer.isSome()) {
+    script << "tc qdisc add dev " << eth0 << " root handle 1: htb default 1\n";
+    script << "tc class add dev " << eth0 << " parent 1: classid 1:1 htb rate "
+           << egressRateLimitPerContainer.get().bytes() * 8 << "bit\n";
+
+    // Display the htb qdisc and class created on eth0.
+    script << "tc qdisc show dev " << eth0 << "\n";
+    script << "tc class show dev " << eth0 << "\n";
+  }
+
   return script.str();
 }
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/190e87c5/src/slave/containerizer/isolators/network/port_mapping.hpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.hpp b/src/slave/containerizer/isolators/network/port_mapping.hpp
index a417f55..b624c4d 100644
--- a/src/slave/containerizer/isolators/network/port_mapping.hpp
+++ b/src/slave/containerizer/isolators/network/port_mapping.hpp
@@ -31,6 +31,7 @@
 #include <process/metrics/metrics.hpp>
 #include <process/metrics/counter.hpp>
 
+#include <stout/bytes.hpp>
 #include <stout/hashmap.hpp>
 #include <stout/hashset.hpp>
 #include <stout/interval.hpp>
@@ -224,6 +225,7 @@ private:
       const net::IP& _hostIP,
       const size_t _hostEth0MTU,
       const net::IP& _hostDefaultGateway,
+      const Option<Bytes>& _egressRateLimitPerContainer,
       const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
       const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator)
     : flags(_flags),
@@ -233,6 +235,7 @@ private:
       hostIP(_hostIP),
       hostEth0MTU(_hostEth0MTU),
       hostDefaultGateway(_hostDefaultGateway),
+      egressRateLimitPerContainer(_egressRateLimitPerContainer),
       managedNonEphemeralPorts(_managedNonEphemeralPorts),
       ephemeralPortsAllocator(_ephemeralPortsAllocator) {}
 
@@ -266,6 +269,9 @@ private:
   const size_t hostEth0MTU;
   const net::IP hostDefaultGateway;
 
+  // The optional throughput limit to containers' egress traffic.
+  const Option<Bytes> egressRateLimitPerContainer;
+
   // All the non-ephemeral ports managed by the slave, as passed in
   // via flags.resources.
   const IntervalSet<uint16_t> managedNonEphemeralPorts;

http://git-wip-us.apache.org/repos/asf/mesos/blob/190e87c5/src/slave/flags.hpp
----------------------------------------------------------------------
diff --git a/src/slave/flags.hpp b/src/slave/flags.hpp
index 21e0021..32e51d2 100644
--- a/src/slave/flags.hpp
+++ b/src/slave/flags.hpp
@@ -21,6 +21,7 @@
 
 #include <string>
 
+#include <stout/bytes.hpp>
 #include <stout/duration.hpp>
 #include <stout/flags.hpp>
 #include <stout/option.hpp>
@@ -313,6 +314,13 @@ public:
         "lo_name",
         "The name of the loopback network interface (e.g., lo). If it is\n"
         "not specified, the network isolator will try to guess it.");
+
+    add(&Flags::egress_rate_limit_per_container,
+        "egress_rate_limit_per_container",
+        "The limit of the egress traffic for each container, in Bytes/s.\n"
+        "If not specified or specified as zero, the network isolator will\n"
+        "impose no limits to containers' egress traffic throughput.\n"
+        "This flag uses the Bytes type, defined in stout.");
 #endif // WITH_NETWORK_ISOLATOR
   }
 
@@ -359,6 +367,7 @@ public:
   uint16_t ephemeral_ports_per_container;
   Option<std::string> eth0_name;
   Option<std::string> lo_name;
+  Option<Bytes> egress_rate_limit_per_container;
 #endif
 };
 

http://git-wip-us.apache.org/repos/asf/mesos/blob/190e87c5/src/tests/port_mapping_tests.cpp
----------------------------------------------------------------------
diff --git a/src/tests/port_mapping_tests.cpp b/src/tests/port_mapping_tests.cpp
index fdb23b2..0389f40 100644
--- a/src/tests/port_mapping_tests.cpp
+++ b/src/tests/port_mapping_tests.cpp
@@ -24,7 +24,9 @@
 
 #include <process/future.hpp>
 #include <process/reap.hpp>
+#include <process/subprocess.hpp>
 
+#include <stout/bytes.hpp>
 #include <stout/gtest.hpp>
 #include <stout/json.hpp>
 #include <stout/net.hpp>
@@ -1218,6 +1220,125 @@ TEST_F(PortMappingIsolatorTest, ROOT_TooManyContainersTest)
 }
 
 
+// Test the scenario where PortMappingIsolator uses a very small
+// egress rate limit.
+TEST_F(PortMappingIsolatorTest, ROOT_SmallEgressLimitTest)
+{
+  // Note that the underlying rate limiting mechanism usually has a
+  // small allowance for burst. Empirically, as least 10x of the rate
+  // limit amount of data is required to make sure the burst is an
+  // insignificant factor of the transmission time.
+
+  // To-be-tested egress rate limit, in Bytes/s.
+  const Bytes rate = 1000;
+  // Size of the data to send, in Bytes.
+  const Bytes size = 20480;
+
+  // Use a very small egress limit.
+  flags.egress_rate_limit_per_container = rate;
+
+  Try<Isolator*> isolator = PortMappingIsolatorProcess::create(flags);
+  CHECK_SOME(isolator);
+
+  Try<Launcher*> launcher = LinuxLauncher::create(flags);
+  CHECK_SOME(launcher);
+
+  // Open a nc server on the host side. Note that 'errorPort' is in
+  // neither 'ports' nor 'ephemeral_ports', which makes it a good port
+  // to use on the host.
+  Try<Subprocess> s = subprocess(
+      "nc -l localhost " + stringify(errorPort) + " > /devnull");
+  CHECK_SOME(s);
+
+  // Set the executor's resources.
+  ExecutorInfo executorInfo;
+  executorInfo.mutable_resources()->CopyFrom(
+      Resources::parse(container1Ports).get());
+
+  ContainerID containerId;
+  containerId.set_value("container1");
+
+  Future<Option<CommandInfo> > preparation1 =
+    isolator.get()->prepare(containerId, executorInfo);
+  AWAIT_READY(preparation1);
+  ASSERT_SOME(preparation1.get());
+
+  // Fill 'size' bytes of data. The actual content does not matter.
+  char data[size.bytes()];
+  memset(data, 97, size.bytes());
+
+  ostringstream command1;
+  const string transmissionTime = path::join(os::getcwd(), "transmission_time");
+
+  command1 << "echo 'Sending " << size.bytes()
+           << " bytes of data under egress rate limit " << rate.bytes()
+           << "Bytes/s...';";
+
+  command1 << "{ time -p echo " << string(data)  << " | nc localhost "
+           << errorPort << " ; } 2> " << transmissionTime << " && ";
+
+  // Touch the guard file.
+  command1 << "touch " << container1Ready;
+
+  int pipes[2];
+  ASSERT_NE(-1, ::pipe(pipes));
+
+  Try<pid_t> pid = launchHelper(
+      launcher.get(),
+      pipes,
+      containerId,
+      command1.str(),
+      preparation1.get());
+  ASSERT_SOME(pid);
+
+  // Reap the forked child.
+  Future<Option<int> > reap = process::reap(pid.get());
+
+  // Continue in the parent.
+  ::close(pipes[0]);
+
+  // Isolate the forked child.
+  AWAIT_READY(isolator.get()->isolate(containerId, pid.get()));
+
+  // Now signal the child to continue.
+  char dummy;
+  ASSERT_LT(0, ::write(pipes[1], &dummy, sizeof(dummy)));
+  ::close(pipes[1]);
+
+  // Wait for the command to finish.
+  while (!os::exists(container1Ready));
+
+  Try<string> read = os::read(transmissionTime);
+  CHECK_SOME(read);
+
+  // Get the real elapsed time from `time` output. Sample output:
+  // real 12.37
+  // user 0.00
+  // sys 0.00
+  vector<string> lines = strings::split(strings::trim(read.get()), "\n");
+  ASSERT_EQ(3u, lines.size());
+  vector<string> split = strings::split(lines[0], " ");
+  ASSERT_EQ(2u, split.size());
+  Try<float> time = numify<float>(split[1]);
+  ASSERT_SOME(time);
+  ASSERT_GT(time.get(), (size.bytes() / rate.bytes()));
+
+  // Make sure the nc server exits normally.
+  Future<Option<int> > status = s.get().status();
+  AWAIT_READY(status);
+  EXPECT_SOME_EQ(0, status.get());
+
+  // Ensure all processes are killed.
+  AWAIT_READY(launcher.get()->destroy(containerId));
+
+  // Let the isolator clean up.
+  AWAIT_READY(isolator.get()->cleanup(containerId));
+
+  delete isolator.get();
+  delete launcher.get();
+}
+
+
 class PortMappingMesosTest : public ContainerizerTest<MesosContainerizer>
 {
 public: