You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by ji...@apache.org on 2015/06/08 21:27:14 UTC
[3/3] mesos git commit: Documented and consolidate qdisc handles.
Documented and consolidate qdisc handles.
Review: https://reviews.apache.org/r/35152
Project: http://git-wip-us.apache.org/repos/asf/mesos/repo
Commit: http://git-wip-us.apache.org/repos/asf/mesos/commit/a6a19bc0
Tree: http://git-wip-us.apache.org/repos/asf/mesos/tree/a6a19bc0
Diff: http://git-wip-us.apache.org/repos/asf/mesos/diff/a6a19bc0
Branch: refs/heads/master
Commit: a6a19bc0abddd437875267c48eefebed53aaf61f
Parents: 1ea1fdb
Author: Paul Brett <pa...@twopensource.com>
Authored: Mon Jun 8 12:25:30 2015 -0700
Committer: Jie Yu <yu...@gmail.com>
Committed: Mon Jun 8 12:26:37 2015 -0700
----------------------------------------------------------------------
.../isolators/network/port_mapping.cpp | 114 +++++++++++++++++--
1 file changed, 103 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mesos/blob/a6a19bc0/src/slave/containerizer/isolators/network/port_mapping.cpp
----------------------------------------------------------------------
diff --git a/src/slave/containerizer/isolators/network/port_mapping.cpp b/src/slave/containerizer/isolators/network/port_mapping.cpp
index 871e9cf..df2de60 100644
--- a/src/slave/containerizer/isolators/network/port_mapping.cpp
+++ b/src/slave/containerizer/isolators/network/port_mapping.cpp
@@ -126,6 +126,86 @@ using mesos::slave::Limitation;
// The minimum number of ephemeral ports a container should have.
static const uint16_t MIN_EPHEMERAL_PORTS_SIZE = 16;
+// Linux traffic control is a combination of queueing disciplines,
+// filters and classes organized as a tree for the ingress (tx) and
+// egress (rx) flows for each interface. Each container provides two
+// networking interfaces, a virtual eth0 and a loopback interface. The
+// flow of packets from the external network to container is shown
+// below:
+//
+// +----------------------+----------------------+
+// | Container |
+// |----------------------|----------------------|
+// | eth0 | lo |
+// +----------------------+----------------------+
+// ^ | ^ |
+// [3] | | [4] | |
+// | | [7] +-----------+ [10]
+// | |
+// | | [8] +-----------+ [9]
+// [2] | | [5] | |
+// | v v v
+// +----------------------+----------------------+
+// | veth0 | lo |
+// +----------------------|----------------------+
+// | Host |
+// |----------------------|----------------------|
+// | eth0 |
+// +----------------------+----------------------|
+// ^ |
+// [1] | | [6]
+// | v
+//
+// Traffic flowing from outside the network into a container enters
+// the system via the host ingress interface [1] and is routed based
+// on destination port to the outbound interface for the matching
+// container [2], which forwards the packet to the container's inbound
+// virtual interface. Outbound traffic destined for the external
+// network flows along the reverse path [4,5,6]. Loopback traffic is
+// directed to the corresponding Ethernet interface, either [7,10] or
+// [8,9] where the same destination port routing can be applied as to
+// external traffic. We use traffic control filters at several of the
+// interfaces to create these packet paths.
+//
+// Linux provides only a very simple topology for ingress interfaces.
+// A root is provided on a fixed handle (handle::INGRESS_ROOT) under
+// which a single qdisc can be installed, with handle ingress::HANDLE.
+// Traffic control filters can then be attached to the ingress qdisc.
+// We install one or more ingress filters on the host eth0 [1] to
+// direct traffic to the correct container, and on the container
+// virtual eth0 [5] to direct traffic to other containers or out of
+// the box. Since we know the ip port assignments for each container,
+// we can direct traffic directly to the appropriate container.
+// However, for ICMP and ARP traffic where no equivalent to a port
+// exists, we send a copy of the packet to every container and rely on
+// the network stack to drop unexpected packets.
+//
+// We install a Hierarchical Token Bucket (HTB) qdisc and class to
+// limit the outbound traffic bandwidth as the egress qdisc inside the
+// container [4] and then add a fq_codel qdisc to limit head of line
+// blocking on the egress filter. The egress traffic control chain is
+// thus:
+//
+// root device: handle::EGRESS_ROOT ->
+// htb egress qdisc: CONTAINER_TX_HTB_HANDLE ->
+// htb rate limiting class: CONTAINER_TX_HTB_CLASS_ID ->
+// buffer-bloat reduction: FQ_CODEL
+constexpr Handle CONTAINER_TX_HTB_HANDLE = Handle(1, 0);
+constexpr Handle CONTAINER_TX_HTB_CLASS_ID =
+ Handle(CONTAINER_TX_HTB_HANDLE, 1);
+
+
+// Finally we create a second fq_codel qdisc on the public interface
+// of the host [6] to reduce performance interference between
+// containers. We create independent flows for each container, and
+// one for the host, which ensures packets from each container are
+// guaranteed fair access to the host interface. This egress traffic
+// control chain for the host interface is thus:
+//
+// root device: handle::EGRESS_ROOT ->
+// buffer-bloat reduction: FQ_CODEL
+constexpr Handle HOST_TX_FQ_CODEL_HANDLE = Handle(1, 0);
+
// The primary priority used by each type of filter.
static const uint8_t ARP_FILTER_PRIORITY = 1;
@@ -3409,13 +3489,15 @@ string PortMappingIsolatorProcess::scripts(Info* info)
// Allow talking between containers and from container to host.
// TODO(chzhcn): Consider merging the following two filters.
- script << "tc filter add dev " << lo << " parent ffff: protocol ip"
+ script << "tc filter add dev " << lo << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(IP_FILTER_PRIORITY, NORMAL).get() << " u32"
<< " flowid ffff:0"
<< " match ip dst " << hostIPNetwork.address()
<< " action mirred egress redirect dev " << eth0 << "\n";
- script << "tc filter add dev " << lo << " parent ffff: protocol ip"
+ script << "tc filter add dev " << lo << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(IP_FILTER_PRIORITY, NORMAL).get() << " u32"
<< " flowid ffff:0"
<< " match ip dst "
@@ -3425,7 +3507,8 @@ string PortMappingIsolatorProcess::scripts(Info* info)
foreach (const PortRange& range,
getPortRanges(info->nonEphemeralPorts + info->ephemeralPorts)) {
// Local traffic inside a container will not be redirected to eth0.
- script << "tc filter add dev " << lo << " parent ffff: protocol ip"
+ script << "tc filter add dev " << lo << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(IP_FILTER_PRIORITY, HIGH).get() << " u32"
<< " flowid ffff:0"
<< " match ip dport " << range.begin() << " "
@@ -3433,7 +3516,8 @@ string PortMappingIsolatorProcess::scripts(Info* info)
// Traffic going to host loopback IP and ports assigned to this
// container will be redirected to lo.
- script << "tc filter add dev " << eth0 << " parent ffff: protocol ip"
+ script << "tc filter add dev " << eth0 << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(IP_FILTER_PRIORITY, NORMAL).get() << " u32"
<< " flowid ffff:0"
<< " match ip dst "
@@ -3444,13 +3528,15 @@ string PortMappingIsolatorProcess::scripts(Info* info)
}
// Do not forward the ICMP packet if the destination IP is self.
- script << "tc filter add dev " << lo << " parent ffff: protocol ip"
+ script << "tc filter add dev " << lo << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(ICMP_FILTER_PRIORITY, NORMAL).get() << " u32"
<< " flowid ffff:0"
<< " match ip protocol 1 0xff"
<< " match ip dst " << hostIPNetwork.address() << "\n";
- script << "tc filter add dev " << lo << " parent ffff: protocol ip"
+ script << "tc filter add dev " << lo << " parent " << ingress::HANDLE
+ << " protocol ip"
<< " prio " << Priority(ICMP_FILTER_PRIORITY, NORMAL).get() << " u32"
<< " flowid ffff:0"
<< " match ip protocol 1 0xff"
@@ -3458,8 +3544,10 @@ string PortMappingIsolatorProcess::scripts(Info* info)
<< net::IPNetwork::LOOPBACK_V4().address() << "\n";
// Display the filters created on eth0 and lo.
- script << "tc filter show dev " << eth0 << " parent ffff:\n";
- script << "tc filter show dev " << lo << " parent ffff:\n";
+ script << "tc filter show dev " << eth0
+ << " parent " << ingress::HANDLE << "\n";
+ script << "tc filter show dev " << lo
+ << " parent " << ingress::HANDLE << "\n";
// If throughput limit for container egress traffic exists, use HTB
// qdisc to achieve traffic shaping.
@@ -3470,8 +3558,11 @@ string PortMappingIsolatorProcess::scripts(Info* info)
// throughput. TBF requires other parameters such as 'burst' that
// HTB already has default values for.
if (egressRateLimitPerContainer.isSome()) {
- script << "tc qdisc add dev " << eth0 << " root handle 1: htb default 1\n";
- script << "tc class add dev " << eth0 << " parent 1: classid 1:1 htb rate "
+ script << "tc qdisc add dev " << eth0 << " root handle "
+ << CONTAINER_TX_HTB_HANDLE << " htb default 1\n";
+ script << "tc class add dev " << eth0 << " parent "
+ << CONTAINER_TX_HTB_HANDLE << " classid "
+ << CONTAINER_TX_HTB_CLASS_ID << " htb rate "
<< egressRateLimitPerContainer.get().bytes() * 8 << "bit\n";
// Packets are buffered at the leaf qdisc if we send them faster
@@ -3480,7 +3571,8 @@ string PortMappingIsolatorProcess::scripts(Info* info)
// fq_codel, which has a larger buffer and better control on
// buffer bloat.
// TODO(cwang): Verity that fq_codel qdisc is available.
- script << "tc qdisc add dev " << eth0 << " parent 1:1 fq_codel\n";
+ script << "tc qdisc add dev " << eth0
+ << " parent " << CONTAINER_TX_HTB_CLASS_ID << " fq_codel\n";
// Display the htb qdisc and class created on eth0.
script << "tc qdisc show dev " << eth0 << "\n";