You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@aurora.apache.org by ma...@apache.org on 2016/05/31 01:44:04 UTC

aurora git commit: Adding support for GPU resource

Repository: aurora
Updated Branches:
  refs/heads/master b76e38f9a -> edd47d08f


Adding support for GPU resource

Reviewed at https://reviews.apache.org/r/47869/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/edd47d08
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/edd47d08
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/edd47d08

Branch: refs/heads/master
Commit: edd47d08f57a816d524f4a353a1cb619ef093b89
Parents: b76e38f
Author: Maxim Khutornenko <ma...@apache.org>
Authored: Mon May 30 18:43:47 2016 -0700
Committer: Maxim Khutornenko <ma...@apache.org>
Committed: Mon May 30 18:43:47 2016 -0700

----------------------------------------------------------------------
 RELEASE-NOTES.md                                | 17 ++++++++
 .../thrift/org/apache/aurora/gen/api.thrift     | 11 +++++-
 docs/features/resource-isolation.md             | 12 ++++++
 docs/reference/configuration.md                 |  1 +
 .../mesos_config/etc_mesos-slave/resources      |  2 +-
 examples/vagrant/upstart/aurora-scheduler.conf  |  3 +-
 .../apache/aurora/scheduler/app/AppModule.java  |  6 ++-
 .../aurora/scheduler/base/TaskTestUtil.java     |  1 +
 .../configuration/ConfigurationManager.java     | 14 ++++++-
 .../scheduler/resources/ResourceType.java       | 15 +++++++
 src/main/python/apache/aurora/config/thrift.py  |  8 +++-
 .../python/apache/thermos/config/schema_base.py |  1 +
 .../apache/thermos/config/schema_helpers.py     | 17 +++++---
 .../scheduler/assets/configSummary.html         | 15 ++++---
 .../resources/scheduler/assets/js/filters.js    | 39 ++++++++++++++-----
 .../configuration/ConfigurationManagerTest.java | 35 +++++++++++++----
 .../resources/ResourceManagerTest.java          |  2 +
 .../aurora/scheduler/thrift/ThriftIT.java       |  3 +-
 .../python/apache/aurora/config/test_thrift.py  |  3 +-
 .../python/apache/thermos/config/test_schema.py | 33 ++++++++++------
 .../apache/aurora/e2e/http/http_example.aurora  | 41 ++++++++++++--------
 .../http/http_example_bad_healthcheck.aurora    | 41 ++++++++++++--------
 .../aurora/e2e/http/http_example_updated.aurora | 41 ++++++++++++--------
 .../sh/org/apache/aurora/e2e/test_end_to_end.sh | 11 ++++--
 24 files changed, 269 insertions(+), 103 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/RELEASE-NOTES.md
----------------------------------------------------------------------
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 6e878c5..21a141f 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -24,6 +24,21 @@
   reference to either a `Docker` or `Mesos` container.
 - New scheduler command line argument `-ip` to control what ip address to bind the schedulers http
   server to.
+- Added experimental support for Mesos GPU resource. This feature will be available in Mesos 0.29.0
+  and is disabled by default. Use `-allow_gpu_resource` flag to enable it.
+
+  **IMPORTANT: once this feature is enabled, creating jobs with GPU resource will make scheduler
+  snapshot backwards incompatible. Scheduler will be unable to read snapshot if rolled back to
+  previous version. If rollback is absolutely necessary, perform the following steps:**
+  1. Set `-allow_gpu_resource` to false
+  2. Delete all jobs with GPU resource (including cron job schedules if applicable)
+  3. Wait until GPU task history is pruned. You may speed it up by changing the history retention
+    flags, e.g.: `-history_prune_threshold=1mins` and `-history_max_per_job_threshold=0`
+  4. In case there were GPU job updates created, prune job update history for affected jobs from
+    `/h2console` endpoint or reduce job update pruning thresholds, e.g.:
+    `-job_update_history_pruning_threshold=1mins` and `-job_update_history_per_job_threshold=0`
+  5. Ensure a new snapshot is created by running `aurora_admin scheduler_snapshot <cluster>`
+  6. Rollback to previous version
 
 ### Deprecations and removals:
 
@@ -37,6 +52,8 @@
   sandbox.
 - Setting the `container` property of a `Job` to a `Container` holder is deprecated in favor of
   setting it directly to the appropriate (i.e. `Docker` or `Mesos`) container type.
+- Deprecated `numCpus`, `ramMb` and `diskMb` fields in `TaskConfig` and `ResourceAggregate` thrift
+  structs. Use `set<Resource> resources` to specify task resources or quota values.
 
 0.13.0
 ------

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/api/src/main/thrift/org/apache/aurora/gen/api.thrift
----------------------------------------------------------------------
diff --git a/api/src/main/thrift/org/apache/aurora/gen/api.thrift b/api/src/main/thrift/org/apache/aurora/gen/api.thrift
index a99889c..ed94f24 100644
--- a/api/src/main/thrift/org/apache/aurora/gen/api.thrift
+++ b/api/src/main/thrift/org/apache/aurora/gen/api.thrift
@@ -223,18 +223,22 @@ union Resource {
   2: i64 ramMb
   3: i64 diskMb
   4: string namedPort
+  5: i64 numGpus
 }
 
 /** Description of the tasks contained within a job. */
 struct TaskConfig {
  /** Job task belongs to. */
  28: JobKey job
- // TODO(maxim): Remove in 0.7.0. (AURORA-749)
+ // TODO(maxim): Deprecated. See AURORA-749.
  /** contains the role component of JobKey */
  17: Identity owner
   7: bool isService
+  // TODO(maxim): Deprecated. See AURORA-1707.
   8: double numCpus
+  // TODO(maxim): Deprecated. See AURORA-1707.
   9: i64 ramMb
+  // TODO(maxim): Deprecated. See AURORA-1707.
  10: i64 diskMb
  11: i32 priority
  13: i32 maxTaskFailures
@@ -267,10 +271,13 @@ struct TaskConfig {
 }
 
 struct ResourceAggregate {
+  // TODO(maxim): Deprecated. See AURORA-1707.
   /** Number of CPU cores allotted. */
   1: double numCpus
+  // TODO(maxim): Deprecated. See AURORA-1707.
   /** Megabytes of RAM allotted. */
   2: i64 ramMb
+  // TODO(maxim): Deprecated. See AURORA-1707.
   /** Megabytes of disk space allotted. */
   3: i64 diskMb
   /** Aggregated resource values. */
@@ -299,7 +306,7 @@ struct JobConfiguration {
    * used to construct it server-side.
    */
   9: JobKey key
-  // TODO(maxim): Remove in 0.7.0. (AURORA-749)
+  // TODO(maxim): Deprecated. See AURORA-749.
   /** Owner of this job. */
   7: Identity owner
   /**

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/docs/features/resource-isolation.md
----------------------------------------------------------------------
diff --git a/docs/features/resource-isolation.md b/docs/features/resource-isolation.md
index d08b59f..59da823 100644
--- a/docs/features/resource-isolation.md
+++ b/docs/features/resource-isolation.md
@@ -101,6 +101,14 @@ are still available but you shouldn't count on them being so.
 application can write above its quota without getting an ENOSPC, but it
 will be killed shortly after. This is subject to change.
 
+### GPU Isolation
+
+GPU isolation will be supported for Nvidia devices starting from Mesos 0.29.0.
+Access to the allocated units will be exclusive with no sharing between tasks
+allowed (e.g. no fractional GPU allocation). Until official documentation is released,
+see [Mesos design document](https://docs.google.com/document/d/10GJ1A80x4nIEo8kfdeo9B11PIbS1xJrrB4Z373Ifkpo/edit#heading=h.w84lz7p4eexl)
+for more details.
+
 ### Other Resources
 
 Other resources, such as network bandwidth, do not have any performance
@@ -141,6 +149,10 @@ add the maximum size of the Java heap to your disk space requirement, in
 order to account for an out of memory error dumping the heap
 into the application's sandbox space.
 
+## GPU Sizing
+
+GPU is highly dependent on your application requirements and is only limited
+by the number of physical GPU units available on a target box.
 
 Oversubscription
 ----------------

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/docs/reference/configuration.md
----------------------------------------------------------------------
diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md
index e77ee60..c56ace0 100644
--- a/docs/reference/configuration.md
+++ b/docs/reference/configuration.md
@@ -321,6 +321,7 @@ resources are allocated.
   ```cpu```  | Float   | Fractional number of cores required by the task.
   ```ram```  | Integer | Bytes of RAM required by the task.
   ```disk``` | Integer | Bytes of disk required by the task.
+  ```gpu```  | Integer | Number of GPU cores required by the task
 
 
 Job Schema

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/examples/vagrant/mesos_config/etc_mesos-slave/resources
----------------------------------------------------------------------
diff --git a/examples/vagrant/mesos_config/etc_mesos-slave/resources b/examples/vagrant/mesos_config/etc_mesos-slave/resources
index 5bfe779..aa0e97e 100644
--- a/examples/vagrant/mesos_config/etc_mesos-slave/resources
+++ b/examples/vagrant/mesos_config/etc_mesos-slave/resources
@@ -1 +1 @@
-cpus(aurora-role):0.5;cpus(*):3.5;mem(aurora-role):1024;disk:20000
+cpus(aurora-role):0.5;cpus(*):3.5;mem(aurora-role):1024;disk:20000;gpus(*):4

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/examples/vagrant/upstart/aurora-scheduler.conf
----------------------------------------------------------------------
diff --git a/examples/vagrant/upstart/aurora-scheduler.conf b/examples/vagrant/upstart/aurora-scheduler.conf
index 3d9e706..954ddb4 100644
--- a/examples/vagrant/upstart/aurora-scheduler.conf
+++ b/examples/vagrant/upstart/aurora-scheduler.conf
@@ -51,4 +51,5 @@ exec bin/aurora-scheduler \
   -tier_config=/home/vagrant/aurora/src/main/resources/org/apache/aurora/scheduler/tiers.json \
   -mesos_role=aurora-role \
   -populate_discovery_info=true \
-  -receive_revocable_resources=true
+  -receive_revocable_resources=true \
+  -allow_gpu_resource=true

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/app/AppModule.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/app/AppModule.java b/src/main/java/org/apache/aurora/scheduler/app/AppModule.java
index c81bd79..6c7c75a 100644
--- a/src/main/java/org/apache/aurora/scheduler/app/AppModule.java
+++ b/src/main/java/org/apache/aurora/scheduler/app/AppModule.java
@@ -94,6 +94,9 @@ public class AppModule extends AbstractModule {
       help = "If false, Docker tasks may run without an executor (EXPERIMENTAL)")
   private static final Arg<Boolean> REQUIRE_DOCKER_USE_EXECUTOR = Arg.create(true);
 
+  @CmdLine(name = "allow_gpu_resource", help = "Allow jobs to request Mesos GPU resource.")
+  private static final Arg<Boolean> ALLOW_GPU_RESOURCE = Arg.create(false);
+
   private final ConfigurationManagerSettings configurationManagerSettings;
 
   @VisibleForTesting
@@ -106,7 +109,8 @@ public class AppModule extends AbstractModule {
         ImmutableSet.copyOf(ALLOWED_CONTAINER_TYPES.get()),
         ENABLE_DOCKER_PARAMETERS.get(),
         DEFAULT_DOCKER_PARAMETERS.get(),
-        REQUIRE_DOCKER_USE_EXECUTOR.get()));
+        REQUIRE_DOCKER_USE_EXECUTOR.get(),
+        ALLOW_GPU_RESOURCE.get()));
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java b/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java
index 221417f..e431b58 100644
--- a/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java
+++ b/src/main/java/org/apache/aurora/scheduler/base/TaskTestUtil.java
@@ -68,6 +68,7 @@ public final class TaskTestUtil {
           ImmutableSet.of(_Fields.MESOS),
           false,
           ImmutableMultimap.of(),
+          true,
           true);
   public static final ConfigurationManager CONFIGURATION_MANAGER =
       new ConfigurationManager(CONFIGURATION_MANAGER_SETTINGS, DEV_TIER_MANAGER);

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java b/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java
index 4ef202c..0e95620 100644
--- a/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java
+++ b/src/main/java/org/apache/aurora/scheduler/configuration/ConfigurationManager.java
@@ -51,6 +51,7 @@ import org.apache.aurora.scheduler.storage.log.ThriftBackfill;
 
 import static java.util.Objects.requireNonNull;
 
+import static org.apache.aurora.scheduler.resources.ResourceType.GPUS;
 import static org.apache.aurora.scheduler.resources.ResourceType.PORTS;
 
 /**
@@ -114,17 +115,20 @@ public class ConfigurationManager {
     private final boolean allowDockerParameters;
     private final Multimap<String, String> defaultDockerParameters;
     private final boolean requireDockerUseExecutor;
+    private final boolean allowGpuResource;
 
     public ConfigurationManagerSettings(
         ImmutableSet<Container._Fields> allowedContainerTypes,
         boolean allowDockerParameters,
         Multimap<String, String> defaultDockerParameters,
-        boolean requireDockerUseExecutor) {
+        boolean requireDockerUseExecutor,
+        boolean allowGpuResource) {
 
       this.allowedContainerTypes = requireNonNull(allowedContainerTypes);
       this.allowDockerParameters = allowDockerParameters;
       this.defaultDockerParameters = requireNonNull(defaultDockerParameters);
       this.requireDockerUseExecutor = requireDockerUseExecutor;
+      this.allowGpuResource = allowGpuResource;
     }
   }
 
@@ -321,6 +325,14 @@ public class ConfigurationManager {
       throw new TaskDescriptionException("Multiple resource values are not supported for " + types);
     }
 
+    if (!settings.allowGpuResource && config.getResources().stream()
+        .filter(r -> ResourceType.fromResource(r).equals(GPUS))
+        .findAny()
+        .isPresent()) {
+
+      throw new TaskDescriptionException("GPU resource support is disabled in this cluster.");
+    }
+
     maybeFillLinks(builder);
 
     return ITaskConfig.build(builder);

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java b/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java
index 6a4f110..bfe7583 100644
--- a/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java
+++ b/src/main/java/org/apache/aurora/scheduler/resources/ResourceType.java
@@ -100,6 +100,21 @@ public enum ResourceType implements TEnum {
       "count",
       1000,
       true,
+      false),
+
+  /**
+   * GPU resource.
+   */
+  GPUS(
+      _Fields.NUM_GPUS,
+      SCALAR,
+      "gpus",
+      LONG,
+      Optional.empty(),
+      "GPU",
+      "core(s)",
+      4,
+      false,
       false);
 
   /**

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/aurora/config/thrift.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/config/thrift.py b/src/main/python/apache/aurora/config/thrift.py
index 81a5055..3539469 100644
--- a/src/main/python/apache/aurora/config/thrift.py
+++ b/src/main/python/apache/aurora/config/thrift.py
@@ -268,10 +268,14 @@ def convert(job, metadata=frozenset(), ports=frozenset()):
   if task.numCpus <= 0 or task.ramMb <= 0 or task.diskMb <= 0:
     raise InvalidConfig('Task has invalid resources.  cpu/ramMb/diskMb must all be positive: '
         'cpu:%r ramMb:%r diskMb:%r' % (task.numCpus, task.ramMb, task.diskMb))
+  numGpus = fully_interpolated(task_raw.resources().gpu())
 
   task.resources = frozenset(
-      [Resource(numCpus=task.numCpus), Resource(ramMb=task.ramMb), Resource(diskMb=task.diskMb)] +
-      [Resource(namedPort=p) for p in ports])
+      [Resource(numCpus=task.numCpus),
+       Resource(ramMb=task.ramMb),
+       Resource(diskMb=task.diskMb)]
+      + [Resource(namedPort=p) for p in ports]
+      + [Resource(numGpus=numGpus)] if numGpus else [])
 
   task.job = key
   task.owner = owner

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/thermos/config/schema_base.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/thermos/config/schema_base.py b/src/main/python/apache/thermos/config/schema_base.py
index a6768e6..f9e96d9 100644
--- a/src/main/python/apache/thermos/config/schema_base.py
+++ b/src/main/python/apache/thermos/config/schema_base.py
@@ -51,6 +51,7 @@ class Resources(Struct):
   cpu  = Required(Float)
   ram  = Required(Integer)
   disk = Required(Integer)
+  gpu  = Default(Integer, 0)
 
 
 class Constraint(Struct):

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/python/apache/thermos/config/schema_helpers.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/thermos/config/schema_helpers.py b/src/main/python/apache/thermos/config/schema_helpers.py
index 46394bb..cecfd80 100644
--- a/src/main/python/apache/thermos/config/schema_helpers.py
+++ b/src/main/python/apache/thermos/config/schema_helpers.py
@@ -69,9 +69,13 @@ class Units(object):
     def add(r1, r2):
       return Resources(cpu=add_unit(r1.cpu(), r2.cpu()),
                        ram=add_unit(r1.ram(), r2.ram()),
-                       disk=add_unit(r1.disk(), r2.disk()))
+                       disk=add_unit(r1.disk(), r2.disk()),
+                       gpu=add_unit(r1.gpu(), r2.gpu()))
 
-    return reduce(add, map(cls.optional_resources, resources), Resources(cpu=0, ram=0, disk=0))
+    return reduce(
+        add,
+        map(cls.optional_resources, resources),
+        Resources(cpu=0, ram=0, disk=0, gpu=0))
 
   @classmethod
   def finalization_wait_sum(cls, waits):
@@ -88,10 +92,13 @@ class Units(object):
     def resource_max(r1, r2):
       return Resources(cpu=max_unit(r1.cpu(), r2.cpu()),
                        ram=max_unit(r1.ram(), r2.ram()),
-                       disk=max_unit(r1.disk(), r2.disk()))
+                       disk=max_unit(r1.disk(), r2.disk()),
+                       gpu=max_unit(r1.gpu(), r2.gpu()))
 
-    return reduce(resource_max,
-        map(cls.optional_resources, resources), Resources(cpu=0, ram=0, disk=0))
+    return reduce(
+        resource_max,
+        map(cls.optional_resources, resources),
+        Resources(cpu=0, ram=0, disk=0, gpu=0))
 
   @classmethod
   def finalization_wait_max(cls, waits):

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/resources/scheduler/assets/configSummary.html
----------------------------------------------------------------------
diff --git a/src/main/resources/scheduler/assets/configSummary.html b/src/main/resources/scheduler/assets/configSummary.html
index 36df616..86a87ab 100644
--- a/src/main/resources/scheduler/assets/configSummary.html
+++ b/src/main/resources/scheduler/assets/configSummary.html
@@ -17,10 +17,15 @@
       configuration details for instances {{group.label}}
     </th>
   <tr>
-    <td class="cellLabel" rowspan="3">resources</td>
+    <td class="cellLabel"
+        rowspan="{{group.summary.schedulingDetail.resources | toResourceValue}}">resources</td>
     <td>cpu</td>
     <td>{{group.summary.schedulingDetail.resources | toResourceValue:'CPUS'}}</td>
   </tr>
+  <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"GPUS").length > 0'>
+    <td>gpu</td>
+    <td>{{group.summary.schedulingDetail.resources | toResourceValue:'GPUS'}}</td>
+  </tr>
   <tr>
     <td>ram</td>
     <td>{{group.summary.schedulingDetail.resources | toResourceValue:'RAM_MB'}}</td>
@@ -29,6 +34,10 @@
     <td>disk</td>
     <td>{{group.summary.schedulingDetail.resources | toResourceValue:'DISK_MB'}}</td>
   </tr>
+  <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"PORTS").length > 0'>
+    <td>ports</td>
+    <td>{{group.summary.schedulingDetail.resources | toResourceValue:'PORTS'}}</td>
+  </tr>
   <tr>
     <td class="cellLabel">constraints</td>
     <td colspan="2">{{group.summary.schedulingDetail.constraints}}</td>
@@ -41,10 +50,6 @@
     <td class="cellLabel">service</td>
     <td colspan="2">{{group.summary.schedulingDetail.isService}}</td>
   </tr>
-  <tr ng-if='(group.summary.schedulingDetail.resources | toResourceValue:"PORTS").length > 0'>
-    <td class="cellLabel">ports</td>
-    <td colspan="2">{{group.summary.schedulingDetail.resources | toResourceValue:'PORTS'}}</td>
-  </tr>
   <tr ng-if='group.summary.schedulingDetail.metadata'>
     <td class="cellLabel">metadata</td>
     <td colspan="2">{{group.summary.schedulingDetail.metadata}}</td>

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/main/resources/scheduler/assets/js/filters.js
----------------------------------------------------------------------
diff --git a/src/main/resources/scheduler/assets/js/filters.js b/src/main/resources/scheduler/assets/js/filters.js
index 98f786e..ec35d81 100644
--- a/src/main/resources/scheduler/assets/js/filters.js
+++ b/src/main/resources/scheduler/assets/js/filters.js
@@ -95,31 +95,50 @@
     return function (resources, type) {
       var RESOURCE_MAP = {
         'CPUS': {
-          filter: function (e) { return e.numCpus !== null; },
-          format: function (v) { return _.first(v).numCpus + ' cores'; }
+          field: 'numCpus',
+          format: function (v) { return _.first(v)[this.field] + ' core(s)'; }
         },
         'RAM_MB': {
-          filter: function (e) { return e.ramMb !== null; },
-          format: function (v) { return formatMem(_.first(v).ramMb); }
+          field: 'ramMb',
+          format: function (v) { return formatMem(_.first(v)[this.field]); }
         },
         'DISK_MB': {
-          filter: function (e) { return e.diskMb !== null; },
-          format: function (v) { return formatMem(_.first(v).diskMb); }
+          field: 'diskMb',
+          format: function (v) { return formatMem(_.first(v)[this.field]); }
         },
         'PORTS': {
-          filter: function (e) { return e.namedPort !== null; },
+          field: 'namedPort',
           format: function (v) {
+            var field = this.field;
             return _.chain(v)
-                .map(function (r) { return r.namedPort; })
+                .map(function (r) { return r[field]; })
                 .sortBy()
                 .value()
                 .join(', ');
           }
+        },
+        'GPUS': {
+          field: 'numGpus',
+          format: function (v) { return _.first(v)[this.field] + ' core(s)'; }
         }
       };
 
-      if (RESOURCE_MAP.hasOwnProperty(type)) {
-        var match = _.filter(resources, RESOURCE_MAP[type].filter);
+      if (!type) {
+        return _.chain(resources)
+            .groupBy(function (r) {
+              for (var key in RESOURCE_MAP) {
+                var field = RESOURCE_MAP[key].field;
+                if (r.hasOwnProperty(field) && r[field] !== null) {
+                  return field;
+                }
+              }
+              return null;
+            })
+            .size()
+            .value();
+      } else if (RESOURCE_MAP.hasOwnProperty(type)) {
+        var field = RESOURCE_MAP[type].field;
+        var match = _.filter(resources, function (r) { return r[field] !== null; });
         if (match && !_.isEmpty(match)) {
           return RESOURCE_MAP[type].format(match);
         }

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java b/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java
index ddf8143..2e322d2 100644
--- a/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java
+++ b/src/test/java/org/apache/aurora/scheduler/configuration/ConfigurationManagerTest.java
@@ -45,6 +45,7 @@ import org.junit.rules.ExpectedException;
 import static org.apache.aurora.gen.Resource.diskMb;
 import static org.apache.aurora.gen.Resource.namedPort;
 import static org.apache.aurora.gen.Resource.numCpus;
+import static org.apache.aurora.gen.Resource.numGpus;
 import static org.apache.aurora.gen.Resource.ramMb;
 import static org.apache.aurora.gen.test.testConstants.INVALID_IDENTIFIERS;
 import static org.apache.aurora.gen.test.testConstants.VALID_IDENTIFIERS;
@@ -113,17 +114,19 @@ public class ConfigurationManagerTest {
 
   private static final ConfigurationManager CONFIGURATION_MANAGER = new ConfigurationManager(
       new ConfigurationManagerSettings(
-      ALL_CONTAINER_TYPES,
-      false,
-      ImmutableMultimap.of(),
-      true),
+          ALL_CONTAINER_TYPES,
+          false,
+          ImmutableMultimap.of(),
+          true,
+          false),
       TaskTestUtil.DEV_TIER_MANAGER);
   private static final ConfigurationManager DOCKER_CONFIGURATION_MANAGER = new ConfigurationManager(
       new ConfigurationManagerSettings(
-        ALL_CONTAINER_TYPES,
-        true,
-        ImmutableMultimap.of("foo", "bar"),
-        false),
+          ALL_CONTAINER_TYPES,
+          true,
+          ImmutableMultimap.of("foo", "bar"),
+          false,
+          true),
       TaskTestUtil.DEV_TIER_MANAGER);
 
   @Test
@@ -272,6 +275,22 @@ public class ConfigurationManagerTest {
   }
 
   @Test
+  public void testGpuResourcesNotAllowed() throws Exception {
+    TaskConfig builder = CONFIG_WITH_CONTAINER.newBuilder();
+    builder.addToResources(numGpus(2));
+
+    expectTaskDescriptionException("GPU resource support is disabled in this cluster.");
+    new ConfigurationManager(
+        new ConfigurationManagerSettings(
+            ALL_CONTAINER_TYPES,
+            true,
+            ImmutableMultimap.of("foo", "bar"),
+            false,
+            false),
+        TaskTestUtil.DEV_TIER_MANAGER).validateAndPopulate(ITaskConfig.build(builder));
+  }
+
+  @Test
   public void testTaskLinks() throws Exception {
     TaskConfig builder = CONFIG_WITH_CONTAINER.newBuilder();
     builder.addToResources(namedPort("health"));

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java b/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java
index a5dda25..14ac547 100644
--- a/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java
+++ b/src/test/java/org/apache/aurora/scheduler/resources/ResourceManagerTest.java
@@ -32,6 +32,7 @@ import org.junit.Test;
 import static org.apache.aurora.gen.Resource.diskMb;
 import static org.apache.aurora.gen.Resource.namedPort;
 import static org.apache.aurora.gen.Resource.numCpus;
+import static org.apache.aurora.gen.Resource.numGpus;
 import static org.apache.aurora.gen.Resource.ramMb;
 import static org.apache.aurora.scheduler.base.TaskTestUtil.JOB;
 import static org.apache.aurora.scheduler.base.TaskTestUtil.makeTask;
@@ -114,6 +115,7 @@ public class ResourceManagerTest {
   public void testGetTaskResourceTypes() {
     AssignedTask builder = makeTask("id", JOB).newBuilder().getAssignedTask();
     builder.getTask().addToResources(namedPort("health"));
+    builder.getTask().addToResources(numGpus(4));
 
     assertEquals(
         EnumSet.allOf(ResourceType.class),

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java b/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java
index 9cce641..a54d169 100644
--- a/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java
+++ b/src/test/java/org/apache/aurora/scheduler/thrift/ThriftIT.java
@@ -154,7 +154,8 @@ public class ThriftIT extends EasyMockTest {
         ImmutableSet.of(_Fields.DOCKER),
         true,
         ImmutableMultimap.of(),
-        false);
+        false,
+        true);
 
     createThrift(configurationManagerSettings);
 

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/python/apache/aurora/config/test_thrift.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/config/test_thrift.py b/src/test/python/apache/aurora/config/test_thrift.py
index 8769db3..e213184 100644
--- a/src/test/python/apache/aurora/config/test_thrift.py
+++ b/src/test/python/apache/aurora/config/test_thrift.py
@@ -45,7 +45,7 @@ HELLO_WORLD = Job(
   task=Task(
     name='main',
     processes=[Process(name='hello_world', cmdline='echo {{mesos.instance}}')],
-    resources=Resources(cpu=0.1, ram=64 * 1048576, disk=64 * 1048576),
+    resources=Resources(cpu=0.1, ram=64 * 1048576, disk=64 * 1048576, gpu=2),
   )
 )
 
@@ -77,6 +77,7 @@ def test_simple_config():
   assert Resource(ramMb=64) in list(tti.resources)
   assert Resource(diskMb=64) in list(tti.resources)
   assert Resource(namedPort='health') in list(tti.resources)
+  assert Resource(numGpus=2) in list(tti.resources)
 
 
 def test_config_with_tier():

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/python/apache/thermos/config/test_schema.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/thermos/config/test_schema.py b/src/test/python/apache/thermos/config/test_schema.py
index 0440ee5..4cae2de 100644
--- a/src/test/python/apache/thermos/config/test_schema.py
+++ b/src/test/python/apache/thermos/config/test_schema.py
@@ -51,17 +51,28 @@ def test_order():
 
 
 def test_add_resources():
-  assert Units.resources_sum(Resources(), Resources()) == Resources(cpu=0, ram=0, disk=0)
-
-  r100 = Resources(cpu=1, ram=0, disk=0)
-  r010 = Resources(cpu=0, ram=1, disk=0)
-  r001 = Resources(cpu=0, ram=0, disk=1)
-  r111 = Resources(cpu=1, ram=1, disk=1)
-  r222 = Resources(cpu=2, ram=2, disk=2)
-
-  assert reduce(Units.resources_sum, [r100, r010, r001]) == r111
-  assert Units.resources_sum(r111, r111) == r222
-  assert r222 == Units.resources_sum(r100, r010, r001, r111, Resources())
+  assert Units.resources_sum(Resources(), Resources()) == Resources(cpu=0, ram=0, disk=0, gpu=0)
+
+  r1000 = Resources(cpu=1, ram=0, disk=0, gpu=0)
+  r1001 = Resources(cpu=1, ram=0, disk=0, gpu=1)
+  r0100 = Resources(cpu=0, ram=1, disk=0, gpu=0)
+  r0010 = Resources(cpu=0, ram=0, disk=1, gpu=0)
+  r1110 = Resources(cpu=1, ram=1, disk=1, gpu=0)
+  r1101 = Resources(cpu=1, ram=1, disk=0, gpu=1)
+  r2220 = Resources(cpu=2, ram=2, disk=2, gpu=0)
+
+  assert reduce(Units.resources_sum, [r1000, r0100, r0010]) == r1110
+  assert Units.resources_sum(r1110, r1110) == r2220
+  assert r2220 == Units.resources_sum(r1000, r0100, r0010, r1110, Resources())
+  assert Units.resources_sum(r1001, r0100) == r1101
+
+
+def test_max_resources():
+  assert Resources(cpu=1, ram=2, disk=3, gpu=4) == Units.resources_max([
+      Resources(cpu=0, ram=2, disk=1, gpu=4),
+      Resources(cpu=1, ram=1, disk=2, gpu=0),
+      Resources(cpu=0, ram=1, disk=3, gpu=1)
+  ])
 
 
 def test_combine_tasks():

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
index 219c40f..bf6ef69 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
@@ -14,7 +14,19 @@
 
 import getpass
 
-DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .'
+class DefaultProfile(Struct):
+  role=Default(String, getpass.getuser())
+  cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .')
+  gpu=Default(Integer, 0)
+
+
+ContainerProfile = DefaultProfile(
+  cmd = 'cp /tmp/http_example.py .'
+)
+
+GpuProfile = DefaultProfile(
+  gpu=1
+)
 
 echo_ports = Process(
   name = 'echo_ports',
@@ -27,12 +39,12 @@ run_server = Process(
 
 stage_server = Process(
   name = 'stage_server',
-  cmdline = '{{cmd}}'
+  cmdline = '{{profile.cmd}}'
 )
 
 test_task = Task(
   name = 'http_example',
-  resources = Resources(cpu=0.4, ram=32*MB, disk=64*MB),
+  resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB, gpu='{{profile.gpu}}'),
   processes = [echo_ports, stage_server, run_server],
   constraints = order(echo_ports, stage_server, run_server))
 
@@ -45,9 +57,9 @@ job = Service(
   update_config = update_config,
   health_check_config = health_check_config,
   task = test_task,
-  role = getpass.getuser(),
+  role = '{{profile.role}}',
   environment = 'test',
-  contact = '{{role}}@localhost',
+  contact = '{{profile.role}}@localhost',
   announce = Announcer(
     portmap={'alias': 'http'},
   ),
@@ -56,25 +68,20 @@ job = Service(
 jobs = [
   job(
     name = 'http_example'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_revocable',
     tier = 'revocable'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_docker',
     container = Container(docker=Docker(image = 'http_example'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  ),
+  ).bind(profile=ContainerProfile),
   job(
     name = 'http_example_appc',
     container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  )
+  ).bind(profile=ContainerProfile),
+  job(
+    name = 'http_example_gpu'
+  ).bind(profile=GpuProfile)
 ]

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
index 08553e4..edeafbe 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
@@ -14,7 +14,19 @@
 
 import getpass
 
-DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .'
+class DefaultProfile(Struct):
+  role=Default(String, getpass.getuser())
+  cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .')
+  gpu=Default(Integer, 0)
+
+
+ContainerProfile = DefaultProfile(
+  cmd = 'cp /tmp/http_example.py .'
+)
+
+GpuProfile = DefaultProfile(
+  gpu=1
+)
 
 run_server = Process(
   name = 'run_server',
@@ -23,12 +35,12 @@ run_server = Process(
 
 stage_server = Process(
   name = 'stage_server',
-  cmdline = '{{cmd}}'
+  cmdline = '{{profile.cmd}}'
 )
 
 test_task = Task(
   name = 'http_example',
-  resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB),
+  resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB, gpu='{{profile.gpu}}'),
   processes = [stage_server, run_server],
   constraints = order(stage_server, run_server)
 )
@@ -53,34 +65,29 @@ job = Service(
   update_config = update_config,
   health_check_config = health_check_config,
   task = test_task,
-  role = getpass.getuser(),
+  role = '{{profile.role}}',
   environment = 'test',
-  contact = '{{role}}@localhost',
+  contact = '{{profile.role}}@localhost',
   announce = Announcer(),
 )
 
 jobs = [
   job(
     name = 'http_example'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_revocable',
     tier = 'revocable'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_docker',
     container = Container(docker=Docker(image = 'http_example'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  ),
+  ).bind(profile=ContainerProfile),
   job(
     name = 'http_example_appc',
     container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  )
+  ).bind(profile=ContainerProfile),
+  job(
+    name = 'http_example_gpu'
+  ).bind(profile=GpuProfile)
 ]

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
index 8b3a50b..9569eec 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
@@ -14,7 +14,19 @@
 
 import getpass
 
-DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .'
+class DefaultProfile(Struct):
+  role=Default(String, getpass.getuser())
+  cmd=Default(String, 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .')
+  gpu=Default(Integer, 0)
+
+
+ContainerProfile = DefaultProfile(
+  cmd = 'cp /tmp/http_example.py .'
+)
+
+GpuProfile = DefaultProfile(
+  gpu=1
+)
 
 run_server = Process(
   name = 'run_server',
@@ -22,12 +34,12 @@ run_server = Process(
 
 stage_server = Process(
   name = 'stage_server',
-  cmdline = '{{cmd}}'
+  cmdline = '{{profile.cmd}}'
 )
 
 test_task = SequentialTask(
   name = 'http_example',
-  resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB),
+  resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB, gpu='{{profile.gpu}}'),
   processes = [stage_server, run_server])
 
 update_config = UpdateConfig(watch_secs=10, batch_size=3)
@@ -39,34 +51,29 @@ job = Service(
   update_config = update_config,
   health_check_config = health_check_config,
   task = test_task,
-  role = getpass.getuser(),
+  role = '{{profile.role}}',
   environment = 'test',
-  contact = '{{role}}@localhost',
+  contact = '{{profile.role}}@localhost',
   announce = Announcer(),
 )
 
 jobs = [
   job(
     name = 'http_example'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_revocable',
     tier = 'revocable'
-  ).bind(
-    cmd = DEFAULT_CMD
-  ),
+  ).bind(profile=DefaultProfile()),
   job(
     name = 'http_example_docker',
     container = Container(docker=Docker(image = 'http_example'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  ),
+  ).bind(profile=ContainerProfile),
   job(
     name = 'http_example_appc',
     container = Mesos(image=AppcImage(name='http_example', image_id='{{appc_image_id}}'))
-  ).bind(
-    cmd = 'cp /tmp/http_example.py .'
-  )
+  ).bind(profile=ContainerProfile),
+  job(
+    name = 'http_example_gpu'
+  ).bind(profile=GpuProfile)
 ]

http://git-wip-us.apache.org/repos/asf/aurora/blob/edd47d08/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
index abe0ca7..c3c9e64 100755
--- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
+++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
@@ -152,7 +152,7 @@ test_observer_ui() {
 test_restart() {
   local _jobkey=$1
 
-  aurora job restart --batch-size=2 $_jobkey
+  aurora job restart --batch-size=2 --watch-secs=10 $_jobkey
 }
 
 assert_update_state() {
@@ -343,7 +343,7 @@ test_http_example() {
   test_quota $_cluster $_role
 }
 
-test_http_revocable_example() {
+test_http_example_basic() {
   local _cluster=$1 _role=$2 _env=$3
   local _base_config=$4
   local _job=$7
@@ -429,6 +429,7 @@ TEST_ROLE=vagrant
 TEST_ENV=test
 TEST_JOB=http_example
 TEST_JOB_REVOCABLE=http_example_revocable
+TEST_JOB_GPU=http_example_gpu
 TEST_JOB_DOCKER=http_example_docker
 TEST_JOB_APPC=http_example_appc
 TEST_CONFIG_FILE=$EXAMPLE_DIR/http_example.aurora
@@ -450,6 +451,8 @@ TEST_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB")
 
 TEST_JOB_REVOCABLE_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_REVOCABLE")
 
+TEST_JOB_GPU_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_GPU")
+
 TEST_JOB_DOCKER_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB_DOCKER")
 
 TEST_ADMIN_ARGS=($TEST_CLUSTER)
@@ -469,7 +472,9 @@ test_version
 test_http_example "${TEST_JOB_ARGS[@]}"
 test_health_check
 
-test_http_revocable_example "${TEST_JOB_REVOCABLE_ARGS[@]}"
+test_http_example_basic "${TEST_JOB_REVOCABLE_ARGS[@]}"
+
+test_http_example_basic "${TEST_JOB_GPU_ARGS[@]}"
 
 # build the test docker image
 sudo docker build -t http_example ${TEST_ROOT}