You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by bi...@apache.org on 2014/05/30 18:47:16 UTC

svn commit: r1598665 [2/16] - in /incubator/slider/site/content: ./ architecture/ configuration/ css/ developing/ images/ images/logos/ images/profiles/ img/ js/ registry/ release_notes/ slider_specs/ specification/

Added: incubator/slider/site/content/architecture/rolehistory.html
URL: http://svn.apache.org/viewvc/incubator/slider/site/content/architecture/rolehistory.html?rev=1598665&view=auto
==============================================================================
--- incubator/slider/site/content/architecture/rolehistory.html (added)
+++ incubator/slider/site/content/architecture/rolehistory.html Fri May 30 16:47:13 2014
@@ -0,0 +1,874 @@
+<!DOCTYPE html>
+<!--
+ | Generated by Apache Maven Doxia at 2014-05-30
+ | Rendered using Apache Maven Fluido Skin 1.3.0
+-->
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta name="Date-Revision-yyyymmdd" content="20140530" />
+    <meta http-equiv="Content-Language" content="en" />
+    <title>Apache Slider 0.30 (incubating) - </title>
+    <link rel="stylesheet" href="../css/apache-maven-fluido-1.3.0.min.css" />
+    <link rel="stylesheet" href="../css/site.css" />
+    <link rel="stylesheet" href="../css/print.css" media="print" />
+
+      
+    <script type="text/javascript" src="../js/apache-maven-fluido-1.3.0.min.js"></script>
+
+    
+            </head>
+        <body class="topBarEnabled">
+          
+                        
+                    
+                
+
+    <div id="topbar" class="navbar navbar-fixed-top ">
+      <div class="navbar-inner">
+                                  <div class="container"><div class="nav-collapse">
+            
+                
+                                <ul class="nav">
+                          <li class="dropdown">
+        <a href="#" class="dropdown-toggle" data-toggle="dropdown">Project Documentation <b class="caret"></b></a>
+        <ul class="dropdown-menu">
+        
+                      <li class="dropdown-submenu">
+                                      <a href="../project-info.html"  title="Project Information">Project Information</a>
+              <ul class="dropdown-menu">
+                                  <li>      <a href="../index.html"  title="About">About</a>
+</li>
+                                  <li>      <a href="../plugin-management.html"  title="Plugin Management">Plugin Management</a>
+</li>
+                                  <li>      <a href="../distribution-management.html"  title="Distribution Management">Distribution Management</a>
+</li>
+                                  <li>      <a href="../dependency-info.html"  title="Dependency Information">Dependency Information</a>
+</li>
+                                  <li>      <a href="../dependency-convergence.html"  title="Dependency Convergence">Dependency Convergence</a>
+</li>
+                                  <li>      <a href="../source-repository.html"  title="Source Repository">Source Repository</a>
+</li>
+                                  <li>      <a href="../mail-lists.html"  title="Mailing Lists">Mailing Lists</a>
+</li>
+                                  <li>      <a href="../issue-tracking.html"  title="Issue Tracking">Issue Tracking</a>
+</li>
+                                  <li>      <a href="../integration.html"  title="Continuous Integration">Continuous Integration</a>
+</li>
+                                  <li>      <a href="../plugins.html"  title="Project Plugins">Project Plugins</a>
+</li>
+                                  <li>      <a href="../license.html"  title="Project License">Project License</a>
+</li>
+                                  <li>      <a href="../modules.html"  title="Project Modules">Project Modules</a>
+</li>
+                                  <li>      <a href="../dependency-management.html"  title="Dependency Management">Dependency Management</a>
+</li>
+                                  <li>      <a href="../team-list.html"  title="Project Team">Project Team</a>
+</li>
+                                  <li>      <a href="../project-summary.html"  title="Project Summary">Project Summary</a>
+</li>
+                                  <li>      <a href="../dependencies.html"  title="Dependencies">Dependencies</a>
+</li>
+                              </ul>
+            </li>
+                  
+                      <li class="dropdown-submenu">
+                                      <a href="../project-reports.html"  title="Project Reports">Project Reports</a>
+              <ul class="dropdown-menu">
+                                  <li>      <a href="../surefire-report.html"  title="Surefire Report">Surefire Report</a>
+</li>
+                              </ul>
+            </li>
+                          </ul>
+      </li>
+                <li class="dropdown">
+        <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documents <b class="caret"></b></a>
+        <ul class="dropdown-menu">
+        
+                      <li>      <a href="../getting_started.html"  title="Getting Started">Getting Started</a>
+</li>
+                  
+                      <li>      <a href="../manpage.html"  title="manpage">manpage</a>
+</li>
+                  
+                      <li>      <a href="../troubleshooting.html"  title="Troubleshooting">Troubleshooting</a>
+</li>
+                  
+                      <li>      <a href="../architecture/index.html"  title="Architecture">Architecture</a>
+</li>
+                  
+                      <li>      <a href="../developing/index.html"  title="Developing">Developing</a>
+</li>
+                  
+                      <li>      <a href="../exitcodes.html"  title="Exitcodes">Exitcodes</a>
+</li>
+                          </ul>
+      </li>
+                  </ul>
+          
+          
+                                                              
+                   
+                      </div>
+          
+        </div>
+      </div>
+    </div>
+    
+        <div class="container">
+          <div id="banner">
+        <div class="pull-left">
+                                                  <a href=".././" id="bannerLeft">
+                <h2>Apache Slider (incubating)</h2>
+                </a>
+                      </div>
+        <div class="pull-right">              <div id="bannerRight">
+                                                                                        <img src="http://incubator.apache.org/images/apache-incubator-logo.png" />
+                </div>
+      </div>
+        <div class="clear"><hr/></div>
+      </div>
+
+      <div id="breadcrumbs">
+        <ul class="breadcrumb">
+                
+                    
+                  <li id="publishDate">Last Published: 2014-05-30</li>
+                      
+                
+                    
+                 <li id="projectVersion" class="pull-right">Version: 0.30</li>
+      
+                            </ul>
+      </div>
+
+      
+                
+        <div id="bodyColumn" >
+                                  
+            <!-- -
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. --><h1>Apache Slider Role History: how Slider brings back nodes in the same location</h1>
+<div class="section">
+<div class="section">
+<h3>Last updated 2013-12-06<a name="Last_updated_2013-12-06"></a></h3>
+
+<ul>
+  
+<li>This document uses the pre-slider terminology of role/cluster and not component and application instance *</li>
+</ul></div></div>
+<div class="section">
+<h2>Outstanding issues<a name="Outstanding_issues"></a></h2>
+
+<ol style="list-style-type: decimal">
+  
+<li>
+<p>Can we use the history to implement anti-affinity: for any role with this flag, use our knowledge of the cluster to ask for all nodes that aren&#x2019;t in use already</p></li>
+  
+<li>
+<p>How to add blacklisting here? We are tracking failures and startup failures per node (not persisted), but not using this in role placement requests yet.</p></li>
+</ol></div>
+<div class="section">
+<h2>Introduction<a name="Introduction"></a></h2>
+<p>Slider needs to bring up instances of a given role on the machine(s) on which they last ran -it should remember after shrinking or freezing a cluster which servers were last used for a role -and use this (persisted) data to select clusters next time</p>
+<p>It does this in the basis that the role instances prefer node-local access to data previously persisted to HDFS. This is precisely the case for Apache HBase, which can use Unix Domain Sockets to talk to the DataNode without using the TCP stack. The HBase master persists to HDFS the tables assigned to specific Region Servers, and when HBase is restarted its master tries to reassign the same tables back to Region Servers on the same machine.</p>
+<p>For this to work in a dynamic cluster, Slider needs to bring up Region Servers on the previously used hosts, so that the HBase Master can re-assign the same tables.</p>
+<p>Note that it does not need to care about the placement of other roles, such as the HBase masters -there anti-affinity between other instances is the key requirement.</p>
+<div class="section">
+<h3>Terminology<a name="Terminology"></a></h3>
+
+<ul>
+  
+<li><b>Role Instance</b> : a single instance of a role.</li>
+  
+<li><b>Node</b> : A server in the YARN Physical (or potentially virtual) Cluster of servers.</li>
+  
+<li><b>Slider Cluster</b>: The set of role instances deployed by Slider so as to  create a single aggregate application.</li>
+  
+<li><b>Slider AM</b>: The Application Master of Slider: the program deployed by YARN to manage its Slider Cluster.</li>
+  
+<li><b>RM</b> YARN Resource Manager</li>
+</ul></div>
+<div class="section">
+<h3>Assumptions<a name="Assumptions"></a></h3>
+<p>Here are some assumptions in Slider&#x2019;s design</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>
+<p>Instances of a specific role should preferably be deployed onto different servers. This enables Slider to only remember the set of server nodes onto which instances were created, rather than more complex facts such as &quot;two Region Servers were previously running on Node #17. On restart Slider can simply request one instance of a Region Server on a specific node, leaving the other instance to be arbitrarily deployed by YARN. This strategy should help reduce the <i>affinity</i> in the role deployment, so increase their resilience to failure.</p></li>
+  
+<li>
+<p>There is no need to make sophisticated choices on which nodes to request re-assignment -such as recording the amount of data persisted by a previous instance and prioritizing nodes based on such data. More succinctly &#x2019;the only priority needed when asking for nodes is <i>ask for the most recently used</i>.</p></li>
+  
+<li>
+<p>Different roles are independent: it is not an issue if a role of one type  (example, an Accumulo Monitor and an Accumulo Tablet Server) are on the same  host. This assumption allows Slider to only worry about affinity issues within  a specific role, rather than across all roles.</p></li>
+  
+<li>
+<p>After a cluster has been started, the rate of change of the cluster is low: both node failures and cluster flexing happen at the rate of every few hours, rather than every few seconds. This allows Slider to avoid needing data structures and layout persistence code designed for regular and repeated changes.</p></li>
+  
+<li>
+<p>Instance placement is best-effort: if the previous placement cannot be satisfied, the application will still perform adequately with role instances deployed onto new servers. More specifically, if a previous server is unavailable for hosting a role instance due to lack of capacity or availability, Slider will not decrement the number of instances to deploy: instead it will rely on YARN to locate a new node -ideally on the same rack.</p></li>
+  
+<li>
+<p>If two instances of the same role do get assigned to the same server, it is not a failure condition. (This may be problematic for some roles -we may need a role-by-role policy here, so that master nodes can be anti-affine) [specifically, &gt;1 HBase master mode will not come up on the same host]</p></li>
+  
+<li>
+<p>If a role instance fails on a specific node, asking for a container on that same node for the replacement instance is a valid recovery strategy. This contains assumptions about failure modes -some randomness here may be a valid tactic, especially for roles that do not care about locality.</p></li>
+  
+<li>
+<p>Tracking failure statistics of nodes may be a feature to add in future; designing the Role History datastructures to enable future collection of rolling statistics on recent failures would be a first step to this </p></li>
+</ol></div>
+<div class="section">
+<h3>The Role History<a name="The_Role_History"></a></h3>
+<p>The <tt>RoleHistory</tt> is a datastructure which models the role assignment, and can persist it to and restore it from the (shared) filesystem.</p>
+
+<ul>
+  
+<li>
+<p>For each role, there is a list of cluster nodes which have supported this role used in the past.</p></li>
+  
+<li>
+<p>This history is used when selecting a node for a role.</p></li>
+  
+<li>
+<p>This history remembers when nodes were allocated. These are re-requested when thawing a cluster.</p></li>
+  
+<li>
+<p>It must also remember when nodes were released -these are re-requested when returning the cluster size to a previous size during flex operations.</p></li>
+  
+<li>
+<p>It has to track nodes for which Slider has an outstanding container request with YARN. This ensures that the same node is not requested more than once due to outstanding requests.</p></li>
+  
+<li>
+<p>It does not retain a complete history of the role -and does not need to. All it needs to retain is the recent history for every node onto which a role instance has been deployed. Specifically, the last allocation or release operation on a node is all that needs to be persisted.</p></li>
+  
+<li>
+<p>On AM startup, all nodes in the history are considered candidates, even those nodes currently marked as active -as they were from the previous instance.</p></li>
+  
+<li>
+<p>On AM restart, nodes in the role history marked as active have to be considered still active -the YARN RM will have to provide the full list of which are not.</p></li>
+  
+<li>
+<p>During cluster flexing, nodes marked as released -and for which there is no outstanding request - are considered candidates for requesting new instances.</p></li>
+  
+<li>
+<p>When choosing a candidate node for hosting a role instance, it from the head of the time-ordered list of nodes that last ran an instance of that role</p></li>
+</ul></div>
+<div class="section">
+<h3>Persistence<a name="Persistence"></a></h3>
+<p>The state of the role is persisted to HDFS on changes -but not on cluster termination.</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>When nodes are allocated, the Role History is marked as dirty</li>
+  
+<li>When container release callbacks are received, the Role History is marked as dirty</li>
+  
+<li>When nodes are requested or a release request made, the Role History is <i>not</i>  marked as dirty. This information is not relevant on AM restart.</li>
+</ol>
+<p>As at startup, a large number of allocations may arrive in a short period of time, the Role History may be updated very rapidly -yet as the containers are only recently activated, it is not likely that an immediately restarted Slider cluster would gain by re-requesting containers on them -their historical value is more important than their immediate past.</p>
+<p>Accordingly, the role history may be persisted to HDFS asynchronously, with the dirty bit triggering an flushing of the state to HDFS. The datastructure will still need to be synchronized for cross thread access, but the sync operation will not be a major deadlock, compared to saving the file on every container allocation response (which will actually be the initial implementation).</p>
+<p>There&#x2019;s no need to persist the format in a human-readable form; while protobuf might seem the approach most consistent with the rest of YARN, it&#x2019;s not an easy structure to work with.</p>
+<p>The initial implementation will use Apache Avro as the persistence format, with the data saved in JSON or compressed format.</p></div></div>
+<div class="section">
+<h2>Weaknesses in this design<a name="Weaknesses_in_this_design"></a></h2>
+<p><b>Blacklisting</b>: even if a node fails repeatedly, this design will still try to re-request instances on this node; there is no blacklisting. As a central blacklist for YARN has been proposed, it is hoped that this issue will be addressed centrally, without Slider having to remember which nodes are unreliable <i>for that particular Slider cluster</i>.</p>
+<p><b>Anti-affinity</b>: If multiple role instances are assigned to the same node, Slider has to choose on restart or flexing whether to ask for multiple nodes on that node again, or to pick other nodes. The assumed policy is &#x201c;only ask for one node&#x201d;</p>
+<p><b>Bias towards recent nodes over most-used</b>: re-requesting the most recent nodes, rather than those with the most history of use, may push Slider to requesting nodes that were only briefly in use -and so have on a small amount of local state, over nodes that have had long-lived instances. This is a problem that could perhaps be addressed by preserving more history of a node -maintaining some kind of moving average of node use and picking the heaviest used, or some other more-complex algorithm. This may be possible, but we&#x2019;d need evidence that the problem existed before trying to address it.</p>
+<h1>The NodeMap: the core of the Role History</h1>
+<p>The core data structure, the <tt>NodeMap</tt> is a map of every known node in the cluster, tracking how many containers are allocated to specific roles in it, and, when there are no active instances, when it was last used. This history is used to choose where to request new containers. Because of the asynchronous allocation and release of containers, the Role History also needs to track outstanding release requests &#x2013;and, more critically, outstanding allocation requests. If Slider has already requested a container for a specific role on a host, then asking for another container of that role would break anti-affinity requirements. Note that not tracking outstanding requests would radically simplify some aspects of the design, especially the complexity of correlating allocation responses with the original requests -and so the actual hosts originally requested.</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>Slider builds up a map of which nodes have recently been used.</li>
+  
+<li>Every node counts the number. of active containers in each role.</li>
+  
+<li>Nodes are only chosen for allocation requests when there are no active or requested containers on that node.</li>
+  
+<li>When choosing which instances to release, Slider could pick the node with the most containers on it. This would spread the load.</li>
+  
+<li>When there are no empty nodes to request containers on, a request would let YARN choose.</li>
+</ol>
+<div class="section">
+<div class="section">
+<h4>Strengths<a name="Strengths"></a></h4>
+
+<ul>
+  
+<li>Handles the multi-container on one node problem</li>
+  
+<li>By storing details about every role, cross-role decisions could be possible</li>
+  
+<li>Simple counters can track the state of pending add/release requests</li>
+  
+<li>Scales well to a rapidly flexing cluster</li>
+  
+<li>Simple to work with and persist</li>
+  
+<li>Easy to view and debug</li>
+  
+<li>Would support cross-role collection of node failures in future</li>
+</ul></div>
+<div class="section">
+<h4>Weaknesses<a name="Weaknesses"></a></h4>
+
+<ul>
+  
+<li>Size of the data structure is <tt>O(nodes * role-instances</tt>). This could be mitigated by regular cleansing of the structure. For example, at thaw time (or intermittently) all unused nodes &gt; 2 weeks old could be dropped.</li>
+  
+<li>Locating a free node could take <tt>O(nodes)</tt> lookups -and if the criteria of &#x201c;newest&#x201d; is included, will take exactly <tt>O(nodes)</tt> lookups. As an optimization, a list of recently explicitly released nodes can be maintained.</li>
+  
+<li>Need to track outstanding requests against nodes, so that if a request was satisfied on a different node, the original node&#x2019;s request count is  decremented, <i>not that of the node actually allocated</i>.</li>
+  
+<li>In a virtual cluster, may fill with node entries that are no longer in the cluster. Slider should query the RM (or topology scripts?) to determine if nodes are still parts of the YARN cluster.</li>
+</ul></div></div></div>
+<div class="section">
+<h2>Data Structures<a name="Data_Structures"></a></h2>
+<div class="section">
+<h3>RoleHistory<a name="RoleHistory"></a></h3>
+
+<div class="source">
+<pre>startTime: long
+saveTime: long
+dirty: boolean
+nodemap: NodeMap
+roles: RoleStatus[]
+outstandingRequests: transient OutstandingRequestTracker
+availableNodes: transient List&lt;NodeInstance&gt;[]
+</pre></div>
+<p>This is the aggregate data structure that is persisted to/from file</p></div>
+<div class="section">
+<h3>NodeMap<a name="NodeMap"></a></h3>
+
+<div class="source">
+<pre>clusterNodes: Map: NodeId -&gt; NodeInstance
+clusterNodes(): Iterable&lt;NodeInstance&gt;
+getOrCreate(NodeId): NodeInstance
+</pre></div>
+<p>Maps a YARN NodeID record to a Slider <tt>NodeInstance</tt> structure</p></div>
+<div class="section">
+<h3>NodeInstance<a name="NodeInstance"></a></h3>
+<p>Every node in the cluster is modeled as an ragged array of <tt>NodeEntry</tt> instances, indexed by role index -</p>
+
+<div class="source">
+<pre>NodeEntry[roles]
+get(roleId): NodeEntry or null
+create(roleId): NodeEntry
+getNodeEntries(): NodeEntry[roles]
+getOrCreate(roleId): NodeEntry
+remove(roleId): NodeEntry
+</pre></div>
+<p>This could be implemented in a map or an indexed array; the array is more efficient but it does mandate that the number of roles are bounded and fixed.</p></div>
+<div class="section">
+<h3>NodeEntry<a name="NodeEntry"></a></h3>
+<p>Records the details about all of a roles containers on a node. The <tt>active</tt> field records the number of containers currently active.</p>
+
+<div class="source">
+<pre>active: int
+requested: transient int
+releasing: transient int
+last_used: long
+
+NodeEntry.available(): boolean = active - releasing == 0 &amp;&amp; requested == 0
+</pre></div>
+<p>The two fields <tt>releasing</tt> and <tt>requested</tt> are used to track the ongoing state of YARN requests; they do not need to be persisted across freeze/thaw cycles. They may be relevant across AM restart, but without other data structures in the AM, not enough to track what the AM was up to before it was restarted. The strategy will be to ignore unexpected allocation responses (which may come from pre-restart) requests, while treating unexpected container release responses as failures.</p>
+<p>The <tt>active</tt> counter is only decremented after a container release response has been received.</p></div>
+<div class="section">
+<h3>RoleStatus<a name="RoleStatus"></a></h3>
+<p>This is the existing <tt>org.apache.hoya.yarn.appmaster.state.RoleStatus</tt> class</p></div>
+<div class="section">
+<h3>RoleList<a name="RoleList"></a></h3>
+<p>A list mapping role to int enum is needed to index NodeEntry elements in the NodeInstance arrays. Although such an enum is already implemented in the Slider Providers, explicitly serializing and deserializing it would make the persistent structure easier to parse in other tools, and resilient to changes in the number or position of roles.</p>
+<p>This list could also retain information about recently used/released nodes, so that the selection of containers to request could shortcut a search</p></div>
+<div class="section">
+<h3>ContainerPriority<a name="ContainerPriority"></a></h3>
+<p>The container priority field (a 32 bit integer) is used by Slider (0.5.x) to index the specific role in a container so as to determine which role has been offered in a container allocation message, and which role has been released on a release event.</p>
+<p>The Role History needs to track outstanding requests, so that when an allocation comes in, it can be mapped back to the original request. Simply looking up the nodes on the provided container and decrementing its request counter is not going to work -the container may be allocated on a different node from that requested.</p>
+<p><b>Proposal</b>: The priority field of a request is divided by Slider into 8 bits for <tt>roleID</tt> and 24 bits for <tt>requestID</tt>. The request ID will be a simple rolling integer -Slider will assume that after 2^24 requests per role, it can be rolled, -though as we will be retaining a list of outstanding requests, a clash should not occur. The main requirement is: not have &gt; 2^24 outstanding requests for instances of a specific role, which places an upper bound on the size of a Slider cluster.</p>
+<p>The splitting and merging will be implemented in a ContainerPriority class, for uniform access.</p></div>
+<div class="section">
+<h3>OutstandingRequest<a name="OutstandingRequest"></a></h3>
+<p>Tracks an outstanding request. This is used to correlate an allocation response (whose Container Priority file is used to locate this request), with the node and role used in the request.</p>
+
+<div class="source">
+<pre>  roleId:  int
+  requestID :  int
+  node: string (may be null)
+  requestedTime: long
+  priority: int = requestID &lt;&lt; 24 | roleId
+</pre></div>
+<p>The node identifier may be null -which indicates that a request was made without a specific target node</p></div>
+<div class="section">
+<h3>OutstandingRequestTracker<a name="OutstandingRequestTracker"></a></h3>
+<p>Contains a map from requestID to the specific <tt>OutstandingRequest</tt> made, and generates the request ID</p>
+
+<div class="source">
+<pre>nextRequestId: int
+requestMap(RequestID) -&gt; OutstandingRequest
+</pre></div>
+<p>Operations</p>
+
+<div class="source">
+<pre>addRequest(NodeInstance, RoleId): OutstandingRequest
+    (and an updated request Map with a new entry)
+lookup(RequestID): OutstandingRequest
+remove(RequestID): OutstandingRequest
+listRequestsForNode(ClusterID): [OutstandingRequest]
+</pre></div>
+<p>The list operation can be implemented inefficiently unless it is found to be important -if so a more complex structure will be needed.</p></div>
+<div class="section">
+<h3>AvailableNodes<a name="AvailableNodes"></a></h3>
+<p>This is a field in <tt>RoleHistory</tt></p>
+
+<div class="source">
+<pre>availableNodes: List&lt;NodeInstance&gt;[]
+</pre></div>
+<p>For each role, lists nodes that are available for data-local allocation, ordered by more recently released - To accelerate node selection</p>
+<p>The performance benefit is most significant when requesting multiple nodes, as the scan for M locations from N nodes is reduced from <tt>M*N</tt> comparisons to 1 Sort + M list lookups.</p>
+<p>Each list can be created off the Node Map by building for each role a sorted list of all Nodes which are available for an instance of that role, using a comparator that places the most recently released node ahead of older nodes.</p>
+<p>This list is not persisted -when a Slider Cluster is frozen it is moot, and when an AM is restarted this structure will be rebuilt.</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>When a node is needed for a new request, this list is consulted first.</li>
+  
+<li>After the request is issued it can be removed from the list</li>
+  
+<li>Whenever a container is released, if the node is now available for requests for that node, should be added to to the front of the list for that role.</li>
+</ol>
+<p>If the list is empty during a container request operation, it means that the Role History does not know of any nodes in the cluster that have hosted instances of that role and which are not in use. There are then two possible strategies to select a role</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>Ask for an instance anywhere in the cluster (policy in Slider 0.5)</li>
+  
+<li>Search the node map to identify other nodes which are (now) known about, but which are not hosting instances of a specific role -this can be used as the target for the next resource request.</li>
+</ol>
+<p>Strategy #1 is simpler; Strategy #2 <i>may</i> decrease the affinity in the cluster, as the AM will be explicitly requesting an instance on a node which it knows is not running an instance of that role.</p>
+<div class="section">
+<h4>ISSUE What to do about failing nodes?<a name="ISSUE_What_to_do_about_failing_nodes"></a></h4>
+<p>Should a node whose container just failed be placed at the top of the stack, ready for the next request? </p>
+<p>If the container failed due to an unexpected crash in the application, asking for that container back <i>is the absolute right strategy</i> -it will bring back a new role instance on that machine. </p>
+<p>If the container failed because the node is now offline, the container request will not be satisfied by that node.</p>
+<p>If there is a problem with the node, such that containers repeatedly fail on it, then re-requesting containers on it will amplify the damage.</p></div></div></div>
+<div class="section">
+<h2>Actions<a name="Actions"></a></h2>
+<div class="section">
+<h3>Bootstrap<a name="Bootstrap"></a></h3>
+
+<ol style="list-style-type: decimal">
+  
+<li>Persistent Role History file not found; empty data structures created.</li>
+</ol></div>
+<div class="section">
+<h3>Thaw<a name="Thaw"></a></h3>
+<p>When thawing, the Role History should be loaded -if it is missing Slider must revert to the bootstrap actions.</p>
+<p>If found, the Role History will contain Slider&#x2019;s view of the Slider Cluster&#x2019;s state at the time the history was saved, explicitly recording the last-used time of all nodes no longer hosting a role&#x2019;s container. By noting which roles were actually being served, it implicitly notes which nodes have a <tt>last_used</tt> value greater than any of the <tt>last_used</tt> fields persisted in the file. That is: all node entries listed as having active nodes at the time the history was saved must have more recent data than those nodes listed as inactive.</p>
+<p>When rebuilding the data structures, the fact that nodes were active at save time must be converted into the data that indicates that the nodes were at least in use <i>at the time the data was saved</i>. The state of the cluster after the last save is unknown.</p>
+<p>1: Role History loaded; Failure =&gt; Bootstrap. 2: Future: if role list enum != current enum, remapping could take place. Until then: fail. 3: Mark all nodes as active at save time to that of the</p>
+<p>//define a threshold  threshold = rolehistory.saveTime - 7*24*60<i>60</i> 1000</p>
+
+<div class="source">
+<pre>for (clusterId, clusternode) in rolehistory.clusterNodes().entries() :
+  for (role, nodeEntry) in clusterNode.getNodeEntries():
+    nodeEntry.requested = 0
+    nodeEntry.releasing = 0
+    if nodeEntry.active &gt; 0 :
+      nodeEntry.last_used = rolehistory.saveTime;
+    nodeEntry.n.active = 0
+    if nodeEntry.last_used &lt; threshold :
+      clusterNode.remove(role)
+    else:
+     availableNodes[role].add(clusterId)
+   if clusterNode.getNodeEntries() isEmpty :
+     rolehistory.clusterNodes.remove(clusterId)
+
+
+for availableNode in availableNodes:
+  sort(availableNode,new last_used_comparator())
+</pre></div>
+<p>After this operation, the structures are purged with all out of date entries, and the available node list contains a sorted list of the remainder.</p></div>
+<div class="section">
+<h3>AM Restart<a name="AM_Restart"></a></h3>
+<p>1: Create the initial data structures as the thaw operation 2: update the structure with the list of live nodes, removing those nodes from the list of available nodes</p>
+
+<div class="source">
+<pre>now = time()
+activeContainers = RM.getActiveContainers()
+
+for container in activeContainers:
+   nodeId = container.nodeId
+   clusterNode = roleHistory.nodemap.getOrCreate(nodeId)
+   role = extractRoleId(container.getPriority)
+   nodeEntry = clusterNode.getOrCreate(role)
+   nodeEntry.active++
+   nodeEntry.last_used = now
+   availableNodes[role].remove(nodeId)
+</pre></div>
+<p>There&#x2019;s no need to resort the available node list -all that has happened is that some entries have been removed</p>
+<p><b>Issue</b>: what if requests come in for a <tt>(role, requestID)</tt> for the previous instance of the AM? Could we just always set the initial requestId counter to a random number and hope the collision rate is very, very low (2^24 * #(outstanding_requests)). If YARN-1041 ensures that a restarted AM does not receive outstanding requests, this issue goes away.</p></div>
+<div class="section">
+<h3>Teardown<a name="Teardown"></a></h3>
+
+<ol style="list-style-type: decimal">
+  
+<li>If dirty, save role history to its file.</li>
+  
+<li>Issue release requests</li>
+  
+<li>Maybe update data structures on responses, but do not mark Role History as dirty or flush it to disk.</li>
+</ol>
+<p>This strategy is designed to eliminate the expectation that there will ever be a clean shutdown -and so that the startup-time code should expect the Role History to have been written during shutdown. Instead the code should assume that the history was saved to disk at some point during the life of the Slider Cluster -ideally after the most recent change, and that the information in it is only an approximate about what the previous state of the cluster was.</p></div>
+<div class="section">
+<h3>Flex: Requesting a container in role <tt>role</tt><a name="Flex:_Requesting_a_container_in_role_role"></a></h3>
+
+<div class="source">
+<pre>node = availableNodes[roleId].pop() 
+if node != null :
+  node.nodeEntry[roleId].requested++;
+outstanding = outstandingRequestTracker.addRequest(node, roleId)
+request.node = node
+request.priority = outstanding.priority
+
+//update existing Slider role status
+roleStatus[roleId].incRequested();
+</pre></div>
+<p>There is a bias here towards previous nodes, even if the number of nodes in the cluster has changed. This is why a node is picked where the number of <tt>active-releasing == 0 and requested == 0</tt>, rather than where it is simply the lowest value of <tt>active + requested - releasing</tt>: if there is no node in the nodemap that is not running an instance of that role, it is left to the RM to decide where the role instance should be instantiated.</p>
+<p>This bias towards previously used nodes also means that (lax) requests will be made of nodes that are currently unavailable either because they are offline or simply overloaded with other work. In such circumstances, the node will have an active count of zero -so the search will find these nodes and request them -even though the requests cannot be satisfied. As a result, the request will be downgraded to a rack-local or cluster-wide, request -an acceptable degradation on a cluster where all the other entries in the nodemap have instances of that specific node -but not when there are empty nodes. </p>
+<div class="section">
+<h4>Solutions<a name="Solutions"></a></h4>
+
+<ol style="list-style-type: decimal">
+  
+<li>
+<p>Add some randomness in the search of the datastructure, rather than simply iterate through the values. This would prevent the same unsatisfiable node from being requested first.</p></li>
+  
+<li>
+<p>Keep track of requests, perhaps through a last-requested counter -and use this in the selection process. This would radically complicate the selection algorithm, and would not even distinguish &#x201c;node recently released that was also the last requested&#x201d; from &#x201c;node that has not recently satisfied requests even though it was recently requested&#x201d;.</p></li>
+  
+<li>
+<p>Keep track of requests that weren&#x2019;t satisfied, so identify a node that isn&#x2019;t currently satisfying requests.</p></li>
+</ol></div>
+<div class="section">
+<h4>History Issues<a name="History_Issues"></a></h4>
+<p>Without using that history, there is a risk that a very old assignment is used in place of a recent one and the value of locality decreased.</p>
+<p>But there are consequences:</p>
+<p><b>Performance</b>:</p>
+<p>Using the history to pick a recent node may increase selection times on a large cluster, as for every instance needed, a scan of all nodes in the nodemap is required (unless there is some clever bulk assignment list being built up), or a sorted version of the nodemap is maintained, with a node placed at the front of this list whenever its is updated.</p>
+<p><b>Thaw-time problems</b></p>
+<p>There is also the risk that while thawing, the <tt>rolehistory.saved</tt> flag may be updated while the cluster flex is in progress, so making the saved nodes appear out of date. Perhaps the list of recently released nodes could be rebuilt at thaw time.</p>
+<p>The proposed <tt>recentlyReleasedList</tt> addresses this, though it creates another data structure to maintain and rebuild at cluster thaw time from the last-used fields in the node entries.</p></div></div>
+<div class="section">
+<h3>AM Callback : onContainersAllocated<a name="AM_Callback_:_onContainersAllocated"></a></h3>
+
+<div class="source">
+<pre>void onContainersAllocated(List&lt;Container&gt; allocatedContainers) 
+</pre></div>
+<p>This is the callback received when containers have been allocated. Due to (apparently) race conditions, the AM may receive duplicate container allocations -Slider already has to recognize this and currently simply discards any surplus.</p>
+<p>If the AM tracks outstanding requests made for specific hosts, it will need to correlate allocations with the original requests, so as to decrement the node-specific request count. Decrementing the request count on the allocated node will not work, as the allocation may not be to the node originally requested.</p>
+
+<div class="source">
+<pre>assignments = []
+operations =  []
+for container in allocatedContainers:
+  cid = container.getId();
+  roleId = container.priority &amp; 0xff
+  nodeId = container.nodeId
+  outstanding = outstandingRequestTracker.remove(C.priority)
+  roleStatus = lookupRoleStatus(container);
+  roleStatus.decRequested();
+  allocated = roleStatus.incActual();
+  if outstanding == null || allocated &gt; desired :
+    operations.add(new ContainerReleaseOperation(cid))
+    surplusNodes.add(cid);
+    surplusContainers++
+    roleStatus.decActual();
+  else:
+    assignments.add(new ContainerAssignment(container, role))
+    node = nodemap.getOrCreate(nodeId)
+    nodeentry = node.get(roleId)
+    if nodeentry == null :
+      nodeentry = new NodeEntry()
+      node[roleId] = nodeentry
+      nodeentry.active = 1
+    else:
+      if nodeentry.requested &gt; 0 :
+        nodeentry.requested--
+      nodeentry.active++
+    nodemap.dirty = true
+
+    // work back from request ID to node where the 
+    // request was outstanding
+    requestID = outstanding != null? outstanding.nodeId : null
+    if requestID != null:
+      reqNode = nodeMap.get(requestID)
+      reqNodeEntry = reqNode.get(roleId)
+      reqNodeEntry.requested--
+      if reqNodeEntry.available() :
+        availableNodeList.insert(reqNodeEntry)
+</pre></div>
+
+<ol style="list-style-type: decimal">
+  
+<li>
+<p>At end of this, there is a node in the nodemap, which has recorded that there is now an active node entry for that role. The outstanding request has been removed.</p></li>
+  
+<li>
+<p>If a callback comes in for which there is no outstanding request, it is rejected (logged, ignored, etc). This handles duplicate responses as well as any other sync problem.</p></li>
+  
+<li>
+<p>The node selected for the original request has its request for a role instance decremented, so that it may be viewed as available again. The node is also re-inserted into the AvailableNodes list -not at its head, but at its position in the total ordering of the list.</p></li>
+</ol></div>
+<div class="section">
+<h3>NMClientAsync Callback: onContainerStarted()<a name="NMClientAsync_Callback:_onContainerStarted"></a></h3>
+
+<div class="source">
+<pre>onContainerStarted(ContainerId containerId)
+</pre></div>
+<p>The AM uses this as a signal to remove the container from the list of starting containers, moving it into the map of live nodes; the counters in the associated <tt>RoleInstance</tt> are updated accordingly; the node entry adjusted to indicate it has one more live node and one less starting node.</p></div>
+<div class="section">
+<h3>NMClientAsync Callback: onContainerStartFailed()<a name="NMClientAsync_Callback:_onContainerStartFailed"></a></h3>
+<p>The AM uses this as a signal to remove the container from the list of starting containers -the count of starting containers for the relevant NodeEntry is decremented. If the node is now available for instances of this container, it is returned to the queue of available nodes.</p></div>
+<div class="section">
+<h3>Flex: Releasing a role instance from the cluster<a name="Flex:_Releasing_a_role_instance_from_the_cluster"></a></h3>
+<p>Simple strategy: find a node with at least one active container</p>
+
+<div class="source">
+<pre>select a node N in nodemap where for NodeEntry[roleId]: active &gt; releasing; 
+nodeentry = node.get(roleId)
+nodeentry.active--;
+</pre></div>
+<p>Advanced Strategy:</p>
+
+<div class="source">
+<pre>Scan through the map looking for a node where active &gt;1 &amp;&amp; active &gt; releasing.
+If none are found, fall back to the previous strategy
+</pre></div>
+<p>This is guaranteed to release a container on any node with &gt;1 container in use, if such a node exists. If not, the scan time has increased to #(nodes).</p>
+<p>Once a node has been identified</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>a container on it is located (via the existing container map). This container must: be of the target role, and not already be queued for release.</li>
+  
+<li>A release operation is queued trigger a request for the RM.</li>
+  
+<li>The (existing) <tt>containersBeingReleased</tt> Map has the container inserted into it</li>
+</ol>
+<p>After the AM processes the request, it triggers a callback</p></div>
+<div class="section">
+<h3>AM callback onContainersCompleted:<a name="AM_callback_onContainersCompleted:"></a></h3>
+
+<div class="source">
+<pre>void onContainersCompleted(List&lt;ContainerStatus&gt; completedContainers)
+</pre></div>
+<p>This callback returns a list of containers that have completed.</p>
+<p>These need to be split into successful completion of a release request and containers which have failed. </p>
+<p>This is currently done by tracking which containers have been queued for release, as well as which were rejected as surplus before even having any role allocated onto them.</p>
+<p>A container is considered to have failed if it was an active container which has completed although it wasn&#x2019;t on the list of containers to release</p>
+
+<div class="source">
+<pre>shouldReview = false
+for container in completedContainers:
+  containerId = container.containerId
+  nodeId = container.nodeId
+  node = nodemap.get(nodeId)
+  if node == null :
+    // unknown node
+    continue
+  roleId = node.roleId
+  nodeentry = node.get(roleId)
+  nodeentry.active--
+  nodemap.dirty = true
+  if getContainersBeingReleased().containsKey(containerId) :
+    // handle container completion
+    nodeentry.releasing --
+
+    // update existing Slider role status
+    roleStatus[roleId].decReleasing();
+    containersBeingReleased.remove(containerId)
+  else: 
+    //failure of a live node
+    roleStatus[roleId].decActual();
+    shouldReview = true
+
+  if nodeentry.available():
+    nodentry.last_used = now()
+    availableNodes[roleId].insert(node)      
+  //trigger a comparison of actual vs desired
+if shouldReview :
+  reviewRequestAndReleaseNodes()
+</pre></div>
+<p>By calling <tt>reviewRequestAndReleaseNodes()</tt> the AM triggers a re-evaluation of how many instances of each node a cluster has, and how many it needs. If a container has failed and that freed up all role instances on that node, it will have been inserted at the front of the <tt>availableNodes</tt> list. As a result, it is highly likely that a new container will be requested on the same node. (The only way a node the list would be newer is be if other containers were completed in the same callback)</p></div>
+<div class="section">
+<h3>Implementation Notes<a name="Implementation_Notes"></a></h3>
+<p>Notes made while implementing the design.</p>
+<p><tt>OutstandingRequestTracker</tt> should also track requests made with no target node; this makes seeing what is going on easier. <tt>ARMClientImpl</tt> is doing something similar, on a priority-by-priority basis -if many requests are made, each with their own priority, that base class&#x2019;s hash tables may get overloaded. (it assumes a limited set of priorities)</p>
+<p>Access to the role history datastructures was restricted to avoid synchronization problems. Protected access is permitted so that a test subclass can examine (and change?) the internals.</p>
+<p>`NodeEntries need to add a launching value separate from active so that when looking for nodes to release, no attempt is made to release a node that has been allocated but is not yet live.</p>
+<p>We can&#x2019;t reliably map from a request to a response. Does that matter? If we issue a request for a host and it comes in on a different port, do we care? Yes -but only because we are trying to track nodes which have requests outstanding so as not to issue new ones. But if we just pop the entry off the available list, that becomes moot.</p>
+<p>Proposal: don&#x2019;t track the requesting numbers in the node entries, just in the role status fields.</p>
+<p>but: this means that we never re-insert nodes onto the available list if a node on them was requested but not satisfied.</p>
+<p>Other issues: should we place nodes on the available list as soon as all the entries have been released? I.e. Before YARN has replied</p>
+<p>RoleStats were removed -left in app state. Although the rolestats would belong here, leaving them where they were reduced the amount of change in the <tt>AppState</tt> class, so risk of something breaking.</p></div></div>
+<div class="section">
+<h2>MiniYARNCluster node IDs<a name="MiniYARNCluster_node_IDs"></a></h2>
+<p>Mini YARN cluster NodeIDs all share the same hostname , at least when running against file://; so mini tests with &gt;1 NM don&#x2019;t have a 1:1 mapping of <tt>NodeId:NodeInstance</tt>. What will happen is that <tt>NodeInstance getOrCreateNodeInstance(Container container) '
+will always return the same (now shared)</tt>NodeInstance`.</p></div>
+<div class="section">
+<h2>Releasing Containers when shrinking a cluster<a name="Releasing_Containers_when_shrinking_a_cluster"></a></h2>
+<p>When identifying instances to release in a bulk downscale operation, the full list of targets must be identified together. This is not just to eliminate multiple scans of the data structures, but because the containers are not released until the queued list of actions are executed -the nodes&#x2019; release-in-progress counters will not be incremented until after all the targets have been identified.</p>
+<p>It also needs to handle the scenario where there are many role instances on a single server -it should prioritize those. </p>
+<p>The NodeMap/NodeInstance/NodeEntry structure is adequate for identifying nodes, at least provided there is a 1:1 mapping of hostname to NodeInstance. But it is not enough to track containers in need of release: the AppState needs to be able to work backwards from a NodeEntry to container(s) stored there.</p>
+<p>The <tt>AppState</tt> class currently stores this data in a <tt>ConcurrentMap&lt;ContainerId, RoleInstance&gt;</tt></p>
+<p>To map from NodeEntry/NodeInstance to containers to delete, means that either a new datastructure is created to identify containers in a role on a specific host (e.g a list of ContainerIds under each NodeEntry), or we add an index reference in a RoleInstance that identifies the node. We already effectively have that in the container</p>
+<div class="section">
+<h3>dropping any available nodes that are busy<a name="dropping_any_available_nodes_that_are_busy"></a></h3>
+<p>When scanning the available list, any nodes that are no longer idle for that role should be dropped from the list.</p>
+<p>This can happen when an instance was allocated on a different node from that requested.</p></div>
+<div class="section">
+<h3>Finding a node when a role has instances in the cluster but nothing<a name="Finding_a_node_when_a_role_has_instances_in_the_cluster_but_nothing"></a></h3>
+<p>known to be available</p>
+<p>One condition found during testing is the following: </p>
+
+<ol style="list-style-type: decimal">
+  
+<li>A role has one or more instances running in the cluster</li>
+  
+<li>A role has no entries in its available list: there is no history of the role ever being on nodes other than which is currently in use.</li>
+  
+<li>A new instance is requested.</li>
+</ol>
+<p>In this situation, the <tt>findNodeForNewInstance</tt> method returns null: there is no recommended location for placement. However, this is untrue: all nodes in the cluster <tt>other</tt> than those in use are the recommended nodes. </p>
+<p>It would be possible to build up a list of all known nodes in the cluster that are not running this role and use that in the request, effectively telling the AM to pick one of the idle nodes. By not doing so, we increase the probability that another instance of the same role will be allocated on a node in use, a probability which (were there capacity on these nodes and placement random), be <tt>1/(clustersize-roleinstances)</tt>. The smaller the cluster and the bigger the application, the higher the risk.</p>
+<p>This could be revisited, if YARN does not support anti-affinity between new requests at a given priority and existing ones: the solution would be to issue a relaxed placement request listing all nodes that are in the NodeMap and which are not running an instance of the specific role. [To be even more rigorous, the request would have to omit those nodes for which an allocation has already been made off the available list and yet for which no container has yet been granted]. </p></div></div>
+<div class="section">
+<h2>Reworked Outstanding Request Tracker<a name="Reworked_Outstanding_Request_Tracker"></a></h2>
+<p>The reworked request tracker behaves as follows</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>outstanding requests with specific placements are tracked by <tt>(role, hostname)</tt></li>
+  
+<li>container assigments are attempted to be resolved against the same parameters.</li>
+  
+<li>If found: that request is considered satisfied *irrespective of whether or not the request that satisfied the allocation was the one that requested that location.</li>
+  
+<li>When all instances of a specific role have been allocated, the hostnames of all outstanding requests are returned to the available node list on the basis that they have been satisifed elswhere in the YARN cluster. This list is then sorted.</li>
+</ol>
+<p>This strategy returns unused hosts to the list of possible hosts, while retaining the ordering of that list in most-recent-first.</p>
+<div class="section">
+<h3>Weaknesses<a name="Weaknesses"></a></h3>
+<p>if one or more container requests cannot be satisifed, then all the hosts in the set of outstanding requests will be retained, so all these hosts in the will be considered unavailable for new location-specific requests. This may imply that new requests that could be explicity placed will now only be randomly placed -however, it is moot on the basis that if there are outstanding container requests it means the RM cannot grant resources: new requests at the same priority (i.e. same Slider Role ID) will not be granted either.</p>
+<p>The only scenario where this would be different is if the resource requirements of instances of the target role were decreated during a cluster flex such that the placement could now be satisfied on the target host. This is not considered a significant problem.</p>
+<h1>Persistence</h1>
+<p>The initial implementation uses the JSON-formatted Avro format; while significantly less efficient than a binary format, it is human-readable</p>
+<p>Here are sequence of entries from a test run on a single node cluster; running 1 HBase Master and two region servers.</p>
+<p>Initial save; the instance of Role 1 (HBase master) is live, Role 2 (RS) is not.</p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183475949,&quot;savedx&quot;:&quot;14247c3aeed&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:0}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:false,&quot;last_used&quot;:0}}}
+</pre></div>
+<p>At least one RS is live: </p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryFooter&quot;:{&quot;count&quot;:2}}}{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183476010,&quot;savedx&quot;:&quot;14247c3af2a&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:0}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:true,&quot;last_used&quot;:0}}}
+</pre></div>
+<p>Another entry is saved -presumably the second RS is now live, which triggered another write</p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryFooter&quot;:{&quot;count&quot;:2}}}{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183476028,&quot;savedx&quot;:&quot;14247c3af3c&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:0}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:true,&quot;last_used&quot;:0}}}
+</pre></div>
+<p>At this point the cluster was frozen and thawed. Slider does not save the cluster state at freeze time, but does as it is rebuilt.</p>
+<p>When the cluster is restarted, every node that was active for a role at the time the file was saved <tt>1384183476028</tt> is given a last_used timestamp of that time. </p>
+<p>When the history is next saved, the master has come back onto the (single) node, it is active while its <tt>last_used</tt> timestamp is the previous file&#x2019;s timestamp. No region servers are yet live.</p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryFooter&quot;:{&quot;count&quot;:2}}}{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183512173,&quot;savedx&quot;:&quot;14247c43c6d&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:1384183476028}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:false,&quot;last_used&quot;:1384183476028}}}
+</pre></div>
+<p>Here a region server is live</p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryFooter&quot;:{&quot;count&quot;:2}}}{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183512199,&quot;savedx&quot;:&quot;14247c43c87&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:1384183476028}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:true,&quot;last_used&quot;:1384183476028}}}
+</pre></div>
+<p>And here, another region server has started. This does not actually change the contents of the file</p>
+
+<div class="source">
+<pre>{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryFooter&quot;:{&quot;count&quot;:2}}}{&quot;entry&quot;:{&quot;org.apache.hoya.avro.RoleHistoryHeader&quot;:{&quot;version&quot;:1,&quot;saved&quot;:1384183512217,&quot;savedx&quot;:&quot;14247c43c99&quot;,&quot;roles&quot;:3}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:1,&quot;active&quot;:true,&quot;last_used&quot;:1384183476028}}}
+{&quot;entry&quot;:{&quot;org.apache.hoya.avro.NodeEntryRecord&quot;:{&quot;host&quot;:&quot;192.168.1.85&quot;,&quot;role&quot;:2,&quot;active&quot;:true,&quot;last_used&quot;:1384183476028}}}
+</pre></div>
+<p>The <tt>last_used</tt> timestamps will not be changed until the cluster is shrunk or thawed, as the <tt>active</tt> flag being set implies that the server is running both roles at the save time of <tt>1384183512217</tt>.</p></div></div>
+<div class="section">
+<h2>Resolved issues<a name="Resolved_issues"></a></h2>
+
+<blockquote>
+<p>How best to distinguish at thaw time from nodes used just before thawing from nodes used some period before? Should the RoleHistory simply forget about nodes which are older than some threshold when reading in the history?</p>
+</blockquote>
+<p>we just track last used times</p>
+
+<blockquote>
+<p>Is there a way to avoid tracking the outstanding requests?</p>
+</blockquote>
+<p>No </p>
+
+<blockquote>
+<p>What will the strategy of picking the most-recently-used node do if that node creates the container and then fails to start it up. Do we need to add blacklisting too? Or actually monitor the container start time, and if a container hasn&#x2019;t been there for very long, don&#x2019;t pick it.</p>
+</blockquote>
+<p>Startup failures drop the node from the ready-to-use list; the node is no longer trusted. We don&#x2019;t blacklist it (yet)</p>
+
+<blockquote>
+<p>Should we prioritise a node that was used for a long session ahead of a node that was used more recently for a shorter session? Maybe, but it complicates selection as generating a strict order of nodes gets significantly harder.</p>
+</blockquote>
+<p>No: you need to start tracking aggregate execution time, for the last session. In a stable state, all servers recorded in the history will have spread the data amongst them, so its irrelevant.</p></div>
+                  </div>
+          </div>
+
+    <hr/>
+
+    <footer>
+            <div class="container">
+              <div class="row span12">Copyright &copy;                    2014
+                        <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+            All Rights Reserved.      
+                    
+      </div>
+
+                                                                  <?xml version="1.0" encoding="UTF-8"?>
+<div class="row-fluid">Apache Slider, Slider, Apache, and the Apache Incubator logo are trademarks of The Apache Software Foundation.</div>
+                  
+                <p id="poweredBy" class="pull-right">
+                          <a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
+        <img class="builtBy" alt="Built by Maven" src="../images/logos/maven-feather.png" />
+      </a>
+              </p>
+        
+                </div>
+    </footer>
+  </body>
+</html>

Propchange: incubator/slider/site/content/architecture/rolehistory.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/slider/site/content/client-configuration.html
URL: http://svn.apache.org/viewvc/incubator/slider/site/content/client-configuration.html?rev=1598665&view=auto
==============================================================================
--- incubator/slider/site/content/client-configuration.html (added)
+++ incubator/slider/site/content/client-configuration.html Fri May 30 16:47:13 2014
@@ -0,0 +1,396 @@
+<!DOCTYPE html>
+<!--
+ | Generated by Apache Maven Doxia at 2014-05-30
+ | Rendered using Apache Maven Fluido Skin 1.3.0
+-->
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta name="Date-Revision-yyyymmdd" content="20140530" />
+    <meta http-equiv="Content-Language" content="en" />
+    <title>Apache Slider 0.30 (incubating) - </title>
+    <link rel="stylesheet" href="./css/apache-maven-fluido-1.3.0.min.css" />
+    <link rel="stylesheet" href="./css/site.css" />
+    <link rel="stylesheet" href="./css/print.css" media="print" />
+
+      
+    <script type="text/javascript" src="./js/apache-maven-fluido-1.3.0.min.js"></script>
+
+    
+            </head>
+        <body class="topBarEnabled">
+          
+                        
+                    
+                
+
+    <div id="topbar" class="navbar navbar-fixed-top ">
+      <div class="navbar-inner">
+                                  <div class="container"><div class="nav-collapse">
+            
+                
+                                <ul class="nav">
+                          <li class="dropdown">
+        <a href="#" class="dropdown-toggle" data-toggle="dropdown">Project Documentation <b class="caret"></b></a>
+        <ul class="dropdown-menu">
+        
+                      <li class="dropdown-submenu">
+                                      <a href="project-info.html"  title="Project Information">Project Information</a>
+              <ul class="dropdown-menu">
+                                  <li>      <a href="index.html"  title="About">About</a>
+</li>
+                                  <li>      <a href="plugin-management.html"  title="Plugin Management">Plugin Management</a>
+</li>
+                                  <li>      <a href="distribution-management.html"  title="Distribution Management">Distribution Management</a>
+</li>
+                                  <li>      <a href="dependency-info.html"  title="Dependency Information">Dependency Information</a>
+</li>
+                                  <li>      <a href="dependency-convergence.html"  title="Dependency Convergence">Dependency Convergence</a>
+</li>
+                                  <li>      <a href="source-repository.html"  title="Source Repository">Source Repository</a>
+</li>
+                                  <li>      <a href="mail-lists.html"  title="Mailing Lists">Mailing Lists</a>
+</li>
+                                  <li>      <a href="issue-tracking.html"  title="Issue Tracking">Issue Tracking</a>
+</li>
+                                  <li>      <a href="integration.html"  title="Continuous Integration">Continuous Integration</a>
+</li>
+                                  <li>      <a href="plugins.html"  title="Project Plugins">Project Plugins</a>
+</li>
+                                  <li>      <a href="license.html"  title="Project License">Project License</a>
+</li>
+                                  <li>      <a href="modules.html"  title="Project Modules">Project Modules</a>
+</li>
+                                  <li>      <a href="dependency-management.html"  title="Dependency Management">Dependency Management</a>
+</li>
+                                  <li>      <a href="team-list.html"  title="Project Team">Project Team</a>
+</li>
+                                  <li>      <a href="project-summary.html"  title="Project Summary">Project Summary</a>
+</li>
+                                  <li>      <a href="dependencies.html"  title="Dependencies">Dependencies</a>
+</li>
+                              </ul>
+            </li>
+                  
+                      <li class="dropdown-submenu">
+                                      <a href="project-reports.html"  title="Project Reports">Project Reports</a>
+              <ul class="dropdown-menu">
+                                  <li>      <a href="surefire-report.html"  title="Surefire Report">Surefire Report</a>
+</li>
+                              </ul>
+            </li>
+                          </ul>
+      </li>
+                <li class="dropdown">
+        <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documents <b class="caret"></b></a>
+        <ul class="dropdown-menu">
+        
+                      <li>      <a href="getting_started.html"  title="Getting Started">Getting Started</a>
+</li>
+                  
+                      <li>      <a href="manpage.html"  title="manpage">manpage</a>
+</li>
+                  
+                      <li>      <a href="troubleshooting.html"  title="Troubleshooting">Troubleshooting</a>
+</li>
+                  
+                      <li>      <a href="architecture/index.html"  title="Architecture">Architecture</a>
+</li>
+                  
+                      <li>      <a href="developing/index.html"  title="Developing">Developing</a>
+</li>
+                  
+                      <li>      <a href="exitcodes.html"  title="Exitcodes">Exitcodes</a>
+</li>
+                          </ul>
+      </li>
+                  </ul>
+          
+          
+                                                              
+                   
+                      </div>
+          
+        </div>
+      </div>
+    </div>
+    
+        <div class="container">
+          <div id="banner">
+        <div class="pull-left">
+                                                  <a href="./" id="bannerLeft">
+                <h2>Apache Slider (incubating)</h2>
+                </a>
+                      </div>
+        <div class="pull-right">              <div id="bannerRight">
+                                                                                        <img src="http://incubator.apache.org/images/apache-incubator-logo.png" />
+                </div>
+      </div>
+        <div class="clear"><hr/></div>
+      </div>
+
+      <div id="breadcrumbs">
+        <ul class="breadcrumb">
+                
+                    
+                  <li id="publishDate">Last Published: 2014-05-30</li>
+                      
+                
+                    
+                 <li id="projectVersion" class="pull-right">Version: 0.30</li>
+      
+                            </ul>
+      </div>
+
+      
+                
+        <div id="bodyColumn" >
+                                  
+            <!-- -
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. --><h1>Apache Slider Client Configuration</h1>
+<p>This document covers how the client application is itself configured.</p>
+<div class="section">
+<h2>Summary<a name="Summary"></a></h2>
+<p>The client application can be configured</p>
+
+<ol style="list-style-type: decimal">
+  
+<li>On the command line, which can set client options and JVM system properties.</li>
+  
+<li>With Hadoop-style configuration options in the file <tt>slider-client.xml</tt>  in the configuration directory<tt>conf/</tt> dir</li>
+  
+<li>Or, if the environment variable <tt>SLIDER_CONF_DIR</tt> is set, in the  file <tt>$SLIDER_CONF_DIR/slider-client.xml</tt></li>
+  
+<li>Logging is defined in the <tt>log4j.properties</tt> file in the same configuration directory.</li>
+  
+<li>VM options can be defined in <tt>SLIDER_JVM_OPTS</tt></li>
+</ol>
+<p>The options defined in a Slider cluster configuration are only used by the client when creating a cluster -not for the actual client itself.</p></div>
+<div class="section">
+<h2>Introduction<a name="Introduction"></a></h2>
+<p>The Slider client needs to be configured to talk to a Hadoop filesystem and a YARN resource manager (&#x201c;the RM&#x201d;). In a secure cluster it needs to be told the Kerberos identity, the <i>principal</i> of both the HDFS namenode and the YARN RM -and it may also need some JVM options set in order for Java&#x2019;s Kerberos module to correctly identify itself to these services.</p>
+<p>It cannot rely on local <tt>HADOOP_PREFIX/conf/hadoop-site.xml</tt> and <tt>$YARN_PREFIX/conf/yarn-site.xml</tt> files -because it is designed to work on client machines that may not have Hadoop and YARN installed.</p>
+<p>Instead all client-side (non-JVM) options can be predefined in the configuration file <tt>slider-client.xml</tt>. </p></div>
+<div class="section">
+<h2>Setting Slider JVM options<a name="Setting_Slider_JVM_options"></a></h2>
+<p>Core JVM options can be set in the environment variable <tt>SLIDER_JVM_OPTS</tt>; if unset the <tt>bin/slider</tt> script will use the default values that were current when that version of Slider was released. These values may change across versions, and may in fact be.</p>
+<p>At the time of writing, the default values were:</p>
+
+<div class="source">
+<pre>&quot;-Djava.net.preferIPv4Stack=true -Djava.awt.headless=true -Xmx256m -Dslider.confdir=${confdir}&quot;
+</pre></div>
+<p>To allow some Java system properties to be set without editing this environment variable, such system properties may be set on the Slider command line through the <tt>-S</tt> parameter. For example, the following two operations are equivalent in terms of setting the system property <tt>java.security.krb5.realm</tt> to the value <tt>LOCAL</tt>.</p>
+
+<div class="source">
+<pre>export SLIDER_JVM_OPTS=&quot;-Djava.security.krb5.realm=LOCAL&quot;
+</pre></div>
+<p>and</p>
+
+<div class="source">
+<pre>slider -S java.security.krb5.realm=LOCAL
+</pre></div>
+<p>Note that the first declaration invalidates all default JVM options; if any of those were desired, they should be included in the new definition.</p>
+<p>Multiple system property declarations are allowed on the command line -including duplicate declarations. In such a case the order of assignment is undefined.</p>
+<p>For any system property that the user expects to have to issue on every command -including any kerberos-related properties, adding them to the JVM options environment variable guarantees that they are always set.</p></div>
+<div class="section">
+<h2>Setting Slider client options on the command line with the <tt>-D</tt> parameter<a name="Setting_Slider_client_options_on_the_command_line_with_the_-D_parameter"></a></h2>
+<p>The slider client is configured via Hadoop-style configuration options. To be precise, all standard Hadoop-common, hadoop-hdfs client and hadoop-yar client-side options control how Slider communicates with the Hadoop YARN cluster.</p>
+<p>There are extra options specific to Slider itself, options which are again set as Hadoop configuration parameters.</p>
+<p>All Hadoop and Slider options can be set on the command line using the <tt>-D</tt> parameter followed by the appropriate <tt>key=value</tt> argument</p>
+<p>For example, here is a definition of the default Hadoop filesystem:</p>
+
+<div class="source">
+<pre>-D fs.defaultFS=hdfs://namenode:9000
+</pre></div>
+<p>Multiple definitions are of course allowed on the command line </p>
+
+<div class="source">
+<pre>-D fs.defaultFS=hdfs://namenode:9000 -D dfs.namenode.kerberos.principal=hdfs/namenode@LOCAL
+</pre></div>
+<p>Slider-specific options can be made the same way</p>
+
+<div class="source">
+<pre>-D slider.kerberos.principal=
+</pre></div>
+<p>If duplicate declarations are made the order of assignment is undefined.</p>
+<h1>Setting common options through specific command-line arguments</h1>
+<p>Some Hadoop and Slider options are so common that they have specific shortcut commands to aid their use</p>
+<p><tt>-m</tt>, <tt>--manager</tt> : sets the YARN resource manager. Equivalent to setting the <tt>yarn.resourcemanager.address</tt> option</p>
+<p><tt>--fs</tt>, <tt>--filesystem</tt>: defines the filesystem. Equivalent to setting the <tt>fs.defaultFS</tt> option</p>
+<p>If these shortcuts are used and the options are also defined via <tt>-D</tt> declarations, the order of assignment is undefined.</p>
+<h1>Defining Hadoop and Slider Options in the <tt>slider-client.xml</tt> file.</h1>
+<p>In the Slider installation, alongside the <tt>bin/slider</tt> script is a configuration directory <tt>conf</tt>. This contains the files:</p>
+
+<ol style="list-style-type: decimal">
+  
+<li><tt>log4j.properties</tt></li>
+  
+<li><tt>slider-client.xml</tt></li>
+</ol>
+<p>The <tt>log4j.properties</tt> file is not covered here -it is a standard Log4J file. At the time of writing, this log configuration file is used on both the client and the server.</p>
+<p>The <tt>slider-client.xml</tt> file is a hadoop-formatted XML options file, which is read by the Slider client -but not by they Slider Application Master.</p>
+<p>Here is an example file:</p>
+
+<div class="source">
+<pre>&lt;property&gt;
+  &lt;name&gt;yarn.resourcemanager.address&lt;/name&gt;
+  &lt;value&gt;namenode:8033&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;fs.defaultFS&lt;/name&gt;
+  &lt;value&gt;hdfs://namenode:9000&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;ipc.client.fallback-to-simple-auth-allowed&lt;/name&gt;
+  &lt;value&gt;false&lt;/value&gt;
+&lt;/property&gt;
+</pre></div>
+<p>This defines both the filesystem and the YARN RM, and so obviates the need to declare either on the command line.</p>
+<p>If an option is defined in the <tt>slider-client.xml</tt> file and on the command line -be it by a <tt>-D key=value</tt> declaration or a <tt>--manager</tt> or <tt>--filesystem</tt> definition. (this holds even if the value is declared with <tt>&lt;final&gt;true&lt;/final&gt;</tt>).</p></div>
+<div class="section">
+<h2>Selecting an alternate Slider configuration directory<a name="Selecting_an_alternate_Slider_configuration_directory"></a></h2>
+<p>The environment variable <tt>SLIDER_CONF_DIR</tt> can be used to declare an alternate configuration directory. If set, the directory it identifies will be used as the source of the <tt>log4j.properties</tt> and <tt>slider-client.xml</tt> files.</p></div>
+<div class="section">
+<h2>Slider Client Configuration options<a name="Slider_Client_Configuration_options"></a></h2>
+<p>As well as standard YARN and Hadoop configuration options, Slider supports a limited number of slider-specific configuration parameters.</p>
+
+<div class="source">
+<pre>&lt;property&gt;
+  &lt;name&gt;slider.zookeeper.quorum&lt;/name&gt;
+  &lt;value&gt;localhost:2181,zookeeper2:4545&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.yarn.queue&lt;/name&gt;
+  &lt;value&gt;default&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.security.enabled&lt;/name&gt;
+  &lt;value&gt;false&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.yarn.queue&lt;/name&gt;
+  &lt;value&gt;default&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.yarn.queue.priority&lt;/name&gt;
+  &lt;value&gt;1&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.yarn.restart.limit&lt;/name&gt;
+  &lt;value&gt;5&lt;/value&gt;
+  &lt;description&gt;How many times to start/restart the Slider AM&lt;/description&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.cluster.directory.permissions&lt;/name&gt;
+  &lt;value&gt;750&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;slider.data.directory.permissions&lt;/name&gt;
+  &lt;value&gt;750&lt;/value&gt;
+&lt;/property&gt;
+</pre></div>
+<div class="section">
+<h3><tt>slider.zookeeper.quorum</tt> - the zookeeper quorum.<a name="slider.zookeeper.quorum_-_the_zookeeper_quorum."></a></h3>
+<p>This defines the zookeeper quorum for this YARN cluster. </p>
+<p>It is used to locate the service registry, enable running instances to publish information about their application, and for clients to query this. </p>
+<p>It is also used as the default zookeeper binding for any application that uses zookeeper in its configuration -the value set when the application is defined will be copied into the instance definition file.</p></div>
+<div class="section">
+<h3><tt>&quot;slider.registry.path&quot;</tt> - the zookeeper path for the service registry<a name="aslider.registry.path_-_the_zookeeper_path_for_the_service_registry"></a></h3>
+<p>This declares the the zookeeper path for the service registry. </p></div>
+<div class="section">
+<h3><tt>slider.security.enabled</tt> - enable security.<a name="slider.security.enabled_-_enable_security."></a></h3>
+<p>This turns security on; consult <a href="security.html">Security</a> for more information.</p></div>
+<div class="section">
+<h3><tt>slider.yarn.restart.limit</tt> - set limit on Application Master Restarts<a name="slider.yarn.restart.limit_-_set_limit_on_Application_Master_Restarts"></a></h3>
+<p>This limits how many times YARN should start a failed application master.</p>
+<p>A short restart limit is useful when initially creating a cluster, as it ensures that YARN does not repeatedly try to restart a failing application.</p>
+<p>In production, however, a large number prevents YARN from halting a Slider application merely because failures in the underlying YARN cluster have triggered restarts.</p>
+<p><i>Important:</i> The cluster-wide limit of <tt>yarn.resourcemanager.am.max-attempts</tt> places an upper limit on the number of retries that any application can request. If the application fails after less restarts than requested, check this cluster setting.</p></div>
+<div class="section">
+<h3><tt>slider.yarn.queue</tt> - the name of the YARN queue for the cluster.<a name="slider.yarn.queue_-_the_name_of_the_YARN_queue_for_the_cluster."></a></h3>
+<p>This identifies the queue submit the application creation request to, which can define the priority, resource limits and other values of an application. All containers created in the Slider cluster will share this same queue.</p>
+<p>Default value: <tt>default</tt>.</p></div>
+<div class="section">
+<h3><tt>slider.yarn.queue.priority</tt> - the name of the YARN queue for the cluster.<a name="slider.yarn.queue.priority_-_the_name_of_the_YARN_queue_for_the_cluster."></a></h3>
+<p>This identifies the priority within the queue. The lower the value, the higher the priority</p>
+<p>Default value: <tt>1</tt>.</p>
+
+<div class="source">
+<pre>bin/slider thaw cl1 -D slider.yarn.queue.priority=5
+</pre></div>
+<div class="section">
+<h4><tt>slider.cluster.directory.permissions</tt><a name="slider.cluster.directory.permissions"></a></h4>
+<p>An octal-format (<tt>chmod</tt>-style) permissions mask for the directory that contains the cluster specification <tt>${user.home}/.slider/clusters/${clustername}</tt></p>
+
+<div class="source">
+<pre>&lt;property&gt;
+  &lt;name&gt;slider.cluster.directory.permissions&lt;/name&gt;
+  &lt;value&gt;750&lt;/value&gt;
+&lt;/property&gt;
+</pre></div></div>
+<div class="section">
+<h4><tt>slider.data.directory.permissions</tt><a name="slider.data.directory.permissions"></a></h4>
+<p>An octal-format (<tt>chmod</tt>-style) permissions mask for the directory that contains the application data <tt>${user.home}/.slider/clusters/${clustername}/database</tt></p>
+
+<div class="source">
+<pre>&lt;property&gt;
+  &lt;name&gt;slider.data.directory.permissions&lt;/name&gt;
+  &lt;value&gt;750&lt;/value&gt;
+&lt;/property&gt;
+</pre></div></div></div></div>
+<div class="section">
+<h2>Debugging configuration issues<a name="Debugging_configuration_issues"></a></h2>
+<p>If the slider packages are set to log at debug level in the log4j configuration file, details on properties will be part of the copious output.</p></div>
+<div class="section">
+<h2>How client options are passed down to created clusters.<a name="How_client_options_are_passed_down_to_created_clusters."></a></h2>
+<p>Apart from the filesystem bindings, Client configuration options are not passed down to the XML site specification of the created cluster.</p>
+<p>The sole options passed down are the HDFS bindings: <tt>fs.defaultFS</tt>, which is passed down both as that property and as <tt>fs.default.name</tt>, and, in a secure cluster, the security flag (<tt>slider.security.enabled</tt>) and the HDFS Kerberos principal.</p></div>
+                  </div>
+          </div>
+
+    <hr/>
+
+    <footer>
+            <div class="container">
+              <div class="row span12">Copyright &copy;                    2014
+                        <a href="http://www.apache.org/">The Apache Software Foundation</a>.
+            All Rights Reserved.      
+                    
+      </div>
+
+                                                                  <?xml version="1.0" encoding="UTF-8"?>
+<div class="row-fluid">Apache Slider, Slider, Apache, and the Apache Incubator logo are trademarks of The Apache Software Foundation.</div>
+                  
+                <p id="poweredBy" class="pull-right">
+                          <a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
+        <img class="builtBy" alt="Built by Maven" src="./images/logos/maven-feather.png" />
+      </a>
+              </p>
+        
+                </div>
+    </footer>
+  </body>
+</html>

Propchange: incubator/slider/site/content/client-configuration.html
------------------------------------------------------------------------------
    svn:eol-style = native