You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ni...@apache.org on 2008/09/12 16:57:53 UTC
svn commit: r694702 [4/4] - in /hadoop/core/trunk: docs/ src/contrib/hod/
src/contrib/hod/hodlib/Hod/ src/contrib/hod/hodlib/NodePools/
src/docs/src/documentation/content/xdocs/
Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Sep 12 07:57:52 2008
@@ -22,6 +22,11 @@
HADOOP-4060. Modified HOD to rotate log files on the client side.
(Vinod Kumar Vavilapalli via yhemanth)
+ IMPROVEMENTS
+
+ HADOOP-4145. Add an accounting plugin (script) for HOD.
+ (Hemanth Yamijala via nigel)
+
BUG FIXES
HADOOP-4161. Fixed bug in HOD cleanup that had the potential to
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Sep 12 07:57:52 2008
@@ -451,8 +451,43 @@
raise Exception("Invalid state: Node pool is not initialized to delete the given job.")
return ret
+ def is_valid_account(self):
+ """Verify if the account being used to submit the job is a valid account.
+ This code looks for a file <install-dir>/bin/verify-account.
+ If the file is present, it executes the file, passing as argument
+ the account name. It returns the exit code and output from the
+ script on non-zero exit code."""
+
+ accountValidationScript = os.path.abspath('./verify-account')
+ if not os.path.exists(accountValidationScript):
+ return (0, None)
+
+ account = self.__nodePool.getAccountString()
+ exitCode = 0
+ errMsg = None
+ try:
+ accountValidationCmd = simpleCommand('Account Validation Command',\
+ '%s %s' % (accountValidationScript,
+ account))
+ accountValidationCmd.start()
+ accountValidationCmd.wait()
+ accountValidationCmd.join()
+ exitCode = accountValidationCmd.exit_code()
+ self.__log.debug('account validation script is run %d' \
+ % exitCode)
+ errMsg = None
+ if exitCode is not 0:
+ errMsg = accountValidationCmd.output()
+ except Exception, e:
+ exitCode = 0
+ self.__log.warn('Error executing account script: %s ' \
+ 'Accounting is disabled.' \
+ % get_exception_error_string())
+ self.__log.debug(get_exception_string())
+ return (exitCode, errMsg)
+
def allocate(self, clusterDir, min, max=None):
- status = 0
+ status = 0
self.__svcrgyClient = self.__get_svcrgy_client()
self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Sep 12 07:57:52 2008
@@ -252,7 +252,6 @@
self.__cfg['ringmaster']['max-master-failures'] = \
min(maxFailures, maxFailedNodes)
-
def _op_allocate(self, args):
operation = "allocate"
argLength = len(args)
@@ -313,6 +312,21 @@
return
self.__setup_cluster_logger(clusterDir)
+
+ (status, message) = self.__cluster.is_valid_account()
+ if status is not 0:
+ if message:
+ for line in message:
+ self.__log.critical("verify-account output: %s" % line)
+ self.__log.critical("Cluster cannot be allocated because account verification failed. " \
+ + "verify-account returned exit code: %s." % status)
+ self.__opCode = 4
+ return
+ else:
+ self.__log.debug("verify-account returned zero exit code.")
+ if message:
+ self.__log.debug("verify-account output: %s" % message)
+
if re.match('\d+-\d+', nodes):
(min, max) = nodes.split("-")
min = int(min)
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py Fri Sep 12 07:57:52 2008
@@ -116,6 +116,10 @@
"""Update information about the workers started by this NodePool."""
raise NotImplementedError
+ def getAccountString(self):
+ """Return the account string for this job"""
+ raise NotImplementedError
+
def getNextNodeSetId(self):
id = self.nextNodeSetId
self.nextNodeSetId += 1
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py Fri Sep 12 07:57:52 2008
@@ -51,6 +51,12 @@
self.__torque = torqueInterface(
self._cfg['resource_manager']['batch-home'], environ, self._log)
+ def getAccountString(self):
+ account = ''
+ if self._cfg['resource_manager'].has_key('pbs-account'):
+ account = self._cfg['resource_manager']['pbs-account']
+ return account
+
def __gen_submit_params(self, nodeSet, walltime = None, qosLevel = None,
account = None):
argList = []
Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml (original)
+++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml Fri Sep 12 07:57:52 2008
@@ -351,6 +351,37 @@
it is better not to run this inside a tight loop without sleeping.</p>
</section>
</section>
+
+ <section>
+ <title>verify-account - Script to verify an account under which
+ jobs are submitted</title>
+ <p>Production systems use accounting packages to charge users for using
+ shared compute resources. HOD supports a parameter
+ <em>resource_manager.pbs-account</em> to allow users to identify the
+ account under which they would like to submit jobs. It may be necessary
+ to verify that this account is a valid one configured in an accounting
+ system. The <em>hod-install-dir/bin/verify-account</em> script
+ provides a mechanism to plug-in a custom script that can do this
+ verification.</p>
+
+ <section>
+ <title>Integrating the verify-account script with HOD</title>
+ <p>HOD runs the <em>verify-account</em> script passing in the
+ <em>resource_manager.pbs-account</em> value as argument to the script,
+ before allocating a cluster. Sites can write a script that verify this
+ account against their accounting systems. Returning a non-zero exit
+ code from this script will cause HOD to fail allocation. Also, in
+ case of an error, HOD will print the output of script to the user.
+ Any descriptive error message can be passed to the user from the
+ script in this manner.</p>
+ <p>The default script that comes with the HOD installation does not
+ do any validation, and returns a zero exit code.</p>
+ <p>If the verify-account script is not found, then HOD will treat
+ that verification is disabled, and continue allocation as is.</p>
+ </section>
+ </section>
+
</section>
+
</body>
</document>
Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml (original)
+++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml Fri Sep 12 07:57:52 2008
@@ -412,7 +412,8 @@
<td> 5 </td>
<td> Job execution failure </td>
<td> 1. Torque Job was deleted from outside. Execute the Torque <code>qstat</code> command to see if you have any jobs in the <code>R</code> (Running) state. If none exist, try re-executing HOD. <br />
- 2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. </td>
+ 2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. <br/>
+ 3. The system administrator might have configured account verification, and an invalid account is specified. Contact system administrator.</td>
</tr>
<tr>
<td> 6 </td>