You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ni...@apache.org on 2008/09/12 16:57:53 UTC

svn commit: r694702 [4/4] - in /hadoop/core/trunk: docs/ src/contrib/hod/ src/contrib/hod/hodlib/Hod/ src/contrib/hod/hodlib/NodePools/ src/docs/src/documentation/content/xdocs/

Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Sep 12 07:57:52 2008
@@ -22,6 +22,11 @@
     HADOOP-4060. Modified HOD to rotate log files on the client side.
     (Vinod Kumar Vavilapalli via yhemanth)
 
+  IMPROVEMENTS
+
+    HADOOP-4145. Add an accounting plugin (script) for HOD.
+    (Hemanth Yamijala via nigel)
+
   BUG FIXES
 
     HADOOP-4161. Fixed bug in HOD cleanup that had the potential to

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Sep 12 07:57:52 2008
@@ -451,8 +451,43 @@
       raise Exception("Invalid state: Node pool is not initialized to delete the given job.")
     return ret
          
+  def is_valid_account(self):
+    """Verify if the account being used to submit the job is a valid account.
+       This code looks for a file <install-dir>/bin/verify-account. 
+       If the file is present, it executes the file, passing as argument 
+       the account name. It returns the exit code and output from the 
+       script on non-zero exit code."""
+
+    accountValidationScript = os.path.abspath('./verify-account')
+    if not os.path.exists(accountValidationScript):
+      return (0, None)
+
+    account = self.__nodePool.getAccountString()
+    exitCode = 0
+    errMsg = None
+    try:
+      accountValidationCmd = simpleCommand('Account Validation Command',\
+                                             '%s %s' % (accountValidationScript,
+                                                        account))
+      accountValidationCmd.start()
+      accountValidationCmd.wait()
+      accountValidationCmd.join()
+      exitCode = accountValidationCmd.exit_code()
+      self.__log.debug('account validation script is run %d' \
+                          % exitCode)
+      errMsg = None
+      if exitCode is not 0:
+        errMsg = accountValidationCmd.output()
+    except Exception, e:
+      exitCode = 0
+      self.__log.warn('Error executing account script: %s ' \
+                         'Accounting is disabled.' \
+                          % get_exception_error_string())
+      self.__log.debug(get_exception_string())
+    return (exitCode, errMsg)
+    
   def allocate(self, clusterDir, min, max=None):
-    status = 0  
+    status = 0
     self.__svcrgyClient = self.__get_svcrgy_client()
         
     self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Sep 12 07:57:52 2008
@@ -252,7 +252,6 @@
     self.__cfg['ringmaster']['max-master-failures'] = \
                               min(maxFailures, maxFailedNodes)
 
-    
   def _op_allocate(self, args):
     operation = "allocate"
     argLength = len(args)
@@ -313,6 +312,21 @@
           return
  
       self.__setup_cluster_logger(clusterDir)
+
+      (status, message) = self.__cluster.is_valid_account()
+      if status is not 0:
+        if message:
+          for line in message:
+            self.__log.critical("verify-account output: %s" % line)
+        self.__log.critical("Cluster cannot be allocated because account verification failed. " \
+                              + "verify-account returned exit code: %s." % status)
+        self.__opCode = 4
+        return
+      else:
+        self.__log.debug("verify-account returned zero exit code.")
+        if message:
+          self.__log.debug("verify-account output: %s" % message)
+
       if re.match('\d+-\d+', nodes):
         (min, max) = nodes.split("-")
         min = int(min)

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/nodePool.py Fri Sep 12 07:57:52 2008
@@ -116,6 +116,10 @@
     """Update information about the workers started by this NodePool."""
     raise NotImplementedError
 
+  def getAccountString(self):
+    """Return the account string for this job"""
+    raise NotImplementedError
+
   def getNextNodeSetId(self):
     id = self.nextNodeSetId
     self.nextNodeSetId += 1

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/NodePools/torque.py Fri Sep 12 07:57:52 2008
@@ -51,6 +51,12 @@
     self.__torque = torqueInterface(
       self._cfg['resource_manager']['batch-home'], environ, self._log)
 
+  def getAccountString(self):
+    account = ''
+    if self._cfg['resource_manager'].has_key('pbs-account'):
+      account = self._cfg['resource_manager']['pbs-account']
+    return account
+
   def __gen_submit_params(self, nodeSet, walltime = None, qosLevel = None, 
                           account = None):
     argList = []

Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml (original)
+++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_admin_guide.xml Fri Sep 12 07:57:52 2008
@@ -351,6 +351,37 @@
         it is better not to run this inside a tight loop without sleeping.</p>
       </section>
     </section>
+
+    <section>
+      <title>verify-account - Script to verify an account under which 
+             jobs are submitted</title>
+      <p>Production systems use accounting packages to charge users for using
+      shared compute resources. HOD supports a parameter 
+      <em>resource_manager.pbs-account</em> to allow users to identify the
+      account under which they would like to submit jobs. It may be necessary
+      to verify that this account is a valid one configured in an accounting
+      system. The <em>hod-install-dir/bin/verify-account</em> script 
+      provides a mechanism to plug-in a custom script that can do this
+      verification.</p>
+      
+      <section>
+        <title>Integrating the verify-account script with HOD</title>
+        <p>HOD runs the <em>verify-account</em> script passing in the
+        <em>resource_manager.pbs-account</em> value as argument to the script,
+        before allocating a cluster. Sites can write a script that verify this 
+        account against their accounting systems. Returning a non-zero exit 
+        code from this script will cause HOD to fail allocation. Also, in
+        case of an error, HOD will print the output of script to the user.
+        Any descriptive error message can be passed to the user from the
+        script in this manner.</p>
+        <p>The default script that comes with the HOD installation does not
+        do any validation, and returns a zero exit code.</p>
+        <p>If the verify-account script is not found, then HOD will treat
+        that verification is disabled, and continue allocation as is.</p>
+      </section>
+    </section>
+
   </section>
+
 </body>
 </document>

Modified: hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml?rev=694702&r1=694701&r2=694702&view=diff
==============================================================================
--- hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml (original)
+++ hadoop/core/trunk/src/docs/src/documentation/content/xdocs/hod_user_guide.xml Fri Sep 12 07:57:52 2008
@@ -412,7 +412,8 @@
         <td> 5 </td>
         <td> Job execution failure </td>
         <td> 1. Torque Job was deleted from outside. Execute the Torque <code>qstat</code> command to see if you have any jobs in the <code>R</code> (Running) state. If none exist, try re-executing HOD. <br />
-          2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. </td>
+          2. Torque problems such as the server momentarily going down, or becoming unresponsive. Contact system administrator. <br/>
+          3. The system administrator might have configured account verification, and an invalid account is specified. Contact system administrator.</td>
       </tr>
       <tr>
         <td> 6 </td>