You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ac...@apache.org on 2008/10/17 02:22:33 UTC
svn commit: r705422 - in /hadoop/core/branches/branch-0.19: ./
src/contrib/hod/CHANGES.txt src/contrib/hod/bin/hod
src/contrib/hod/hodlib/Hod/hadoop.py
src/contrib/hod/hodlib/NodePools/torque.py
Author: acmurthy
Date: Thu Oct 16 17:22:33 2008
New Revision: 705422
URL: http://svn.apache.org/viewvc?rev=705422&view=rev
Log:
Merge -r 705419:705420 from trunk to branch-0.19 to fix HADOOP-3127.
Modified:
hadoop/core/branches/branch-0.19/ (props changed)
hadoop/core/branches/branch-0.19/src/contrib/hod/CHANGES.txt
hadoop/core/branches/branch-0.19/src/contrib/hod/bin/hod
hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/Hod/hadoop.py
hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/NodePools/torque.py
Propchange: hadoop/core/branches/branch-0.19/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu Oct 16 17:22:33 2008
@@ -1 +1 @@
-/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273,701398,703923,704203,704261,704701,704703,704707,704712,704732,704748,704989,705391
+/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273,701398,703923,704203,704261,704701,704703,704707,704712,704732,704748,704989,705391,705420
Modified: hadoop/core/branches/branch-0.19/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/contrib/hod/CHANGES.txt?rev=705422&r1=705421&r2=705422&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.19/src/contrib/hod/CHANGES.txt Thu Oct 16 17:22:33 2008
@@ -110,6 +110,13 @@
HADOOP-3703. Fixes logcondense.py to use the new format of hadoop dfs -lsr
command line output format. (Vinod Kumar Vavilapalli via yhemanth)
+Release 0.17.3 - Unreleased
+
+ BUG FIXES
+
+ HADOOP-3217. Decrease the rate at which the hod queries the resource
+ manager for job status. (Hemanth Yamijala via acmurthy)
+
Release 0.17.0 - 2008-05-18
INCOMPATIBLE CHANGES
Modified: hadoop/core/branches/branch-0.19/src/contrib/hod/bin/hod
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/contrib/hod/bin/hod?rev=705422&r1=705421&r2=705422&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/contrib/hod/bin/hod (original)
+++ hadoop/core/branches/branch-0.19/src/contrib/hod/bin/hod Thu Oct 16 17:22:33 2008
@@ -159,7 +159,19 @@
True, 10, False, True, 'W'),
('log-rollover-count', 'pos_int', 'Specifies the number of rolled-over log files of HOD client. A zero value disables rollover.',
- True, 5, False, True, 'L')),
+ True, 5, False, True, 'L'),
+
+ ('job-status-query-interval', 'pos_int', 'Specifies the time between checking for job status',
+ False, 30, False, True),
+
+ ('job-command-failure-interval', 'pos_int', 'Specifies the time between checking for failed job status or submission commands',
+ False, 10, False, True),
+
+ ('job-status-query-failure-retries', 'pos_int', 'Specifies the number of times job status failure queries are retried',
+ False, 3, False, True),
+
+ ('job-submission-failure-retries', 'pos_int', 'Specifies the number of times job submission failure queries are retried',
+ False, 3, False, True)),
'resource_manager' : (
('id', 'string', 'Batch scheduler ID: torque|condor.',
Modified: hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/Hod/hadoop.py?rev=705422&r1=705421&r2=705422&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/Hod/hadoop.py Thu Oct 16 17:22:33 2008
@@ -180,25 +180,29 @@
return serviceData
def __check_job_status(self):
- initWaitCount = 20
- count = 0
+ failureCount = 0
status = False
state = 'Q'
userLimitsFirstFlag = True
- while state == 'Q':
+ while (state=='Q') or (state==False):
if hodInterrupt.isSet():
raise HodInterruptException()
jobInfo = self.__nodePool.getJobInfo()
state = jobInfo['job_state']
- if (state==False) or (state!='Q'):
+ self.__log.debug('job state %s' % state)
+ if state == False:
+ failureCount += 1
+ if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
+ self.__log.debug('Number of retries reached max limit while querying job status')
+ break
+ time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+ elif state!='Q':
break
- count = count + 1
- if count < initWaitCount:
- time.sleep(0.5)
else:
- time.sleep(10)
+ self.__log.debug('querying for job status after job-status-query-interval')
+ time.sleep(self.__cfg['hod']['job-status-query-interval'])
if self.__cfg['hod'].has_key('job-feasibility-attr') and \
self.__cfg['hod']['job-feasibility-attr']:
@@ -255,7 +259,7 @@
time.sleep(1)
count = count + 1
# check to see if the job exited by any chance in that time:
- if (count % 10 == 0):
+ if (count % self.__cfg['hod']['job-status-query-interval'] == 0):
if not self.__check_job_status():
break
return ringmasterXRS
@@ -273,9 +277,9 @@
serviceAddress = xmlrpcClient.getServiceAddr(serviceName)
if serviceAddress:
if serviceAddress == 'not found':
- time.sleep(.5)
+ time.sleep(1)
# check to see if the job exited by any chance in that time:
- if (i % 10 == 0):
+ if ((i+1) % self.__cfg['hod']['job-status-query-interval'] == 0):
if not self.__check_job_status():
break
else:
@@ -486,6 +490,7 @@
def allocate(self, clusterDir, min, max=None):
status = 0
+ failureCount = 0
self.__svcrgyClient = self.__get_svcrgy_client()
self.__log.debug("allocate %s %s %s" % (clusterDir, min, max))
@@ -499,6 +504,23 @@
if self.__cfg['hod'].has_key('walltime'):
walltime = self.__cfg['hod']['walltime']
self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
+ # if the job submission returned an error other than no resources
+ # retry a couple of times
+ while (self.jobId is False) and (exitCode != 188):
+ if hodInterrupt.isSet():
+ raise HodInterruptException()
+
+ failureCount += 1
+ if (failureCount >= self.__cfg['hod']['job-status-query-failure-retries']):
+ self.__log.debug("failed submitting job more than the retries. exiting")
+ break
+ else:
+ # wait a bit before retrying
+ time.sleep(self.__cfg['hod']['job-command-failure-interval'])
+ if hodInterrupt.isSet():
+ raise HodInterruptException()
+ self.jobId, exitCode = self.__nodePool.submitNodeSet(nodeSet, walltime)
+
if self.jobId:
jobStatus = None
try:
@@ -632,7 +654,7 @@
if exitCode == 188:
self.__log.critical("Request execeeded maximum resource allocation.")
else:
- self.__log.critical("Insufficient resources available.")
+ self.__log.critical("Job submission failed with exit code %s" % exitCode)
status = 4
else:
self.__log.critical("Scheduler failure, allocation failed.\n\n")
Modified: hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/NodePools/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/NodePools/torque.py?rev=705422&r1=705421&r2=705422&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/NodePools/torque.py (original)
+++ hadoop/core/branches/branch-0.19/src/contrib/hod/hodlib/NodePools/torque.py Thu Oct 16 17:22:33 2008
@@ -276,33 +276,22 @@
return id
def getJobInfo(self, jobId=None):
- #torque error code when credentials fail, a temporary condition sometimes.
- credFailureErrorCode = 171
+
jobNonExistentErrorCode = 153
- credFailureRetries = 10
- i = 0
- self.__jobInfo = None
+ self.__jobInfo = { 'job_state' : False }
if jobId == None:
jobId = self.getServiceId()
- while i < credFailureRetries:
- qstatInfo, exitCode = self.__torque.qstat(jobId)
- if exitCode == 0:
- self.__jobInfo = qstatInfo
- break
- elif exitCode == jobNonExistentErrorCode:
- # This really means that the job completed
- # However, setting only job_state for now, not
- # any other attributes, as none seem required.
- self.__jobInfo = { 'job_state' : 'C' }
- break
- else:
- if exitCode == credFailureErrorCode:
- time.sleep(1)
- i = i+1
- else:
- break
+ qstatInfo, exitCode = self.__torque.qstat(jobId)
+ if exitCode == 0:
+ self.__jobInfo = qstatInfo
+ elif exitCode == jobNonExistentErrorCode:
+ # This really means that the job completed
+ # However, setting only job_state for now, not
+ # any other attributes, as none seem required.
+ self.__jobInfo = { 'job_state' : 'C' }
+
return self.__jobInfo
def deleteJob(self, jobId):