''' run processes asynchronously on various machines, with a callback on process exit. ''' # to do someday: # # This feature should really be broken into 3 parts: # - remotely execute on another machine # - track concurrent execution # - reserve resource usage # # To start a process, ask for a resource reservation. (Currently, the # only resource we track is CPUs.) If we don't get a reservation, we # don't run right away. # # If we do, use the remote exec to run the process on the target machine. # This is the part that knows hosts.ini. (We also use hosts.ini to declare # resource availability.) # # When the process finishes, release the resource reservation. # import subprocess import time import datetime import os import os.path import traceback import sys import errno import ConfigParser debug=0 ##### class struct : pass ##### class run_exception(Exception) : pass class runner(object): # dict of all current running processes, indexed by node name all_procs = None # index of nodes node_index = None # host_info_cache = None # dict of how many commands we have running for that machine howmany = None ##### # def __init__( self, nodes ) : self.all_procs = { } self.node_index = nodes self.load_host_info() self.host_info_cache = { } self.howmany = { } ##### # start a process def run( self, node, run_name, logfile_name, no_run = False ): try : try : args = self.get_host_info(node.host) except Exception, e : log_traceback() print "ERROR: do not know how to run on %s"%node.host print e raise hostname = args['hostname'] if 'maxproc' in args : n = int(self.howmany.get(hostname,0)) if n >= int(args['maxproc']) : # print "decline to run %s - %d other already running"%(node.name,n) return False n = n + 1 self.howmany[hostname] = n # print "running %s %s %d"%(hostname,node.name, n) else : # print "running %s %s no maxproc"%(hostname, node.name) pass if debug : print "run",node.name if debug : print "....%s:%s/%s\n"%(node.host, node.table, node.cmd) node.running = 1 args = args.copy() args.update( script=node.script, script_type=node.script_type, host=node.host, table=node.table, cmd=node.cmd, node=node.name, runname=run_name, ) if debug : print "ARGS" for x in sorted([x for x in args]) : print '%s=%s'%(x,args[x]) args['script'] = args['script'] % args if args['script_type'] == 'r' : run = args['run'] elif args['script_type'] == 'l' : run = args['local'] else : raise Exception() t = [ ] for x in run : # bug: what to do in case of keyerror t.append( x % args ) run = t if debug : print "RUN",run try : os.makedirs( os.path.dirname(logfile_name) ) except OSError, e : if e.errno == errno.EEXIST : pass else : raise # open the log file, write initial notes print "LOGFILE",logfile_name logfile=open(logfile_name,"w") logfile.write('%s %s\n'%(datetime.datetime.now(),run)) logfile.flush() # debug - just say the name of the node we would run if no_run : run = [ 'echo', 'no_run - node=', node.name ] # start running the process if debug : print "RUN",run p = subprocess.Popen(args=run, stdout=logfile, stderr=subprocess.STDOUT, shell=False, close_fds=True) # remember the popen object for the process; remember the open log file n = struct() n.proc = p n.logfile = logfile n.logfile_name = logfile_name # remember the process is running self.all_procs[node.name] = n return True except Exception, e : log_traceback() txt= "ERROR RUNNING %s"%node.name raise run_exception(txt) ##### # callback when a node finishes def finish( self, node_name, status): node = self.node_index[node_name] args = self.get_host_info(node.host) hostname = args['hostname'] n = self.howmany[hostname] - 1 self.howmany[hostname] = n if debug : print "finish %s %s %d"%(hostname,node_name,n) # note the termination of the process at the end of the log file logfile = self.all_procs[node_name].logfile logfile.seek(0,2) # end of file logfile.write('\n%s exit=%s\n'%(datetime.datetime.now(),status)) logfile.close() # note the completion of the command if debug : print "finish",node.name node.running = 0 node.finished = 1 node.exit_status = status ##### # poll for exited child processes - this whole thing could could # be event driven, but I don't care to work out the details right # now. def poll( self ) : # look at all active processes for name in self.all_procs : # see if name has finished p = self.all_procs[name].proc n = p.poll() if n is not None : # marke the node finished self.finish(name,n) # status = p.returncode # remove it from the list of pending processes del self.all_procs[name] # Return the identity of the exited process. # There may be more, but we will come back and poll again. return ( name, status ) return None ##### def display_procs( self ) : # display currently active child processes print "procs:" for x in sorted(self.all_procs) : print " ",x print "" ##### def _host_get_names( self, cfg, section ) : d = { } # pick all the variables out of this section try : for name, value in cfg.items(section) : if value.startswith('[') : # it is a list d[name] = eval(value) else : # everything else is plain text d[name] = value return d except ConfigParser.NoSectionError : print "No config section in hosts.ini: %s"%section return { } def load_host_info( self, filename=None ) : # read the config file if filename is None : filename = os.path.dirname(__file__) + '/hosts.ini' self.cfg = ConfigParser.RawConfigParser() self.cfg.read(filename) def get_host_info(self, host) : if not host in self.host_info_cache : d = self._host_get_names(self.cfg, host) if 'like' in d : # get the dict of what this entry is like, copy it, # and update it with the values for this entry d1 = self.get_host_info(d['like']) d1 = d1.copy() d1.update(d) d = d1 del d['like'] # default hostname is the name from the section header if not 'hostname' in d : d['hostname'] = host # default maximum processes is 1 if not 'maxproc' in d : d['maxproc'] = 1 self.host_info_cache[host] = d return self.host_info_cache[host] ##### # The traceback interface is awkward in python; here is something I copied from pyetc: def log_traceback() : # You would think that the python traceback module contains # something useful to do this, but it always returns multi-line # strings. I want each line of output logged separately so the log # file remains easy to process, so I reverse engineered this out of # the logging module. try: etype, value, tb = sys.exc_info() tbex = traceback.extract_tb( tb ) for filename, lineno, name, line in tbex : print '%s:%d, in %s'%(filename,lineno,name) if line: print ' %s'%line.strip() for x in traceback.format_exception_only( etype, value ) : print ": %s",x print "---" finally: # If you don't clear these guys, you can make loops that # the garbage collector has to work hard to eliminate. etype = value = tb = None