diff options
author | sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> | 2011-09-27 11:46:09 -0400 |
---|---|---|
committer | sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> | 2011-09-27 11:46:09 -0400 |
commit | 61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b (patch) | |
tree | 942866a36819b21731f74c157bda1ffb19081c51 | |
parent | ff102cd2b89daf9a0feea2e10503e780d2454e29 (diff) | |
download | steuermann-61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b.tar.gz |
checkpoint
git-svn-id: https://svn.stsci.edu/svn/ssb/etal/steuermann/trunk@430 d34015c8-bcbb-4646-8ac8-8ba5febf221d
-rw-r--r-- | README | 4 | ||||
-rw-r--r-- | dev.sm | 189 | ||||
-rw-r--r-- | init.sm | 4 | ||||
-rw-r--r-- | scripts/steuermann_report.cgi | 56 | ||||
-rw-r--r-- | steuermann/config.py | 4 | ||||
-rw-r--r-- | steuermann/db.sql | 9 | ||||
-rw-r--r-- | steuermann/hosts.ini | 11 | ||||
-rw-r--r-- | steuermann/nodes.py | 20 | ||||
-rw-r--r-- | steuermann/report.py | 53 | ||||
-rw-r--r-- | steuermann/run.py | 284 | ||||
-rw-r--r-- | steuermann/run_all.py | 116 |
11 files changed, 557 insertions, 193 deletions
@@ -7,7 +7,5 @@ make python setup.py install -smc test.sm -? - +smc [ -a ] [ -r run_name ] file.sm @@ -1,5 +1,11 @@ +## TODO: +## add builds on ssbwebv1 for those things that we actually care about -TABLE assemble HOST rhe5-64 +#################### +#################### + +# arzach assembles all the source code +TABLE assemble HOST arzach CMD dev.stsci_python RUN "assemble_stsci_python dev" AFTER init/* @@ -12,7 +18,16 @@ TABLE assemble HOST rhe5-64 CMD dev.hstcal RUN "assemble_hstcal dev" AFTER init/* -TABLE build HOST rhe4-32 rhe4-64 rhe5-64 leopard snow-leopard + CMD nop RUN "sleep 1" + AFTER init/* + +#################### +#################### + +# install stsci_python into default environment +# build hstcal +# - everywhere +TABLE build HOST herbert thor arzach bond cadeau CMD dev.py2.7 RUN "build_stsci_python dev 2.7" AFTER init/* AFTER *:assemble/dev.stsci_python @@ -20,7 +35,9 @@ TABLE build HOST rhe4-32 rhe4-64 rhe5-64 leopard snow-leopard AFTER init/* AFTER *:assemble/dev.hstcal -TABLE build HOST rhe5-64 +# older python environments +# - arzach only +TABLE build HOST arzach CMD dev.py2.6 RUN "build_stsci_python dev 2.6" AFTER init/* AFTER *:assemble/dev.stsci_python @@ -29,7 +46,9 @@ TABLE build HOST rhe5-64 AFTER init/* AFTER *:assemble/dev.stsci_python -TABLE build HOST rhe4-32 leopard +# stsdas and friends +# - 32 bit only +TABLE build HOST herbert bond CMD dev.axe RUN "build_axe dev" AFTER init/* AFTER *:assemble/dev.axe @@ -48,14 +67,166 @@ TABLE build HOST rhe4-32 leopard AFTER build/dev.stsci_iraf -TABLE build HOST rhe4-64 rhe5-64 +# stsdas for 64 bit machines - get it from a related 32 bit system +TABLE build HOST thor arzach CMD dev.stsci_iraf_64hack RUN "build_stsci_iraf_64hack dev herbert" - AFTER rhe4-32:build/dev.stsci_iraf* + AFTER herbert:build/dev.stsci_iraf* -TABLE build HOST snow-leopard +TABLE build HOST cadeau CMD dev.stsci_iraf_64hack RUN "build_stsci_iraf_64hack dev cadeau" - AFTER rhe4-32:build/dev.stsci_iraf* + AFTER bond:build/dev.stsci_iraf* -TABLE build HOST rhe5-64 +# stsci_python documentation +# - one machine only +TABLE build HOST arzach CMD dev.stsci_python_sphinxdocs RUN "build_sphinxdocs dev 2.7" AFTER build/dev.py2.7 + +# old epydoc documentation - only works on thor; hope we can get rid of +# epydoc sooner than we have to do anything about this. +TABLE build HOST thor + CMD dev.stsci_python_epydoc RUN "/thor/data2/iraf/epydoc_test/nightly" + AFTER build/dev.py2.7 + +# stamp the IRAF banner file when the builds are complete +TABLE stamp HOST herbert thor arzach bond cadeau + CMD dev RUN "build_stamp dev" + AFTER build/* + +#################### +#################### + +# regular distributions + +TABLE distribute HOST herbert thor arzach + CMD dev.iraf RUN "synctool - irafdev" + AFTER stamp/dev + CMD dev.pyssg RUN "synctool - pyssgdev" + AFTER stamp/dev + CMD dev.stsci_iraf RUN "synctool - stsci_iraf_dev" + AFTER stamp/dev + CMD dev.hstcal RUN "synctool - hstcal_dev" + AFTER stamp/dev + CMD dev.motd RUN "synctool - irafdev/iraf/unix/hlib/motd" + AFTER distribute/dev.iraf + +TABLE distribute HOST bond cadeau + CMD irafdev.pkg RUN "cd $HOME/daily_build/mac_package; ./clean ; ./build dev " AFTER stamp/dev + CMD irafdev.dmg RUN "cd $HOME/daily_build/mac_package; ./distribute dev" AFTER irafdev.pkg + +# wads of special cases + +# jwcalibdev has local disk - some day it may do its own builds +TABLE distribute HOST arzach + CMD jwcalibdev.iraf RUN "synctool jwcalibdev: irafdev" + AFTER stamp/dev + CMD jwcalibdev.pyssg RUN "synctool jwcalibdev: pyssgdev" + AFTER stamp/dev + CMD jwcalibdev.stsci_iraf RUN "synctool jwcalibdev: stsci_iraf_dev" + AFTER stamp/dev + CMD jwcalibdev.hstcal RUN "synctool jwcalibdev: hstcal_dev" + AFTER stamp/dev + CMD jwcalibdev.motd RUN "synctool jwcalibdev: irafdev/iraf/unix/hlib/motd" + AFTER jwcalibdev.iraf + +# goods - has RHE 5 only now + +TABLE distribute_other HOST arzach + CMD goods.iraf RUN "synctool goods12: irafdev" + AFTER stamp/dev + CMD goods.pyssg RUN "synctool goods12: pyssgdev" + AFTER stamp/dev + CMD goods.stsci_iraf RUN "synctool goods12: stsci_iraf_dev" + AFTER stamp/dev + CMD goods.hstcal RUN "synctool goods12: hstcal_dev" + AFTER stamp/dev + CMD goods.motd RUN "synctool goods12: irafdev/iraf/unix/hlib/motd" + AFTER goods.iraf + +# witserv1 - who are these guys? + +TABLE distribute_other HOST arzach + CMD witserv1.iraf RUN "synctool witserv1: irafdev" + AFTER stamp/dev + CMD witserv1.pyssg RUN "synctool witserv1: pyssgdev" + AFTER stamp/dev + CMD witserv1.stsci_iraf RUN "synctool witserv1: stsci_iraf_dev" + AFTER stamp/dev + CMD witserv1.hstcal RUN "synctool witserv1: hstcal_dev" + AFTER stamp/dev + CMD witserv1.motd RUN "synctool witserv1: irafdev/iraf/unix/hlib/motd" + AFTER witserv1.iraf + +# dmsinsvm - have a pipeline and irafx/irafdev on the same machine for INS + +TABLE distribute_other HOST arzach + CMD dmsinsvm.iraf RUN "synctool dmsinsvm: irafdev" + AFTER stamp/dev + CMD dmsinsvm.pyssg RUN "synctool dmsinsvm: pyssgdev" + AFTER stamp/dev + CMD dmsinsvm.stsci_iraf RUN "synctool dmsinsvm: stsci_iraf_dev" + AFTER stamp/dev + CMD dmsinsvm.hstcal RUN "synctool dmsinsvm: hstcal_dev" + AFTER stamp/dev + CMD dmsinsvm.motd RUN "synctool dmsinsvm: irafdev/iraf/unix/hlib/motd" + AFTER dmsinsvm.iraf + +# UDF - another funded project with their own machines + +TABLE distribute_other HOST thor + CMD udf1.iraf RUN "synctool udf1: irafdev" + AFTER stamp/dev + CMD udf1.pyssg RUN "synctool udf1: pyssgdev" + AFTER stamp/dev + CMD udf1.stsci_iraf RUN "synctool udf1: stsci_iraf_dev" + AFTER stamp/dev + CMD udf1.hstcal RUN "synctool udf1: hstcal_dev" + AFTER stamp/dev + CMD udf1.motd RUN "synctool udf1: irafdev/iraf/unix/hlib/motd" + AFTER udf1.iraf + +# royal - a beowulf cluster + +TABLE distribute_other HOST thor + CMD royal.iraf RUN "synctool royal: irafdev" + AFTER stamp/dev + CMD royal.pyssg RUN "synctool royal: pyssgdev" + AFTER stamp/dev + CMD royal.stsci_iraf RUN "synctool royal: stsci_iraf_dev" + AFTER stamp/dev + CMD royal.hstcal RUN "synctool royal: hstcal_dev" + AFTER stamp/dev + CMD royal.motd RUN "synctool royal: irafdev/iraf/unix/hlib/motd" + AFTER royal.iraf + +#################### +#################### + +# This structure is meant to serialize the regtests so that nothing else +# is going on at the same time. The regtests can consume all the CPUs. +TABLE regtest HOST herbert thor arzach bond cadeau + CMD okify RUN "test_okify" + AFTER distribute/* + CMD dev.test2.7 RUN "test_rt dev 2.7" + AFTER okify + +TABLE regtest HOST arzach + CMD dev.test2.6 RUN "test_rt dev 2.6" + AFTER dev.test2.7 + CMD dev.test2.5 RUN "test_rt dev 2.5" + AFTER dev.test2.6 + +TABLE regtest HOST thor + CMD dev.contact RUN "test_contact" + AFTER dev.test2.7 + + # change this to have each test batch import independently +TABLE regtest_import HOST ssb + CMD dev.import RUN "test_import daily" + AFTER *:regtest/* + CMD dev.notify RUN "test_notify daily" + AFTER dev.import + CMD dev.sqlite_watch RUN "test_sqlite_watch" + AFTER dev.import + CMD dev.mysql_watch RUN "test_mysql_watch" + AFTER dev.import @@ -1,9 +1,9 @@ -TABLE init HOST leopard snow-leopard rhe5-64 rhe4-64 rhe4-32 +TABLE init HOST bond cadeau arzach thor herbert CMD sendscripts LOCAL "/eng/ssb/auto/steuermann_scripts/init_sendscripts %(hostname)s %(workdir)s" CMD sysstat RUN "sysstat" AFTER sendscripts -TABLE init HOST rhe5-64 +TABLE init HOST arzach CMD svnsync RUN "assemble_svnsync" AFTER OPT init/irafx_update AFTER sendscripts diff --git a/scripts/steuermann_report.cgi b/scripts/steuermann_report.cgi index a25d407..abf371b 100644 --- a/scripts/steuermann_report.cgi +++ b/scripts/steuermann_report.cgi @@ -35,18 +35,23 @@ def sqltime(arg) : return d +########## +# if no action specified, show the list of runs +# if not 'action' in form : print 'content-type: text/html' print '' db = steuermann.config.open_db() c = db.cursor() - c.execute('SELECT DISTINCT run FROM status ORDER BY run DESC') + c.execute('SELECT DISTINCT run FROM sm_status ORDER BY run DESC') for run, in c : print "<a href=%s?action=status&run=%s>%s</a><br>"%(cginame, run, run) sys.exit(0) action = form['action'].value - +########## +# status means show the status of a particular run +# if action == 'status' : db = steuermann.config.open_db() import steuermann.report @@ -57,6 +62,9 @@ if action == 'status' : print steuermann.report.report_html( db, run, info_callback=steuermann.report.info_callback_gui ) sys.exit(0) +########## +# log means show the result of a particular node from a run +# elif action == 'log' : print 'content-type: text/plain' print '' @@ -71,7 +79,7 @@ elif action == 'log' : db = steuermann.config.open_db() c = db.cursor() - c.execute("SELECT status, start_time, end_time, notes FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( + c.execute("SELECT status, start_time, end_time, notes FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( run, host, table, cmd ) ) x = c.fetchone() if x is None : @@ -95,17 +103,43 @@ elif action == 'log' : for x in [ ' ' + x for x in notes.split('\n') ] : print x print "" - print "--------------------" filename = '%s/%s/%s:%s.%s.log'%(steuermann.config.logdir,run,host,table,cmd) - f=open(filename,'r') - while 1 : - x = f.read(65536) - if x == '' : - break - sys.stdout.write(x) + try : + f=open(filename,'r') + except IOError: + print "No log file %s" %filename + f = None + print "--------------------" + + if f : + while 1 : + x = f.read(65536) + if x == '' : + break + sys.stdout.write(x) + sys.exit(0) +########## +# info means show information about the system +# +elif action == 'info' : + print 'content-type: text/html\n' + print 'db credentials: ',steuermann.config.db_creds,'<br>' + print 'logdir: ',steuermann.config.logdir,'<br>' + db = steuermann.config.open_db() + cur = db.cursor() + cur.execute("select count(*) from sm_status") + l = cur.fetchone() + print "database records: %s\n"%l[0],'<br>' + cur.execute("select count(*) from sm_runs") + l = cur.fetchone() + print "runs: %s\n"%l[0],'<br>' + sys.exit(0) + +########## + print 'content-type: text/html' print '' -print 'no action?' +print 'no recognized action?' diff --git a/steuermann/config.py b/steuermann/config.py index 7419703..b10881b 100644 --- a/steuermann/config.py +++ b/steuermann/config.py @@ -1,5 +1,7 @@ +db_creds = '/ssbwebv1/data2/steuermann/steuermann.db' + def open_db() : import sqlite3 - return sqlite3.connect('/ssbwebv1/data2/steuermann/steuermann.db') + return sqlite3.connect(db_creds) logdir = '/ssbwebv1/data2/steuermann/logs' diff --git a/steuermann/db.sql b/steuermann/db.sql index dd6a569..b6d2ae6 100644 --- a/steuermann/db.sql +++ b/steuermann/db.sql @@ -2,7 +2,7 @@ -- Before we start running anything, we insert a record for every -- command in the test run. The initial status is 'S'. -CREATE TABLE status ( +CREATE TABLE sm_status ( run VARCHAR(100), -- name of this run @@ -19,6 +19,7 @@ CREATE TABLE status ( -- R = started, not finished -- S = skipped -- P = prereq not satisfied, so not attempted + -- E = error internal to steuermann -- 0-255 = exit code start_time VARCHAR(30), @@ -37,13 +38,13 @@ CREATE TABLE status ( ); -create unique index idx_status_1 on status ( run, host, tablename, cmd ); +create unique index sm_status_idx1 on sm_status ( run, host, tablename, cmd ); -- table lists all run names in the system -CREATE TABLE runs ( +CREATE TABLE sm_runs ( run VARCHAR(100) ); -CREATE UNIQUE INDEX idx_runs_run ON runs(run); +CREATE UNIQUE INDEX sm_runs_idx1 ON sm_runs(run); diff --git a/steuermann/hosts.ini b/steuermann/hosts.ini index 0f0c043..f8c2e3b 100644 --- a/steuermann/hosts.ini +++ b/steuermann/hosts.ini @@ -17,9 +17,12 @@ [all] hostname=no_such_machine -local=[ 'sh', '-c', '%(script)s' ] maxproc=2 +; local really applies the same to all the machines, but this is a +; convenient place to stash it for now. +local=[ 'sh', '-c', '%(script)s' ] + [linux:csh] ; for CSH ; @@ -74,6 +77,12 @@ like=linux:csh workdir=/arzach/data1/iraf/steuermann maxproc=4 +[ssb] +hostname=ssbwebv1 +like=linux:csh +workdir=/ssbwebv1/data1/iraf/work +maxproc=2 + [bond] hostname=bond like=mac:csh diff --git a/steuermann/nodes.py b/steuermann/nodes.py index 5e9852a..0f24e36 100644 --- a/steuermann/nodes.py +++ b/steuermann/nodes.py @@ -110,11 +110,21 @@ class command_tree(object): # crack open host:table/cmd def crack_name(name) : - t = name.split(':') - host = t[0] - t = t[1].split('/') - table = t[0] - cmd = t[1] + if ':' in name : + t = name.split(':') + host = t[0] + name = t[1] + else : + host = '*' + + if '/' in name : + t = name.split('/') + table = t[0] + cmd = t[1] + else : + table = '*' + cmd = name + return (host, table, cmd) ##### diff --git a/steuermann/report.py b/steuermann/report.py index 82da984..cc90604 100644 --- a/steuermann/report.py +++ b/steuermann/report.py @@ -9,6 +9,21 @@ import pandokia.text_table as text_table import pandokia.common import StringIO +# maybe the output is html 3.2 - in any case, it is way simpler than +# more recent standards. +html_header='''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<HTML> +<HEAD> +<TITLE>%(title)s</TITLE> +</HEAD> +<BODY> +''' + +html_trailer=''' +</BODY> +</HTML> +''' + # this will be reset by the cgi main program if we are in a real cgi cginame = 'arf.cgi' @@ -16,7 +31,7 @@ cginame = 'arf.cgi' def info_callback_status( db, run, tablename, host, cmd ) : c = db.cursor() - c.execute("SELECT status FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( + c.execute("SELECT status FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( run, host, tablename, cmd ) ) status, = c.fetchone() return status @@ -29,7 +44,7 @@ simple_status = ( 'N', 'P', 'S', 'W' ) def info_callback_gui( db, run, tablename, host, cmd ) : c = db.cursor() - c.execute("SELECT status, start_time, end_time FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( + c.execute("SELECT status, start_time, end_time FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",( run, host, tablename, cmd ) ) x = c.fetchone() if x is None : @@ -81,7 +96,7 @@ def info_callback_debug_table_cell( db, run, tablename, cmd, host ) : def get_table_list( db, run_name ) : c = db.cursor() - c.execute("select max(depth) as d, tablename from status where run = ? group by tablename order by d asc",(run_name,)) + c.execute("select max(depth) as d, tablename from sm_status where run = ? group by tablename order by d asc",(run_name,)) table_list = [ x for x in c ] # table_list contains ( depth, tablename ) return table_list @@ -97,25 +112,28 @@ def get_table( db, run_name, tablename, info_callback, showdepth=0 ) : t.define_column('depth') c = db.cursor() - c.execute("select distinct host from status where tablename = ? and run = ? order by host asc",(tablename, run_name)) + c.execute("select distinct host from sm_status where tablename = ? and run = ? order by host asc",(tablename, run_name)) for host, in c : t.define_column(host) - c.execute("""select cmd, host, depth, status, start_time, end_time, notes from status - where tablename = ? and run = ? order by depth, cmd asc + c.execute("select cmd, max(depth) as d from sm_status where tablename = ? and run = ? group by cmd order by d asc",(tablename, run_name)) + row = -1 + cmd_to_row = { } + for cmd, depth in c : + row = row + 1 + cmd_to_row[cmd] = row + t.set_value(row, 0, cmd) + if showdepth : + t.set_value(row, 'depth', depth) + + c.execute("""select cmd, host, status, start_time, end_time, notes from sm_status + where tablename = ? and run = ? order by cmd asc """, ( tablename, run_name ) ) - row = -1 - prev_cmd = None + row = 0 for x in c : - cmd, host, depth, status, start_time, end_time, notes = x - if cmd != prev_cmd : - row = row + 1 - t.set_value(row, 0, cmd) - if showdepth : - t.set_value(row, 'depth', depth) - prev_cmd = cmd - + cmd, host, status, start_time, end_time, notes = x + row = cmd_to_row[cmd] info = info_callback( db, run_name, tablename, host, cmd ) if isinstance(info, tuple) : t.set_value( row, host, text=info[0], html=info[1] ) @@ -149,6 +167,7 @@ def report_text( db, run_name, info_callback = info_callback_status ) : def report_html( db, run_name, info_callback = info_callback_status, hlevel=1 ) : s = StringIO.StringIO() + s.write(html_header % { 'title' : run_name } ) s.write('<h%d>%s</h%d>\n'%(hlevel,run_name,hlevel)) hlevel = hlevel + 1 @@ -160,6 +179,8 @@ def report_html( db, run_name, info_callback = info_callback_status, hlevel=1 ) t = get_table( db, run_name, tablename, info_callback, showdepth=1 ) s.write(t.get_html()) + s.write(html_trailer) + return s.getvalue() # diff --git a/steuermann/run.py b/steuermann/run.py index 36cda80..af52a02 100644 --- a/steuermann/run.py +++ b/steuermann/run.py @@ -7,6 +7,8 @@ import subprocess import time import datetime import os +import traceback +import sys import ConfigParser @@ -20,6 +22,9 @@ class struct : ##### +class run_exception(Exception) : + pass + class runner(object): # dict of all current running processes, indexed by node name @@ -51,100 +56,112 @@ class runner(object): ##### # start a process - def run( self, node, run_name ): + def run( self, node, run_name, no_run = False ): try : - args = self.get_host_info(node.host) - except : - print "ERROR: do not know how to run on %s"%node.host - raise + try : + args = self.get_host_info(node.host) + except Exception, e : + log_traceback() + print "ERROR: do not know how to run on %s"%node.host + print e + raise - if 'maxproc' in args : hostname = args['hostname'] + if 'maxproc' in args : - n = int(self.howmany.get(hostname,0)) - if n >= int(args['maxproc']) : - print "decline to run %s - %d other already running"%(node.name,n) - return False - - n = n + 1 - self.howmany[hostname] = n - print "running %s %s %d"%(hostname,node.name, n) - else : - print "running %s %s no maxproc"%(hostname, node.name) - - if debug : - print "run",node.name - if debug : - print "....%s:%s/%s\n"%(node.host, node.table, node.cmd) - - node.running = 1 - - args = args.copy() - args.update( - script=node.script, - script_type=node.script_type, - host=node.host, - table=node.table, - cmd=node.cmd, - node=node.name, - ) - - if debug : - print "ARGS" - for x in sorted([x for x in args]) : - print '%s=%s'%(x,args[x]) - - args['script'] = args['script'] % args - - if args['script_type'] == 'r' : - run = args['run'] - elif args['script_type'] == 'l' : - run = args['local'] - else : - raise Exception() - - t = [ ] - for x in run : - # bug: what to do in case of keyerror - t.append( x % args ) - - run = t + n = int(self.howmany.get(hostname,0)) + if n >= int(args['maxproc']) : + print "decline to run %s - %d other already running"%(node.name,n) + return False - if debug : - print "RUN",run - - # make sure the log directory is there - logdir= self.logdir + "/%s"%run_name - try : - os.makedirs(logdir) - except OSError: - pass - - # create a name for the log file, but do not use / in the name - logfile_name = "%s/%s.log"%( logdir, node.name.replace('/','.') ) - - # open the log file, write initial notes - logfile=open(logfile_name,"w") - logfile.write('%s %s\n'%(datetime.datetime.now(),run)) - logfile.flush() - - # start running the process - p = subprocess.Popen(args=run, - stdout=logfile, - stderr=subprocess.STDOUT, - shell=False, close_fds=True) - - # remember the popen object for the process; remember the open log file - n = struct() - n.proc = p - n.logfile = logfile - n.logfile_name = logfile_name - - # remember the process is running - self.all_procs[node.name] = n - - return True + n = n + 1 + self.howmany[hostname] = n + print "running %s %s %d"%(hostname,node.name, n) + else : + print "running %s %s no maxproc"%(hostname, node.name) + + if debug : + print "run",node.name + if debug : + print "....%s:%s/%s\n"%(node.host, node.table, node.cmd) + + node.running = 1 + + args = args.copy() + args.update( + script=node.script, + script_type=node.script_type, + host=node.host, + table=node.table, + cmd=node.cmd, + node=node.name, + ) + + if debug : + print "ARGS" + for x in sorted([x for x in args]) : + print '%s=%s'%(x,args[x]) + + args['script'] = args['script'] % args + + if args['script_type'] == 'r' : + run = args['run'] + elif args['script_type'] == 'l' : + run = args['local'] + else : + raise Exception() + + t = [ ] + for x in run : + # bug: what to do in case of keyerror + t.append( x % args ) + + run = t + + if debug : + print "RUN",run + + # make sure the log directory is there + logdir= self.logdir + "/%s"%run_name + try : + os.makedirs(logdir) + except OSError: + pass + + # create a name for the log file, but do not use / in the name + logfile_name = "%s/%s.log"%( logdir, node.name.replace('/','.') ) + + # open the log file, write initial notes + logfile=open(logfile_name,"w") + logfile.write('%s %s\n'%(datetime.datetime.now(),run)) + logfile.flush() + + # debug - just say the name of the node we would run + if no_run : + run = [ 'echo', 'no_run - node=', node.name ] + + # start running the process + p = subprocess.Popen(args=run, + stdout=logfile, + stderr=subprocess.STDOUT, + shell=False, close_fds=True) + + # remember the popen object for the process; remember the open log file + n = struct() + n.proc = p + n.logfile = logfile + n.logfile_name = logfile_name + + # remember the process is running + self.all_procs[node.name] = n + + return True + + except Exception, e : + log_traceback() + txt= "ERROR RUNNING %s"%node.name + raise run_exception(txt) ##### # callback when a node finishes @@ -155,16 +172,12 @@ class runner(object): args = self.get_host_info(node.host) - if 'maxproc' in args : - hostname = args['hostname'] + hostname = args['hostname'] - n = int(self.howmany.get(hostname,0)) - n = n - 1 + n = self.howmany[hostname] - 1 + self.howmany[hostname] = n - self.howmany[hostname] = n - print "finish %s %s %d"%(hostname,node_name,n) - else : - print "finish %s %s no maxproc"%(hostname,node_name) + print "finish %s %s %d"%(hostname,node_name,n) # note the termination of the process at the end of the log file logfile = self.all_procs[node_name].logfile @@ -225,14 +238,18 @@ class runner(object): def _host_get_names( self, cfg, section ) : d = { } # pick all the variables out of this section - for name, value in cfg.items(section) : - if value.startswith('[') : - # it is a list - d[name] = eval(value) - else : - # everything else is plain text - d[name] = value - return d + try : + for name, value in cfg.items(section) : + if value.startswith('[') : + # it is a list + d[name] = eval(value) + else : + # everything else is plain text + d[name] = value + return d + except ConfigParser.NoSectionError : + print "No config section in hosts.ini: %s"%section + return { } def load_host_info( self, filename=None ) : @@ -243,26 +260,55 @@ class runner(object): self.cfg.read(filename) def get_host_info(self, host) : - if debug: - print "enter get_host_info",host if not host in self.host_info_cache : - d = self._host_get_names(self.cfg, host) - if debug: - print "in get_host_info, got names for ",host, d + if 'like' in d : - if debug: - print "has like", d['like'] - d1 = self.get_host_info(d['like']).copy() - del d['like'] + # get the dict of what this entry is like, copy it, + # and update it with the values for this entry + d1 = self.get_host_info(d['like']) + d1 = d1.copy() d1.update(d) - self.host_info_cache[host] = d1 - else : - print "end of chain",host,d - self.host_info_cache[host] = d + d = d1 + print d + del d['like'] - if debug: - print "leave get_host_info",host, self.host_info_cache[host] + # default hostname is the name from the section header + if not 'hostname' in d : + d['hostname'] = host + + # default maximum processes is 1 + if not 'maxproc' in d : + d['maxproc'] = 1 + + self.host_info_cache[host] = d return self.host_info_cache[host] ##### + +# The traceback interface is awkward in python; here is something I copied from pyetc: + +def log_traceback() : + # You would think that the python traceback module contains + # something useful to do this, but it always returns multi-line + # strings. I want each line of output logged separately so the log + # file remains easy to process, so I reverse engineered this out of + # the logging module. + try: + etype, value, tb = sys.exc_info() + tbex = traceback.extract_tb( tb ) + for filename, lineno, name, line in tbex : + print '%s:%d, in %s'%(filename,lineno,name) + if line: + print ' %s'%line.strip() + + for x in traceback.format_exception_only( etype, value ) : + print ": %s",x + + print "---" + + finally: + # If you don't clear these guys, you can make loops that + # the garbage collector has to work hard to eliminate. + etype = value = tb = None + diff --git a/steuermann/run_all.py b/steuermann/run_all.py index e7bff61..f670abe 100644 --- a/steuermann/run_all.py +++ b/steuermann/run_all.py @@ -26,6 +26,7 @@ except ImportError : def main() : global xnodes + global no_run # read all the input files if readline : @@ -37,19 +38,38 @@ def main() : import atexit atexit.register(readline.write_history_file, history) - opt, args = easyargs.get( { '-a' : '--all', - '--all' : '-a', + +# easyargs spec definition: +# +# '-v' : '', # arg takes no parameter, opt['-v'] is +# # how many times it occurred +# '-f' : '=', # arg takes a parameter +# '-mf' : '=+', # arg takes a parameter, may be specified +# # several times to get a list +# '--verbose' : '-v', # arg is an alias for some other arg + + opt, args = easyargs.get( { + '--all' : '-a' , + '-a' : '' , # run all nodes non-interactively + '-r' : '=' , # give run name + '-n' : '' , # do not actually execute any processes } ) # # - all = '--all' in opt + all = opt['-a'] + no_run = opt['-n'] di_nodes = nodes.read_file_list( args ) xnodes = di_nodes.node_index - run_name = str(datetime.datetime.now()).replace(' ','_') + + if '-r' in opt : + run_name = opt['-r'] + else : + run_name = str(datetime.datetime.now()).replace(' ','_') + db = steuermann.config.open_db() if all : @@ -59,6 +79,16 @@ def main() : # +def find_wild_names( xnodes, name ) : + print "find_wild",name + l = [ ] + for x in xnodes : + if nodes.wildcard_name( name, x ) : + print "...",x + l.append(x) + return l +# + def do_flag( xnodes, name, recursive, fn, verbose ) : if verbose : verbose = verbose + 1 @@ -84,7 +114,6 @@ def do_flag( xnodes, name, recursive, fn, verbose ) : else : if verbose : print ' '*verbose, "not in list", name - raise Exception() def set_want( node ) : # if we said we want it, mark it as wanted and don't skip @@ -111,6 +140,17 @@ def cmd_flagging( l, xnodes, func ) : for x in l : do_flag( xnodes, x, recursive, func, 1 ) + +# +def print_node(xnodes, x, print_recursive, print_all, indent=0): + print ' '*indent, xnodes[x].wanted, xnodes[x].finished, xnodes[x].skip, x + if print_all : + l = [ a.name for a in xnodes[x].predecessors ] + print ' '*indent, " AFTER", ' '.join(l) + if print_recursive : + for x in l : + print_node( xnodes, x, print_recursive, print_all, indent=indent+8) + # helpstr = """ @@ -128,6 +168,9 @@ pre node show what must come before a node def run_interactive( xnodes, run_name, db) : + org_run_name = run_name + run_count = 0 + register_database(db, run_name, xnodes) runner = run.runner( xnodes, steuermann.config.logdir ) @@ -195,18 +238,37 @@ def run_interactive( xnodes, run_name, db) : for x in xnodes : xnodes[x].finished = 0 - run_name = str(datetime.datetime.now()).replace(' ','_') + run_name = org_run_name + '.%d'%run_count + run_count = run_count + 1 print "new run name",run_name register_database(db, run_name, xnodes) elif n == 'list' : - print_all = '-a' in l - l = sorted ( [ x for x in xnodes ] ) + l = l[1:] + if len(l) > 0 and l[0] == '-a' : + l = l[1:] + print_all = 1 + else : + print_all = 0 + + if len(l) > 0 and l[0] == '-r' : + l = l[1:] + print_recursive=1 + else : + print_recursive=0 + + if len(l) == 0 : + all = [ x for x in xnodes ] + else : + all = [ ] + for x in l : + all = all + find_wild_names( xnodes, x ) + + all = sorted(all) + print "recursive",print_recursive print "w f s name" - for x in l : - print xnodes[x].wanted, xnodes[x].finished, xnodes[x].skip, x - if print_all : - print " AFTER", ' '.join([ a.name for a in xnodes[x].predecessors ]) + for x in all : + print_node(xnodes, x, print_recursive, all) elif n == 'wait' : c = db.cursor() @@ -214,7 +276,7 @@ def run_interactive( xnodes, run_name, db) : host, tablename, cmd = nodes.crack_name(x) if xnodes[x].wanted : status = 'W' - c.execute("UPDATE status SET status = 'W' WHERE run = ? AND host = ? AND tablename = ? AND cmd = ? AND status = 'N'", + c.execute("UPDATE sm_status SET status = 'W' WHERE run = ? AND host = ? AND tablename = ? AND cmd = ? AND status = 'N'", (run_name, host, tablename, cmd) ) db.commit() @@ -286,13 +348,13 @@ def print_pre(who, xnodes, depth) : def register_database(db, run, xnodes ) : c = db.cursor() - c.execute('INSERT INTO runs ( run ) VALUES ( ? )', ( run, ) ) + c.execute('INSERT INTO sm_runs ( run ) VALUES ( ? )', ( run, ) ) c = db.cursor() for x in xnodes : host, tablename, cmd = nodes.crack_name(x) depth = xnodes[x].depth - c.execute("INSERT INTO status ( run, host, tablename, cmd, depth, status ) VALUES " + c.execute("INSERT INTO sm_status ( run, host, tablename, cmd, depth, status ) VALUES " "( ?, ?, ?, ?, ?, 'N' )", ( run, host, tablename, cmd, depth ) ) db.commit() @@ -365,16 +427,26 @@ def run_step( runner, xnodes, run_name, db ) : x.finished = 1 no_sleep = 1 keep_running = 1 - db.execute("UPDATE status SET start_time = ?, status = 'S' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", + db.execute("UPDATE sm_status SET start_time = ?, status = 'S' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", ( str(datetime.datetime.now()), run_name, host, table, cmd ) ) db.commit() else : - if runner.run(x, run_name) : - # returns true/false whether it actually ran it - it may not because of resource limits - db.execute("UPDATE status SET start_time = ?, status = 'R' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", - ( str(datetime.datetime.now()), run_name, host, table, cmd ) ) - db.commit() + try : + if runner.run(x, run_name, no_run=no_run) : + # returns true/false whether it actually ran it - it may not because of resource limits + db.execute("UPDATE sm_status SET start_time = ?, status = 'R' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", + ( str(datetime.datetime.now()), run_name, host, table, cmd ) ) + except run.run_exception, e : + now = str(datetime.datetime.now()) + db.execute("UPDATE sm_status SET start_time=?, end_time=?, status='E', notes=? WHERE ( run=? AND host=? AND tablename=? AND cmd=? )", + ( now, now, repr(e), run_name, host, table, cmd ) ) + x.finished = 1 + no_sleep = 1 + keep_running = 1 + + db.commit() + # if anything has exited, we process it and update the status in the database while 1 : @@ -390,7 +462,7 @@ def run_step( runner, xnodes, run_name, db ) : # note who and log it x_host, x_table, x_cmd = nodes.crack_name(who_exited[0]) - db.execute("UPDATE status SET end_time = ?, status = ? WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", + db.execute("UPDATE sm_status SET end_time = ?, status = ? WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )", ( str(datetime.datetime.now()), who_exited[1], run_name, x_host, x_table, x_cmd ) ) db.commit() |