checkpoint

git-svn-id: https://svn.stsci.edu/svn/ssb/etal/steuermann/trunk@430 d34015c8-bcbb-4646-8ac8-8ba5febf221d
author: sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> 2011-09-27 11:46:09 -0400
committer: sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d> 2011-09-27 11:46:09 -0400
commit: 61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b (patch)
tree: 942866a36819b21731f74c157bda1ffb19081c51
parent: ff102cd2b89daf9a0feea2e10503e780d2454e29 (diff)
download: steuermann-61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b.tar.gz
11 files changed, 557 insertions, 193 deletions
diff --git a/README b/README
index b0cd1d7..8054a62 100644
--- a/README
+++ b/README
@@ -7,7 +7,5 @@ make
 
 python setup.py install
 
-smc test.sm
-?
-
+smc [ -a ] [ -r run_name ] file.sm
 
diff --git a/dev.sm b/dev.sm
index 1def284..cb55c8e 100644
--- a/dev.sm
+++ b/dev.sm
@@ -1,5 +1,11 @@
+## TODO:
+##  add builds on ssbwebv1 for those things that we actually care about
 
-TABLE assemble HOST rhe5-64
+####################
+####################
+
+# arzach assembles all the source code
+TABLE assemble HOST arzach
 	CMD dev.stsci_python RUN "assemble_stsci_python dev"
 		AFTER init/*
 
@@ -12,7 +18,16 @@ TABLE assemble HOST rhe5-64
 	CMD dev.hstcal RUN "assemble_hstcal dev"
 		AFTER init/*
 
-TABLE build HOST rhe4-32 rhe4-64 rhe5-64 leopard snow-leopard
+	CMD nop RUN "sleep 1"
+		AFTER init/*
+
+####################
+####################
+
+# install stsci_python into default environment
+# build hstcal
+#	- everywhere
+TABLE build HOST herbert thor arzach bond cadeau
 	CMD dev.py2.7 RUN "build_stsci_python dev 2.7"
 		AFTER init/*
 		AFTER *:assemble/dev.stsci_python
@@ -20,7 +35,9 @@ TABLE build HOST rhe4-32 rhe4-64 rhe5-64 leopard snow-leopard
 		AFTER init/*
 		AFTER *:assemble/dev.hstcal
 
-TABLE build HOST rhe5-64 
+# older python environments
+#	- arzach only
+TABLE build HOST arzach 
 	CMD dev.py2.6 RUN "build_stsci_python dev 2.6"
 		AFTER init/*
 		AFTER *:assemble/dev.stsci_python
@@ -29,7 +46,9 @@ TABLE build HOST rhe5-64
 		AFTER init/*
 		AFTER *:assemble/dev.stsci_python
 
-TABLE build HOST rhe4-32 leopard
+# stsdas and friends
+#	- 32 bit only
+TABLE build HOST herbert bond
 	CMD dev.axe RUN "build_axe dev"
 		AFTER init/*
 		AFTER *:assemble/dev.axe
@@ -48,14 +67,166 @@ TABLE build HOST rhe4-32 leopard
 		AFTER build/dev.stsci_iraf
 
 
-TABLE build HOST rhe4-64 rhe5-64
+# stsdas for 64 bit machines - get it from a related 32 bit system
+TABLE build HOST thor arzach
 	CMD dev.stsci_iraf_64hack RUN "build_stsci_iraf_64hack dev herbert"
-	AFTER rhe4-32:build/dev.stsci_iraf*
+	AFTER herbert:build/dev.stsci_iraf*
 
-TABLE build HOST snow-leopard
+TABLE build HOST cadeau
 	CMD dev.stsci_iraf_64hack RUN "build_stsci_iraf_64hack dev cadeau"
-	AFTER rhe4-32:build/dev.stsci_iraf*
+	AFTER bond:build/dev.stsci_iraf*
 
-TABLE build HOST rhe5-64
+# stsci_python documentation
+#	- one machine only
+TABLE build HOST arzach
 	CMD dev.stsci_python_sphinxdocs RUN "build_sphinxdocs dev 2.7"
 	AFTER build/dev.py2.7
+
+# old epydoc documentation - only works on thor; hope we can get rid of
+# epydoc sooner than we have to do anything about this.
+TABLE build HOST thor
+	CMD dev.stsci_python_epydoc RUN "/thor/data2/iraf/epydoc_test/nightly"
+	AFTER build/dev.py2.7
+
+# stamp the IRAF banner file when the builds are complete
+TABLE stamp HOST herbert thor arzach bond cadeau
+	CMD dev RUN "build_stamp dev"
+		AFTER build/*
+
+####################
+####################
+
+# regular distributions
+
+TABLE distribute HOST herbert thor arzach
+	CMD dev.iraf RUN "synctool - irafdev"
+		AFTER stamp/dev
+	CMD dev.pyssg RUN "synctool - pyssgdev"
+		AFTER stamp/dev
+	CMD dev.stsci_iraf RUN "synctool - stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD dev.hstcal RUN "synctool - hstcal_dev"
+		AFTER stamp/dev
+	CMD dev.motd RUN "synctool - irafdev/iraf/unix/hlib/motd"
+		AFTER distribute/dev.iraf
+
+TABLE distribute HOST bond cadeau
+	CMD irafdev.pkg RUN "cd $HOME/daily_build/mac_package; ./clean ; ./build dev " AFTER stamp/dev
+	CMD irafdev.dmg RUN "cd $HOME/daily_build/mac_package; ./distribute dev" AFTER irafdev.pkg
+
+# wads of special cases
+
+# jwcalibdev has local disk - some day it may do its own builds
+TABLE distribute HOST arzach
+	CMD jwcalibdev.iraf RUN "synctool jwcalibdev: irafdev"
+		AFTER stamp/dev
+	CMD jwcalibdev.pyssg RUN "synctool jwcalibdev: pyssgdev"
+		AFTER stamp/dev
+	CMD jwcalibdev.stsci_iraf RUN "synctool jwcalibdev: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD jwcalibdev.hstcal RUN "synctool jwcalibdev: hstcal_dev"
+		AFTER stamp/dev
+	CMD jwcalibdev.motd RUN "synctool jwcalibdev: irafdev/iraf/unix/hlib/motd"
+		AFTER jwcalibdev.iraf
+
+# goods - has RHE 5 only now
+
+TABLE distribute_other HOST arzach
+	CMD goods.iraf RUN "synctool goods12: irafdev"
+		AFTER stamp/dev
+	CMD goods.pyssg RUN "synctool goods12: pyssgdev"
+		AFTER stamp/dev
+	CMD goods.stsci_iraf RUN "synctool goods12: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD goods.hstcal RUN "synctool goods12: hstcal_dev"
+		AFTER stamp/dev
+	CMD goods.motd RUN "synctool goods12: irafdev/iraf/unix/hlib/motd"
+		AFTER goods.iraf
+
+# witserv1 - who are these guys?
+
+TABLE distribute_other HOST arzach
+	CMD witserv1.iraf RUN "synctool witserv1: irafdev"
+		AFTER stamp/dev
+	CMD witserv1.pyssg RUN "synctool witserv1: pyssgdev"
+		AFTER stamp/dev
+	CMD witserv1.stsci_iraf RUN "synctool witserv1: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD witserv1.hstcal RUN "synctool witserv1: hstcal_dev"
+		AFTER stamp/dev
+	CMD witserv1.motd RUN "synctool witserv1: irafdev/iraf/unix/hlib/motd"
+		AFTER witserv1.iraf
+
+# dmsinsvm - have a pipeline and irafx/irafdev on the same machine for INS
+
+TABLE distribute_other HOST arzach
+	CMD dmsinsvm.iraf RUN "synctool dmsinsvm: irafdev"
+		AFTER stamp/dev
+	CMD dmsinsvm.pyssg RUN "synctool dmsinsvm: pyssgdev"
+		AFTER stamp/dev
+	CMD dmsinsvm.stsci_iraf RUN "synctool dmsinsvm: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD dmsinsvm.hstcal RUN "synctool dmsinsvm: hstcal_dev"
+		AFTER stamp/dev
+	CMD dmsinsvm.motd RUN "synctool dmsinsvm: irafdev/iraf/unix/hlib/motd"
+		AFTER dmsinsvm.iraf
+
+# UDF - another funded project with their own machines
+
+TABLE distribute_other HOST thor
+	CMD udf1.iraf RUN "synctool udf1: irafdev"
+		AFTER stamp/dev
+	CMD udf1.pyssg RUN "synctool udf1: pyssgdev"
+		AFTER stamp/dev
+	CMD udf1.stsci_iraf RUN "synctool udf1: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD udf1.hstcal RUN "synctool udf1: hstcal_dev"
+		AFTER stamp/dev
+	CMD udf1.motd RUN "synctool udf1: irafdev/iraf/unix/hlib/motd"
+		AFTER udf1.iraf
+
+# royal - a beowulf cluster
+
+TABLE distribute_other HOST thor
+	CMD royal.iraf RUN "synctool royal: irafdev"
+		AFTER stamp/dev
+	CMD royal.pyssg RUN "synctool royal: pyssgdev"
+		AFTER stamp/dev
+	CMD royal.stsci_iraf RUN "synctool royal: stsci_iraf_dev"
+		AFTER stamp/dev
+	CMD royal.hstcal RUN "synctool royal: hstcal_dev"
+		AFTER stamp/dev
+	CMD royal.motd RUN "synctool royal: irafdev/iraf/unix/hlib/motd"
+		AFTER royal.iraf
+
+####################
+####################
+
+# This structure is meant to serialize the regtests so that nothing else
+# is going on at the same time.  The regtests can consume all the CPUs.
+TABLE regtest HOST herbert thor arzach bond cadeau
+	CMD okify RUN "test_okify"
+		AFTER distribute/*
+	CMD dev.test2.7 RUN "test_rt dev 2.7"
+		AFTER okify
+
+TABLE regtest HOST arzach 
+	CMD dev.test2.6 RUN "test_rt dev 2.6"
+		AFTER dev.test2.7
+	CMD dev.test2.5 RUN "test_rt dev 2.5"
+		AFTER dev.test2.6
+
+TABLE regtest HOST thor
+	CMD dev.contact RUN "test_contact"
+		AFTER dev.test2.7
+
+	# change this to have each test batch import independently
+TABLE regtest_import HOST ssb
+	CMD dev.import RUN "test_import daily"
+		AFTER *:regtest/*
+	CMD dev.notify RUN "test_notify daily"
+		AFTER dev.import
+	CMD dev.sqlite_watch RUN "test_sqlite_watch"
+		AFTER dev.import
+	CMD dev.mysql_watch RUN "test_mysql_watch"
+		AFTER dev.import
diff --git a/init.sm b/init.sm
index 7297f85..af058f6 100644
--- a/init.sm
+++ b/init.sm
@@ -1,9 +1,9 @@
-TABLE init HOST leopard snow-leopard rhe5-64 rhe4-64 rhe4-32
+TABLE init HOST bond cadeau arzach thor herbert
 	CMD sendscripts LOCAL "/eng/ssb/auto/steuermann_scripts/init_sendscripts %(hostname)s %(workdir)s"
 	CMD sysstat RUN "sysstat"
 		AFTER sendscripts
 
-TABLE init HOST rhe5-64
+TABLE init HOST arzach
         CMD svnsync RUN "assemble_svnsync"
                 AFTER OPT init/irafx_update
 		AFTER sendscripts
diff --git a/scripts/steuermann_report.cgi b/scripts/steuermann_report.cgi
index a25d407..abf371b 100644
--- a/scripts/steuermann_report.cgi
+++ b/scripts/steuermann_report.cgi
@@ -35,18 +35,23 @@ def sqltime(arg) :
     return d
 
 
+##########
+# if no action specified, show the list of runs
+#
 if not 'action' in form :
     print 'content-type: text/html'
     print ''
     db = steuermann.config.open_db()
     c = db.cursor()
-    c.execute('SELECT DISTINCT run FROM status ORDER BY run DESC')
+    c.execute('SELECT DISTINCT run FROM sm_status ORDER BY run DESC')
     for run, in c :
         print "<a href=%s?action=status&run=%s>%s</a><br>"%(cginame, run, run)
     sys.exit(0)
 
 action = form['action'].value
-
+##########
+# status means show the status of a particular run
+#
 if action == 'status' :
     db = steuermann.config.open_db()
     import steuermann.report
@@ -57,6 +62,9 @@ if action == 'status' :
     print steuermann.report.report_html( db, run, info_callback=steuermann.report.info_callback_gui )
     sys.exit(0)
 
+##########
+# log means show the result of a particular node from a run
+#
 elif action == 'log' :
     print 'content-type: text/plain'
     print ''
@@ -71,7 +79,7 @@ elif action == 'log' :
 
     db = steuermann.config.open_db()
     c = db.cursor()
-    c.execute("SELECT status, start_time, end_time, notes FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
+    c.execute("SELECT status, start_time, end_time, notes FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
             run, host, table, cmd ) )
     x = c.fetchone()
     if x is None :
@@ -95,17 +103,43 @@ elif action == 'log' :
         for x in [ '    ' + x for x in notes.split('\n') ] :
             print x
     print ""
-    print "--------------------"
     filename = '%s/%s/%s:%s.%s.log'%(steuermann.config.logdir,run,host,table,cmd)
-    f=open(filename,'r')
-    while 1 :
-        x = f.read(65536)
-        if x == '' :
-            break
-        sys.stdout.write(x)
+    try :
+        f=open(filename,'r')
+    except IOError:
+        print "No log file %s" %filename
+        f = None
+    print "--------------------"
+
+    if f :
+        while 1 :
+            x = f.read(65536)
+            if x == '' :
+                break
+            sys.stdout.write(x)
+
     sys.exit(0)
 
+##########
+# info means show information about the system
+#
+elif action == 'info' :
+    print 'content-type: text/html\n'
+    print 'db credentials: ',steuermann.config.db_creds,'<br>'
+    print 'logdir: ',steuermann.config.logdir,'<br>'
+    db = steuermann.config.open_db()
+    cur = db.cursor()
+    cur.execute("select count(*) from sm_status")
+    l = cur.fetchone()
+    print "database records: %s\n"%l[0],'<br>'
+    cur.execute("select count(*) from sm_runs")
+    l = cur.fetchone()
+    print "runs: %s\n"%l[0],'<br>'
+    sys.exit(0)
+
+##########
+
 print 'content-type: text/html'
 print ''
-print 'no action?'
+print 'no recognized action?'
 
diff --git a/steuermann/config.py b/steuermann/config.py
index 7419703..b10881b 100644
--- a/steuermann/config.py
+++ b/steuermann/config.py
@@ -1,5 +1,7 @@
+db_creds = '/ssbwebv1/data2/steuermann/steuermann.db'
+
 def open_db() :
     import sqlite3
-    return sqlite3.connect('/ssbwebv1/data2/steuermann/steuermann.db')
+    return sqlite3.connect(db_creds)
 
 logdir = '/ssbwebv1/data2/steuermann/logs'
diff --git a/steuermann/db.sql b/steuermann/db.sql
index dd6a569..b6d2ae6 100644
--- a/steuermann/db.sql
+++ b/steuermann/db.sql
@@ -2,7 +2,7 @@
 -- Before we start running anything, we insert a record for every
 -- command in the test run.  The initial status is 'S'.
 
-CREATE TABLE status ( 
+CREATE TABLE sm_status ( 
 	run	VARCHAR(100),
 		-- name of this run
 
@@ -19,6 +19,7 @@ CREATE TABLE status (
 		-- R = started, not finished
 		-- S = skipped
 		-- P = prereq not satisfied, so not attempted
+		-- E = error internal to steuermann
 		-- 0-255 = exit code
 
 	start_time	VARCHAR(30),
@@ -37,13 +38,13 @@ CREATE TABLE status (
 	);
 
 
-create unique index idx_status_1 on status ( run, host, tablename, cmd );
+create unique index sm_status_idx1 on sm_status ( run, host, tablename, cmd );
 
 
 -- table lists all run names in the system
-CREATE TABLE runs (
+CREATE TABLE sm_runs (
 	run	VARCHAR(100)
 	);
 
-CREATE UNIQUE INDEX idx_runs_run ON runs(run);
+CREATE UNIQUE INDEX sm_runs_idx1 ON sm_runs(run);
 
diff --git a/steuermann/hosts.ini b/steuermann/hosts.ini
index 0f0c043..f8c2e3b 100644
--- a/steuermann/hosts.ini
+++ b/steuermann/hosts.ini
@@ -17,9 +17,12 @@
 
 [all]
 hostname=no_such_machine
-local=[ 'sh', '-c', '%(script)s' ]
 maxproc=2
 
+; local really applies the same to all the machines, but this is a
+; convenient place to stash it for now.
+local=[ 'sh', '-c', '%(script)s' ]
+
 [linux:csh]
 ; for CSH
 ;
@@ -74,6 +77,12 @@ like=linux:csh
 workdir=/arzach/data1/iraf/steuermann
 maxproc=4
 
+[ssb]
+hostname=ssbwebv1
+like=linux:csh
+workdir=/ssbwebv1/data1/iraf/work
+maxproc=2
+
 [bond]
 hostname=bond
 like=mac:csh
diff --git a/steuermann/nodes.py b/steuermann/nodes.py
index 5e9852a..0f24e36 100644
--- a/steuermann/nodes.py
+++ b/steuermann/nodes.py
@@ -110,11 +110,21 @@ class command_tree(object):
 
 # crack open host:table/cmd
 def crack_name(name) :
-    t = name.split(':')
-    host = t[0]
-    t = t[1].split('/')
-    table = t[0]
-    cmd = t[1]
+    if ':' in name :
+        t = name.split(':')
+        host = t[0]
+        name = t[1]
+    else :
+        host = '*'
+
+    if '/' in name :
+        t = name.split('/')
+        table = t[0]
+        cmd = t[1]
+    else :
+        table = '*'
+        cmd = name
+
     return (host, table, cmd)
 
 #####
diff --git a/steuermann/report.py b/steuermann/report.py
index 82da984..cc90604 100644
--- a/steuermann/report.py
+++ b/steuermann/report.py
@@ -9,6 +9,21 @@ import pandokia.text_table as text_table
 import pandokia.common
 import StringIO
 
+# maybe the output is html 3.2 - in any case, it is way simpler than
+# more recent standards.
+html_header='''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<HTML>
+<HEAD>
+<TITLE>%(title)s</TITLE>
+</HEAD>
+<BODY>
+'''
+
+html_trailer='''
+</BODY>
+</HTML>
+'''
+
 # this will be reset by the cgi main program if we are in a real cgi
 cginame = 'arf.cgi'
 
@@ -16,7 +31,7 @@ cginame = 'arf.cgi'
 
 def info_callback_status( db, run, tablename, host, cmd ) :
     c = db.cursor()
-    c.execute("SELECT status FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
+    c.execute("SELECT status FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
             run, host, tablename, cmd ) )
     status, = c.fetchone()
     return status
@@ -29,7 +44,7 @@ simple_status = ( 'N', 'P', 'S', 'W' )
 
 def info_callback_gui( db, run, tablename, host, cmd ) :
     c = db.cursor()
-    c.execute("SELECT status, start_time, end_time FROM status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
+    c.execute("SELECT status, start_time, end_time FROM sm_status WHERE run = ? AND host = ? AND tablename = ? AND cmd = ?",(
             run, host, tablename, cmd ) )
     x = c.fetchone()
     if x is None :
@@ -81,7 +96,7 @@ def info_callback_debug_table_cell( db, run, tablename, cmd, host ) :
 
 def get_table_list( db, run_name ) :
     c = db.cursor()
-    c.execute("select max(depth) as d, tablename from status where run = ? group by tablename order by d asc",(run_name,))
+    c.execute("select max(depth) as d, tablename from sm_status where run = ? group by tablename order by d asc",(run_name,))
     table_list = [ x for x in c ]
         # table_list contains ( depth, tablename )
     return table_list
@@ -97,25 +112,28 @@ def get_table( db, run_name, tablename, info_callback, showdepth=0 ) :
         t.define_column('depth')
 
     c = db.cursor()
-    c.execute("select distinct host from status where tablename = ? and run = ? order by host asc",(tablename, run_name))
+    c.execute("select distinct host from sm_status where tablename = ? and run = ? order by host asc",(tablename, run_name))
     for host, in c :
         t.define_column(host)
 
-    c.execute("""select cmd, host, depth, status, start_time, end_time, notes from status
-        where tablename = ? and run = ?  order by depth, cmd asc
+    c.execute("select cmd, max(depth) as d from sm_status where tablename = ? and run = ? group by cmd order by d asc",(tablename, run_name))
+    row = -1
+    cmd_to_row = { }
+    for cmd, depth in c :
+        row = row + 1
+        cmd_to_row[cmd] = row
+        t.set_value(row, 0, cmd)
+        if showdepth :
+            t.set_value(row, 'depth', depth)
+
+    c.execute("""select cmd, host, status, start_time, end_time, notes from sm_status
+        where tablename = ? and run = ?  order by cmd asc
         """, ( tablename, run_name ) )
 
-    row = -1
-    prev_cmd = None
+    row = 0
     for x in c :
-        cmd, host, depth, status, start_time, end_time, notes = x
-        if cmd != prev_cmd :
-            row = row + 1
-            t.set_value(row, 0, cmd)
-            if showdepth :
-                t.set_value(row, 'depth', depth)
-            prev_cmd = cmd
-            
+        cmd, host, status, start_time, end_time, notes = x
+        row = cmd_to_row[cmd]
         info = info_callback( db, run_name, tablename, host, cmd )
         if isinstance(info, tuple) :
             t.set_value( row, host, text=info[0], html=info[1] )
@@ -149,6 +167,7 @@ def report_text( db, run_name, info_callback = info_callback_status ) :
 
 def report_html( db, run_name, info_callback = info_callback_status, hlevel=1 ) :
     s = StringIO.StringIO()
+    s.write(html_header % { 'title' : run_name } )
     s.write('<h%d>%s</h%d>\n'%(hlevel,run_name,hlevel))
 
     hlevel = hlevel + 1
@@ -160,6 +179,8 @@ def report_html( db, run_name, info_callback = info_callback_status, hlevel=1 )
         t = get_table( db, run_name, tablename, info_callback, showdepth=1 )
         s.write(t.get_html())
 
+    s.write(html_trailer)
+
     return s.getvalue()
 
 #
diff --git a/steuermann/run.py b/steuermann/run.py
index 36cda80..af52a02 100644
--- a/steuermann/run.py
+++ b/steuermann/run.py
@@ -7,6 +7,8 @@ import subprocess
 import time
 import datetime
 import os
+import traceback
+import sys
 
 import ConfigParser
 
@@ -20,6 +22,9 @@ class struct :
 
 #####
 
+class run_exception(Exception) :
+    pass
+
 class runner(object): 
 
     # dict of all current running processes, indexed by node name
@@ -51,100 +56,112 @@ class runner(object):
     #####
     # start a process
 
-    def run( self, node, run_name ):
+    def run( self, node, run_name, no_run = False ):
 
         try :
-            args = self.get_host_info(node.host)
-        except :
-            print "ERROR: do not know how to run on %s"%node.host
-            raise
+            try :
+                args = self.get_host_info(node.host)
+            except Exception, e :
+                log_traceback()
+                print "ERROR: do not know how to run on %s"%node.host
+                print e
+                raise
 
-        if 'maxproc' in args :
             hostname = args['hostname']
+            if 'maxproc' in args :
 
-            n = int(self.howmany.get(hostname,0))
-            if n >= int(args['maxproc']) :
-                print "decline to run %s - %d other already running"%(node.name,n)
-                return False
-
-            n = n + 1
-            self.howmany[hostname] = n
-            print "running %s %s %d"%(hostname,node.name, n)
-        else :
-            print "running %s %s no maxproc"%(hostname, node.name)
-
-        if debug :
-            print "run",node.name
-        if debug :
-            print "....%s:%s/%s\n"%(node.host, node.table, node.cmd)
-
-        node.running = 1
-
-        args = args.copy()
-        args.update( 
-            script=node.script,
-            script_type=node.script_type,
-            host=node.host,
-            table=node.table,
-            cmd=node.cmd,
-            node=node.name,
-            )
-
-        if debug :
-            print "ARGS"
-            for x in sorted([x for x in args]) :
-                print '%s=%s'%(x,args[x])
-
-        args['script'] = args['script'] % args
-
-        if args['script_type'] == 'r' :
-            run = args['run']
-        elif  args['script_type'] == 'l' :
-            run = args['local']
-        else :
-            raise Exception()
-
-        t = [ ]
-        for x in run :
-            # bug: what to do in case of keyerror
-            t.append( x % args )
-
-        run = t
+                n = int(self.howmany.get(hostname,0))
+                if n >= int(args['maxproc']) :
+                    print "decline to run %s - %d other already running"%(node.name,n)
+                    return False
 
-        if debug :
-            print "RUN",run
-
-        # make sure the log directory is there
-        logdir= self.logdir + "/%s"%run_name
-        try :
-            os.makedirs(logdir)
-        except OSError:
-            pass
-
-        # create a name for the log file, but do not use / in the name
-        logfile_name = "%s/%s.log"%( logdir, node.name.replace('/','.') )
-
-        # open the log file, write initial notes
-        logfile=open(logfile_name,"w")
-        logfile.write('%s %s\n'%(datetime.datetime.now(),run))
-        logfile.flush()
-        
-        # start running the process
-        p = subprocess.Popen(args=run,
-            stdout=logfile,
-            stderr=subprocess.STDOUT,
-            shell=False, close_fds=True)
-
-        # remember the popen object for the process; remember the open log file
-        n = struct()
-        n.proc = p
-        n.logfile = logfile
-        n.logfile_name = logfile_name
-
-        # remember the process is running
-        self.all_procs[node.name] = n
-
-        return True
+                n = n + 1
+                self.howmany[hostname] = n
+                print "running %s %s %d"%(hostname,node.name, n)
+            else :
+                print "running %s %s no maxproc"%(hostname, node.name)
+
+            if debug :
+                print "run",node.name
+            if debug :
+                print "....%s:%s/%s\n"%(node.host, node.table, node.cmd)
+
+            node.running = 1
+
+            args = args.copy()
+            args.update( 
+                script=node.script,
+                script_type=node.script_type,
+                host=node.host,
+                table=node.table,
+                cmd=node.cmd,
+                node=node.name,
+                )
+
+            if debug :
+                print "ARGS"
+                for x in sorted([x for x in args]) :
+                    print '%s=%s'%(x,args[x])
+
+            args['script'] = args['script'] % args
+
+            if args['script_type'] == 'r' :
+                run = args['run']
+            elif  args['script_type'] == 'l' :
+                run = args['local']
+            else :
+                raise Exception()
+
+            t = [ ]
+            for x in run :
+                # bug: what to do in case of keyerror
+                t.append( x % args )
+
+            run = t
+
+            if debug :
+                print "RUN",run
+
+            # make sure the log directory is there
+            logdir= self.logdir + "/%s"%run_name
+            try :
+                os.makedirs(logdir)
+            except OSError:
+                pass
+
+            # create a name for the log file, but do not use / in the name
+            logfile_name = "%s/%s.log"%( logdir, node.name.replace('/','.') )
+
+            # open the log file, write initial notes
+            logfile=open(logfile_name,"w")
+            logfile.write('%s %s\n'%(datetime.datetime.now(),run))
+            logfile.flush()
+
+            # debug - just say the name of the node we would run
+            if no_run :
+                run = [ 'echo', 'no_run - node=', node.name ]
+            
+            # start running the process
+            p = subprocess.Popen(args=run,
+                stdout=logfile,
+                stderr=subprocess.STDOUT,
+                shell=False, close_fds=True)
+
+            # remember the popen object for the process; remember the open log file
+            n = struct()
+            n.proc = p
+            n.logfile = logfile
+            n.logfile_name = logfile_name
+
+            # remember the process is running
+            self.all_procs[node.name] = n
+
+            return True
+
+        except Exception, e :
+            log_traceback()
+            txt= "ERROR RUNNING %s"%node.name
+            raise run_exception(txt)
 
     #####
     # callback when a node finishes
@@ -155,16 +172,12 @@ class runner(object):
 
         args = self.get_host_info(node.host)
 
-        if 'maxproc' in args :
-            hostname = args['hostname']
+        hostname = args['hostname']
 
-            n = int(self.howmany.get(hostname,0))
-            n = n - 1
+        n = self.howmany[hostname] - 1
+        self.howmany[hostname] = n
 
-            self.howmany[hostname] = n
-            print "finish %s %s %d"%(hostname,node_name,n)
-        else :
-            print "finish %s %s no maxproc"%(hostname,node_name)
+        print "finish %s %s %d"%(hostname,node_name,n)
 
         # note the termination of the process at the end of the log file
         logfile  = self.all_procs[node_name].logfile
@@ -225,14 +238,18 @@ class runner(object):
     def _host_get_names( self, cfg, section ) :
         d = { }
         # pick all the variables out of this section
-        for name, value in cfg.items(section) :
-            if value.startswith('[') :
-                # it is a list
-                d[name] = eval(value)
-            else :
-                # everything else is plain text
-                d[name] = value
-        return d
+        try :
+            for name, value in cfg.items(section) :
+                if value.startswith('[') :
+                    # it is a list
+                    d[name] = eval(value)
+                else :
+                    # everything else is plain text
+                    d[name] = value
+            return d
+        except ConfigParser.NoSectionError :
+            print "No config section in hosts.ini: %s"%section
+            return { }
 
     def load_host_info( self, filename=None ) : 
 
@@ -243,26 +260,55 @@ class runner(object):
         self.cfg.read(filename)
 
     def get_host_info(self, host) :
-        if debug:
-            print "enter get_host_info",host
         if not host in self.host_info_cache :
-
             d = self._host_get_names(self.cfg, host)
-            if debug:
-                print "in get_host_info, got names for ",host, d
+
             if 'like' in d :
-                if debug:
-                    print "has like", d['like']
-                d1 = self.get_host_info(d['like']).copy()
-                del d['like']
+                # get the dict of what this entry is like, copy it,
+                # and update it with the values for this entry
+                d1 = self.get_host_info(d['like'])
+                d1 = d1.copy()
                 d1.update(d)
-                self.host_info_cache[host] = d1
-            else :
-                print "end of chain",host,d
-                self.host_info_cache[host] = d
+                d = d1
+                print d
+                del d['like']
 
-        if debug:
-            print "leave get_host_info",host, self.host_info_cache[host]
+            # default hostname is the name from the section header
+            if not 'hostname' in d :
+                d['hostname'] = host
+
+            # default maximum processes is 1
+            if not 'maxproc' in d :
+                d['maxproc'] = 1
+
+            self.host_info_cache[host] = d
 
         return self.host_info_cache[host]
     #####
+
+# The traceback interface is awkward in python; here is something I copied from pyetc:
+
+def log_traceback() :
+    # You would think that the python traceback module contains
+    # something useful to do this, but it always returns multi-line
+    # strings.  I want each line of output logged separately so the log
+    # file remains easy to process, so I reverse engineered this out of
+    # the logging module.
+    try:
+        etype, value, tb = sys.exc_info()
+        tbex = traceback.extract_tb( tb )
+        for filename, lineno, name, line in tbex :
+            print '%s:%d, in %s'%(filename,lineno,name)
+            if line:
+                print '    %s'%line.strip()
+
+        for x in  traceback.format_exception_only( etype, value ) :
+            print ": %s",x
+
+        print "---"
+
+    finally:
+        # If you don't clear these guys, you can make loops that
+        # the garbage collector has to work hard to eliminate.
+        etype = value = tb = None
+
diff --git a/steuermann/run_all.py b/steuermann/run_all.py
index e7bff61..f670abe 100644
--- a/steuermann/run_all.py
+++ b/steuermann/run_all.py
@@ -26,6 +26,7 @@ except ImportError :
 
 def main() :
     global xnodes
+    global no_run
     # read all the input files
     
     if readline :
@@ -37,19 +38,38 @@ def main() :
         import atexit
         atexit.register(readline.write_history_file, history)
 
-    opt, args = easyargs.get( { '-a' : '--all',
-        '--all' : '-a',
+
+# easyargs spec definition:
+#
+#        '-v' : '',              # arg takes no parameter, opt['-v'] is
+#                                # how many times it occurred
+#        '-f' : '=',             # arg takes a parameter
+#        '-mf' : '=+',           # arg takes a parameter, may be specified 
+#                                # several times to get a list
+#        '--verbose' : '-v',     # arg is an alias for some other arg
+
+    opt, args = easyargs.get( { 
+        '--all' : '-a'      ,
+        '-a'    : ''        ,   # run all nodes non-interactively
+        '-r'    : '='       ,   # give run name
+        '-n'    : ''        ,   # do not actually execute any processes
         } )
 
     #
     #
 
-    all = '--all' in opt
+    all = opt['-a']
+    no_run = opt['-n']
 
     di_nodes = nodes.read_file_list( args )
 
     xnodes = di_nodes.node_index
-    run_name = str(datetime.datetime.now()).replace(' ','_')
+
+    if '-r' in opt :
+        run_name = opt['-r']
+    else :
+        run_name = str(datetime.datetime.now()).replace(' ','_')
+
     db = steuermann.config.open_db()
 
     if all :
@@ -59,6 +79,16 @@ def main() :
 
 #
 
+def find_wild_names( xnodes, name ) :
+    print "find_wild",name
+    l = [ ]
+    for x in xnodes :
+        if nodes.wildcard_name( name, x ) :
+            print "...",x
+            l.append(x)
+    return l
+#
+
 def do_flag( xnodes, name, recursive, fn, verbose ) :
     if verbose :
         verbose = verbose + 1
@@ -84,7 +114,6 @@ def do_flag( xnodes, name, recursive, fn, verbose ) :
     else :
             if verbose :
                 print '  '*verbose, "not in list", name
-            raise Exception()
 
 def set_want( node ) :
     # if we said we want it, mark it as wanted and don't skip
@@ -111,6 +140,17 @@ def cmd_flagging( l, xnodes, func ) :
     for x in l :
         do_flag( xnodes, x, recursive, func, 1 )
 
+
+#
+def print_node(xnodes, x, print_recursive, print_all, indent=0):
+    print ' '*indent, xnodes[x].wanted, xnodes[x].finished, xnodes[x].skip,  x
+    if print_all :
+        l = [ a.name for a in xnodes[x].predecessors ]
+        print ' '*indent, "       AFTER", '  '.join(l)
+        if print_recursive :
+            for x in l :
+                print_node( xnodes, x, print_recursive, print_all, indent=indent+8)
+
 #
 
 helpstr = """
@@ -128,6 +168,9 @@ pre node            show what must come before a node
 
 def run_interactive( xnodes, run_name, db) :
 
+    org_run_name = run_name
+    run_count = 0
+
     register_database(db, run_name, xnodes)
 
     runner = run.runner( xnodes, steuermann.config.logdir )
@@ -195,18 +238,37 @@ def run_interactive( xnodes, run_name, db) :
             for x in xnodes :
                 xnodes[x].finished = 0
             
-            run_name = str(datetime.datetime.now()).replace(' ','_')
+            run_name = org_run_name + '.%d'%run_count
+            run_count = run_count + 1
             print "new run name",run_name
             register_database(db, run_name, xnodes)
 
         elif n == 'list' :
-            print_all = '-a' in l
-            l = sorted ( [ x for x in xnodes ] )
+            l = l[1:]
+            if len(l) > 0 and l[0] == '-a' :
+                l = l[1:]
+                print_all = 1
+            else :
+                print_all = 0
+
+            if len(l) > 0 and l[0] == '-r' :
+                l = l[1:]
+                print_recursive=1
+            else :
+                print_recursive=0
+
+            if len(l) == 0 :
+                all = [ x for x in xnodes ]
+            else :
+                all = [ ]
+                for x in l :
+                    all = all + find_wild_names( xnodes, x )
+
+            all = sorted(all)
+            print "recursive",print_recursive
             print "w f s name"
-            for x in l :
-                print xnodes[x].wanted, xnodes[x].finished, xnodes[x].skip,  x
-                if print_all :
-                    print "       AFTER", '  '.join([ a.name for a in xnodes[x].predecessors ])
+            for x in all :
+                print_node(xnodes, x, print_recursive, all)
 
         elif n == 'wait' :
             c = db.cursor()
@@ -214,7 +276,7 @@ def run_interactive( xnodes, run_name, db) :
                 host, tablename, cmd = nodes.crack_name(x)
                 if xnodes[x].wanted :
                     status = 'W'
-                    c.execute("UPDATE status SET status = 'W' WHERE run = ? AND host = ? AND tablename = ? AND cmd = ? AND status = 'N'",
+                    c.execute("UPDATE sm_status SET status = 'W' WHERE run = ? AND host = ? AND tablename = ? AND cmd = ? AND status = 'N'",
                         (run_name, host, tablename, cmd) )
 
             db.commit()
@@ -286,13 +348,13 @@ def print_pre(who, xnodes, depth) :
 
 def register_database(db, run, xnodes ) :
     c = db.cursor()
-    c.execute('INSERT INTO runs ( run ) VALUES ( ? )', ( run, ) )
+    c.execute('INSERT INTO sm_runs ( run ) VALUES ( ? )', ( run, ) )
     
     c = db.cursor()
     for x in xnodes :
         host, tablename, cmd = nodes.crack_name(x)
         depth = xnodes[x].depth
-        c.execute("INSERT INTO status ( run, host, tablename, cmd, depth, status ) VALUES "
+        c.execute("INSERT INTO sm_status ( run, host, tablename, cmd, depth, status ) VALUES "
             "( ?, ?, ?, ?, ?, 'N' )", ( run, host, tablename, cmd, depth ) )
 
     db.commit()
@@ -365,16 +427,26 @@ def run_step( runner, xnodes, run_name, db ) :
                     x.finished = 1
                     no_sleep = 1
                     keep_running = 1
-                    db.execute("UPDATE status SET start_time = ?, status = 'S' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
+                    db.execute("UPDATE sm_status SET start_time = ?, status = 'S' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
                             ( str(datetime.datetime.now()), run_name, host, table, cmd ) )
                     db.commit()
 
                 else :
-                    if runner.run(x, run_name) :
-                        # returns true/false whether it actually ran it - it may not because of resource limits
-                        db.execute("UPDATE status SET start_time = ?, status = 'R' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
-                            ( str(datetime.datetime.now()), run_name, host, table, cmd ) )
-                        db.commit()
+                    try :
+                        if runner.run(x, run_name, no_run=no_run) :
+                            # returns true/false whether it actually ran it - it may not because of resource limits
+                            db.execute("UPDATE sm_status SET start_time = ?, status = 'R' WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
+                                ( str(datetime.datetime.now()), run_name, host, table, cmd ) )
+                    except run.run_exception, e :
+                        now = str(datetime.datetime.now())
+                        db.execute("UPDATE sm_status SET start_time=?, end_time=?, status='E', notes=? WHERE ( run=? AND host=? AND tablename=? AND cmd=? )",
+                                ( now, now, repr(e), run_name, host, table, cmd ) )
+                        x.finished = 1
+                        no_sleep = 1
+                        keep_running = 1
+
+                    db.commit()
+                        
 
         # if anything has exited, we process it and update the status in the database
         while 1 :
@@ -390,7 +462,7 @@ def run_step( runner, xnodes, run_name, db ) :
             # note who and log it
             x_host, x_table, x_cmd = nodes.crack_name(who_exited[0])
 
-            db.execute("UPDATE status SET end_time = ?, status = ?  WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
+            db.execute("UPDATE sm_status SET end_time = ?, status = ?  WHERE ( run = ? AND host = ? AND tablename = ? AND cmd = ? )",
                     ( str(datetime.datetime.now()), who_exited[1], run_name, x_host, x_table, x_cmd ) )
             db.commit()
author	sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d>	2011-09-27 11:46:09 -0400
committer	sienkiew <sienkiew@d34015c8-bcbb-4646-8ac8-8ba5febf221d>	2011-09-27 11:46:09 -0400
commit	61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b (patch)
tree	942866a36819b21731f74c157bda1ffb19081c51
parent	ff102cd2b89daf9a0feea2e10503e780d2454e29 (diff)
download	steuermann-61285fb53a2b871d52e27e4e8ecc4d7da6e09e1b.tar.gz