| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409 |
- #!/usr/bin/env python
- # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
- import argparse
- import subprocess
- import re
- import signal
- import time
- import sys
- import os.path
- import copy
- import tempfile
- import calendar
- import json
- ### Constants
- # Here is how the communication with nunit works. It has to work with two constraints:
- # - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
- # and at some point down the line someone calls nunit.
- # - nunit has to be able to report back to us even if (especially if) it terminates improperly.
- # To deal with all this, communication babysitter->nunit is done by environment variables,
- # and communication nunit->babysitter is done by leaving behind files in known locations.
- # Filenames
- CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
- RAN_TEST_FILE = "babysitter_report_ran_test_file.txt"
- FAILED_TEST_FILE = "babysitter_report_failed_test_file.txt"
- LOGGING_FILE = "babysitter_report.json_lines"
- # Environment keys
- # Keys used for Babysitter<->Nunit IPC
- CURRENT_TEST_KEY = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
- RAN_TEST_KEY = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
- FAILED_TEST_KEY = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
- RUN_KEY = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
- RUN_MODE_KEY = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
- # Keys used for script configuration (see --help text)
- LOG_FILE_KEY = 'MONO_BABYSITTER_LOG_FILE' # Path
- RETRY_KEY = 'MONO_BABYSITTER_RETRY' # Equal to an integer
- VERBOSE_KEY = 'MONO_BABYSITTER_VERBOSE' # "Undocumented"-- used for debugging babysitter
- # JSON keys
- DATE_JSON = 'date' # POSIX timestamp of test suite run
- INVOKE_JSON = 'invocation'
- COUNT_JSON = 'iteration' # How many times was command executed?
- LIMIT_JSON = 'failure_max'
- SUPPORT_JSON = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
- FINAL_CODE_JSON = 'final_code'
- TESTS_JSON = 'tests' # Holds dictionary of (test case name)->(dict with TEST_ keys below)
- TEST_FAILURES = 'normal_failures'
- TEST_CRASH_FAILURES = 'crash_failures'
- TEST_TIMEOUT_FAILURES = 'timeout_failures'
- ### Interpret arguments
- scriptname = sys.argv[0]
- # This is very silly: This program needs to switch from argparse to manual parsing
- # after the second positional argument, but argparse doesn't let you do this.
- # I work around this with a custom subclass which can selectively swallow errors:
- class Hesitate(Exception):
- pass
- class HesitantParser(argparse.ArgumentParser):
- def __init__(s, *args, **kwargs):
- s.hesitating = True # Errors will be swallowed until this is set false
- argparse.ArgumentParser.__init__(s, *args, **kwargs)
- def error(s, *args, **kwargs):
- if s.hesitating:
- raise Hesitate() # Bail out before errors are printed.
- argparse.ArgumentParser.error(s, *args, **kwargs)
- # Define args
- argparser = HesitantParser(description="""\
- Run a test suite with a timeout.\n
- Durations are floating point numbers followed by an optional unit:\n
- 's' for seconds (the default)
- 'm' for minutes
- 'h' for hours
- 'd' for days\n
- supported environment variables:
- %s: File to write logs to (as line-delimited JSON)
- %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
- (LOG_FILE_KEY, RETRY_KEY),
- formatter_class=argparse.RawTextHelpFormatter)
- argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
- help="Send this signal to the command on timeout, instead of TERM.")
- argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
- help="If process continues running after signal, send KILL after this duration.")
- argparser.add_argument('duration',
- help="Time to run before sending signal.")
- argparser.add_argument('command', nargs="+", help="Command+args to run.")
- # Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
- argc = len(sys.argv)
- extra_args = []
- for limit in range(1,argc+1):
- try:
- if limit == argc: # On the final pass, parse for real
- argparser.hesitating = False
- args = argparser.parse_args(sys.argv[1:limit])
- # If we're still here, parse_args succeeded.
- # The final parsed arg is the command; remaining argv items are its args.
- extra_args = sys.argv[limit:]
- break
- except Hesitate: # Before the final pass, if parse_args fails, skip
- pass
- argparser.hesitating = False # Just act like a normal argparser from here
- durationre = re.compile(r'(\d+)([smhd]?)')
- def parse_duration(duration): # Accept units
- match = durationre.match(duration)
- if not match:
- argparser.error("Could not understand duration %s" % duration)
- time, units = match.group(1), match.group(2)
- time = int(time)
- if units == 'm':
- time *= 60
- elif units == 'h':
- time *= 60*60
- elif units == 'd':
- time *= 60*60*24
- return time
- def parse_signal(sig): # Accept names
- if sig.isdigit():
- return int(sig)
- for k,v in signal.__dict__.iteritems():
- if k == ("SIG%s" % sig):
- return v
- argparser.error("Could not understand signal name %s" % sig)
- # Final interpretation of arguments
- duration = parse_duration(args.duration)
- kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
- timeout_signal = parse_signal(args.signal)
- command = args.command + extra_args
- # Process environment
- global_env = copy.deepcopy( os.environ )
- verbose = VERBOSE_KEY in global_env
- logging = LOG_FILE_KEY in global_env
- logfile = global_env[LOG_FILE_KEY] if logging else None
- crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
- failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
- babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
- if babysitting:
- babysitter_dir = tempfile.mkdtemp()
- global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
- global_env[RAN_TEST_KEY] = os.path.join(babysitter_dir, RAN_TEST_FILE)
- global_env[FAILED_TEST_KEY] = os.path.join(babysitter_dir, FAILED_TEST_FILE)
- have_unix_process_groups = 'killpg' in os.__dict__
- have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
- ### Timeout implementation
- def wait(proc, duration):
- # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
- start = time.time()
- while True:
- code = proc.poll()
- if code is not None:
- return code
- if time.time()-start > duration:
- return None
- time.sleep(0.05)
- # Popen and send_signal can't be called in their basic forms because we want to
- # send signals to all children, not just to the immediately spawned process.
- # Unfortunately the way to do this varies by operating system.
- def popen(*args, **kwargs):
- if have_unix_process_groups: # Call function on spawn to become process group leader
- kwargs['preexec_fn'] = os.setsid
- elif have_windows_process_groups: # Set magic flag for Windows process groups
- kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
- return subprocess.Popen(*args, **kwargs)
- def send_signal(proc, sig):
- if have_unix_process_groups: # UNIX
- # For compatibility with GNU timeout, pre-send the signal to just the monitored process
- os.kill(proc.pid, sig)
- # Send signal to entire group
- os.killpg(proc.pid, sig)
- # For compatibility with GNU Timeout, send a SIGCONT after the signal
- # (so delivery has a chance to occur even for stopped processes)
- if sig != signal.SIGKILL and sig != signal.SIGCONT:
- os.kill(proc.pid, signal.SIGCONT)
- elif have_windows_process_groups: # Windows with Python 2.7 or better
- os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
- else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
- proc.send_signal(sig) # No way to contact group, just kill process
- ### Utility functions
- def attemptDelete(path):
- try:
- os.remove(path)
- except OSError:
- pass
- def attemptLines(path):
- try:
- with open(path) as f:
- return map(lambda s: s.strip('\r\n'), f.readlines())
- except (OSError, IOError):
- return []
- def attemptFirstLine(path):
- lines = attemptLines(path)
- if lines:
- return lines[0]
- return None
- def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
- return calendar.timegm(time.gmtime())
- failcount = {}
- def failure_may_retry(test):
- if test not in failcount:
- failcount[test] = 0
- failcount[test] += 1
- return failcount[test] < failmax
- def verbose_print(arg):
- if (verbose):
- print(arg)
- def failure_annotate(test):
- return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
- def pluralize(lst):
- return "s" if len(lst) > 1 else ""
- ### Run command
- def run(): # Returns exit code
- resume_after = []
- retry_these = []
- ever_completed = False
- died_politely = False
- proc = None
- code = None
- # Set up logging
- log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
- INVOKE_JSON: " ".join(command)}
- def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
- if add is not None:
- if key not in target:
- target[key] = 0
- target[key] += add
- else:
- target[key] = set
- def log_test(testname, key, set=None, add=None): # Call to add test-case-level value to log
- if TESTS_JSON not in log:
- log[TESTS_JSON] = {}
- if testname not in log[TESTS_JSON]:
- log[TESTS_JSON][testname] = {}
- log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
- # Ready to run tests
- try:
- while True:
- env = copy.copy(global_env)
- if ever_completed:
- retry_next = []
- else: # Persist reported failures list until first non-crash run
- retry_next = retry_these
- log_value(COUNT_JSON, add=1)
- # Prepare environment/filesystem
- if babysitting:
- for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
- attemptDelete(env[key])
- if resume_after:
- env[RUN_KEY] = ";".join(resume_after)
- env[RUN_MODE_KEY] = "EXCLUDE"
- elif retry_these:
- env[RUN_KEY] = ";".join(retry_these)
- env[RUN_MODE_KEY] = "RUN"
- # Run test suite
- try:
- proc = popen(command, env=env)
- except OSError:
- died_politely = True
- sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
- sys.exit(127)
- code = wait(proc, duration)
- timed_out = code is None
- if timed_out: # Initial timeout
- send_signal(proc, timeout_signal)
- if kill_after is not None: # Kill-after timeout
- code = wait(proc, kill_after)
- if code is None:
- send_signal(proc, signal.SIGKILL)
- code = proc.wait() # Waits forever
- sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
- died_politely = True
- # The test suite has now run, and what happens next varies:
- # 1. The suite either completed fully without failures, or timed out: Just quit.
- # 2. The suite crashed (halted without completing):
- # Remember any failures for later and rerun, using a blacklist of testcases we have completed.
- # 3. The suite completed, but there were failures reported:
- # Rerun, using a whitelist of only reported-failed testcases.
- # 4. The suite crashed partway through a run with a whitelist:
- # Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
- crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
- failed_tests = attemptLines(env[FAILED_TEST_KEY])
- ran_tests = attemptLines(env[RAN_TEST_KEY])
- bailout = False
- if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
- log_value(SUPPORT_JSON, True)
- if not crashed_at and not ever_completed: # The resume_after whitelist is only
- resume_after = [] # used before the first noncrashing run
- ever_completed = True
- if timed_out: # Currently no retries after timeout
- bailout = True
- code = 124 # See GNU timeout manpage
- if code or crashed_at: # Process failures
- # Handle crash failures
- if crashed_at and not timed_out:
- log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
- if not crash_resuming:
- bailout = True
- if failure_may_retry(crashed_at):
- if ever_completed: # Rerun with whitelist next time
- for test in retry_these: # Prepopulate with last whitelist minus run testcases
- if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
- retry_next.append(test)
- else: # Rerun with blacklist next time
- for test in ran_tests: # Add testcases we just ran to blacklist
- if test != crashed_at: # (except for the crashed testcase)
- resume_after.append(test)
- else:
- bailout = True
- # Handle reported failures
- for test in failed_tests:
- log_test(test, TEST_FAILURES, add=1)
- if failure_may_retry(test):
- retry_next.append(test)
- else:
- bailout = True
- # Report human-readable failures for stdout log.
- message = "%s:" % (scriptname)
- if timed_out:
- message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
- elif crashed_at:
- message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
- if failed_tests:
- message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
- if not (timed_out or crashed_at or failed_tests):
- message += " Test suite terminated with code %d, " % (code)
- if log[SUPPORT_JSON]:
- message += "but failure did not occur during a test case. Halting."
- else:
- message += "and suite cannot report test case data. Halting."
- elif bailout:
- message += " Will halt testing."
- print(message)
- if bailout or not (resume_after or retry_next): # If not retrying
- return code
- # If we got here, a retry is occurring
- retry_these = retry_next
- # Report human-readable retry notice for stdout log.
- message = "%s: Will rerun test suite" % (scriptname)
- if log[COUNT_JSON] > 1:
- message += " (iteration #%d)" % (log[COUNT_JSON])
- if resume_after:
- message += ", resuming at crashed testcase %s." % (crashed_at)
- else:
- message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
- print(message)
- finally:
- # Emergency: Ensure command does not outlive this script
- if proc is not None and not died_politely:
- send_signal(proc, signal.SIGKILL)
- # Write out logs
- log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
- if logging:
- with open(logfile, "a") as f:
- f.write(json.dumps(log) + os.linesep)
- sys.exit( run() )
|