babysitter 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. #!/usr/bin/env python
  2. # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
  3. import argparse
  4. import subprocess
  5. import re
  6. import signal
  7. import time
  8. import sys
  9. import os.path
  10. import copy
  11. import tempfile
  12. import calendar
  13. import json
  14. ### Constants
  15. # Here is how the communication with nunit works. It has to work with two constraints:
  16. # - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
  17. # and at some point down the line someone calls nunit.
  18. # - nunit has to be able to report back to us even if (especially if) it terminates improperly.
  19. # To deal with all this, communication babysitter->nunit is done by environment variables,
  20. # and communication nunit->babysitter is done by leaving behind files in known locations.
  21. # Filenames
  22. CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
  23. RAN_TEST_FILE = "babysitter_report_ran_test_file.txt"
  24. FAILED_TEST_FILE = "babysitter_report_failed_test_file.txt"
  25. LOGGING_FILE = "babysitter_report.json_lines"
  26. # Environment keys
  27. # Keys used for Babysitter<->Nunit IPC
  28. CURRENT_TEST_KEY = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
  29. RAN_TEST_KEY = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
  30. FAILED_TEST_KEY = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
  31. RUN_KEY = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
  32. RUN_MODE_KEY = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
  33. # Keys used for script configuration (see --help text)
  34. LOG_FILE_KEY = 'MONO_BABYSITTER_LOG_FILE' # Path
  35. RETRY_KEY = 'MONO_BABYSITTER_RETRY' # Equal to an integer
  36. VERBOSE_KEY = 'MONO_BABYSITTER_VERBOSE' # "Undocumented"-- used for debugging babysitter
  37. # JSON keys
  38. DATE_JSON = 'date' # POSIX timestamp of test suite run
  39. INVOKE_JSON = 'invocation'
  40. COUNT_JSON = 'iteration' # How many times was command executed?
  41. LIMIT_JSON = 'failure_max'
  42. SUPPORT_JSON = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
  43. FINAL_CODE_JSON = 'final_code'
  44. TESTS_JSON = 'tests' # Holds dictionary of (test case name)->(dict with TEST_ keys below)
  45. TEST_FAILURES = 'normal_failures'
  46. TEST_CRASH_FAILURES = 'crash_failures'
  47. TEST_TIMEOUT_FAILURES = 'timeout_failures'
  48. ### Interpret arguments
  49. scriptname = sys.argv[0]
  50. # This is very silly: This program needs to switch from argparse to manual parsing
  51. # after the second positional argument, but argparse doesn't let you do this.
  52. # I work around this with a custom subclass which can selectively swallow errors:
  53. class Hesitate(Exception):
  54. pass
  55. class HesitantParser(argparse.ArgumentParser):
  56. def __init__(s, *args, **kwargs):
  57. s.hesitating = True # Errors will be swallowed until this is set false
  58. argparse.ArgumentParser.__init__(s, *args, **kwargs)
  59. def error(s, *args, **kwargs):
  60. if s.hesitating:
  61. raise Hesitate() # Bail out before errors are printed.
  62. argparse.ArgumentParser.error(s, *args, **kwargs)
  63. # Define args
  64. argparser = HesitantParser(description="""\
  65. Run a test suite with a timeout.\n
  66. Durations are floating point numbers followed by an optional unit:\n
  67. 's' for seconds (the default)
  68. 'm' for minutes
  69. 'h' for hours
  70. 'd' for days\n
  71. supported environment variables:
  72. %s: File to write logs to (as line-delimited JSON)
  73. %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
  74. (LOG_FILE_KEY, RETRY_KEY),
  75. formatter_class=argparse.RawTextHelpFormatter)
  76. argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
  77. help="Send this signal to the command on timeout, instead of TERM.")
  78. argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
  79. help="If process continues running after signal, send KILL after this duration.")
  80. argparser.add_argument('duration',
  81. help="Time to run before sending signal.")
  82. argparser.add_argument('command', nargs="+", help="Command+args to run.")
  83. # Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
  84. argc = len(sys.argv)
  85. extra_args = []
  86. for limit in range(1,argc+1):
  87. try:
  88. if limit == argc: # On the final pass, parse for real
  89. argparser.hesitating = False
  90. args = argparser.parse_args(sys.argv[1:limit])
  91. # If we're still here, parse_args succeeded.
  92. # The final parsed arg is the command; remaining argv items are its args.
  93. extra_args = sys.argv[limit:]
  94. break
  95. except Hesitate: # Before the final pass, if parse_args fails, skip
  96. pass
  97. argparser.hesitating = False # Just act like a normal argparser from here
  98. durationre = re.compile(r'(\d+)([smhd]?)')
  99. def parse_duration(duration): # Accept units
  100. match = durationre.match(duration)
  101. if not match:
  102. argparser.error("Could not understand duration %s" % duration)
  103. time, units = match.group(1), match.group(2)
  104. time = int(time)
  105. if units == 'm':
  106. time *= 60
  107. elif units == 'h':
  108. time *= 60*60
  109. elif units == 'd':
  110. time *= 60*60*24
  111. return time
  112. def parse_signal(sig): # Accept names
  113. if sig.isdigit():
  114. return int(sig)
  115. for k,v in signal.__dict__.iteritems():
  116. if k == ("SIG%s" % sig):
  117. return v
  118. argparser.error("Could not understand signal name %s" % sig)
  119. # Final interpretation of arguments
  120. duration = parse_duration(args.duration)
  121. kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
  122. timeout_signal = parse_signal(args.signal)
  123. command = args.command + extra_args
  124. # Process environment
  125. global_env = copy.deepcopy( os.environ )
  126. verbose = VERBOSE_KEY in global_env
  127. logging = LOG_FILE_KEY in global_env
  128. logfile = global_env[LOG_FILE_KEY] if logging else None
  129. crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
  130. failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
  131. babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
  132. if babysitting:
  133. babysitter_dir = tempfile.mkdtemp()
  134. global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
  135. global_env[RAN_TEST_KEY] = os.path.join(babysitter_dir, RAN_TEST_FILE)
  136. global_env[FAILED_TEST_KEY] = os.path.join(babysitter_dir, FAILED_TEST_FILE)
  137. have_unix_process_groups = 'killpg' in os.__dict__
  138. have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
  139. ### Timeout implementation
  140. def wait(proc, duration):
  141. # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
  142. start = time.time()
  143. while True:
  144. code = proc.poll()
  145. if code is not None:
  146. return code
  147. if time.time()-start > duration:
  148. return None
  149. time.sleep(0.05)
  150. # Popen and send_signal can't be called in their basic forms because we want to
  151. # send signals to all children, not just to the immediately spawned process.
  152. # Unfortunately the way to do this varies by operating system.
  153. def popen(*args, **kwargs):
  154. if have_unix_process_groups: # Call function on spawn to become process group leader
  155. kwargs['preexec_fn'] = os.setsid
  156. elif have_windows_process_groups: # Set magic flag for Windows process groups
  157. kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
  158. return subprocess.Popen(*args, **kwargs)
  159. def send_signal(proc, sig):
  160. if have_unix_process_groups: # UNIX
  161. # For compatibility with GNU timeout, pre-send the signal to just the monitored process
  162. os.kill(proc.pid, sig)
  163. # Send signal to entire group
  164. os.killpg(proc.pid, sig)
  165. # For compatibility with GNU Timeout, send a SIGCONT after the signal
  166. # (so delivery has a chance to occur even for stopped processes)
  167. if sig != signal.SIGKILL and sig != signal.SIGCONT:
  168. os.kill(proc.pid, signal.SIGCONT)
  169. elif have_windows_process_groups: # Windows with Python 2.7 or better
  170. os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
  171. else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
  172. proc.send_signal(sig) # No way to contact group, just kill process
  173. ### Utility functions
  174. def attemptDelete(path):
  175. try:
  176. os.remove(path)
  177. except OSError:
  178. pass
  179. def attemptLines(path):
  180. try:
  181. with open(path) as f:
  182. return map(lambda s: s.strip('\r\n'), f.readlines())
  183. except (OSError, IOError):
  184. return []
  185. def attemptFirstLine(path):
  186. lines = attemptLines(path)
  187. if lines:
  188. return lines[0]
  189. return None
  190. def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
  191. return calendar.timegm(time.gmtime())
  192. failcount = {}
  193. def failure_may_retry(test):
  194. if test not in failcount:
  195. failcount[test] = 0
  196. failcount[test] += 1
  197. return failcount[test] < failmax
  198. def verbose_print(arg):
  199. if (verbose):
  200. print(arg)
  201. def failure_annotate(test):
  202. return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
  203. def pluralize(lst):
  204. return "s" if len(lst) > 1 else ""
  205. ### Run command
  206. def run(): # Returns exit code
  207. resume_after = []
  208. retry_these = []
  209. ever_completed = False
  210. died_politely = False
  211. proc = None
  212. code = None
  213. # Set up logging
  214. log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
  215. INVOKE_JSON: " ".join(command)}
  216. def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
  217. if add is not None:
  218. if key not in target:
  219. target[key] = 0
  220. target[key] += add
  221. else:
  222. target[key] = set
  223. def log_test(testname, key, set=None, add=None): # Call to add test-case-level value to log
  224. if TESTS_JSON not in log:
  225. log[TESTS_JSON] = {}
  226. if testname not in log[TESTS_JSON]:
  227. log[TESTS_JSON][testname] = {}
  228. log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
  229. # Ready to run tests
  230. try:
  231. while True:
  232. env = copy.copy(global_env)
  233. if ever_completed:
  234. retry_next = []
  235. else: # Persist reported failures list until first non-crash run
  236. retry_next = retry_these
  237. log_value(COUNT_JSON, add=1)
  238. # Prepare environment/filesystem
  239. if babysitting:
  240. for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
  241. attemptDelete(env[key])
  242. if resume_after:
  243. env[RUN_KEY] = ";".join(resume_after)
  244. env[RUN_MODE_KEY] = "EXCLUDE"
  245. elif retry_these:
  246. env[RUN_KEY] = ";".join(retry_these)
  247. env[RUN_MODE_KEY] = "RUN"
  248. # Run test suite
  249. try:
  250. proc = popen(command, env=env)
  251. except OSError:
  252. died_politely = True
  253. sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
  254. sys.exit(127)
  255. code = wait(proc, duration)
  256. timed_out = code is None
  257. if timed_out: # Initial timeout
  258. send_signal(proc, timeout_signal)
  259. if kill_after is not None: # Kill-after timeout
  260. code = wait(proc, kill_after)
  261. if code is None:
  262. send_signal(proc, signal.SIGKILL)
  263. code = proc.wait() # Waits forever
  264. sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
  265. died_politely = True
  266. # The test suite has now run, and what happens next varies:
  267. # 1. The suite either completed fully without failures, or timed out: Just quit.
  268. # 2. The suite crashed (halted without completing):
  269. # Remember any failures for later and rerun, using a blacklist of testcases we have completed.
  270. # 3. The suite completed, but there were failures reported:
  271. # Rerun, using a whitelist of only reported-failed testcases.
  272. # 4. The suite crashed partway through a run with a whitelist:
  273. # Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
  274. crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
  275. failed_tests = attemptLines(env[FAILED_TEST_KEY])
  276. ran_tests = attemptLines(env[RAN_TEST_KEY])
  277. bailout = False
  278. if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
  279. log_value(SUPPORT_JSON, True)
  280. if not crashed_at and not ever_completed: # The resume_after whitelist is only
  281. resume_after = [] # used before the first noncrashing run
  282. ever_completed = True
  283. if timed_out: # Currently no retries after timeout
  284. bailout = True
  285. code = 124 # See GNU timeout manpage
  286. if code or crashed_at: # Process failures
  287. # Handle crash failures
  288. if crashed_at and not timed_out:
  289. log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
  290. if not crash_resuming:
  291. bailout = True
  292. if failure_may_retry(crashed_at):
  293. if ever_completed: # Rerun with whitelist next time
  294. for test in retry_these: # Prepopulate with last whitelist minus run testcases
  295. if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
  296. retry_next.append(test)
  297. else: # Rerun with blacklist next time
  298. for test in ran_tests: # Add testcases we just ran to blacklist
  299. if test != crashed_at: # (except for the crashed testcase)
  300. resume_after.append(test)
  301. else:
  302. bailout = True
  303. # Handle reported failures
  304. for test in failed_tests:
  305. log_test(test, TEST_FAILURES, add=1)
  306. if failure_may_retry(test):
  307. retry_next.append(test)
  308. else:
  309. bailout = True
  310. # Report human-readable failures for stdout log.
  311. message = "%s:" % (scriptname)
  312. if timed_out:
  313. message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
  314. elif crashed_at:
  315. message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
  316. if failed_tests:
  317. message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
  318. if not (timed_out or crashed_at or failed_tests):
  319. message += " Test suite terminated with code %d, " % (code)
  320. if log[SUPPORT_JSON]:
  321. message += "but failure did not occur during a test case. Halting."
  322. else:
  323. message += "and suite cannot report test case data. Halting."
  324. elif bailout:
  325. message += " Will halt testing."
  326. print(message)
  327. if bailout or not (resume_after or retry_next): # If not retrying
  328. return code
  329. # If we got here, a retry is occurring
  330. retry_these = retry_next
  331. # Report human-readable retry notice for stdout log.
  332. message = "%s: Will rerun test suite" % (scriptname)
  333. if log[COUNT_JSON] > 1:
  334. message += " (iteration #%d)" % (log[COUNT_JSON])
  335. if resume_after:
  336. message += ", resuming at crashed testcase %s." % (crashed_at)
  337. else:
  338. message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
  339. print(message)
  340. finally:
  341. # Emergency: Ensure command does not outlive this script
  342. if proc is not None and not died_politely:
  343. send_signal(proc, signal.SIGKILL)
  344. # Write out logs
  345. log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
  346. if logging:
  347. with open(logfile, "a") as f:
  348. f.write(json.dumps(log) + os.linesep)
  349. sys.exit( run() )