Browse Source

Trying to reap processes betterer

msmith-techempower 8 years ago
parent
commit
ea96e73a7b

+ 8 - 8
toolset/benchmark/benchmarker.py

@@ -573,9 +573,9 @@ class Benchmarker:
           print "Error: Unable to recover port, cannot start test"
           return exit_with_code(1)
 
-        result, sid = test.start(out)
+        result, ppid = test.start(out)
         if result != 0:
-          self.__stop_test(sid, out)
+          self.__stop_test(ppid, out)
           time.sleep(5)
           out.write( "ERROR: Problem starting {name}\n".format(name=test.name) )
           out.flush()
@@ -613,13 +613,13 @@ class Benchmarker:
         ##########################
         out.write(header("Stopping %s" % test.name))
         out.flush()
-        self.__stop_test(sid, out)
+        self.__stop_test(ppid, out)
         out.flush()
         time.sleep(15)
 
         if self.__is_port_bound(test.port):
           # This can happen sometimes - let's try again
-          self.__stop_test(sid, out)
+          self.__stop_test(ppid, out)
           out.flush()
           time.sleep(15)
           if self.__is_port_bound(test.port):
@@ -664,7 +664,7 @@ class Benchmarker:
         traceback.print_exc(file=out)
         out.flush()
         try:
-          self.__stop_test(sid, out)
+          self.__stop_test(ppid, out)
         except (subprocess.CalledProcessError) as e:
           self.__write_intermediate_results(test.name,"<setup.py>#stop() raised an error")
           out.write(header("Subprocess Error: Test .stop() raised exception %s" % test.name))
@@ -675,7 +675,7 @@ class Benchmarker:
       # TODO - subprocess should not catch this exception!
       # Parent process should catch it and cleanup/exit
       except (KeyboardInterrupt) as e:
-        self.__stop_test(sid, out)
+        self.__stop_test(ppid, out)
         out.write(header("Cleaning up..."))
         out.flush()
         self.__finish()
@@ -692,9 +692,9 @@ class Benchmarker:
   # __stop_test(benchmarker)
   # Stops all running tests
   ############################################################
-  def __stop_test(self, sid, out):
+  def __stop_test(self, ppid, out):
     try:
-      subprocess.check_call('sudo kill -9 -%s' % sid, shell=True, stderr=out, stdout=out)
+      subprocess.check_call('pkill -P %s' % ppid, shell=True, stderr=out, stdout=out)
       retcode = 0
     except Exception:
       retcode = 1

+ 4 - 3
toolset/benchmark/framework_test.py

@@ -198,7 +198,8 @@ class FrameworkTest:
     os.chdir(os.path.dirname(self.troot))
     logging.info("Running setup module start (cwd=%s)", self.directory)
 
-    command = 'bash -exc "source %s && source %s.sh"' % (
+    command = '%s/TFBReaper "bash -exc \\\"source %s && source %s.sh\\\""' % (
+      self.install_root,
       bash_functions_path,
       os.path.join(self.troot, self.setup_file))
 
@@ -244,7 +245,7 @@ class FrameworkTest:
           stdout=subprocess.PIPE,
           stderr=subprocess.STDOUT,
           preexec_fn=os.setsid)
-    sid = os.getsid(p.pid)
+    pid = p.pid
     nbsr = setup_util.NonBlockingStreamReader(p.stdout,
       "%s: %s.sh and framework processes have terminated" % (self.name, self.setup_file))
 
@@ -341,7 +342,7 @@ class FrameworkTest:
     logging.info("Executed %s.sh, returning %s", self.setup_file, retcode)
     os.chdir(previousDir)
 
-    return retcode, sid
+    return retcode, pid
   ############################################################
   # End start
   ############################################################

+ 52 - 0
toolset/setup/linux/TFBReaper.c

@@ -0,0 +1,52 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <asm/unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+int main(int argc, char *argv[])
+{
+  int count = argc - 1;
+  int *sizes = malloc(sizeof(int) * count);
+  int total_size = 0;
+  for( int i = 1; i < argc; i++ ) {
+    sizes[i - 1] = strlen(argv[i]);
+    total_size += sizes[i - 1];
+  }
+  char *result = malloc(sizeof(char) * total_size + count);
+  char *ptr = result;
+  for( int i = 1; i < argc; i++ ) {
+    memcpy(ptr, argv[i], sizes[i - 1]);
+    ptr[sizes[i - 1]] = ' ';
+    ptr += sizes[i - 1] + 1;
+  }
+  *ptr = '\0';
+  free(sizes);
+
+  // Here is the magic. This sets any child processes to
+  // use THIS process as a 'subreaper'. What that means is
+  // even if the process uses the fork-exit technicque for
+  // running a daemon (which normally orphans the process
+  // and causes init(1) to adopt it, which is problematic
+  // for TFB because we cannot then generally kill the
+  // process since it has lost all context available to us)
+  // the child process will have the parent id of THIS
+  // process, allowing us to kill all the processes started
+  // by the suite in this way generally.
+  //
+  // See: http://man7.org/linux/man-pages/man2/prctl.2.html
+  prctl(PR_SET_CHILD_SUBREAPER,1);
+
+  int ret = system(result);
+  free(result);
+
+  int status;
+  wait(&status);
+
+  return ret;
+}
+

+ 2 - 0
toolset/setup/linux/prerequisites.sh

@@ -10,6 +10,8 @@ RETCODE=$(fw_exists fwbm_prereqs_installed)
   echo "Prerequisites installed!"; 
   return 0; }
 
+# Create TFBReaper application
+gcc -std=c99 -oTFBReaper $FWROOT/toolset/setup/linux/TFBReaper.c
 
 # Use a more recent version of Mongo shell
 sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10