Browse Source

TFBReaper Improvements (#2847)

* Reaper update

Moved process killing to the TFBReaper.

* Added more ps checking

* Updated TFBReaper to address respawned slaves

When a slave process is killed before the master, the master can
(and very often does) respawn a new slave to take the killed one's
place. This update puts the TFBReaper into a loop until there are
no processes remaining in the lineage of the TFBReaper.

* Attempting to release defunct processes

* Loops for 30s while waiting for the reaper to die

* Realized this logging is now incorrect

* Removed some superfluous logging

Also, moved the stop test functionality into its own function

* Comments and code cleanup
Mike Smith 8 years ago
parent
commit
c4a1bee948
2 changed files with 77 additions and 66 deletions
  1. 34 42
      toolset/benchmark/benchmarker.py
  2. 43 24
      toolset/setup/linux/TFBReaper.c

+ 34 - 42
toolset/benchmark/benchmarker.py

@@ -550,10 +550,6 @@ class Benchmarker:
             out.flush()
             try:
                 self.__cleanup_leftover_processes_before_test()
-                ##########################
-                # Capturing PIDs before
-                ##########################
-                normalPIDs = subprocess.check_output(['ps -o pid,ppid,comm -u $(whoami)'], shell=True)
 
                 if self.__is_port_bound(test.port):
                     time.sleep(60)
@@ -579,11 +575,6 @@ class Benchmarker:
                 logging.info("Sleeping %s seconds to ensure framework is ready" % self.sleep)
                 time.sleep(self.sleep)
 
-                ##########################
-                # Capturing PIDs started
-                ##########################
-                startedPIDs = subprocess.check_output(['ps -aux'], shell=True)
-
                 ##########################
                 # Verify URLs
                 ##########################
@@ -610,37 +601,7 @@ class Benchmarker:
                 ##########################
                 # Stop this test
                 ##########################
-                out.write(header("Stopping %s" % test.name))
-                out.flush()
-                self.__process.terminate()
-                out.flush()
-                time.sleep(5)
-
-                if self.__is_port_bound(test.port):
-                    # This can happen sometimes - let's try again
-                    self.__process.terminate()
-                    out.flush()
-                    time.sleep(5)
-                    if self.__is_port_bound(test.port):
-                        leftovers = "  PID  PPID COMMAND" + os.linesep
-                        for line in subprocess.check_output(['ps -aux'], shell=True).splitlines():
-                            if line not in startedPIDs:
-                                leftovers += line + os.linesep
-
-                        started = "  PID  PPID COMMAND" + os.linesep
-                        for line in startedPIDs.splitlines():
-                            if line not in normalPIDs:
-                                started += line + os.linesep
-
-                        # We gave it our all
-                        self.__write_intermediate_results(test.name, "port " + str(test.port) + " was not released by stop")
-                        out.write(header("Error: Port %s was not released by stop - %s" % (test.port, test.name)))
-                        out.write(header("Processes Started"))
-                        out.write(started)
-                        out.write(header("Processes Not Killed"))
-                        out.write(leftovers)
-                        out.flush()
-                        return exit_with_code(1)
+                self.__stop_test(test, out)
 
                 out.write(header("Stopped %s" % test.name))
                 out.flush()
@@ -677,8 +638,7 @@ class Benchmarker:
                     print "Failed verify!"
                     return exit_with_code(1)
             except KeyboardInterrupt:
-                if self.__process is not None:
-                    self.__process.terminate()
+                self.__stop_test(test, out)
             except (OSError, IOError, subprocess.CalledProcessError) as e:
                 self.__write_intermediate_results(test.name,"<setup.py> raised an exception")
                 out.write(header("Subprocess Error %s" % test.name))
@@ -694,6 +654,38 @@ class Benchmarker:
     # End __run_tests
     ############################################################
 
+    ############################################################
+    # __stop_test
+    # Attempts to stop the running test.
+    ############################################################
+    def __stop_test(self, test, out):
+        # self.__process may not be set if the user hit ctrl+c prior to the test
+        # starting properly.
+        if self.__process is not None:
+            out.write(header("Stopping %s" % test.name))
+            out.flush()
+            # Ask TFBReaper to nicely terminate itself
+            self.__process.terminate()
+            slept = 0
+            returnCode = None
+            # Check once a second to see if TFBReaper has exited
+            while(slept < 30 and returnCode is None):
+                time.sleep(1)
+                slept += 1
+                returnCode = self.__process.poll()
+            
+            # If TFBReaper has not exited at this point, we have a problem
+            if returnCode is None:
+                self.__write_intermediate_results(test.name, "port " + str(test.port) + " was not released by stop")
+                out.write(header("Error: Port %s was not released by stop - %s" % (test.port, test.name)))
+                out.write(header("Running Processes"))
+                out.write(subprocess.check_output(['ps -aux'], shell=True))
+                out.flush()
+                return exit_with_code(1)
+    ############################################################
+    # End __stop_test
+    ############################################################
+
     def is_port_bound(self, port):
         return self.__is_port_bound(port)
 

+ 43 - 24
toolset/setup/linux/TFBReaper.c

@@ -4,6 +4,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <sys/wait.h>
 #include <sys/prctl.h>
 #include <string.h>
 
@@ -38,37 +39,55 @@ void reap(int signum)
   char command[256];
   sprintf(command, "findChilds() { for child in $(ps --ppid $1 ho pid); do echo $child; findChilds $child; done } && findChilds %d", pid);
 
-  char *pids[256];
-  fp = popen(command, "r");
-  while(fgets(buf, sizeof(buf), fp) != 0)
-  {
-    Node *newNode = malloc(sizeof(Node));
-    newNode->str = malloc(strlen(buf)+1);
-    strcpy(newNode->str, buf);
-    newNode->next = NULL;
+  int count;
 
-    if(tail == NULL)
-    {
-      tail = newNode;
-      head = newNode;
-    }
-    else
+  do
+  {
+    count = 0;
+    char *pids[256];
+    fp = popen(command, "r");
+    while(fgets(buf, sizeof(buf), fp) != 0)
     {
-      if(head->next == NULL)
+      Node *newNode = malloc(sizeof(Node));
+      newNode->str = malloc(strlen(buf)+1);
+      strcpy(newNode->str, buf);
+      newNode->next = NULL;
+
+      if(tail == NULL)
       {
-        head->next = newNode;
+        tail = newNode;
+        head = newNode;
       }
-      tail->next = newNode;
-      tail = newNode;
+      else
+      {
+        if(head->next == NULL)
+        {
+          head->next = newNode;
+        }
+        tail->next = newNode;
+        tail = newNode;
+      }
+      count ++;
     }
-  }
 
-  Node *curr = head;
-  while(curr != NULL)
-  {
-    kill(atoi(curr->str), SIGKILL);
-    curr = curr->next;
+    Node *curr = head;
+    while(curr != NULL)
+    {
+      kill(atoi(curr->str), SIGKILL);
+      waitpid(atoi(curr->str), NULL, 0);
+      curr = curr->next;
+    }
   }
+  // This may seem magical, but that command from above always results in two
+  // additionally PIDs: one for `ps` and one for `sh`. Therefore, all of the
+  // lineage of this TFBReaper have been successfully killed once there are
+  // only two PIDs counted in the loop.
+  // This loop is necessary for edge cases where there is a master->slave 
+  // lineage and TFBReaper kills a slave first, which is observed and fixed
+  // by the master by spawning a NEW slave in the original's place, and then
+  // killing the master (thus orphaning the newly spawned slave, but that PID
+  // is not in our master list).
+  while(count > 2);
 
   exit(0);
 }