Ver código fonte

Avoid race condition between domain unload and threadpool. (#3491)

* Avoid race condition between domain unload and threadpool.

Previous logic only ensured that domain->threadpool_jobs was 0.
There was a race condition as follows:
* worker jobs are queued into ThreadPoolDomain outstanding_request's
* domain unload starts
* domain->threadpool_jobs is 0 so unload proceeds
* queued jobs are processed from the ThreadPoolDomain outstanding_request's
* the domain can be set/entered by worker threads since
  mono_domain_set only fails for domains with state of MONO_APPDOMAIN_UNLOADED,
  while our domain is in the MONO_APPDOMAIN_UNLOADING state
* jobs are now being unexpectedly run in domain that is being shutdown

* Fix a hang when unloading the domain.

This change applies the diff from ludovic-henry to correct the hang.

https://gist.github.com/ludovic-henry/6c8a4bc8951091c0ef36df2e52d5e32e
Jonathan Chambers 9 anos atrás
pai
commit
12d1d63b8a
1 arquivos alterados com 43 adições e 32 exclusões
  1. 43 32
      mono/metadata/threadpool-ms.c

+ 43 - 32
mono/metadata/threadpool-ms.c

@@ -430,6 +430,9 @@ domain_get (MonoDomain *domain, gboolean create)
 	}
 	}
 
 
 	if (create) {
 	if (create) {
+		g_assert(!domain->cleanup_semaphore);
+		domain->cleanup_semaphore = CreateSemaphore(NULL, 0, 1, NULL);
+
 		tpdomain = g_new0 (ThreadPoolDomain, 1);
 		tpdomain = g_new0 (ThreadPoolDomain, 1);
 		tpdomain->domain = domain;
 		tpdomain->domain = domain;
 		domain_add (tpdomain);
 		domain_add (tpdomain);
@@ -681,10 +684,13 @@ worker_thread (gpointer data)
 		g_assert (tpdomain->domain->threadpool_jobs >= 0);
 		g_assert (tpdomain->domain->threadpool_jobs >= 0);
 
 
 		if (tpdomain->domain->threadpool_jobs == 0 && mono_domain_is_unloading (tpdomain->domain)) {
 		if (tpdomain->domain->threadpool_jobs == 0 && mono_domain_is_unloading (tpdomain->domain)) {
-			gboolean removed = domain_remove (tpdomain);
+			gboolean removed;
+
+			removed = domain_remove(tpdomain);
 			g_assert (removed);
 			g_assert (removed);
-			if (tpdomain->domain->cleanup_semaphore)
-				ReleaseSemaphore (tpdomain->domain->cleanup_semaphore, 1, NULL);
+
+			g_assert(tpdomain->domain->cleanup_semaphore);
+			ReleaseSemaphore (tpdomain->domain->cleanup_semaphore, 1, NULL);
 			domain_free (tpdomain);
 			domain_free (tpdomain);
 			tpdomain = NULL;
 			tpdomain = NULL;
 		}
 		}
@@ -1427,9 +1433,9 @@ mono_threadpool_ms_end_invoke (MonoAsyncResult *ares, MonoArray **out_args, Mono
 gboolean
 gboolean
 mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
 mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
 {
 {
-	gboolean res = TRUE;
-	gint64 end;
-	gpointer sem;
+	guint32 res;
+	gint64 now, end;
+	ThreadPoolDomain *tpdomain;
 
 
 	g_assert (domain);
 	g_assert (domain);
 	g_assert (timeout >= -1);
 	g_assert (timeout >= -1);
@@ -1448,38 +1454,43 @@ mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
 #endif
 #endif
 
 
 	/*
 	/*
-	 * There might be some threads out that could be about to execute stuff from the given domain.
-	 * We avoid that by setting up a semaphore to be pulsed by the thread that reaches zero.
-	 */
-	sem = domain->cleanup_semaphore = CreateSemaphore (NULL, 0, 1, NULL);
+	* There might be some threads out that could be about to execute stuff from the given domain.
+	* We avoid that by waiting on a semaphore to be pulsed by the thread that reaches zero.
+	* The semaphore is only created for domains which queued threadpool jobs.
+	* We always wait on the semaphore rather than ensuring domain->threadpool_jobs is 0.
+	* There may be pending outstanding requests which will create new jobs.
+	* The semaphore is signaled the threadpool domain has been removed from list
+	* and we know no more jobs for the domain will be processed.
+	*/
+
+	mono_lazy_initialize(&status, initialize);
+	mono_coop_mutex_lock(&threadpool->domains_lock);
+
+	tpdomain = domain_get(domain, FALSE);
+	if (!tpdomain || tpdomain->outstanding_request == 0) {
+		mono_coop_mutex_unlock(&threadpool->domains_lock);
+		return TRUE;
+	}
+	g_assert(domain->cleanup_semaphore);
 
 
-	/*
-	 * The memory barrier here is required to have global ordering between assigning to cleanup_semaphone
-	 * and reading threadpool_jobs. Otherwise this thread could read a stale version of threadpool_jobs
-	 * and wait forever.
-	 */
-	mono_memory_write_barrier ();
-
-	while (domain->threadpool_jobs) {
-		gint64 now;
-
-		if (timeout != -1) {
-			now = mono_msec_ticks ();
-			if (now > end) {
-				res = FALSE;
-				break;
-			}
+	if (timeout != -1) {
+		now = mono_msec_ticks();
+		if (now > end) {
+			mono_coop_mutex_unlock(&threadpool->domains_lock);
+			return FALSE;
 		}
 		}
-
-		MONO_ENTER_GC_SAFE;
-		WaitForSingleObjectEx (sem, timeout != -1 ? end - now : timeout, FALSE);
-		MONO_EXIT_GC_SAFE;
 	}
 	}
 
 
+	MONO_ENTER_GC_SAFE;
+	res = WaitForSingleObjectEx(domain->cleanup_semaphore, timeout != -1 ? end - now : timeout, FALSE);
+	MONO_EXIT_GC_SAFE;
+
+	CloseHandle(domain->cleanup_semaphore);
 	domain->cleanup_semaphore = NULL;
 	domain->cleanup_semaphore = NULL;
-	CloseHandle (sem);
 
 
-	return res;
+	mono_coop_mutex_unlock(&threadpool->domains_lock);
+
+	return res == WAIT_OBJECT_0;
 }
 }
 
 
 void
 void