1 month ago · e6a7331381
--- a/include/igl/parallel_for.h
+++ b/include/igl/parallel_for.h
@@ -250,6 +250,7 @@ inline bool igl::parallel_for(
 
															   return parallel_for(loop_size, no_op, wrapper, no_op, min_parallel);
														
 
															 }
														
 
															+
														
 
															 template<
														
 
															   typename Index,
														
 
															   typename PreFunctionType,
														
@@ -262,14 +263,15 @@ inline bool igl::parallel_for(
 
															   const AccumFunctionType & accum_func,
														
 
															   const size_t min_parallel)
														
 
															 {
														
 
															-  assert(loop_size>=0);
														
 
															-  if(loop_size==0) return false;
														
 
															+  assert(loop_size >= 0);
														
 
															+  if (loop_size == 0) return false;
														
 
															-  // If we're already inside a ThreadPool worker, run serial to avoid deadlock
														
 
															-  if(igl::internal::is_worker_thread())
														
 
															+  // If we're already inside a ThreadPool worker, run serial to avoid nested
														
 
															+  // deadlock with the global pool.
														
 
															+  if (igl::internal::is_worker_thread())
														
 
															   {
														
 
															     prep_func(1);
														
 
															-    for(Index i = 0; i < loop_size; ++i)
														
 
															+    for (Index i = 0; i < loop_size; ++i)
														
 
															     {
														
 
															       func(i, 0);
														
 
															     }
														
@@ -283,76 +285,86 @@ inline bool igl::parallel_for(
 
															   const size_t configured_threads = igl::default_num_threads();
														
 
															 #endif
														
 
															-  if(loop_size < static_cast<Index>(min_parallel) || configured_threads <= 1)
														
 
															+  if (loop_size < static_cast<Index>(min_parallel) || configured_threads <= 1)
														
 
															   {
														
 
															-    // serial
														
 
															+    // Serial fallback
														
 
															     prep_func(1);
														
 
															-    for(Index i = 0; i < loop_size; ++i)
														
 
															+    for (Index i = 0; i < loop_size; ++i)
														
 
															     {
														
 
															       func(i, 0);
														
 
															     }
														
 
															     accum_func(0);
														
 
															     return false;
														
 
															   }
														
 
															-  else
														
 
															-  {
														
 
															-    // Use shared thread pool
														
 
															-    auto & pool = igl::internal::ThreadPool::instance(configured_threads);
														
 
															-    const size_t pool_threads = std::max<size_t>(1, pool.size());
														
 
															-    // Keep semantics: prep called with number of potential threads
														
 
															-    prep_func(pool_threads);
														
 
															+  // --- Parallel branch using shared thread pool ---
														
 
															-    // Number of logical jobs (chunks)
														
 
															-    const size_t jobs = static_cast<size_t>(
														
 
															-      std::min<Index>(loop_size, static_cast<Index>(pool_threads)));
														
 
															+  auto & pool = igl::internal::ThreadPool::instance(configured_threads);
														
 
															+  const size_t pool_threads = std::max<size_t>(1, pool.size());
														
 
															-    struct SharedCounter
														
 
															-    {
														
 
															-      std::atomic<size_t> remaining;
														
 
															-    };
														
 
															+  // Match old semantics: prep called with number of *potential* threads.
														
 
															+  prep_func(pool_threads);
														
 
															-    auto counter = std::make_shared<SharedCounter>();
														
 
															-    counter->remaining.store(jobs, std::memory_order_relaxed);
														
 
															+  // Number of "logical jobs" (chunks of the index range).
														
 
															+  const size_t jobs = static_cast<size_t>(
														
 
															+    std::min<Index>(loop_size, static_cast<Index>(pool_threads)));
														
 
															-    const Index total = loop_size;
														
 
															-    const Index base  = total / static_cast<Index>(jobs);
														
 
															-    const Index rem   = total % static_cast<Index>(jobs);
														
 
															+  struct Group
														
 
															+  {
														
 
															+    std::mutex mutex;
														
 
															+    std::condition_variable cv;
														
 
															+    std::atomic<size_t> remaining;
														
 
															+  };
														
 
															-    for(size_t t = 0; t < jobs; ++t)
														
 
															-    {
														
 
															-      const Index start =
														
 
															-        static_cast<Index>(t) * base
														
 
															-        + std::min<Index>(static_cast<Index>(t), rem);
														
 
															-      const Index end = start + base + (t < static_cast<size_t>(rem) ? 1 : 0);
														
 
															+  auto group = std::make_shared<Group>();
														
 
															+  group->remaining.store(jobs, std::memory_order_relaxed);
														
 
															-      pool.enqueue([counter, &func, start, end, t]()
														
 
															-      {
														
 
															-        for(Index k = start; k < end; ++k)
														
 
															-        {
														
 
															-          func(k, t);
														
 
															-        }
														
 
															-        counter->remaining.fetch_sub(1, std::memory_order_acq_rel);
														
 
															-      });
														
 
															-    }
														
 
															+  const Index total = loop_size;
														
 
															+  const Index base  = total / static_cast<Index>(jobs);
														
 
															+  const Index rem   = total % static_cast<Index>(jobs);
														
 
															+
														
 
															+  for (size_t t = 0; t < jobs; ++t)
														
 
															+  {
														
 
															+    const Index start =
														
 
															+      static_cast<Index>(t) * base
														
 
															+      + std::min<Index>(static_cast<Index>(t), rem);
														
 
															-    // Wait until all jobs for this parallel_for are finished.
														
 
															-    // Busy-wait with yield to avoid hammering a core.
														
 
															-    while(counter->remaining.load(std::memory_order_acquire) != 0)
														
 
															+    const Index end = start + base + (t < static_cast<size_t>(rem) ? 1 : 0);
														
 
															+
														
 
															+    pool.enqueue([group, &func, start, end, t]()
														
 
															     {
														
 
															-      std::this_thread::yield();
														
 
															-    }
														
 
															+      // Each job processes its contiguous slice [start, end)
														
 
															+      for (Index k = start; k < end; ++k)
														
 
															+      {
														
 
															+        func(k, t);
														
 
															+      }
														
 
															-    // Accumulate across all potential threads (like original impl)
														
 
															-    for(size_t t = 0; t < pool_threads; ++t)
														
 
															+      // Signal completion of this job.
														
 
															+      if (group->remaining.fetch_sub(1, std::memory_order_acq_rel) == 1)
														
 
															+      {
														
 
															+        std::unique_lock<std::mutex> lock(group->mutex);
														
 
															+        group->cv.notify_one();
														
 
															+      }
														
 
															+    });
														
 
															+  }
														
 
															+
														
 
															+  // Wait for all jobs for this parallel_for call to finish.
														
 
															+  {
														
 
															+    std::unique_lock<std::mutex> lock(group->mutex);
														
 
															+    group->cv.wait(lock, [&group]()
														
 
															     {
														
 
															-      accum_func(t);
														
 
															-    }
														
 
															+      return group->remaining.load(std::memory_order_acquire) == 0;
														
 
															+    });
														
 
															+  }
														
 
															-    return true;
														
 
															+  // Accumulate across all potential threads (same as original implementation).
														
 
															+  for (size_t t = 0; t < pool_threads; ++t)
														
 
															+  {
														
 
															+    accum_func(t);
														
 
															   }
														
 
															-}
														
 
															+  return true;
														
 
															+}
														
 
															 #endif
														
--- a/include/igl/predicates/find_intersections.cpp
+++ b/include/igl/predicates/find_intersections.cpp
@@ -52,11 +52,11 @@ IGL_INLINE bool igl::predicates::find_intersections(
 
															   const bool self_test = (&V1 == &V2) && (&F1 == &F2);
														
 
															   if(stinker){ printf("%s\n",self_test?"🍎&(V1,F1) == 🍎&(V2,F2)":"🍎≠🍊"); }
														
 
															+  std::atomic<bool> found_any(false);
														
 
															   int num_if = 0;
														
 
															-  // mutex
														
 
															   std::mutex append_mutex;
														
 
															-  const auto append_intersection = 
														
 
															-    [&IF,&CP,&num_if,&append_mutex]( const int f1, const int f2, const bool coplanar = false)
														
 
															+  const auto append_intersection =
														
 
															+  [&IF,&CP,&num_if,&append_mutex,&found_any](const int f1, const int f2, const bool coplanar = false)
														
 
															   {
														
 
															     std::lock_guard<std::mutex> lock(append_mutex);
														
 
															     if(num_if >= IF.rows())
														
@@ -67,8 +67,10 @@ IGL_INLINE bool igl::predicates::find_intersections(
 
															     CP(num_if) = coplanar;
														
 
															     IF.row(num_if) << f1,f2;
														
 
															     num_if++;
														
 
															+    found_any.store(true, std::memory_order_release);
														
 
															   };
														
 
															+
														
 
															   // Returns corner in ith face opposite of shared edge; -1 otherwise
														
 
															   const auto shared_edge = [&F1](const int f, const int g)->int
														
 
															   {