Browse Source

pipeline: Improve performance of Thread::get_current_thread() substantially

Speedup is realised by using thread-local variables.  Note that on Windows we can't inline get_current_thread, but it's still faster this way than calling TlsGetValue.

In theory the cache line alignment should help avoid false sharing but I have not profiled that extensively.
rdb 3 years ago
parent
commit
46a1ad3544

+ 2 - 1
panda/src/pipeline/thread.h

@@ -42,7 +42,8 @@ class AsyncTask;
  * object will automatically be destructed if no other pointers are
  * referencing it.
  */
-class EXPCL_PANDA_PIPELINE Thread : public TypedReferenceCount, public Namable {
+// Due to a GCC bug, we can't use alignas() together with an attribute.
+class ALIGN_64BYTE EXPCL_PANDA_PIPELINE Thread : public TypedReferenceCount, public Namable {
 protected:
   Thread(const std::string &name, const std::string &sync_name);
   Thread(const Thread &copy) = delete;

+ 2 - 20
panda/src/pipeline/threadPosixImpl.I

@@ -46,26 +46,8 @@ prepare_for_exit() {
 INLINE Thread *ThreadPosixImpl::
 get_current_thread() {
   TAU_PROFILE("Thread *ThreadPosixImpl::get_current_thread()", " ", TAU_USER);
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-  return (Thread *)pthread_getspecific(_pt_ptr_index);
-}
-
-/**
- * Associates the indicated Thread object with the currently-executing thread.
- * You should not call this directly; use Thread::bind_thread() instead.
- */
-INLINE void ThreadPosixImpl::
-bind_thread(Thread *thread) {
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-  int result = pthread_setspecific(_pt_ptr_index, thread);
-  nassertv(result == 0);
-#ifdef ANDROID
-  bind_java_thread();
-#endif
+  Thread *thread = _current_thread;
+  return (thread != nullptr) ? thread : init_current_thread();
 }
 
 /**

+ 28 - 27
panda/src/pipeline/threadPosixImpl.cxx

@@ -28,8 +28,8 @@
 static JavaVM *java_vm = nullptr;
 #endif
 
-pthread_key_t ThreadPosixImpl::_pt_ptr_index = 0;
-bool ThreadPosixImpl::_got_pt_ptr_index = false;
+__thread Thread *ThreadPosixImpl::_current_thread = nullptr;
+static patomic_flag _main_thread_known = ATOMIC_FLAG_INIT;
 
 /**
  *
@@ -80,10 +80,6 @@ start(ThreadPriority priority, bool joinable) {
   _status = S_start_called;
   _detached = false;
 
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-
   pthread_attr_t attr;
   pthread_attr_init(&attr);
 
@@ -186,6 +182,21 @@ get_unique_id() const {
   return strm.str();
 }
 
+/**
+ * Associates the indicated Thread object with the currently-executing thread.
+ * You should not call this directly; use Thread::bind_thread() instead.
+ */
+void ThreadPosixImpl::
+bind_thread(Thread *thread) {
+  if (_current_thread == nullptr && thread == Thread::get_main_thread()) {
+    _main_thread_known.test_and_set(std::memory_order_relaxed);
+  }
+  _current_thread = thread;
+#ifdef ANDROID
+  bind_java_thread();
+#endif
+}
+
 #ifdef ANDROID
 /**
  * Attaches the thread to the Java virtual machine.  If this returns true, a
@@ -247,8 +258,7 @@ root_func(void *data) {
     // TAU_PROFILE("void ThreadPosixImpl::root_func()", " ", TAU_USER);
 
     ThreadPosixImpl *self = (ThreadPosixImpl *)data;
-    int result = pthread_setspecific(_pt_ptr_index, self->_parent_obj);
-    nassertr(result == 0, nullptr);
+    _current_thread = self->_parent_obj;
 
     {
       self->_mutex.lock();
@@ -302,27 +312,18 @@ root_func(void *data) {
 }
 
 /**
- * Allocate a new index to store the Thread parent pointer as a piece of per-
- * thread private data.
+ * Called by get_current_thread() if the current therad pointer is null; checks
+ * whether it might be the main thread.
  */
-void ThreadPosixImpl::
-init_pt_ptr_index() {
-  nassertv(!_got_pt_ptr_index);
-
-  int result = pthread_key_create(&_pt_ptr_index, nullptr);
-  if (result != 0) {
-    thread_cat->error()
-      << "Unable to associate Thread pointers with threads.\n";
-    return;
+Thread *ThreadPosixImpl::
+init_current_thread() {
+  Thread *thread = _current_thread;
+  if (!_main_thread_known.test_and_set(std::memory_order_relaxed)) {
+    thread = Thread::get_main_thread();
+    _current_thread = thread;
   }
-
-  _got_pt_ptr_index = true;
-
-  // Assume that we must be in the main thread, since this method must be
-  // called before the first thread is spawned.
-  Thread *main_thread_obj = Thread::get_main_thread();
-  result = pthread_setspecific(_pt_ptr_index, main_thread_obj);
-  nassertv(result == 0);
+  nassertr(thread != nullptr, nullptr);
+  return thread;
 }
 
 #ifdef ANDROID

+ 3 - 4
panda/src/pipeline/threadPosixImpl.h

@@ -49,7 +49,7 @@ public:
   INLINE static void prepare_for_exit();
 
   INLINE static Thread *get_current_thread();
-  INLINE static void bind_thread(Thread *thread);
+  static void bind_thread(Thread *thread);
   INLINE static bool is_threading_supported();
   INLINE static bool is_true_threads();
   INLINE static bool is_simple_threads();
@@ -65,7 +65,7 @@ public:
 
 private:
   static void *root_func(void *data);
-  static void init_pt_ptr_index();
+  static Thread *init_current_thread();
 
   // There appears to be a name collision with the word "Status".
   enum PStatus {
@@ -86,8 +86,7 @@ private:
   JNIEnv *_jni_env;
 #endif
 
-  static pthread_key_t _pt_ptr_index;
-  static bool _got_pt_ptr_index;
+  static __thread Thread *_current_thread;
 };
 
 #include "threadPosixImpl.I"

+ 0 - 24
panda/src/pipeline/threadWin32Impl.I

@@ -38,30 +38,6 @@ INLINE void ThreadWin32Impl::
 prepare_for_exit() {
 }
 
-/**
- *
- */
-INLINE Thread *ThreadWin32Impl::
-get_current_thread() {
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-  return (Thread *)TlsGetValue(_pt_ptr_index);
-}
-
-/**
- * Associates the indicated Thread object with the currently-executing thread.
- * You should not call this directly; use Thread::bind_thread() instead.
- */
-INLINE void ThreadWin32Impl::
-bind_thread(Thread *thread) {
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-  BOOL result = TlsSetValue(_pt_ptr_index, thread);
-  nassertv(result);
-}
-
 /**
  *
  */

+ 44 - 32
panda/src/pipeline/threadWin32Impl.cxx

@@ -20,8 +20,28 @@
 #include "pointerTo.h"
 #include "config_pipeline.h"
 
-DWORD ThreadWin32Impl::_pt_ptr_index = 0;
-bool ThreadWin32Impl::_got_pt_ptr_index = false;
+static thread_local Thread *_current_thread = nullptr;
+static patomic_flag _main_thread_known = ATOMIC_FLAG_INIT;
+
+/**
+ * Called by get_current_thread() if the current thread pointer is null; checks
+ * whether it might be the main thread.
+ * Note that adding noinline speeds up this call *significantly*, don't remove!
+ */
+static __declspec(noinline) Thread *
+init_current_thread() {
+  Thread *thread = _current_thread;
+  if (!_main_thread_known.test_and_set(std::memory_order_relaxed)) {
+    // Assume that we must be in the main thread, since this method must be
+    // called before the first thread is spawned.
+    thread = Thread::get_main_thread();
+    _current_thread = thread;
+  }
+  // If this assertion triggers, you are making Panda calls from a thread
+  // that has not first been registered using Thread::bind_thread().
+  nassertr(thread != nullptr, nullptr);
+  return thread;
+}
 
 /**
  *
@@ -62,10 +82,6 @@ start(ThreadPriority priority, bool joinable) {
   _joinable = joinable;
   _status = S_start_called;
 
-  if (!_got_pt_ptr_index) {
-    init_pt_ptr_index();
-  }
-
   // Increment the parent object's reference count first.  The thread will
   // eventually decrement it when it terminates.
   _parent_obj->ref();
@@ -133,6 +149,27 @@ get_unique_id() const {
   return strm.str();
 }
 
+/**
+ *
+ */
+Thread *ThreadWin32Impl::
+get_current_thread() {
+  Thread *thread = _current_thread;
+  return (thread != nullptr) ? thread : init_current_thread();
+}
+
+/**
+ * Associates the indicated Thread object with the currently-executing thread.
+ * You should not call this directly; use Thread::bind_thread() instead.
+ */
+void ThreadWin32Impl::
+bind_thread(Thread *thread) {
+  if (_current_thread == nullptr && thread == Thread::get_main_thread()) {
+    _main_thread_known.test_and_set(std::memory_order_relaxed);
+  }
+  _current_thread = thread;
+}
+
 /**
  * The entry point of each thread.
  */
@@ -143,8 +180,7 @@ root_func(LPVOID data) {
     // TAU_PROFILE("void ThreadWin32Impl::root_func()", " ", TAU_USER);
 
     ThreadWin32Impl *self = (ThreadWin32Impl *)data;
-    BOOL result = TlsSetValue(_pt_ptr_index, self->_parent_obj);
-    nassertr(result, 1);
+    _current_thread = self->_parent_obj;
 
     {
       self->_mutex.lock();
@@ -185,28 +221,4 @@ root_func(LPVOID data) {
   return 0;
 }
 
-/**
- * Allocate a new index to store the Thread parent pointer as a piece of per-
- * thread private data.
- */
-void ThreadWin32Impl::
-init_pt_ptr_index() {
-  nassertv(!_got_pt_ptr_index);
-
-  _pt_ptr_index = TlsAlloc();
-  if (_pt_ptr_index == TLS_OUT_OF_INDEXES) {
-    thread_cat->error()
-      << "Unable to associate Thread pointers with threads.\n";
-    return;
-  }
-
-  _got_pt_ptr_index = true;
-
-  // Assume that we must be in the main thread, since this method must be
-  // called before the first thread is spawned.
-  Thread *main_thread_obj = Thread::get_main_thread();
-  BOOL result = TlsSetValue(_pt_ptr_index, main_thread_obj);
-  nassertv(result);
-}
-
 #endif  // THREAD_WIN32_IMPL

+ 2 - 6
panda/src/pipeline/threadWin32Impl.h

@@ -43,8 +43,8 @@ public:
 
   INLINE static void prepare_for_exit();
 
-  INLINE static Thread *get_current_thread();
-  INLINE static void bind_thread(Thread *thread);
+  static Thread *get_current_thread();
+  static void bind_thread(Thread *thread);
   INLINE static bool is_threading_supported();
   INLINE static bool is_true_threads();
   INLINE static bool is_simple_threads();
@@ -54,7 +54,6 @@ public:
 
 private:
   static DWORD WINAPI root_func(LPVOID data);
-  static void init_pt_ptr_index();
 
   enum Status {
     S_new,
@@ -70,9 +69,6 @@ private:
   DWORD _thread_id;
   bool _joinable;
   Status _status;
-
-  static DWORD _pt_ptr_index;
-  static bool _got_pt_ptr_index;
 };
 
 #include "threadWin32Impl.I"