Browse Source

dtoolbase: Use mimalloc on Windows, disable USE_DELETED_CHAIN

Windows' malloc has awful performance.  mimalloc is orders of magnitude faster, even faster than DeletedBufferChain.  Therefore, only enable USE_DELETED_CHAIN on Windows when building without mimalloc.

On Linux, mimalloc doesn't appear to be measurably faster than glibc's own allocator.  Both are marginally than DeletedBufferChain, though, and substantially faster in the multi-threaded case, so USE_DELETED_CHAIN is disabled there in all cases.
rdb 3 years ago
parent
commit
07545bc9e3

+ 42 - 5
dtool/Config.cmake

@@ -288,6 +288,26 @@ mark_as_advanced(SIMULATE_NETWORK_DELAY DO_MEMORY_USAGE DO_DCAST)
 # The following options have to do with the memory allocation system.
 # The following options have to do with the memory allocation system.
 #
 #
 
 
+find_package(MIMALLOC 1.0 QUIET)
+
+package_option(MIMALLOC
+  "The mimalloc allocator.  See also USE_MEMORY_MIMALLOC, which
+you will need to use to activate it by default.  If you do not set
+USE_MEMORY_MIMALLOC, Panda will decide whether to use it."
+  IMPORTED_AS mimalloc-static)
+
+if (WIN32 AND HAVE_MIMALLOC)
+  set(_prefer_mimalloc ON)
+else()
+  set(_prefer_mimalloc OFF)
+endif()
+
+option(USE_MEMORY_MIMALLOC
+  "This is an optional memory allocator with good multi-threading
+support.  It is recommended on Windows, where it gives much better
+performance than the built-in malloc.  However, it does not appear
+to be significantly faster on glibc-based systems." ${_prefer_mimalloc})
+
 option(USE_MEMORY_DLMALLOC
 option(USE_MEMORY_DLMALLOC
   "This is an optional alternative memory-allocation scheme
   "This is an optional alternative memory-allocation scheme
 available within Panda.  You can experiment with it to see
 available within Panda.  You can experiment with it to see
@@ -307,16 +327,33 @@ if 16-byte alignment must be performed on top of it, wasting up to
 is required and not provided by the system malloc library, then an
 is required and not provided by the system malloc library, then an
 alternative malloc system (above) will be used instead." OFF)
 alternative malloc system (above) will be used instead." OFF)
 
 
-option(USE_DELETED_CHAIN
-  "Define this true to use the DELETED_CHAIN macros, which support
+if (WIN32 AND NOT HAVE_MIMALLOC)
+  option(USE_DELETED_CHAIN
+    "Define this true to use the DELETED_CHAIN macros, which support
 fast re-use of existing allocated blocks, minimizing the low-level
 fast re-use of existing allocated blocks, minimizing the low-level
 calls to malloc() and free() for frequently-created and -deleted
 calls to malloc() and free() for frequently-created and -deleted
-objects.  There's usually no reason to set this false, unless you
-suspect a bug in Panda's memory management code." ON)
+objects.  This is significantly better than built-in malloc on Windows
+but suffers with multiple threads, where mimalloc performs better, so
+it is preferred to get mimalloc instead and turn this OFF." ON)
+else()
+  option(USE_DELETED_CHAIN
+    "Define this true to use the DELETED_CHAIN macros, which support
+fast re-use of existing allocated blocks, minimizing the low-level
+calls to malloc() and free() for frequently-created and -deleted
+objects.  However, modern memory allocators generally perform as good,
+especially with threading, so best leave this OFF." OFF)
+endif()
 
 
 mark_as_advanced(USE_MEMORY_DLMALLOC USE_MEMORY_PTMALLOC2
 mark_as_advanced(USE_MEMORY_DLMALLOC USE_MEMORY_PTMALLOC2
-  MEMORY_HOOK_DO_ALIGN USE_DELETED_CHAIN)
+  USE_MEMORY_MIMALLOC MEMORY_HOOK_DO_ALIGN USE_DELETED_CHAIN)
+
+if(USE_MEMORY_MIMALLOC)
+  package_status(MIMALLOC "mimalloc memory allocator")
+else()
+  package_status(MIMALLOC "mimalloc memory allocator (not used)")
+endif()
 
 
+unset(_prefer_mimalloc)
 
 
 #
 #
 # This section relates to mobile-device/phone support and options
 # This section relates to mobile-device/phone support and options

+ 5 - 1
dtool/dtool_config.h.in

@@ -130,8 +130,12 @@
 /* Define if we want to support fixed-function OpenGL rendering. */
 /* Define if we want to support fixed-function OpenGL rendering. */
 #cmakedefine SUPPORT_FIXED_FUNCTION
 #cmakedefine SUPPORT_FIXED_FUNCTION
 
 
-/* Define for either of the alternative malloc schemes. */
+/* Define if we have mimalloc available. */
+#cmakedefine HAVE_MIMALLOC
+
+/* Define for one of the alternative malloc schemes. */
 #cmakedefine USE_MEMORY_DLMALLOC
 #cmakedefine USE_MEMORY_DLMALLOC
+#cmakedefine USE_MEMORY_MIMALLOC
 #cmakedefine USE_MEMORY_PTMALLOC2
 #cmakedefine USE_MEMORY_PTMALLOC2
 
 
 /* Define if we want to compile in support for pipelining.  */
 /* Define if we want to compile in support for pipelining.  */

+ 1 - 1
dtool/src/dtoolbase/CMakeLists.txt

@@ -92,7 +92,7 @@ add_component_library(p3dtoolbase NOINIT SYMBOL BUILDING_DTOOL_DTOOLBASE
 target_include_directories(p3dtoolbase PUBLIC
 target_include_directories(p3dtoolbase PUBLIC
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
   $<BUILD_INTERFACE:${PANDA_OUTPUT_DIR}/include>)
   $<BUILD_INTERFACE:${PANDA_OUTPUT_DIR}/include>)
-target_link_libraries(p3dtoolbase PKG::EIGEN PKG::THREADS)
+target_link_libraries(p3dtoolbase PKG::EIGEN PKG::THREADS PKG::MIMALLOC)
 target_interrogate(p3dtoolbase ${P3DTOOLBASE_SOURCES} EXTENSIONS ${P3DTOOLBASE_IGATEEXT})
 target_interrogate(p3dtoolbase ${P3DTOOLBASE_SOURCES} EXTENSIONS ${P3DTOOLBASE_IGATEEXT})
 
 
 if(NOT BUILD_METALIBS)
 if(NOT BUILD_METALIBS)

+ 11 - 1
dtool/src/dtoolbase/dtoolbase.h

@@ -375,6 +375,10 @@ typedef struct _object PyObject;
 // This specialized malloc implementation can perform the required alignment.
 // This specialized malloc implementation can perform the required alignment.
 #undef MEMORY_HOOK_DO_ALIGN
 #undef MEMORY_HOOK_DO_ALIGN
 
 
+#elif defined(USE_MEMORY_MIMALLOC)
+// This one does, too.
+#undef MEMORY_HOOK_DO_ALIGN
+
 #elif defined(USE_MEMORY_PTMALLOC2)
 #elif defined(USE_MEMORY_PTMALLOC2)
 // But not this one.  For some reason it crashes when we try to build it with
 // But not this one.  For some reason it crashes when we try to build it with
 // alignment 16.  So if we're using ptmalloc2, we need to enforce alignment
 // alignment 16.  So if we're using ptmalloc2, we need to enforce alignment
@@ -385,6 +389,12 @@ typedef struct _object PyObject;
 // The OS-provided malloc implementation will do the required alignment.
 // The OS-provided malloc implementation will do the required alignment.
 #undef MEMORY_HOOK_DO_ALIGN
 #undef MEMORY_HOOK_DO_ALIGN
 
 
+#elif defined(HAVE_MIMALLOC) && defined(_WIN32)
+// Prefer mimalloc on Windows, if we have it.  It is significantly faster than
+// standard malloc, supports multi-threading well and does the alignment too.
+#undef MEMORY_HOOK_DO_ALIGN
+#define USE_MEMORY_MIMALLOC 1
+
 #elif defined(MEMORY_HOOK_DO_ALIGN)
 #elif defined(MEMORY_HOOK_DO_ALIGN)
 // We need memory alignment, and we're willing to provide it ourselves.
 // We need memory alignment, and we're willing to provide it ourselves.
 
 
@@ -426,7 +436,7 @@ typedef struct _object PyObject;
 #endif
 #endif
 
 
 /* Determine our memory-allocation requirements. */
 /* Determine our memory-allocation requirements. */
-#if defined(USE_MEMORY_PTMALLOC2) || defined(USE_MEMORY_DLMALLOC) || defined(DO_MEMORY_USAGE) || defined(MEMORY_HOOK_DO_ALIGN)
+#if defined(USE_MEMORY_MIMALLOC) || defined(USE_MEMORY_PTMALLOC2) || defined(USE_MEMORY_DLMALLOC) || defined(DO_MEMORY_USAGE) || defined(MEMORY_HOOK_DO_ALIGN)
 /* In this case we have some custom memory management requirements. */
 /* In this case we have some custom memory management requirements. */
 #else
 #else
 /* Otherwise, if we have no custom memory management needs at all, we
 /* Otherwise, if we have no custom memory management needs at all, we

+ 12 - 0
dtool/src/dtoolbase/memoryHook.cxx

@@ -51,6 +51,18 @@ static_assert((MEMORY_HOOK_ALIGNMENT & (MEMORY_HOOK_ALIGNMENT - 1)) == 0,
 
 
 #if defined(CPPPARSER)
 #if defined(CPPPARSER)
 
 
+#elif defined(USE_MEMORY_MIMALLOC)
+
+// mimalloc is a modern memory manager by Microsoft that is very fast as well
+// as thread-safe.
+
+#include "mimalloc.h"
+
+#define call_malloc mi_malloc
+#define call_realloc mi_realloc
+#define call_free mi_free
+#undef MEMORY_HOOK_MALLOC_LOCK
+
 #elif defined(USE_MEMORY_DLMALLOC)
 #elif defined(USE_MEMORY_DLMALLOC)
 
 
 // Memory manager: DLMALLOC This is Doug Lea's memory manager.  It is very
 // Memory manager: DLMALLOC This is Doug Lea's memory manager.  It is very

+ 21 - 2
makepanda/makepanda.py

@@ -103,6 +103,7 @@ PkgListSet(["PYTHON", "DIRECT",                        # Python support
   "PANDAPARTICLESYSTEM",                               # Built in particle system
   "PANDAPARTICLESYSTEM",                               # Built in particle system
   "CONTRIB",                                           # Experimental
   "CONTRIB",                                           # Experimental
   "SSE2", "NEON",                                      # Compiler features
   "SSE2", "NEON",                                      # Compiler features
+  "MIMALLOC",                                          # Memory allocators
 ])
 ])
 
 
 CheckPandaSourceTree()
 CheckPandaSourceTree()
@@ -633,6 +634,7 @@ if (COMPILER == "MSVC"):
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "quartz.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "quartz.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbc32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbc32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbccp32.lib")
     if (PkgSkip("DIRECTCAM")==0): LibName("DIRECTCAM", "odbccp32.lib")
+    if (PkgSkip("MIMALLOC")==0): LibName("MIMALLOC", GetThirdpartyDir() + "mimalloc/lib/mimalloc-static.lib")
     if (PkgSkip("OPENSSL")==0):
     if (PkgSkip("OPENSSL")==0):
         if os.path.isfile(GetThirdpartyDir() + "openssl/lib/libpandassl.lib"):
         if os.path.isfile(GetThirdpartyDir() + "openssl/lib/libpandassl.lib"):
             LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
             LibName("OPENSSL", GetThirdpartyDir() + "openssl/lib/libpandassl.lib")
@@ -778,6 +780,8 @@ if (COMPILER == "MSVC"):
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/BulletSoftBody" + suffix)
         LibName("BULLET", GetThirdpartyDir() + "bullet/lib/BulletSoftBody" + suffix)
 
 
 if (COMPILER=="GCC"):
 if (COMPILER=="GCC"):
+    PkgDisable("MIMALLOC") # no discernable benefit over glibc
+
     if GetTarget() != "darwin":
     if GetTarget() != "darwin":
         PkgDisable("COCOA")
         PkgDisable("COCOA")
 
 
@@ -2288,6 +2292,7 @@ DTOOL_CONFIG=[
     ("REPORT_OPENSSL_ERRORS",          '1',                      '1'),
     ("REPORT_OPENSSL_ERRORS",          '1',                      '1'),
     ("USE_PANDAFILESTREAM",            '1',                      '1'),
     ("USE_PANDAFILESTREAM",            '1',                      '1'),
     ("USE_DELETED_CHAIN",              '1',                      '1'),
     ("USE_DELETED_CHAIN",              '1',                      '1'),
+    ("HAVE_MIMALLOC",                  'UNDEF',                  'UNDEF'),
     ("HAVE_WGL",                       '1',                      'UNDEF'),
     ("HAVE_WGL",                       '1',                      'UNDEF'),
     ("HAVE_DX9",                       'UNDEF',                  'UNDEF'),
     ("HAVE_DX9",                       'UNDEF',                  'UNDEF'),
     ("HAVE_THREADS",                   '1',                      '1'),
     ("HAVE_THREADS",                   '1',                      '1'),
@@ -2433,6 +2438,20 @@ def WriteConfigSettings():
 
 
     dtool_config["HAVE_NET"] = '1'
     dtool_config["HAVE_NET"] = '1'
 
 
+    if GetTarget() == 'windows':
+        if not PkgSkip("MIMALLOC"):
+            # This is faster than both DeletedBufferChain and malloc,
+            # especially in the multi-threaded case.
+            dtool_config["USE_MEMORY_MIMALLOC"] = '1'
+            dtool_config["USE_DELETED_CHAIN"] = 'UNDEF'
+        else:
+            # If we don't have mimalloc, use DeletedBufferChain as fallback,
+            # which is still more efficient than malloc.
+            dtool_config["USE_DELETED_CHAIN"] = '1'
+    else:
+        # On other systems, the default malloc seems to be fine.
+        dtool_config["USE_DELETED_CHAIN"] = 'UNDEF'
+
     if (PkgSkip("NVIDIACG")==0):
     if (PkgSkip("NVIDIACG")==0):
         dtool_config["HAVE_CG"] = '1'
         dtool_config["HAVE_CG"] = '1'
         dtool_config["HAVE_CGGL"] = '1'
         dtool_config["HAVE_CGGL"] = '1'
@@ -3340,7 +3359,7 @@ if GetTarget() == 'windows':
 # DIRECTORY: dtool/src/dtoolbase/
 # DIRECTORY: dtool/src/dtoolbase/
 #
 #
 
 
-OPTS=['DIR:dtool/src/dtoolbase', 'BUILDING:DTOOL']
+OPTS=['DIR:dtool/src/dtoolbase', 'BUILDING:DTOOL', 'MIMALLOC']
 TargetAdd('p3dtoolbase_composite1.obj', opts=OPTS, input='p3dtoolbase_composite1.cxx')
 TargetAdd('p3dtoolbase_composite1.obj', opts=OPTS, input='p3dtoolbase_composite1.cxx')
 TargetAdd('p3dtoolbase_composite2.obj', opts=OPTS, input='p3dtoolbase_composite2.cxx')
 TargetAdd('p3dtoolbase_composite2.obj', opts=OPTS, input='p3dtoolbase_composite2.cxx')
 TargetAdd('p3dtoolbase_lookup3.obj',    opts=OPTS, input='lookup3.c')
 TargetAdd('p3dtoolbase_lookup3.obj',    opts=OPTS, input='lookup3.c')
@@ -3371,7 +3390,7 @@ TargetAdd('libp3dtool.dll', input='p3dtoolbase_composite1.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_composite2.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_composite2.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_indent.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_indent.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_lookup3.obj')
 TargetAdd('libp3dtool.dll', input='p3dtoolbase_lookup3.obj')
-TargetAdd('libp3dtool.dll', opts=['ADVAPI','WINSHELL','WINKERNEL'])
+TargetAdd('libp3dtool.dll', opts=['ADVAPI','WINSHELL','WINKERNEL','MIMALLOC'])
 
 
 #
 #
 # DIRECTORY: dtool/src/cppparser/
 # DIRECTORY: dtool/src/cppparser/