Pārlūkot izejas kodu

Added TCMalloc and JEMalloc projects

Brian Fiete 3 gadi atpakaļ
vecāks
revīzija
652142e189
100 mainītis faili ar 23853 papildinājumiem un 0 dzēšanām
  1. 41 0
      BeefRT/JEMalloc/.appveyor.yml
  2. 3 0
      BeefRT/JEMalloc/.autom4te.cfg
  3. 27 0
      BeefRT/JEMalloc/COPYING
  4. 424 0
      BeefRT/JEMalloc/INSTALL.md
  5. 762 0
      BeefRT/JEMalloc/Makefile.in
  6. 20 0
      BeefRT/JEMalloc/README
  7. 1 0
      BeefRT/JEMalloc/VERSION
  8. 17 0
      BeefRT/JEMalloc/autogen.sh
  9. 83 0
      BeefRT/JEMalloc/bin/jemalloc-config.in
  10. 9 0
      BeefRT/JEMalloc/bin/jemalloc.sh.in
  11. 5723 0
      BeefRT/JEMalloc/bin/jeprof.in
  12. 250 0
      BeefRT/JEMalloc/build-aux/install-sh
  13. 2669 0
      BeefRT/JEMalloc/configure.ac
  14. 23 0
      BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h
  15. 121 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h
  16. 24 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h
  17. 550 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h
  18. 114 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h
  19. 101 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h
  20. 58 0
      BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h
  21. 56 0
      BeefRT/JEMalloc/include/jemalloc/internal/assert.h
  22. 107 0
      BeefRT/JEMalloc/include/jemalloc/internal/atomic.h
  23. 97 0
      BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h
  24. 129 0
      BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h
  25. 195 0
      BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h
  26. 158 0
      BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h
  27. 33 0
      BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h
  28. 48 0
      BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h
  29. 66 0
      BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h
  30. 110 0
      BeefRT/JEMalloc/include/jemalloc/internal/base.h
  31. 82 0
      BeefRT/JEMalloc/include/jemalloc/internal/bin.h
  32. 50 0
      BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h
  33. 57 0
      BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h
  34. 17 0
      BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h
  35. 422 0
      BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h
  36. 368 0
      BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h
  37. 32 0
      BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h
  38. 670 0
      BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h
  39. 101 0
      BeefRT/JEMalloc/include/jemalloc/internal/ckh.h
  40. 34 0
      BeefRT/JEMalloc/include/jemalloc/internal/counter.h
  41. 159 0
      BeefRT/JEMalloc/include/jemalloc/internal/ctl.h
  42. 186 0
      BeefRT/JEMalloc/include/jemalloc/internal/decay.h
  43. 41 0
      BeefRT/JEMalloc/include/jemalloc/internal/div.h
  44. 55 0
      BeefRT/JEMalloc/include/jemalloc/internal/ecache.h
  45. 698 0
      BeefRT/JEMalloc/include/jemalloc/internal/edata.h
  46. 49 0
      BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h
  47. 412 0
      BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h
  48. 357 0
      BeefRT/JEMalloc/include/jemalloc/internal/emap.h
  49. 510 0
      BeefRT/JEMalloc/include/jemalloc/internal/emitter.h
  50. 77 0
      BeefRT/JEMalloc/include/jemalloc/internal/eset.h
  51. 50 0
      BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h
  52. 137 0
      BeefRT/JEMalloc/include/jemalloc/internal/extent.h
  53. 26 0
      BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h
  54. 10 0
      BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h
  55. 373 0
      BeefRT/JEMalloc/include/jemalloc/internal/fb.h
  56. 126 0
      BeefRT/JEMalloc/include/jemalloc/internal/fxp.h
  57. 320 0
      BeefRT/JEMalloc/include/jemalloc/internal/hash.h
  58. 163 0
      BeefRT/JEMalloc/include/jemalloc/internal/hook.h
  59. 182 0
      BeefRT/JEMalloc/include/jemalloc/internal/hpa.h
  60. 17 0
      BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h
  61. 74 0
      BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h
  62. 413 0
      BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h
  63. 40 0
      BeefRT/JEMalloc/include/jemalloc/internal/inspect.h
  64. 108 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h
  65. 428 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h
  66. 427 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in
  67. 75 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h
  68. 84 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h
  69. 122 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h
  70. 103 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h
  71. 340 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h
  72. 111 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h
  73. 130 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h
  74. 263 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h
  75. 263 0
      BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in
  76. 24 0
      BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h
  77. 204 0
      BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h
  78. 115 0
      BeefRT/JEMalloc/include/jemalloc/internal/log.h
  79. 105 0
      BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h
  80. 134 0
      BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h
  81. 319 0
      BeefRT/JEMalloc/include/jemalloc/internal/mutex.h
  82. 117 0
      BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h
  83. 73 0
      BeefRT/JEMalloc/include/jemalloc/internal/nstime.h
  84. 243 0
      BeefRT/JEMalloc/include/jemalloc/internal/pa.h
  85. 179 0
      BeefRT/JEMalloc/include/jemalloc/internal/pac.h
  86. 119 0
      BeefRT/JEMalloc/include/jemalloc/internal/pages.h
  87. 95 0
      BeefRT/JEMalloc/include/jemalloc/internal/pai.h
  88. 37 0
      BeefRT/JEMalloc/include/jemalloc/internal/peak.h
  89. 24 0
      BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h
  90. 520 0
      BeefRT/JEMalloc/include/jemalloc/internal/ph.h
  91. 5 0
      BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh
  92. 52 0
      BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk
  93. 51 0
      BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh
  94. 52 0
      BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk
  95. 168 0
      BeefRT/JEMalloc/include/jemalloc/internal/prng.h
  96. 37 0
      BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h
  97. 95 0
      BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h
  98. 21 0
      BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h
  99. 261 0
      BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h
  100. 22 0
      BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h

+ 41 - 0
BeefRT/JEMalloc/.appveyor.yml

@@ -0,0 +1,41 @@
+version: '{build}'
+
+environment:
+  matrix:
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    MSVC: amd64
+    CONFIG_FLAGS: --enable-debug
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    CONFIG_FLAGS: --enable-debug
+  - MSYSTEM: MINGW32
+    CPU: i686
+    MSVC: x86
+    CONFIG_FLAGS: --enable-debug
+  - MSYSTEM: MINGW32
+    CPU: i686
+    CONFIG_FLAGS: --enable-debug
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    MSVC: amd64
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+  - MSYSTEM: MINGW32
+    CPU: i686
+    MSVC: x86
+  - MSYSTEM: MINGW32
+    CPU: i686
+
+install:
+  - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH%
+  - if defined MSVC call "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %MSVC%
+  - if defined MSVC pacman --noconfirm -Rsc mingw-w64-%CPU%-gcc gcc
+
+build_script:
+  - bash -c "autoconf"
+  - bash -c "./configure $CONFIG_FLAGS"
+  - mingw32-make
+  - file lib/jemalloc.dll
+  - mingw32-make tests
+  - mingw32-make -k check

+ 3 - 0
BeefRT/JEMalloc/.autom4te.cfg

@@ -0,0 +1,3 @@
+begin-language: "Autoconf-without-aclocal-m4"
+args: --no-cache
+end-language: "Autoconf-without-aclocal-m4"

+ 27 - 0
BeefRT/JEMalloc/COPYING

@@ -0,0 +1,27 @@
+Unless otherwise specified, files in the jemalloc source distribution are
+subject to the following license:
+--------------------------------------------------------------------------------
+Copyright (C) 2002-present Jason Evans <[email protected]>.
+All rights reserved.
+Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+Copyright (C) 2009-present Facebook, Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice(s),
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice(s),
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--------------------------------------------------------------------------------

+ 424 - 0
BeefRT/JEMalloc/INSTALL.md

@@ -0,0 +1,424 @@
+Building and installing a packaged release of jemalloc can be as simple as
+typing the following while in the root directory of the source tree:
+
+    ./configure
+    make
+    make install
+
+If building from unpackaged developer sources, the simplest command sequence
+that might work is:
+
+    ./autogen.sh
+    make
+    make install
+
+You can uninstall the installed build artifacts like this:
+
+    make uninstall
+
+Notes:
+ - "autoconf" needs to be installed
+ - Documentation is built by the default target only when xsltproc is
+available.  Build will warn but not stop if the dependency is missing.
+
+
+## Advanced configuration
+
+The 'configure' script supports numerous options that allow control of which
+functionality is enabled, where jemalloc is installed, etc.  Optionally, pass
+any of the following arguments (not a definitive list) to 'configure':
+
+* `--help`
+
+    Print a definitive list of options.
+
+* `--prefix=<install-root-dir>`
+
+    Set the base directory in which to install.  For example:
+
+        ./configure --prefix=/usr/local
+
+    will cause files to be installed into /usr/local/include, /usr/local/lib,
+    and /usr/local/man.
+
+* `--with-version=(<major>.<minor>.<bugfix>-<nrev>-g<gid>|VERSION)`
+
+    The VERSION file is mandatory for successful configuration, and the
+    following steps are taken to assure its presence:
+    1) If --with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid> is specified,
+       generate VERSION using the specified value.
+    2) If --with-version is not specified in either form and the source
+       directory is inside a git repository, try to generate VERSION via 'git
+       describe' invocations that pattern-match release tags.
+    3) If VERSION is missing, generate it with a bogus version:
+       0.0.0-0-g0000000000000000000000000000000000000000
+
+    Note that --with-version=VERSION bypasses (1) and (2), which simplifies
+    VERSION configuration when embedding a jemalloc release into another
+    project's git repository.
+
+* `--with-rpath=<colon-separated-rpath>`
+
+    Embed one or more library paths, so that libjemalloc can find the libraries
+    it is linked to.  This works only on ELF-based systems.
+
+* `--with-mangling=<map>`
+
+    Mangle public symbols specified in <map> which is a comma-separated list of
+    name:mangled pairs.
+
+    For example, to use ld's --wrap option as an alternative method for
+    overriding libc's malloc implementation, specify something like:
+
+      --with-mangling=malloc:__wrap_malloc,free:__wrap_free[...]
+
+    Note that mangling happens prior to application of the prefix specified by
+    --with-jemalloc-prefix, and mangled symbols are then ignored when applying
+    the prefix.
+
+* `--with-jemalloc-prefix=<prefix>`
+
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
+
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.
+
+* `--without-export`
+
+    Don't export public APIs.  This can be useful when building jemalloc as a
+    static library, or to avoid exporting public APIs when using the zone
+    allocator on OSX.
+
+* `--with-private-namespace=<prefix>`
+
+    Prefix all library-private APIs with <prefix>je_.  For shared libraries,
+    symbol visibility mechanisms prevent these symbols from being exported, but
+    for static libraries, naming collisions are a real possibility.  By
+    default, <prefix> is empty, which results in a symbol prefix of je_ .
+
+* `--with-install-suffix=<suffix>`
+
+    Append <suffix> to the base name of all installed files, such that multiple
+    versions of jemalloc can coexist in the same installation directory.  For
+    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
+
+* `--with-malloc-conf=<malloc_conf>`
+
+    Embed `<malloc_conf>` as a run-time options string that is processed prior to
+    the malloc_conf global variable, the /etc/malloc.conf symlink, and the
+    MALLOC_CONF environment variable.  For example, to change the default decay
+    time to 30 seconds:
+
+      --with-malloc-conf=decay_ms:30000
+
+* `--enable-debug`
+
+    Enable assertions and validation code.  This incurs a substantial
+    performance hit, but is very useful during application development.
+
+* `--disable-stats`
+
+    Disable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.
+
+* `--enable-prof`
+
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documentation for usage details.  When enabled, there are several
+    approaches to backtracing, and the configure script chooses the first one
+    in the following list that appears to function correctly:
+
+    + libunwind      (requires --enable-prof-libunwind)
+    + libgcc         (unless --disable-prof-libgcc)
+    + gcc intrinsics (unless --disable-prof-gcc)
+
+* `--enable-prof-libunwind`
+
+    Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
+    backtracing.
+
+* `--disable-prof-libgcc`
+
+    Disable the use of libgcc's backtracing functionality.
+
+* `--disable-prof-gcc`
+
+    Disable the use of gcc intrinsics for backtracing.
+
+* `--with-static-libunwind=<libunwind.a>`
+
+    Statically link against the specified libunwind.a rather than dynamically
+    linking with -lunwind.
+
+* `--disable-fill`
+
+    Disable support for junk/zero filling of memory.  See the "opt.junk" and
+    "opt.zero" option documentation for usage details.
+
+* `--disable-zone-allocator`
+
+    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
+    the default allocator on OSX/iOS.
+
+* `--enable-utrace`
+
+    Enable utrace(2)-based allocation tracing.  This feature is not broadly
+    portable (FreeBSD has it, but Linux and OS X do not).
+
+* `--enable-xmalloc`
+
+    Enable support for optional immediate termination due to out-of-memory
+    errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
+    See the "opt.xmalloc" option documentation for usage details.
+
+* `--enable-lazy-lock`
+
+    Enable code that wraps pthread_create() to detect when an application
+    switches from single-threaded to multi-threaded mode, so that it can avoid
+    mutex locking/unlocking operations while in single-threaded mode.  In
+    practice, this feature usually has little impact on performance unless
+    thread-specific caching is disabled.
+
+* `--disable-cache-oblivious`
+
+    Disable cache-oblivious large allocation alignment by default, for large
+    allocation requests with no alignment constraints.  If this feature is
+    disabled, all large allocations are page-aligned as an implementation
+    artifact, which can severely harm CPU cache utilization.  However, the
+    cache-oblivious layout comes at the cost of one extra page per large
+    allocation, which in the most extreme case increases physical memory usage
+    for the 16 KiB size class to 20 KiB.
+
+* `--disable-syscall`
+
+    Disable use of syscall(2) rather than {open,read,write,close}(2).  This is
+    intended as a workaround for systems that place security limitations on
+    syscall(2).
+
+* `--disable-cxx`
+
+    Disable C++ integration.  This will cause new and delete operator
+    implementations to be omitted.
+
+* `--with-xslroot=<path>`
+
+    Specify where to find DocBook XSL stylesheets when building the
+    documentation.
+
+* `--with-lg-page=<lg-page>`
+
+    Specify the base 2 log of the allocator page size, which must in turn be at
+    least as large as the system page size.  By default the configure script
+    determines the host's page size and sets the allocator page size equal to
+    the system page size, so this option need not be specified unless the
+    system page size may change between configuration and execution, e.g. when
+    cross compiling.
+
+* `--with-lg-hugepage=<lg-hugepage>`
+
+    Specify the base 2 log of the system huge page size.  This option is useful
+    when cross compiling, or when overriding the default for systems that do
+    not explicitly support huge pages.
+
+* `--with-lg-quantum=<lg-quantum>`
+
+    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
+    to know the minimum alignment that meets the following C standard
+    requirement (quoted from the April 12, 2011 draft of the C11 standard):
+
+    >  The pointer returned if the allocation succeeds is suitably aligned so
+      that it may be assigned to a pointer to any type of object with a
+      fundamental alignment requirement and then used to access such an object
+      or an array of such objects in the space allocated [...]
+
+    This setting is architecture-specific, and although jemalloc includes known
+    safe values for the most commonly used modern architectures, there is a
+    wrinkle related to GNU libc (glibc) that may impact your choice of
+    <lg-quantum>.  On most modern architectures, this mandates 16-byte
+    alignment (<lg-quantum>=4), but the glibc developers chose not to meet this
+    requirement for performance reasons.  An old discussion can be found at
+    <https://sourceware.org/bugzilla/show_bug.cgi?id=206> .  Unlike glibc,
+    jemalloc does follow the C standard by default (caveat: jemalloc
+    technically cheats for size classes smaller than the quantum), but the fact
+    that Linux systems already work around this allocator noncompliance means
+    that it is generally safe in practice to let jemalloc's minimum alignment
+    follow glibc's lead.  If you specify `--with-lg-quantum=3` during
+    configuration, jemalloc will provide additional size classes that are not
+    16-byte-aligned (24, 40, and 56).
+
+* `--with-lg-vaddr=<lg-vaddr>`
+
+    Specify the number of significant virtual address bits.  By default, the
+    configure script attempts to detect virtual address size on those platforms
+    where it knows how, and picks a default otherwise.  This option may be
+    useful when cross-compiling.
+
+* `--disable-initial-exec-tls`
+
+    Disable the initial-exec TLS model for jemalloc's internal thread-local
+    storage (on those platforms that support explicit settings).  This can allow
+    jemalloc to be dynamically loaded after program startup (e.g. using dlopen).
+    Note that in this case, there will be two malloc implementations operating
+    in the same process, which will almost certainly result in confusing runtime
+    crashes if pointers leak from one implementation to the other.
+
+* `--disable-libdl`
+
+    Disable the usage of libdl, namely dlsym(3) which is required by the lazy
+    lock option.  This can allow building static binaries.
+
+The following environment variables (not a definitive list) impact configure's
+behavior:
+
+* `CFLAGS="?"`
+* `CXXFLAGS="?"`
+
+    Pass these flags to the C/C++ compiler.  Any flags set by the configure
+    script are prepended, which means explicitly set flags generally take
+    precedence.  Take care when specifying flags such as -Werror, because
+    configure tests may be affected in undesirable ways.
+
+* `EXTRA_CFLAGS="?"`
+* `EXTRA_CXXFLAGS="?"`
+
+    Append these flags to CFLAGS/CXXFLAGS, without passing them to the
+    compiler(s) during configuration.  This makes it possible to add flags such
+    as -Werror, while allowing the configure script to determine what other
+    flags are appropriate for the specified configuration.
+
+* `CPPFLAGS="?"`
+
+    Pass these flags to the C preprocessor.  Note that CFLAGS is not passed to
+    'cpp' when 'configure' is looking for include files, so you must use
+    CPPFLAGS instead if you need to help 'configure' find header files.
+
+* `LD_LIBRARY_PATH="?"`
+
+    'ld' uses this colon-separated list to find libraries.
+
+* `LDFLAGS="?"`
+
+    Pass these flags when linking.
+
+* `PATH="?"`
+
+    'configure' uses this to find programs.
+
+In some cases it may be necessary to work around configuration results that do
+not match reality.  For example, Linux 4.5 added support for the MADV_FREE flag
+to madvise(2), which can cause problems if building on a host with MADV_FREE
+support and deploying to a target without.  To work around this, use a cache
+file to override the relevant configuration variable defined in configure.ac,
+e.g.:
+
+    echo "je_cv_madv_free=no" > config.cache && ./configure -C
+
+
+## Advanced compilation
+
+To build only parts of jemalloc, use the following targets:
+
+    build_lib_shared
+    build_lib_static
+    build_lib
+    build_doc_html
+    build_doc_man
+    build_doc
+
+To install only parts of jemalloc, use the following targets:
+
+    install_bin
+    install_include
+    install_lib_shared
+    install_lib_static
+    install_lib_pc
+    install_lib
+    install_doc_html
+    install_doc_man
+    install_doc
+
+To clean up build results to varying degrees, use the following make targets:
+
+    clean
+    distclean
+    relclean
+
+
+## Advanced installation
+
+Optionally, define make variables when invoking make, including (not
+exclusively):
+
+* `INCLUDEDIR="?"`
+
+    Use this as the installation prefix for header files.
+
+* `LIBDIR="?"`
+
+    Use this as the installation prefix for libraries.
+
+* `MANDIR="?"`
+
+    Use this as the installation prefix for man pages.
+
+* `DESTDIR="?"`
+
+    Prepend DESTDIR to INCLUDEDIR, LIBDIR, DATADIR, and MANDIR.  This is useful
+    when installing to a different path than was specified via --prefix.
+
+* `CC="?"`
+
+    Use this to invoke the C compiler.
+
+* `CFLAGS="?"`
+
+    Pass these flags to the compiler.
+
+* `CPPFLAGS="?"`
+
+    Pass these flags to the C preprocessor.
+
+* `LDFLAGS="?"`
+
+    Pass these flags when linking.
+
+* `PATH="?"`
+
+    Use this to search for programs used during configuration and building.
+
+
+## Development
+
+If you intend to make non-trivial changes to jemalloc, use the 'autogen.sh'
+script rather than 'configure'.  This re-generates 'configure', enables
+configuration dependency rules, and enables re-generation of automatically
+generated source files.
+
+The build system supports using an object directory separate from the source
+tree.  For example, you can create an 'obj' directory, and from within that
+directory, issue configuration and build commands:
+
+    autoconf
+    mkdir obj
+    cd obj
+    ../configure --enable-autogen
+    make
+
+
+## Documentation
+
+The manual page is generated in both html and roff formats.  Any web browser
+can be used to view the html manual.  The roff manual page can be formatted
+prior to installation via the following command:
+
+    nroff -man -t doc/jemalloc.3

+ 762 - 0
BeefRT/JEMalloc/Makefile.in

@@ -0,0 +1,762 @@
+# Clear out all vpaths, then set just one (default vpath) for the main build
+# directory.
+vpath
+vpath % .
+
+# Clear the default suffixes, so that built-in rules are not used.
+.SUFFIXES :
+
+SHELL := /bin/sh
+
+CC := @CC@
+CXX := @CXX@
+
+# Configuration parameters.
+DESTDIR =
+BINDIR := $(DESTDIR)@BINDIR@
+INCLUDEDIR := $(DESTDIR)@INCLUDEDIR@
+LIBDIR := $(DESTDIR)@LIBDIR@
+DATADIR := $(DESTDIR)@DATADIR@
+MANDIR := $(DESTDIR)@MANDIR@
+srcroot := @srcroot@
+objroot := @objroot@
+abs_srcroot := @abs_srcroot@
+abs_objroot := @abs_objroot@
+
+# Build parameters.
+CPPFLAGS := @CPPFLAGS@ -I$(objroot)include -I$(srcroot)include
+CONFIGURE_CFLAGS := @CONFIGURE_CFLAGS@
+SPECIFIED_CFLAGS := @SPECIFIED_CFLAGS@
+EXTRA_CFLAGS := @EXTRA_CFLAGS@
+CFLAGS := $(strip $(CONFIGURE_CFLAGS) $(SPECIFIED_CFLAGS) $(EXTRA_CFLAGS))
+CONFIGURE_CXXFLAGS := @CONFIGURE_CXXFLAGS@
+SPECIFIED_CXXFLAGS := @SPECIFIED_CXXFLAGS@
+EXTRA_CXXFLAGS := @EXTRA_CXXFLAGS@
+CXXFLAGS := $(strip $(CONFIGURE_CXXFLAGS) $(SPECIFIED_CXXFLAGS) $(EXTRA_CXXFLAGS))
+LDFLAGS := @LDFLAGS@
+EXTRA_LDFLAGS := @EXTRA_LDFLAGS@
+LIBS := @LIBS@
+RPATH_EXTRA := @RPATH_EXTRA@
+SO := @so@
+IMPORTLIB := @importlib@
+O := @o@
+A := @a@
+EXE := @exe@
+LIBPREFIX := @libprefix@
+REV := @rev@
+install_suffix := @install_suffix@
+ABI := @abi@
+XSLTPROC := @XSLTPROC@
+XSLROOT := @XSLROOT@
+AUTOCONF := @AUTOCONF@
+_RPATH = @RPATH@
+RPATH = $(if $(1),$(call _RPATH,$(1)))
+cfghdrs_in := $(addprefix $(srcroot),@cfghdrs_in@)
+cfghdrs_out := @cfghdrs_out@
+cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
+cfgoutputs_out := @cfgoutputs_out@
+enable_autogen := @enable_autogen@
+enable_doc := @enable_doc@
+enable_shared := @enable_shared@
+enable_static := @enable_static@
+enable_prof := @enable_prof@
+enable_zone_allocator := @enable_zone_allocator@
+enable_experimental_smallocx := @enable_experimental_smallocx@
+MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
+link_whole_archive := @link_whole_archive@
+DSO_LDFLAGS = @DSO_LDFLAGS@
+SOREV = @SOREV@
+PIC_CFLAGS = @PIC_CFLAGS@
+CTARGET = @CTARGET@
+LDTARGET = @LDTARGET@
+TEST_LD_MODE = @TEST_LD_MODE@
+MKLIB = @MKLIB@
+AR = @AR@
+ARFLAGS = @ARFLAGS@
+DUMP_SYMS = @DUMP_SYMS@
+AWK := @AWK@
+CC_MM = @CC_MM@
+LM := @LM@
+INSTALL = @INSTALL@
+
+ifeq (macho, $(ABI))
+TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH="$(objroot)lib"
+else
+ifeq (pecoff, $(ABI))
+TEST_LIBRARY_PATH := PATH="$(PATH):$(objroot)lib"
+else
+TEST_LIBRARY_PATH :=
+endif
+endif
+
+LIBJEMALLOC := $(LIBPREFIX)jemalloc$(install_suffix)
+
+# Lists of files.
+BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/jeprof
+C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
+C_SRCS := $(srcroot)src/jemalloc.c \
+	$(srcroot)src/arena.c \
+	$(srcroot)src/background_thread.c \
+	$(srcroot)src/base.c \
+	$(srcroot)src/bin.c \
+	$(srcroot)src/bin_info.c \
+	$(srcroot)src/bitmap.c \
+	$(srcroot)src/buf_writer.c \
+	$(srcroot)src/cache_bin.c \
+	$(srcroot)src/ckh.c \
+	$(srcroot)src/counter.c \
+	$(srcroot)src/ctl.c \
+	$(srcroot)src/decay.c \
+	$(srcroot)src/div.c \
+	$(srcroot)src/ecache.c \
+	$(srcroot)src/edata.c \
+	$(srcroot)src/edata_cache.c \
+	$(srcroot)src/ehooks.c \
+	$(srcroot)src/emap.c \
+	$(srcroot)src/eset.c \
+	$(srcroot)src/exp_grow.c \
+	$(srcroot)src/extent.c \
+	$(srcroot)src/extent_dss.c \
+	$(srcroot)src/extent_mmap.c \
+	$(srcroot)src/fxp.c \
+	$(srcroot)src/san.c \
+	$(srcroot)src/san_bump.c \
+	$(srcroot)src/hook.c \
+	$(srcroot)src/hpa.c \
+	$(srcroot)src/hpa_hooks.c \
+	$(srcroot)src/hpdata.c \
+	$(srcroot)src/inspect.c \
+	$(srcroot)src/large.c \
+	$(srcroot)src/log.c \
+	$(srcroot)src/malloc_io.c \
+	$(srcroot)src/mutex.c \
+	$(srcroot)src/nstime.c \
+	$(srcroot)src/pa.c \
+	$(srcroot)src/pa_extra.c \
+	$(srcroot)src/pai.c \
+	$(srcroot)src/pac.c \
+	$(srcroot)src/pages.c \
+	$(srcroot)src/peak_event.c \
+	$(srcroot)src/prof.c \
+	$(srcroot)src/prof_data.c \
+	$(srcroot)src/prof_log.c \
+	$(srcroot)src/prof_recent.c \
+	$(srcroot)src/prof_stats.c \
+	$(srcroot)src/prof_sys.c \
+	$(srcroot)src/psset.c \
+	$(srcroot)src/rtree.c \
+	$(srcroot)src/safety_check.c \
+	$(srcroot)src/sc.c \
+	$(srcroot)src/sec.c \
+	$(srcroot)src/stats.c \
+	$(srcroot)src/sz.c \
+	$(srcroot)src/tcache.c \
+	$(srcroot)src/test_hooks.c \
+	$(srcroot)src/thread_event.c \
+	$(srcroot)src/ticker.c \
+	$(srcroot)src/tsd.c \
+	$(srcroot)src/witness.c
+ifeq ($(enable_zone_allocator), 1)
+C_SRCS += $(srcroot)src/zone.c
+endif
+ifeq ($(IMPORTLIB),$(SO))
+STATIC_LIBS := $(objroot)lib/$(LIBJEMALLOC).$(A)
+endif
+ifdef PIC_CFLAGS
+STATIC_LIBS += $(objroot)lib/$(LIBJEMALLOC)_pic.$(A)
+else
+STATIC_LIBS += $(objroot)lib/$(LIBJEMALLOC)_s.$(A)
+endif
+DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
+ifneq ($(SOREV),$(SO))
+DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
+endif
+ifeq (1, $(link_whole_archive))
+LJEMALLOC := -Wl,--whole-archive -L$(objroot)lib -l$(LIBJEMALLOC) -Wl,--no-whole-archive
+else
+LJEMALLOC := $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+endif
+PC := $(objroot)jemalloc.pc
+DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
+DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.html)
+DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3)
+DOCS := $(DOCS_HTML) $(DOCS_MAN3)
+C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
+	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/sleep.c \
+	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
+	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
+ifeq (1, $(link_whole_archive))
+C_UTIL_INTEGRATION_SRCS :=
+C_UTIL_CPP_SRCS :=
+else
+C_UTIL_INTEGRATION_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c \
+	$(srcroot)src/ticker.c
+C_UTIL_CPP_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c
+endif
+TESTS_UNIT := \
+	$(srcroot)test/unit/a0.c \
+	$(srcroot)test/unit/arena_decay.c \
+	$(srcroot)test/unit/arena_reset.c \
+	$(srcroot)test/unit/atomic.c \
+	$(srcroot)test/unit/background_thread.c \
+	$(srcroot)test/unit/background_thread_enable.c \
+	$(srcroot)test/unit/base.c \
+	$(srcroot)test/unit/batch_alloc.c \
+	$(srcroot)test/unit/binshard.c \
+	$(srcroot)test/unit/bitmap.c \
+	$(srcroot)test/unit/bit_util.c \
+	$(srcroot)test/unit/buf_writer.c \
+	$(srcroot)test/unit/cache_bin.c \
+	$(srcroot)test/unit/ckh.c \
+	$(srcroot)test/unit/counter.c \
+	$(srcroot)test/unit/decay.c \
+	$(srcroot)test/unit/div.c \
+	$(srcroot)test/unit/double_free.c \
+	$(srcroot)test/unit/edata_cache.c \
+	$(srcroot)test/unit/emitter.c \
+	$(srcroot)test/unit/extent_quantize.c \
+	${srcroot}test/unit/fb.c \
+	$(srcroot)test/unit/fork.c \
+	${srcroot}test/unit/fxp.c \
+	${srcroot}test/unit/san.c \
+	${srcroot}test/unit/san_bump.c \
+	$(srcroot)test/unit/hash.c \
+	$(srcroot)test/unit/hook.c \
+	$(srcroot)test/unit/hpa.c \
+	$(srcroot)test/unit/hpa_background_thread.c \
+	$(srcroot)test/unit/hpdata.c \
+	$(srcroot)test/unit/huge.c \
+	$(srcroot)test/unit/inspect.c \
+	$(srcroot)test/unit/junk.c \
+	$(srcroot)test/unit/junk_alloc.c \
+	$(srcroot)test/unit/junk_free.c \
+	$(srcroot)test/unit/log.c \
+	$(srcroot)test/unit/mallctl.c \
+	$(srcroot)test/unit/malloc_conf_2.c \
+	$(srcroot)test/unit/malloc_io.c \
+	$(srcroot)test/unit/math.c \
+	$(srcroot)test/unit/mpsc_queue.c \
+	$(srcroot)test/unit/mq.c \
+	$(srcroot)test/unit/mtx.c \
+	$(srcroot)test/unit/nstime.c \
+	$(srcroot)test/unit/oversize_threshold.c \
+	$(srcroot)test/unit/pa.c \
+	$(srcroot)test/unit/pack.c \
+	$(srcroot)test/unit/pages.c \
+	$(srcroot)test/unit/peak.c \
+	$(srcroot)test/unit/ph.c \
+	$(srcroot)test/unit/prng.c \
+	$(srcroot)test/unit/prof_accum.c \
+	$(srcroot)test/unit/prof_active.c \
+	$(srcroot)test/unit/prof_gdump.c \
+	$(srcroot)test/unit/prof_hook.c \
+	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_log.c \
+	$(srcroot)test/unit/prof_mdump.c \
+	$(srcroot)test/unit/prof_recent.c \
+	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_stats.c \
+	$(srcroot)test/unit/prof_tctx.c \
+	$(srcroot)test/unit/prof_thread_name.c \
+	$(srcroot)test/unit/prof_sys_thread_name.c \
+	$(srcroot)test/unit/psset.c \
+	$(srcroot)test/unit/ql.c \
+	$(srcroot)test/unit/qr.c \
+	$(srcroot)test/unit/rb.c \
+	$(srcroot)test/unit/retained.c \
+	$(srcroot)test/unit/rtree.c \
+	$(srcroot)test/unit/safety_check.c \
+	$(srcroot)test/unit/sc.c \
+	$(srcroot)test/unit/sec.c \
+	$(srcroot)test/unit/seq.c \
+	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/size_check.c \
+	$(srcroot)test/unit/size_classes.c \
+	$(srcroot)test/unit/slab.c \
+	$(srcroot)test/unit/smoothstep.c \
+	$(srcroot)test/unit/spin.c \
+	$(srcroot)test/unit/stats.c \
+	$(srcroot)test/unit/stats_print.c \
+	$(srcroot)test/unit/sz.c \
+	$(srcroot)test/unit/tcache_max.c \
+	$(srcroot)test/unit/test_hooks.c \
+	$(srcroot)test/unit/thread_event.c \
+	$(srcroot)test/unit/ticker.c \
+	$(srcroot)test/unit/tsd.c \
+	$(srcroot)test/unit/uaf.c \
+	$(srcroot)test/unit/witness.c \
+	$(srcroot)test/unit/zero.c \
+	$(srcroot)test/unit/zero_realloc_abort.c \
+	$(srcroot)test/unit/zero_realloc_free.c \
+	$(srcroot)test/unit/zero_realloc_alloc.c \
+	$(srcroot)test/unit/zero_reallocs.c
+ifeq (@enable_prof@, 1)
+TESTS_UNIT += \
+	$(srcroot)test/unit/arena_reset_prof.c \
+	$(srcroot)test/unit/batch_alloc_prof.c
+endif
+TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
+	$(srcroot)test/integration/allocated.c \
+	$(srcroot)test/integration/extent.c \
+	$(srcroot)test/integration/malloc.c \
+	$(srcroot)test/integration/mallocx.c \
+	$(srcroot)test/integration/MALLOCX_ARENA.c \
+	$(srcroot)test/integration/overflow.c \
+	$(srcroot)test/integration/posix_memalign.c \
+	$(srcroot)test/integration/rallocx.c \
+	$(srcroot)test/integration/sdallocx.c \
+	$(srcroot)test/integration/slab_sizes.c \
+	$(srcroot)test/integration/thread_arena.c \
+	$(srcroot)test/integration/thread_tcache_enabled.c \
+	$(srcroot)test/integration/xallocx.c
+ifeq (@enable_experimental_smallocx@, 1)
+TESTS_INTEGRATION += \
+  $(srcroot)test/integration/smallocx.c
+endif
+ifeq (@enable_cxx@, 1)
+CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
+TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \
+	$(srcroot)test/integration/cpp/infallible_new_true.cpp \
+	$(srcroot)test/integration/cpp/infallible_new_false.cpp
+else
+CPP_SRCS :=
+TESTS_INTEGRATION_CPP :=
+endif
+TESTS_ANALYZE := $(srcroot)test/analyze/prof_bias.c \
+	$(srcroot)test/analyze/rand.c \
+	$(srcroot)test/analyze/sizes.c
+TESTS_STRESS := $(srcroot)test/stress/batch_alloc.c \
+	$(srcroot)test/stress/fill_flush.c \
+	$(srcroot)test/stress/hookbench.c \
+	$(srcroot)test/stress/large_microbench.c \
+	$(srcroot)test/stress/mallctl.c \
+	$(srcroot)test/stress/microbench.c
+
+
+TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) \
+	$(TESTS_ANALYZE) $(TESTS_STRESS)
+
+PRIVATE_NAMESPACE_HDRS := $(objroot)include/jemalloc/internal/private_namespace.h $(objroot)include/jemalloc/internal/private_namespace_jet.h
+PRIVATE_NAMESPACE_GEN_HDRS := $(PRIVATE_NAMESPACE_HDRS:%.h=%.gen.h)
+C_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym.$(O))
+C_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym)
+C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O))
+CPP_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.$(O))
+C_PIC_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.pic.$(O))
+CPP_PIC_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.pic.$(O))
+C_JET_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym.$(O))
+C_JET_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym)
+C_JET_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.$(O))
+C_TESTLIB_UNIT_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.unit.$(O))
+C_TESTLIB_INTEGRATION_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O))
+C_UTIL_INTEGRATION_OBJS := $(C_UTIL_INTEGRATION_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O))
+C_TESTLIB_ANALYZE_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.analyze.$(O))
+C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O))
+C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) \
+	$(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_ANALYZE_OBJS) \
+	$(C_TESTLIB_STRESS_OBJS)
+
+TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_INTEGRATION_CPP_OBJS := $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
+TESTS_ANALYZE_OBJS := $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_ANALYZE_OBJS) \
+	$(TESTS_STRESS_OBJS)
+TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS)
+
+.PHONY: all dist build_doc_html build_doc_man build_doc
+.PHONY: install_bin install_include install_lib
+.PHONY: install_doc_html install_doc_man install_doc install
+.PHONY: tests check clean distclean relclean
+
+.SECONDARY : $(PRIVATE_NAMESPACE_GEN_HDRS) $(TESTS_OBJS) $(TESTS_CPP_OBJS)
+
+# Default target.
+all: build_lib
+
+dist: build_doc
+
+$(objroot)doc/%$(install_suffix).html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+ifneq ($(XSLROOT),)
+	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<
+else
+ifeq ($(wildcard $(DOCS_HTML)),)
+	@echo "<p>Missing xsltproc.  Doc not built.</p>" > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
+
+$(objroot)doc/%$(install_suffix).3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+ifneq ($(XSLROOT),)
+	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<
+# The -o option (output filename) of xsltproc may not work (it uses the
+# <refname> in the .xml file).  Manually add the suffix if so.
+  ifneq ($(install_suffix),)
+	@if [ -f $(objroot)doc/jemalloc.3 ]; then \
+		mv $(objroot)doc/jemalloc.3 $(objroot)doc/jemalloc$(install_suffix).3 ; \
+	fi
+  endif
+else
+ifeq ($(wildcard $(DOCS_MAN3)),)
+	@echo "Missing xsltproc.  Doc not built." > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif
+
+build_doc_html: $(DOCS_HTML)
+build_doc_man: $(DOCS_MAN3)
+build_doc: $(DOCS)
+
+#
+# Include generated dependency files.
+#
+ifdef CC_MM
+-include $(C_SYM_OBJS:%.$(O)=%.d)
+-include $(C_OBJS:%.$(O)=%.d)
+-include $(CPP_OBJS:%.$(O)=%.d)
+-include $(C_PIC_OBJS:%.$(O)=%.d)
+-include $(CPP_PIC_OBJS:%.$(O)=%.d)
+-include $(C_JET_SYM_OBJS:%.$(O)=%.d)
+-include $(C_JET_OBJS:%.$(O)=%.d)
+-include $(C_TESTLIB_OBJS:%.$(O)=%.d)
+-include $(TESTS_OBJS:%.$(O)=%.d)
+-include $(TESTS_CPP_OBJS:%.$(O)=%.d)
+endif
+
+$(C_SYM_OBJS): $(objroot)src/%.sym.$(O): $(srcroot)src/%.c
+$(C_SYM_OBJS): CPPFLAGS += -DJEMALLOC_NO_PRIVATE_NAMESPACE
+$(C_SYMS): $(objroot)src/%.sym: $(objroot)src/%.sym.$(O)
+$(C_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.c
+$(CPP_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.cpp
+$(C_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.c
+$(C_PIC_OBJS): CFLAGS += $(PIC_CFLAGS)
+$(CPP_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.cpp
+$(CPP_PIC_OBJS): CXXFLAGS += $(PIC_CFLAGS)
+$(C_JET_SYM_OBJS): $(objroot)src/%.jet.sym.$(O): $(srcroot)src/%.c
+$(C_JET_SYM_OBJS): CPPFLAGS += -DJEMALLOC_JET -DJEMALLOC_NO_PRIVATE_NAMESPACE
+$(C_JET_SYMS): $(objroot)src/%.jet.sym: $(objroot)src/%.jet.sym.$(O)
+$(C_JET_OBJS): $(objroot)src/%.jet.$(O): $(srcroot)src/%.c
+$(C_JET_OBJS): CPPFLAGS += -DJEMALLOC_JET
+$(C_TESTLIB_UNIT_OBJS): $(objroot)test/src/%.unit.$(O): $(srcroot)test/src/%.c
+$(C_TESTLIB_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
+$(C_TESTLIB_INTEGRATION_OBJS): $(objroot)test/src/%.integration.$(O): $(srcroot)test/src/%.c
+$(C_TESTLIB_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
+$(C_UTIL_INTEGRATION_OBJS): $(objroot)src/%.integration.$(O): $(srcroot)src/%.c
+$(C_TESTLIB_ANALYZE_OBJS): $(objroot)test/src/%.analyze.$(O): $(srcroot)test/src/%.c
+$(C_TESTLIB_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
+$(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%.c
+$(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB
+$(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
+$(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
+$(TESTS_INTEGRATION_CPP_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_CPP_TEST
+$(TESTS_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
+$(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
+$(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
+$(TESTS_CPP_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.cpp
+$(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_CPP_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+ifneq ($(IMPORTLIB),$(SO))
+$(CPP_OBJS) $(C_SYM_OBJS) $(C_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
+endif
+
+# Dependencies.
+ifndef CC_MM
+HEADER_DIRS = $(srcroot)include/jemalloc/internal \
+	$(objroot)include/jemalloc $(objroot)include/jemalloc/internal
+HEADERS = $(filter-out $(PRIVATE_NAMESPACE_HDRS),$(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h)))
+$(C_SYM_OBJS) $(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS) $(TESTS_CPP_OBJS): $(HEADERS)
+$(TESTS_OBJS) $(TESTS_CPP_OBJS): $(objroot)test/include/test/jemalloc_test.h
+endif
+
+$(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_INTEGRATION_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace.h
+$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h
+
+$(C_SYM_OBJS) $(C_OBJS) $(C_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) $(CTARGET) $<
+ifdef CC_MM
+	@$(CC) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
+$(C_SYMS): %.sym:
+	@mkdir -p $(@D)
+	$(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols.awk > $@
+
+$(C_JET_SYMS): %.sym:
+	@mkdir -p $(@D)
+	$(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols_jet.awk > $@
+
+$(objroot)include/jemalloc/internal/private_namespace.gen.h: $(C_SYMS)
+	$(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@
+
+$(objroot)include/jemalloc/internal/private_namespace_jet.gen.h: $(C_JET_SYMS)
+	$(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@
+
+%.h: %.gen.h
+	@if ! `cmp -s $< $@` ; then echo "cp $< $@"; cp $< $@ ; fi
+
+$(CPP_OBJS) $(CPP_PIC_OBJS) $(TESTS_CPP_OBJS): %.$(O):
+	@mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) -c $(CPPFLAGS) $(CTARGET) $<
+ifdef CC_MM
+	@$(CXX) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
+ifneq ($(SOREV),$(SO))
+%.$(SO) : %.$(SOREV)
+	@mkdir -p $(@D)
+	ln -sf $(<F) $@
+endif
+
+$(objroot)lib/$(LIBJEMALLOC).$(SOREV) : $(if $(PIC_CFLAGS),$(C_PIC_OBJS),$(C_OBJS)) $(if $(PIC_CFLAGS),$(CPP_PIC_OBJS),$(CPP_OBJS))
+	@mkdir -p $(@D)
+	$(CC) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+
+$(objroot)lib/$(LIBJEMALLOC)_pic.$(A) : $(C_PIC_OBJS) $(CPP_PIC_OBJS)
+$(objroot)lib/$(LIBJEMALLOC).$(A) : $(C_OBJS) $(CPP_OBJS)
+$(objroot)lib/$(LIBJEMALLOC)_s.$(A) : $(C_OBJS) $(CPP_OBJS)
+
+$(STATIC_LIBS):
+	@mkdir -p $(@D)
+	$(AR) $(ARFLAGS)@AROUT@ $+
+
+$(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+	@mkdir -p $(@D)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+	@mkdir -p $(@D)
+	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
+
+$(objroot)test/analyze/%$(EXE): $(objroot)test/analyze/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_ANALYZE_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+	@mkdir -p $(@D)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+build_lib_shared: $(DSOS)
+build_lib_static: $(STATIC_LIBS)
+ifeq ($(enable_shared), 1)
+build_lib: build_lib_shared
+endif
+ifeq ($(enable_static), 1)
+build_lib: build_lib_static
+endif
+
+install_bin:
+	$(INSTALL) -d $(BINDIR)
+	@for b in $(BINS); do \
+	$(INSTALL) -v -m 755 $$b $(BINDIR); \
+done
+
+install_include:
+	$(INSTALL) -d $(INCLUDEDIR)/jemalloc
+	@for h in $(C_HDRS); do \
+	$(INSTALL) -v -m 644 $$h $(INCLUDEDIR)/jemalloc; \
+done
+
+install_lib_shared: $(DSOS)
+	$(INSTALL) -d $(LIBDIR)
+	$(INSTALL) -v -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
+ifneq ($(SOREV),$(SO))
+	ln -sf $(LIBJEMALLOC).$(SOREV) $(LIBDIR)/$(LIBJEMALLOC).$(SO)
+endif
+
+install_lib_static: $(STATIC_LIBS)
+	$(INSTALL) -d $(LIBDIR)
+	@for l in $(STATIC_LIBS); do \
+	$(INSTALL) -v -m 755 $$l $(LIBDIR); \
+done
+
+install_lib_pc: $(PC)
+	$(INSTALL) -d $(LIBDIR)/pkgconfig
+	@for l in $(PC); do \
+	$(INSTALL) -v -m 644 $$l $(LIBDIR)/pkgconfig; \
+done
+
+ifeq ($(enable_shared), 1)
+install_lib: install_lib_shared
+endif
+ifeq ($(enable_static), 1)
+install_lib: install_lib_static
+endif
+install_lib: install_lib_pc
+
+install_doc_html: build_doc_html
+	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
+	@for d in $(DOCS_HTML); do \
+	$(INSTALL) -v -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
+done
+
+install_doc_man: build_doc_man
+	$(INSTALL) -d $(MANDIR)/man3
+	@for d in $(DOCS_MAN3); do \
+	$(INSTALL) -v -m 644 $$d $(MANDIR)/man3; \
+done
+
+install_doc: install_doc_html install_doc_man
+
+install: install_bin install_include install_lib
+
+ifeq ($(enable_doc), 1)
+install: install_doc
+endif
+
+uninstall_bin:
+	$(RM) -v $(foreach b,$(notdir $(BINS)),$(BINDIR)/$(b))
+
+uninstall_include:
+	$(RM) -v $(foreach h,$(notdir $(C_HDRS)),$(INCLUDEDIR)/jemalloc/$(h))
+	rmdir -v $(INCLUDEDIR)/jemalloc
+
+uninstall_lib_shared:
+	$(RM) -v $(LIBDIR)/$(LIBJEMALLOC).$(SOREV)
+ifneq ($(SOREV),$(SO))
+	$(RM) -v $(LIBDIR)/$(LIBJEMALLOC).$(SO)
+endif
+
+uninstall_lib_static:
+	$(RM) -v $(foreach l,$(notdir $(STATIC_LIBS)),$(LIBDIR)/$(l))
+
+uninstall_lib_pc:
+	$(RM) -v $(foreach p,$(notdir $(PC)),$(LIBDIR)/pkgconfig/$(p))
+
+ifeq ($(enable_shared), 1)
+uninstall_lib: uninstall_lib_shared
+endif
+ifeq ($(enable_static), 1)
+uninstall_lib: uninstall_lib_static
+endif
+uninstall_lib: uninstall_lib_pc
+
+uninstall_doc_html:
+	$(RM) -v $(foreach d,$(notdir $(DOCS_HTML)),$(DATADIR)/doc/jemalloc$(install_suffix)/$(d))
+	rmdir -v $(DATADIR)/doc/jemalloc$(install_suffix)
+
+uninstall_doc_man:
+	$(RM) -v $(foreach d,$(notdir $(DOCS_MAN3)),$(MANDIR)/man3/$(d))
+
+uninstall_doc: uninstall_doc_html uninstall_doc_man
+
+uninstall: uninstall_bin uninstall_include uninstall_lib
+
+ifeq ($(enable_doc), 1)
+uninstall: uninstall_doc
+endif
+
+tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
+tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
+tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
+tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE))
+tests: tests_unit tests_integration tests_analyze tests_stress
+
+check_unit_dir:
+	@mkdir -p $(objroot)test/unit
+check_integration_dir:
+	@mkdir -p $(objroot)test/integration
+analyze_dir:
+	@mkdir -p $(objroot)test/analyze
+stress_dir:
+	@mkdir -p $(objroot)test/stress
+check_dir: check_unit_dir check_integration_dir
+
+check_unit: tests_unit check_unit_dir
+	$(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
+check_integration_prof: tests_integration check_integration_dir
+ifeq ($(enable_prof), 1)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+	$(MALLOC_CONF)="prof:true,prof_active:false" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+endif
+check_integration_decay: tests_integration check_integration_dir
+	$(MALLOC_CONF)="dirty_decay_ms:-1,muzzy_decay_ms:-1" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+	$(MALLOC_CONF)="dirty_decay_ms:0,muzzy_decay_ms:0" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+check_integration: tests_integration check_integration_dir
+	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+analyze: tests_analyze analyze_dir
+ifeq ($(enable_prof), 1)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%)
+else
+	$(SHELL) $(objroot)test/test.sh $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%)
+endif
+stress: tests_stress stress_dir
+	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
+check: check_unit check_integration check_integration_decay check_integration_prof
+
+clean:
+	rm -f $(PRIVATE_NAMESPACE_HDRS)
+	rm -f $(PRIVATE_NAMESPACE_GEN_HDRS)
+	rm -f $(C_SYM_OBJS)
+	rm -f $(C_SYMS)
+	rm -f $(C_OBJS)
+	rm -f $(CPP_OBJS)
+	rm -f $(C_PIC_OBJS)
+	rm -f $(CPP_PIC_OBJS)
+	rm -f $(C_JET_SYM_OBJS)
+	rm -f $(C_JET_SYMS)
+	rm -f $(C_JET_OBJS)
+	rm -f $(C_TESTLIB_OBJS)
+	rm -f $(C_SYM_OBJS:%.$(O)=%.d)
+	rm -f $(C_OBJS:%.$(O)=%.d)
+	rm -f $(CPP_OBJS:%.$(O)=%.d)
+	rm -f $(C_PIC_OBJS:%.$(O)=%.d)
+	rm -f $(CPP_PIC_OBJS:%.$(O)=%.d)
+	rm -f $(C_JET_SYM_OBJS:%.$(O)=%.d)
+	rm -f $(C_JET_OBJS:%.$(O)=%.d)
+	rm -f $(C_TESTLIB_OBJS:%.$(O)=%.d)
+	rm -f $(TESTS_OBJS:%.$(O)=%$(EXE))
+	rm -f $(TESTS_OBJS)
+	rm -f $(TESTS_OBJS:%.$(O)=%.d)
+	rm -f $(TESTS_OBJS:%.$(O)=%.out)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%$(EXE))
+	rm -f $(TESTS_CPP_OBJS)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%.d)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%.out)
+	rm -f $(DSOS) $(STATIC_LIBS)
+
+distclean: clean
+	rm -f $(objroot)bin/jemalloc-config
+	rm -f $(objroot)bin/jemalloc.sh
+	rm -f $(objroot)bin/jeprof
+	rm -f $(objroot)config.log
+	rm -f $(objroot)config.status
+	rm -f $(objroot)config.stamp
+	rm -f $(cfghdrs_out)
+	rm -f $(cfgoutputs_out)
+
+relclean: distclean
+	rm -f $(objroot)configure
+	rm -f $(objroot)VERSION
+	rm -f $(DOCS_HTML)
+	rm -f $(DOCS_MAN3)
+
+#===============================================================================
+# Re-configuration rules.
+
+ifeq ($(enable_autogen), 1)
+$(srcroot)configure : $(srcroot)configure.ac
+	cd ./$(srcroot) && $(AUTOCONF)
+
+$(objroot)config.status : $(srcroot)configure
+	./$(objroot)config.status --recheck
+
+$(srcroot)config.stamp.in : $(srcroot)configure.ac
+	echo stamp > $(srcroot)config.stamp.in
+
+$(objroot)config.stamp : $(cfgoutputs_in) $(cfghdrs_in) $(srcroot)configure
+	./$(objroot)config.status
+	@touch $@
+
+# There must be some action in order for make to re-read Makefile when it is
+# out of date.
+$(cfgoutputs_out) $(cfghdrs_out) : $(objroot)config.stamp
+	@true
+endif

+ 20 - 0
BeefRT/JEMalloc/README

@@ -0,0 +1,20 @@
+jemalloc is a general purpose malloc(3) implementation that emphasizes
+fragmentation avoidance and scalable concurrency support.  jemalloc first came
+into use as the FreeBSD libc allocator in 2005, and since then it has found its
+way into numerous applications that rely on its predictable behavior.  In 2010
+jemalloc development efforts broadened to include developer support features
+such as heap profiling and extensive monitoring/tuning hooks.  Modern jemalloc
+releases continue to be integrated back into FreeBSD, and therefore versatility
+remains critical.  Ongoing development efforts trend toward making jemalloc
+among the best allocators for a broad range of demanding applications, and
+eliminating/mitigating weaknesses that have practical repercussions for real
+world applications.
+
+The COPYING file contains copyright and licensing information.
+
+The INSTALL file contains information on how to configure, build, and install
+jemalloc.
+
+The ChangeLog file contains a brief summary of changes for each release.
+
+URL: http://jemalloc.net/

+ 1 - 0
BeefRT/JEMalloc/VERSION

@@ -0,0 +1 @@
+5.3.0-0-g54eaed1d8b56b1aa528be3bdd1877e59c56fa90c

+ 17 - 0
BeefRT/JEMalloc/autogen.sh

@@ -0,0 +1,17 @@
+#!/bin/sh
+
+for i in autoconf; do
+    echo "$i"
+    $i
+    if [ $? -ne 0 ]; then
+	echo "Error $? in $i"
+	exit 1
+    fi
+done
+
+echo "./configure --enable-autogen $@"
+./configure --enable-autogen $@
+if [ $? -ne 0 ]; then
+    echo "Error $? in ./configure"
+    exit 1
+fi

+ 83 - 0
BeefRT/JEMalloc/bin/jemalloc-config.in

@@ -0,0 +1,83 @@
+#!/bin/sh
+
+usage() {
+	cat <<EOF
+Usage:
+  @BINDIR@/jemalloc-config <option>
+Options:
+  --help | -h  : Print usage.
+  --version    : Print jemalloc version.
+  --revision   : Print shared library revision number.
+  --config     : Print configure options used to build jemalloc.
+  --prefix     : Print installation directory prefix.
+  --bindir     : Print binary installation directory.
+  --datadir    : Print data installation directory.
+  --includedir : Print include installation directory.
+  --libdir     : Print library installation directory.
+  --mandir     : Print manual page installation directory.
+  --cc         : Print compiler used to build jemalloc.
+  --cflags     : Print compiler flags used to build jemalloc.
+  --cppflags   : Print preprocessor flags used to build jemalloc.
+  --cxxflags   : Print C++ compiler flags used to build jemalloc.
+  --ldflags    : Print library flags used to build jemalloc.
+  --libs       : Print libraries jemalloc was linked against.
+EOF
+}
+
+prefix="@prefix@"
+exec_prefix="@exec_prefix@"
+
+case "$1" in
+--help | -h)
+	usage
+	exit 0
+	;;
+--version)
+	echo "@jemalloc_version@"
+	;;
+--revision)
+	echo "@rev@"
+	;;
+--config)
+	echo "@CONFIG@"
+	;;
+--prefix)
+	echo "@PREFIX@"
+	;;
+--bindir)
+	echo "@BINDIR@"
+	;;
+--datadir)
+	echo "@DATADIR@"
+	;;
+--includedir)
+	echo "@INCLUDEDIR@"
+	;;
+--libdir)
+	echo "@LIBDIR@"
+	;;
+--mandir)
+	echo "@MANDIR@"
+	;;
+--cc)
+	echo "@CC@"
+	;;
+--cflags)
+	echo "@CFLAGS@"
+	;;
+--cppflags)
+	echo "@CPPFLAGS@"
+	;;
+--cxxflags)
+	echo "@CXXFLAGS@"
+	;;
+--ldflags)
+	echo "@LDFLAGS@ @EXTRA_LDFLAGS@"
+	;;
+--libs)
+	echo "@LIBS@"
+	;;
+*)
+	usage
+	exit 1
+esac

+ 9 - 0
BeefRT/JEMalloc/bin/jemalloc.sh.in

@@ -0,0 +1,9 @@
+#!/bin/sh
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+
+@LD_PRELOAD_VAR@=${libdir}/libjemalloc.@SOREV@
+export @LD_PRELOAD_VAR@
+exec "$@"

+ 5723 - 0
BeefRT/JEMalloc/bin/jeprof.in

@@ -0,0 +1,5723 @@
+#! /usr/bin/env perl
+
+# Copyright (c) 1998-2007, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# ---
+# Program for printing the profile generated by common/profiler.cc,
+# or by the heap profiler (common/debugallocation.cc)
+#
+# The profile contains a sequence of entries of the form:
+#       <count> <stack trace>
+# This program parses the profile, and generates user-readable
+# output.
+#
+# Examples:
+#
+# % tools/jeprof "program" "profile"
+#   Enters "interactive" mode
+#
+# % tools/jeprof --text "program" "profile"
+#   Generates one line per procedure
+#
+# % tools/jeprof --gv "program" "profile"
+#   Generates annotated call-graph and displays via "gv"
+#
+# % tools/jeprof --gv --focus=Mutex "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#
+# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile"
+#   Restrict to code paths that involve an entry that matches "Mutex"
+#   and does not match "string"
+#
+# % tools/jeprof --list=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --list=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each line.
+#
+# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile"
+#   Generates disassembly listing of all routines with at least one
+#   sample that match the --disasm=<regexp> pattern.  The listing is
+#   annotated with the flat and cumulative sample counts at each PC value.
+#
+# TODO: Use color to indicate files?
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Cwd;
+
+my $JEPROF_VERSION = "@jemalloc_version@";
+my $PPROF_VERSION = "2.0";
+
+# These are the object tools we use which can come from a
+# user-specified location using --tools, from the JEPROF_TOOLS
+# environment variable, or from the environment.
+my %obj_tool_map = (
+  "objdump" => "objdump",
+  "nm" => "nm",
+  "addr2line" => "addr2line",
+  "c++filt" => "c++filt",
+  ## ConfigureObjTools may add architecture-specific entries:
+  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
+  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
+  #"otool" => "otool",         # equivalent of objdump on OS X
+);
+# NOTE: these are lists, so you can put in commandline flags if you want.
+my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
+my @GV = ("gv");
+my @EVINCE = ("evince");    # could also be xpdf or perhaps acroread
+my @KCACHEGRIND = ("kcachegrind");
+my @PS2PDF = ("ps2pdf");
+# These are used for dynamic profiles
+my @URL_FETCHER = ("curl", "-s", "--fail");
+
+# These are the web pages that servers need to support for dynamic profiles
+my $HEAP_PAGE = "/pprof/heap";
+my $PROFILE_PAGE = "/pprof/profile";   # must support cgi-param "?seconds=#"
+my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param
+                                                # ?seconds=#&event=x&period=n
+my $GROWTH_PAGE = "/pprof/growth";
+my $CONTENTION_PAGE = "/pprof/contention";
+my $WALL_PAGE = "/pprof/wall(?:\\?.*)?";  # accepts options like namefilter
+my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?";
+my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param
+                                                       # "?seconds=#",
+                                                       # "?tags_regexp=#" and
+                                                       # "?type=#".
+my $SYMBOL_PAGE = "/pprof/symbol";     # must support symbol lookup via POST
+my $PROGRAM_NAME_PAGE = "/pprof/cmdline";
+
+# These are the web pages that can be named on the command line.
+# All the alternatives must begin with /.
+my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" .
+               "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" .
+               "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)";
+
+# default binary name
+my $UNKNOWN_BINARY = "(unknown)";
+
+# There is a pervasive dependency on the length (in hex characters,
+# i.e., nibbles) of an address, distinguishing between 32-bit and
+# 64-bit profiles.  To err on the safe size, default to 64-bit here:
+my $address_length = 16;
+
+my $dev_null = "/dev/null";
+if (! -e $dev_null && $^O =~ /MSWin/) {    # $^O is the OS perl was built for
+  $dev_null = "nul";
+}
+
+# A list of paths to search for shared object files
+my @prefix_list = ();
+
+# Special routine name that should not have any symbols.
+# Used as separator to parse "addr2line -i" output.
+my $sep_symbol = '_fini';
+my $sep_address = undef;
+
+##### Argument parsing #####
+
+sub usage_string {
+  return <<EOF;
+Usage:
+jeprof [options] <program> <profiles>
+   <profiles> is a space separated list of profile names.
+jeprof [options] <symbolized-profiles>
+   <symbolized-profiles> is a list of profile files where each file contains
+   the necessary symbol mappings  as well as profile data (likely generated
+   with --raw).
+jeprof [options] <profile>
+   <profile> is a remote form.  Symbols are obtained from host:port$SYMBOL_PAGE
+
+   Each name can be:
+   /path/to/profile        - a path to a profile file
+   host:port[/<service>]   - a location of a service to get profile from
+
+   The /<service> can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile,
+                         $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall,
+                         $CENSUSPROFILE_PAGE, or /pprof/filteredprofile.
+   For instance:
+     jeprof http://myserver.com:80$HEAP_PAGE
+   If /<service> is omitted, the service defaults to $PROFILE_PAGE (cpu profiling).
+jeprof --symbols <program>
+   Maps addresses to symbol names.  In this mode, stdin should be a
+   list of library mappings, in the same format as is found in the heap-
+   and cpu-profile files (this loosely matches that of /proc/self/maps
+   on linux), followed by a list of hex addresses to map, one per line.
+
+   For more help with querying remote servers, including how to add the
+   necessary server-side support code, see this filename (or one like it):
+
+   /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html
+
+Options:
+   --cum               Sort by cumulative data
+   --base=<base>       Subtract <base> from <profile> before display
+   --interactive       Run in interactive mode (interactive "help" gives help) [default]
+   --seconds=<n>       Length of time for dynamic profiles [default=30 secs]
+   --add_lib=<file>    Read additional symbols and line info from the given library
+   --lib_prefix=<dir>  Comma separated list of library path prefixes
+
+Reporting Granularity:
+   --addresses         Report at address level
+   --lines             Report at source line level
+   --functions         Report at function level [default]
+   --files             Report at source file level
+
+Output type:
+   --text              Generate text report
+   --callgrind         Generate callgrind format to stdout
+   --gv                Generate Postscript and display
+   --evince            Generate PDF and display
+   --web               Generate SVG and display
+   --list=<regexp>     Generate source listing of matching routines
+   --disasm=<regexp>   Generate disassembly of matching routines
+   --symbols           Print demangled symbol names found at given addresses
+   --dot               Generate DOT file to stdout
+   --ps                Generate Postcript to stdout
+   --pdf               Generate PDF to stdout
+   --svg               Generate SVG to stdout
+   --gif               Generate GIF to stdout
+   --raw               Generate symbolized jeprof data (useful with remote fetch)
+   --collapsed         Generate collapsed stacks for building flame graphs
+                       (see http://www.brendangregg.com/flamegraphs.html)
+
+Heap-Profile Options:
+   --inuse_space       Display in-use (mega)bytes [default]
+   --inuse_objects     Display in-use objects
+   --alloc_space       Display allocated (mega)bytes
+   --alloc_objects     Display allocated objects
+   --show_bytes        Display space in bytes
+   --drop_negative     Ignore negative differences
+
+Contention-profile options:
+   --total_delay       Display total delay at each region [default]
+   --contentions       Display number of delays at each region
+   --mean_delay        Display mean delay at each region
+
+Call-graph Options:
+   --nodecount=<n>     Show at most so many nodes [default=80]
+   --nodefraction=<f>  Hide nodes below <f>*total [default=.005]
+   --edgefraction=<f>  Hide edges below <f>*total [default=.001]
+   --maxdegree=<n>     Max incoming/outgoing edges per node [default=8]
+   --focus=<regexp>    Focus on backtraces with nodes matching <regexp>
+   --thread=<n>        Show profile for thread <n>
+   --ignore=<regexp>   Ignore backtraces with nodes matching <regexp>
+   --scale=<n>         Set GV scaling [default=0]
+   --heapcheck         Make nodes with non-0 object counts
+                       (i.e. direct leak generators) more visible
+   --retain=<regexp>   Retain only nodes that match <regexp>
+   --exclude=<regexp>  Exclude all nodes that match <regexp>
+
+Miscellaneous:
+   --tools=<prefix or binary:fullpath>[,...]   \$PATH for object tool pathnames
+   --test              Run unit tests
+   --help              This message
+   --version           Version information
+   --debug-syms-by-id  (Linux only) Find debug symbol files by build ID as well as by name
+
+Environment Variables:
+   JEPROF_TMPDIR        Profiles directory. Defaults to \$HOME/jeprof
+   JEPROF_TOOLS         Prefix for object tools pathnames
+
+Examples:
+
+jeprof /bin/ls ls.prof
+                       Enters "interactive" mode
+jeprof --text /bin/ls ls.prof
+                       Outputs one line per procedure
+jeprof --web /bin/ls ls.prof
+                       Displays annotated call-graph in web browser
+jeprof --gv /bin/ls ls.prof
+                       Displays annotated call-graph via 'gv'
+jeprof --gv --focus=Mutex /bin/ls ls.prof
+                       Restricts to code paths including a .*Mutex.* entry
+jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof
+                       Code paths including Mutex but not string
+jeprof --list=getdir /bin/ls ls.prof
+                       (Per-line) annotated source listing for getdir()
+jeprof --disasm=getdir /bin/ls ls.prof
+                       (Per-PC) annotated disassembly for getdir()
+
+jeprof http://localhost:1234/
+                       Enters "interactive" mode
+jeprof --text localhost:1234
+                       Outputs one line per procedure for localhost:1234
+jeprof --raw localhost:1234 > ./local.raw
+jeprof --text ./local.raw
+                       Fetches a remote profile for later analysis and then
+                       analyzes it in text mode.
+EOF
+}
+
+sub version_string {
+  return <<EOF
+jeprof (part of jemalloc $JEPROF_VERSION)
+based on pprof (part of gperftools $PPROF_VERSION)
+
+Copyright 1998-2007 Google Inc.
+
+This is BSD licensed software; see the source for copying conditions
+and license information.
+There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.
+EOF
+}
+
+sub usage {
+  my $msg = shift;
+  print STDERR "$msg\n\n";
+  print STDERR usage_string();
+  print STDERR "\nFATAL ERROR: $msg\n";    # just as a reminder
+  exit(1);
+}
+
+sub Init() {
+  # Setup tmp-file name and handler to clean it up.
+  # We do this in the very beginning so that we can use
+  # error() and cleanup() function anytime here after.
+  $main::tmpfile_sym = "/tmp/jeprof$$.sym";
+  $main::tmpfile_ps = "/tmp/jeprof$$";
+  $main::next_tmpfile = 0;
+  $SIG{'INT'} = \&sighandler;
+
+  # Cache from filename/linenumber to source code
+  $main::source_cache = ();
+
+  $main::opt_help = 0;
+  $main::opt_version = 0;
+
+  $main::opt_cum = 0;
+  $main::opt_base = '';
+  $main::opt_addresses = 0;
+  $main::opt_lines = 0;
+  $main::opt_functions = 0;
+  $main::opt_files = 0;
+  $main::opt_lib_prefix = "";
+
+  $main::opt_text = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_list = "";
+  $main::opt_disasm = "";
+  $main::opt_symbols = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_web = 0;
+  $main::opt_dot = 0;
+  $main::opt_ps = 0;
+  $main::opt_pdf = 0;
+  $main::opt_gif = 0;
+  $main::opt_svg = 0;
+  $main::opt_raw = 0;
+  $main::opt_collapsed = 0;
+
+  $main::opt_nodecount = 80;
+  $main::opt_nodefraction = 0.005;
+  $main::opt_edgefraction = 0.001;
+  $main::opt_maxdegree = 8;
+  $main::opt_focus = '';
+  $main::opt_thread = undef;
+  $main::opt_ignore = '';
+  $main::opt_scale = 0;
+  $main::opt_heapcheck = 0;
+  $main::opt_retain = '';
+  $main::opt_exclude = '';
+  $main::opt_seconds = 30;
+  $main::opt_lib = "";
+
+  $main::opt_inuse_space   = 0;
+  $main::opt_inuse_objects = 0;
+  $main::opt_alloc_space   = 0;
+  $main::opt_alloc_objects = 0;
+  $main::opt_show_bytes    = 0;
+  $main::opt_drop_negative = 0;
+  $main::opt_interactive   = 0;
+
+  $main::opt_total_delay = 0;
+  $main::opt_contentions = 0;
+  $main::opt_mean_delay = 0;
+
+  $main::opt_tools   = "";
+  $main::opt_debug   = 0;
+  $main::opt_test    = 0;
+  $main::opt_debug_syms_by_id = 0;
+
+  # These are undocumented flags used only by unittests.
+  $main::opt_test_stride = 0;
+
+  # Are we using $SYMBOL_PAGE?
+  $main::use_symbol_page = 0;
+
+  # Files returned by TempName.
+  %main::tempnames = ();
+
+  # Type of profile we are dealing with
+  # Supported types:
+  #     cpu
+  #     heap
+  #     growth
+  #     contention
+  $main::profile_type = '';     # Empty type means "unknown"
+
+  GetOptions("help!"          => \$main::opt_help,
+             "version!"       => \$main::opt_version,
+             "cum!"           => \$main::opt_cum,
+             "base=s"         => \$main::opt_base,
+             "seconds=i"      => \$main::opt_seconds,
+             "add_lib=s"      => \$main::opt_lib,
+             "lib_prefix=s"   => \$main::opt_lib_prefix,
+             "functions!"     => \$main::opt_functions,
+             "lines!"         => \$main::opt_lines,
+             "addresses!"     => \$main::opt_addresses,
+             "files!"         => \$main::opt_files,
+             "text!"          => \$main::opt_text,
+             "callgrind!"     => \$main::opt_callgrind,
+             "list=s"         => \$main::opt_list,
+             "disasm=s"       => \$main::opt_disasm,
+             "symbols!"       => \$main::opt_symbols,
+             "gv!"            => \$main::opt_gv,
+             "evince!"        => \$main::opt_evince,
+             "web!"           => \$main::opt_web,
+             "dot!"           => \$main::opt_dot,
+             "ps!"            => \$main::opt_ps,
+             "pdf!"           => \$main::opt_pdf,
+             "svg!"           => \$main::opt_svg,
+             "gif!"           => \$main::opt_gif,
+             "raw!"           => \$main::opt_raw,
+             "collapsed!"     => \$main::opt_collapsed,
+             "interactive!"   => \$main::opt_interactive,
+             "nodecount=i"    => \$main::opt_nodecount,
+             "nodefraction=f" => \$main::opt_nodefraction,
+             "edgefraction=f" => \$main::opt_edgefraction,
+             "maxdegree=i"    => \$main::opt_maxdegree,
+             "focus=s"        => \$main::opt_focus,
+             "thread=s"       => \$main::opt_thread,
+             "ignore=s"       => \$main::opt_ignore,
+             "scale=i"        => \$main::opt_scale,
+             "heapcheck"      => \$main::opt_heapcheck,
+             "retain=s"       => \$main::opt_retain,
+             "exclude=s"      => \$main::opt_exclude,
+             "inuse_space!"   => \$main::opt_inuse_space,
+             "inuse_objects!" => \$main::opt_inuse_objects,
+             "alloc_space!"   => \$main::opt_alloc_space,
+             "alloc_objects!" => \$main::opt_alloc_objects,
+             "show_bytes!"    => \$main::opt_show_bytes,
+             "drop_negative!" => \$main::opt_drop_negative,
+             "total_delay!"   => \$main::opt_total_delay,
+             "contentions!"   => \$main::opt_contentions,
+             "mean_delay!"    => \$main::opt_mean_delay,
+             "tools=s"        => \$main::opt_tools,
+             "test!"          => \$main::opt_test,
+             "debug!"         => \$main::opt_debug,
+             "debug-syms-by-id!" => \$main::opt_debug_syms_by_id,
+             # Undocumented flags used only by unittests:
+             "test_stride=i"  => \$main::opt_test_stride,
+      ) || usage("Invalid option(s)");
+
+  # Deal with the standard --help and --version
+  if ($main::opt_help) {
+    print usage_string();
+    exit(0);
+  }
+
+  if ($main::opt_version) {
+    print version_string();
+    exit(0);
+  }
+
+  # Disassembly/listing/symbols mode requires address-level info
+  if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) {
+    $main::opt_functions = 0;
+    $main::opt_lines = 0;
+    $main::opt_addresses = 1;
+    $main::opt_files = 0;
+  }
+
+  # Check heap-profiling flags
+  if ($main::opt_inuse_space +
+      $main::opt_inuse_objects +
+      $main::opt_alloc_space +
+      $main::opt_alloc_objects > 1) {
+    usage("Specify at most on of --inuse/--alloc options");
+  }
+
+  # Check output granularities
+  my $grains =
+      $main::opt_functions +
+      $main::opt_lines +
+      $main::opt_addresses +
+      $main::opt_files +
+      0;
+  if ($grains > 1) {
+    usage("Only specify one output granularity option");
+  }
+  if ($grains == 0) {
+    $main::opt_functions = 1;
+  }
+
+  # Check output modes
+  my $modes =
+      $main::opt_text +
+      $main::opt_callgrind +
+      ($main::opt_list eq '' ? 0 : 1) +
+      ($main::opt_disasm eq '' ? 0 : 1) +
+      ($main::opt_symbols == 0 ? 0 : 1) +
+      $main::opt_gv +
+      $main::opt_evince +
+      $main::opt_web +
+      $main::opt_dot +
+      $main::opt_ps +
+      $main::opt_pdf +
+      $main::opt_svg +
+      $main::opt_gif +
+      $main::opt_raw +
+      $main::opt_collapsed +
+      $main::opt_interactive +
+      0;
+  if ($modes > 1) {
+    usage("Only specify one output mode");
+  }
+  if ($modes == 0) {
+    if (-t STDOUT) {  # If STDOUT is a tty, activate interactive mode
+      $main::opt_interactive = 1;
+    } else {
+      $main::opt_text = 1;
+    }
+  }
+
+  if ($main::opt_test) {
+    RunUnitTests();
+    # Should not return
+    exit(1);
+  }
+
+  # Binary name and profile arguments list
+  $main::prog = "";
+  @main::pfile_args = ();
+
+  # Remote profiling without a binary (using $SYMBOL_PAGE instead)
+  if (@ARGV > 0) {
+    if (IsProfileURL($ARGV[0])) {
+      $main::use_symbol_page = 1;
+    } elsif (IsSymbolizedProfileFile($ARGV[0])) {
+      $main::use_symbolized_profile = 1;
+      $main::prog = $UNKNOWN_BINARY;  # will be set later from the profile file
+    }
+  }
+
+  if ($main::use_symbol_page || $main::use_symbolized_profile) {
+    # We don't need a binary!
+    my %disabled = ('--lines' => $main::opt_lines,
+                    '--disasm' => $main::opt_disasm);
+    for my $option (keys %disabled) {
+      usage("$option cannot be used without a binary") if $disabled{$option};
+    }
+    # Set $main::prog later...
+    scalar(@ARGV) || usage("Did not specify profile file");
+  } elsif ($main::opt_symbols) {
+    # --symbols needs a binary-name (to run nm on, etc) but not profiles
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+  } else {
+    $main::prog = shift(@ARGV) || usage("Did not specify program");
+    scalar(@ARGV) || usage("Did not specify profile file");
+  }
+
+  # Parse profile file/location arguments
+  foreach my $farg (@ARGV) {
+    if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) {
+      my $machine = $1;
+      my $num_machines = $2;
+      my $path = $3;
+      for (my $i = 0; $i < $num_machines; $i++) {
+        unshift(@main::pfile_args, "$i.$machine$path");
+      }
+    } else {
+      unshift(@main::pfile_args, $farg);
+    }
+  }
+
+  if ($main::use_symbol_page) {
+    unless (IsProfileURL($main::pfile_args[0])) {
+      error("The first profile should be a remote form to use $SYMBOL_PAGE\n");
+    }
+    CheckSymbolPage();
+    $main::prog = FetchProgramName();
+  } elsif (!$main::use_symbolized_profile) {  # may not need objtools!
+    ConfigureObjTools($main::prog)
+  }
+
+  # Break the opt_lib_prefix into the prefix_list array
+  @prefix_list = split (',', $main::opt_lib_prefix);
+
+  # Remove trailing / from the prefixes, in the list to prevent
+  # searching things like /my/path//lib/mylib.so
+  foreach (@prefix_list) {
+    s|/+$||;
+  }
+
+  # Flag to prevent us from trying over and over to use
+  #  elfutils if it's not installed (used only with
+  #  --debug-syms-by-id option).
+  $main::gave_up_on_elfutils = 0;
+}
+
+sub FilterAndPrint {
+  my ($profile, $symbols, $libs, $thread) = @_;
+
+  # Get total data in profile
+  my $total = TotalProfile($profile);
+
+  # Remove uniniteresting stack items
+  $profile = RemoveUninterestingFrames($symbols, $profile);
+
+  # Focus?
+  if ($main::opt_focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $main::opt_focus);
+  }
+
+  # Ignore?
+  if ($main::opt_ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore);
+  }
+
+  my $calls = ExtractCalls($symbols, $profile);
+
+  # Reduce profiles to required output granularity, and also clean
+  # each stack trace so a given entry exists at most once.
+  my $reduced = ReduceProfile($symbols, $profile);
+
+  # Get derived profiles
+  my $flat = FlatProfile($reduced);
+  my $cumulative = CumulativeProfile($reduced);
+
+  # Print
+  if (!$main::opt_interactive) {
+    if ($main::opt_disasm) {
+      PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm);
+    } elsif ($main::opt_list) {
+      PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0);
+    } elsif ($main::opt_text) {
+      # Make sure the output is empty when have nothing to report
+      # (only matters when --heapcheck is given but we must be
+      # compatible with old branches that did not pass --heapcheck always):
+      if ($total != 0) {
+        printf("Total%s: %s %s\n",
+               (defined($thread) ? " (t$thread)" : ""),
+               Unparse($total), Units());
+      }
+      PrintText($symbols, $flat, $cumulative, -1);
+    } elsif ($main::opt_raw) {
+      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_collapsed) {
+      PrintCollapsedStacks($symbols, $profile);
+    } elsif ($main::opt_callgrind) {
+      PrintCallgrind($calls);
+    } else {
+      if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+        if ($main::opt_gv) {
+          RunGV(TempName($main::next_tmpfile, "ps"), "");
+        } elsif ($main::opt_evince) {
+          RunEvince(TempName($main::next_tmpfile, "pdf"), "");
+        } elsif ($main::opt_web) {
+          my $tmp = TempName($main::next_tmpfile, "svg");
+          RunWeb($tmp);
+          # The command we run might hand the file name off
+          # to an already running browser instance and then exit.
+          # Normally, we'd remove $tmp on exit (right now),
+          # but fork a child to remove $tmp a little later, so that the
+          # browser has time to load it first.
+          delete $main::tempnames{$tmp};
+          if (fork() == 0) {
+            sleep 5;
+            unlink($tmp);
+            exit(0);
+          }
+        }
+      } else {
+        cleanup();
+        exit(1);
+      }
+    }
+  } else {
+    InteractiveMode($profile, $symbols, $libs, $total);
+  }
+}
+
+sub Main() {
+  Init();
+  $main::collected_profile = undef;
+  @main::profile_files = ();
+  $main::op_time = time();
+
+  # Printing symbols is special and requires a lot less info that most.
+  if ($main::opt_symbols) {
+    PrintSymbols(*STDIN);   # Get /proc/maps and symbols output from stdin
+    return;
+  }
+
+  # Fetch all profile data
+  FetchDynamicProfiles();
+
+  # this will hold symbols that we read from the profile files
+  my $symbol_map = {};
+
+  # Read one profile, pick the last item on the list
+  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $profile = $data->{profile};
+  my $pcs = $data->{pcs};
+  my $libs = $data->{libs};   # Info about main program and shared libraries
+  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});
+
+  # Add additional profiles, if available.
+  if (scalar(@main::profile_files) > 0) {
+    foreach my $pname (@main::profile_files) {
+      my $data2 = ReadProfile($main::prog, $pname);
+      $profile = AddProfile($profile, $data2->{profile});
+      $pcs = AddPcs($pcs, $data2->{pcs});
+      $symbol_map = MergeSymbols($symbol_map, $data2->{symbols});
+    }
+  }
+
+  # Subtract base from profile, if specified
+  if ($main::opt_base ne '') {
+    my $base = ReadProfile($main::prog, $main::opt_base);
+    $profile = SubtractProfile($profile, $base->{profile});
+    $pcs = AddPcs($pcs, $base->{pcs});
+    $symbol_map = MergeSymbols($symbol_map, $base->{symbols});
+  }
+
+  # Collect symbols
+  my $symbols;
+  if ($main::use_symbolized_profile) {
+    $symbols = FetchSymbols($pcs, $symbol_map);
+  } elsif ($main::use_symbol_page) {
+    $symbols = FetchSymbols($pcs);
+  } else {
+    # TODO(csilvers): $libs uses the /proc/self/maps data from profile1,
+    # which may differ from the data from subsequent profiles, especially
+    # if they were run on different machines.  Use appropriate libs for
+    # each pc somehow.
+    $symbols = ExtractSymbols($libs, $pcs);
+  }
+
+  if (!defined($main::opt_thread)) {
+    FilterAndPrint($profile, $symbols, $libs);
+  }
+  if (defined($data->{threads})) {
+    foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) {
+      if (defined($main::opt_thread) &&
+          ($main::opt_thread eq '*' || $main::opt_thread == $thread)) {
+        my $thread_profile = $data->{threads}{$thread};
+        FilterAndPrint($thread_profile, $symbols, $libs, $thread);
+      }
+    }
+  }
+
+  cleanup();
+  exit(0);
+}
+
+##### Entry Point #####
+
+Main();
+
+# Temporary code to detect if we're running on a Goobuntu system.
+# These systems don't have the right stuff installed for the special
+# Readline libraries to work, so as a temporary workaround, we default
+# to using the normal stdio code, rather than the fancier readline-based
+# code
+sub ReadlineMightFail {
+  if (-e '/lib/libtermcap.so.2') {
+    return 0;  # libtermcap exists, so readline should be okay
+  } else {
+    return 1;
+  }
+}
+
+sub RunGV {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) {
+    # Options using double dash are supported by this gv version.
+    # Also, turn on noantialias to better handle bug in gv for
+    # postscript files with large dimensions.
+    # TODO: Maybe we should not pass the --noantialias flag
+    # if the gv version is known to work properly without the flag.
+    system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname)
+           . $bg);
+  } else {
+    # Old gv version - only supports options that use single dash.
+    print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n";
+    system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg);
+  }
+}
+
+sub RunEvince {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  system(ShellEscape(@EVINCE, $fname) . $bg);
+}
+
+sub RunWeb {
+  my $fname = shift;
+  print STDERR "Loading web page file:///$fname\n";
+
+  if (`uname` =~ /Darwin/) {
+    # OS X: open will use standard preference for SVG files.
+    system("/usr/bin/open", $fname);
+    return;
+  }
+
+  # Some kind of Unix; try generic symlinks, then specific browsers.
+  # (Stop once we find one.)
+  # Works best if the browser is already running.
+  my @alt = (
+    "/etc/alternatives/gnome-www-browser",
+    "/etc/alternatives/x-www-browser",
+    "google-chrome",
+    "firefox",
+  );
+  foreach my $b (@alt) {
+    if (system($b, $fname) == 0) {
+      return;
+    }
+  }
+
+  print STDERR "Could not load web browser.\n";
+}
+
+sub RunKcachegrind {
+  my $fname = shift;
+  my $bg = shift;       # "" or " &" if we should run in background
+  print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n";
+  system(ShellEscape(@KCACHEGRIND, $fname) . $bg);
+}
+
+
+##### Interactive helper routines #####
+
+sub InteractiveMode {
+  $| = 1;  # Make output unbuffered for interactive mode
+  my ($orig_profile, $symbols, $libs, $total) = @_;
+
+  print STDERR "Welcome to jeprof!  For help, type 'help'.\n";
+
+  # Use ReadLine if it's installed and input comes from a console.
+  if ( -t STDIN &&
+       !ReadlineMightFail() &&
+       defined(eval {require Term::ReadLine}) ) {
+    my $term = new Term::ReadLine 'jeprof';
+    while ( defined ($_ = $term->readline('(jeprof) '))) {
+      $term->addhistory($_) if /\S/;
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+    }
+  } else {       # don't have readline
+    while (1) {
+      print STDERR "(jeprof) ";
+      $_ = <STDIN>;
+      last if ! defined $_ ;
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+
+      # Save some flags that might be reset by InteractiveCommand()
+      my $save_opt_lines = $main::opt_lines;
+
+      if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) {
+        last;    # exit when we get an interactive command to quit
+      }
+
+      # Restore flags
+      $main::opt_lines = $save_opt_lines;
+    }
+  }
+}
+
+# Takes two args: orig profile, and command to run.
+# Returns 1 if we should keep going, or 0 if we were asked to quit
+sub InteractiveCommand {
+  my($orig_profile, $symbols, $libs, $total, $command) = @_;
+  $_ = $command;                # just to make future m//'s easier
+  if (!defined($_)) {
+    print STDERR "\n";
+    return 0;
+  }
+  if (m/^\s*quit/) {
+    return 0;
+  }
+  if (m/^\s*help/) {
+    InteractiveHelpMessage();
+    return 1;
+  }
+  # Clear all the mode options -- mode is controlled by "$command"
+  $main::opt_text = 0;
+  $main::opt_callgrind = 0;
+  $main::opt_disasm = 0;
+  $main::opt_list = 0;
+  $main::opt_gv = 0;
+  $main::opt_evince = 0;
+  $main::opt_cum = 0;
+
+  if (m/^\s*(text|top)(\d*)\s*(.*)/) {
+    $main::opt_text = 1;
+
+    my $line_limit = ($2 ne "") ? int($2) : 10;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($3);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintText($symbols, $flat, $cumulative, $line_limit);
+    return 1;
+  }
+  if (m/^\s*callgrind\s*([^ \n]*)/) {
+    $main::opt_callgrind = 1;
+
+    # Get derived profiles
+    my $calls = ExtractCalls($symbols, $orig_profile);
+    my $filename = $1;
+    if ( $1 eq '' ) {
+      $filename = TempName($main::next_tmpfile, "callgrind");
+    }
+    PrintCallgrind($calls, $filename);
+    if ( $1 eq '' ) {
+      RunKcachegrind($filename, " & ");
+      $main::next_tmpfile++;
+    }
+
+    return 1;
+  }
+  if (m/^\s*(web)?list\s*(.+)/) {
+    my $html = (defined($1) && ($1 eq "web"));
+    $main::opt_list = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($2);
+
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintListing($total, $libs, $flat, $cumulative, $routine, $html);
+    return 1;
+  }
+  if (m/^\s*disasm\s*(.+)/) {
+    $main::opt_disasm = 1;
+
+    my $routine;
+    my $ignore;
+    ($routine, $ignore) = ParseInteractiveArgs($1);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    PrintDisassembly($libs, $flat, $cumulative, $routine);
+    return 1;
+  }
+  if (m/^\s*(gv|web|evince)\s*(.*)/) {
+    $main::opt_gv = 0;
+    $main::opt_evince = 0;
+    $main::opt_web = 0;
+    if ($1 eq "gv") {
+      $main::opt_gv = 1;
+    } elsif ($1 eq "evince") {
+      $main::opt_evince = 1;
+    } elsif ($1 eq "web") {
+      $main::opt_web = 1;
+    }
+
+    my $focus;
+    my $ignore;
+    ($focus, $ignore) = ParseInteractiveArgs($2);
+
+    # Process current profile to account for various settings
+    my $profile = ProcessProfile($total, $orig_profile, $symbols,
+                                 $focus, $ignore);
+    my $reduced = ReduceProfile($symbols, $profile);
+
+    # Get derived profiles
+    my $flat = FlatProfile($reduced);
+    my $cumulative = CumulativeProfile($reduced);
+
+    if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) {
+      if ($main::opt_gv) {
+        RunGV(TempName($main::next_tmpfile, "ps"), " &");
+      } elsif ($main::opt_evince) {
+        RunEvince(TempName($main::next_tmpfile, "pdf"), " &");
+      } elsif ($main::opt_web) {
+        RunWeb(TempName($main::next_tmpfile, "svg"));
+      }
+      $main::next_tmpfile++;
+    }
+    return 1;
+  }
+  if (m/^\s*$/) {
+    return 1;
+  }
+  print STDERR "Unknown command: try 'help'.\n";
+  return 1;
+}
+
+
+sub ProcessProfile {
+  my $total_count = shift;
+  my $orig_profile = shift;
+  my $symbols = shift;
+  my $focus = shift;
+  my $ignore = shift;
+
+  # Process current profile to account for various settings
+  my $profile = $orig_profile;
+  printf("Total: %s %s\n", Unparse($total_count), Units());
+  if ($focus ne '') {
+    $profile = FocusProfile($symbols, $profile, $focus);
+    my $focus_count = TotalProfile($profile);
+    printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n",
+           $focus,
+           Unparse($focus_count), Units(),
+           Unparse($total_count), ($focus_count*100.0) / $total_count);
+  }
+  if ($ignore ne '') {
+    $profile = IgnoreProfile($symbols, $profile, $ignore);
+    my $ignore_count = TotalProfile($profile);
+    printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n",
+           $ignore,
+           Unparse($ignore_count), Units(),
+           Unparse($total_count),
+           ($ignore_count*100.0) / $total_count);
+  }
+
+  return $profile;
+}
+
+sub InteractiveHelpMessage {
+  print STDERR <<ENDOFHELP;
+Interactive jeprof mode
+
+Commands:
+  gv
+  gv [focus] [-ignore1] [-ignore2]
+      Show graphical hierarchical display of current profile.  Without
+      any arguments, shows all samples in the profile.  With the optional
+      "focus" argument, restricts the samples shown to just those where
+      the "focus" regular expression matches a routine name on the stack
+      trace.
+
+  web
+  web [focus] [-ignore1] [-ignore2]
+      Like GV, but displays profile in your web browser instead of using
+      Ghostview. Works best if your web browser is already running.
+      To change the browser that gets used:
+      On Linux, set the /etc/alternatives/gnome-www-browser symlink.
+      On OS X, change the Finder association for SVG files.
+
+  list [routine_regexp] [-ignore1] [-ignore2]
+      Show source listing of routines whose names match "routine_regexp"
+
+  weblist [routine_regexp] [-ignore1] [-ignore2]
+     Displays a source listing of routines whose names match "routine_regexp"
+     in a web browser.  You can click on source lines to view the
+     corresponding disassembly.
+
+  top [--cum] [-ignore1] [-ignore2]
+  top20 [--cum] [-ignore1] [-ignore2]
+  top37 [--cum] [-ignore1] [-ignore2]
+      Show top lines ordered by flat profile count, or cumulative count
+      if --cum is specified.  If a number is present after 'top', the
+      top K routines will be shown (defaults to showing the top 10)
+
+  disasm [routine_regexp] [-ignore1] [-ignore2]
+      Show disassembly of routines whose names match "routine_regexp",
+      annotated with sample counts.
+
+  callgrind
+  callgrind [filename]
+      Generates callgrind file. If no filename is given, kcachegrind is called.
+
+  help - This listing
+  quit or ^D - End jeprof
+
+For commands that accept optional -ignore tags, samples where any routine in
+the stack trace matches the regular expression in any of the -ignore
+parameters will be ignored.
+
+Further pprof details are available at this location (or one similar):
+
+ /usr/doc/gperftools-$PPROF_VERSION/cpu_profiler.html
+ /usr/doc/gperftools-$PPROF_VERSION/heap_profiler.html
+
+ENDOFHELP
+}
+sub ParseInteractiveArgs {
+  my $args = shift;
+  my $focus = "";
+  my $ignore = "";
+  my @x = split(/ +/, $args);
+  foreach $a (@x) {
+    if ($a =~ m/^(--|-)lines$/) {
+      $main::opt_lines = 1;
+    } elsif ($a =~ m/^(--|-)cum$/) {
+      $main::opt_cum = 1;
+    } elsif ($a =~ m/^-(.*)/) {
+      $ignore .= (($ignore ne "") ? "|" : "" ) . $1;
+    } else {
+      $focus .= (($focus ne "") ? "|" : "" ) . $a;
+    }
+  }
+  if ($ignore ne "") {
+    print STDERR "Ignoring samples in call stacks that match '$ignore'\n";
+  }
+  return ($focus, $ignore);
+}
+
+##### Output code #####
+
+sub TempName {
+  my $fnum = shift;
+  my $ext = shift;
+  my $file = "$main::tmpfile_ps.$fnum.$ext";
+  $main::tempnames{$file} = 1;
+  return $file;
+}
+
+# Print profile data in packed binary format (64-bit) to standard out
+sub PrintProfileData {
+  my $profile = shift;
+
+  # print header (64-bit style)
+  # (zero) (header-size) (version) (sample-period) (zero)
+  print pack('L*', 0, 0, 3, 0, 0, 0, 1, 0, 0, 0);
+
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      my $depth = $#addrs + 1;
+      # int(foo / 2**32) is the only reliable way to get rid of bottom
+      # 32 bits on both 32- and 64-bit systems.
+      print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32));
+      print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32));
+
+      foreach my $full_addr (@addrs) {
+        my $addr = $full_addr;
+        $addr =~ s/0x0*//;  # strip off leading 0x, zeroes
+        if (length($addr) > 16) {
+          print STDERR "Invalid address in profile: $full_addr\n";
+          next;
+        }
+        my $low_addr = substr($addr, -8);       # get last 8 hex chars
+        my $high_addr = substr($addr, -16, 8);  # get up to 8 more hex chars
+        print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr));
+      }
+    }
+  }
+}
+
+# Print symbols and profile data
+sub PrintSymbolizedProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $prog = shift;
+
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+
+  print '--- ', $symbol_marker, "\n";
+  if (defined($prog)) {
+    print 'binary=', $prog, "\n";
+  }
+  while (my ($pc, $name) = each(%{$symbols})) {
+    my $sep = ' ';
+    print '0x', $pc;
+    # We have a list of function names, which include the inlined
+    # calls.  They are separated (and terminated) by --, which is
+    # illegal in function names.
+    for (my $j = 2; $j <= $#{$name}; $j += 3) {
+      print $sep, $name->[$j];
+      $sep = '--';
+    }
+    print "\n";
+  }
+  print '---', "\n";
+
+  my $profile_marker;
+  if ($main::profile_type eq 'heap') {
+    $HEAP_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+    $profile_marker = $&;
+  } elsif ($main::profile_type eq 'growth') {
+    $GROWTH_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+    $profile_marker = $&;
+  } elsif ($main::profile_type eq 'contention') {
+    $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+    $profile_marker = $&;
+  } else { # elsif ($main::profile_type eq 'cpu')
+    $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+    $profile_marker = $&;
+  }
+
+  print '--- ', $profile_marker, "\n";
+  if (defined($main::collected_profile)) {
+    # if used with remote fetch, simply dump the collected profile to output.
+    open(SRC, "<$main::collected_profile");
+    while (<SRC>) {
+      print $_;
+    }
+    close(SRC);
+  } else {
+    # --raw/http: For everything to work correctly for non-remote profiles, we
+    # would need to extend PrintProfileData() to handle all possible profile
+    # types, re-enable the code that is currently disabled in ReadCPUProfile()
+    # and FixCallerAddresses(), and remove the remote profile dumping code in
+    # the block above.
+    die "--raw/http: jeprof can only dump remote profiles for --raw\n";
+    # dump a cpu-format profile to standard out
+    PrintProfileData($profile);
+  }
+}
+
+# Print text output
+sub PrintText {
+  my $symbols = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $line_limit = shift;
+
+  my $total = TotalProfile($flat);
+
+  # Which profile to sort by?
+  my $s = $main::opt_cum ? $cumulative : $flat;
+
+  my $running_sum = 0;
+  my $lines = 0;
+  foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b }
+                 keys(%{$cumulative})) {
+    my $f = GetEntry($flat, $k);
+    my $c = GetEntry($cumulative, $k);
+    $running_sum += $f;
+
+    my $sym = $k;
+    if (exists($symbols->{$k})) {
+      $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1];
+      if ($main::opt_addresses) {
+        $sym = $k . " " . $sym;
+      }
+    }
+
+    if ($f != 0 || $c != 0) {
+      printf("%8s %6s %6s %8s %6s %s\n",
+             Unparse($f),
+             Percent($f, $total),
+             Percent($running_sum, $total),
+             Unparse($c),
+             Percent($c, $total),
+             $sym);
+    }
+    $lines++;
+    last if ($line_limit >= 0 && $lines >= $line_limit);
+  }
+}
+
+# Callgrind format has a compression for repeated function and file
+# names.  You show the name the first time, and just use its number
+# subsequently.  This can cut down the file to about a third or a
+# quarter of its uncompressed size.  $key and $val are the key/value
+# pair that would normally be printed by callgrind; $map is a map from
+# value to number.
+sub CompressedCGName {
+  my($key, $val, $map) = @_;
+  my $idx = $map->{$val};
+  # For very short keys, providing an index hurts rather than helps.
+  if (length($val) <= 3) {
+    return "$key=$val\n";
+  } elsif (defined($idx)) {
+    return "$key=($idx)\n";
+  } else {
+    # scalar(keys $map) gives the number of items in the map.
+    $idx = scalar(keys(%{$map})) + 1;
+    $map->{$val} = $idx;
+    return "$key=($idx) $val\n";
+  }
+}
+
+# Print the call graph in a way that's suiteable for callgrind.
+sub PrintCallgrind {
+  my $calls = shift;
+  my $filename;
+  my %filename_to_index_map;
+  my %fnname_to_index_map;
+
+  if ($main::opt_interactive) {
+    $filename = shift;
+    print STDERR "Writing callgrind file to '$filename'.\n"
+  } else {
+    $filename = "&STDOUT";
+  }
+  open(CG, ">$filename");
+  printf CG ("events: Hits\n\n");
+  foreach my $call ( map { $_->[0] }
+                     sort { $a->[1] cmp $b ->[1] ||
+                            $a->[2] <=> $b->[2] }
+                     map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+                           [$_, $1, $2] }
+                     keys %$calls ) {
+    my $count = int($calls->{$call});
+    $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/;
+    my ( $caller_file, $caller_line, $caller_function,
+         $callee_file, $callee_line, $callee_function ) =
+       ( $1, $2, $3, $5, $6, $7 );
+
+    # TODO(csilvers): for better compression, collect all the
+    # caller/callee_files and functions first, before printing
+    # anything, and only compress those referenced more than once.
+    printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map);
+    printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map);
+    if (defined $6) {
+      printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map);
+      printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map);
+      printf CG ("calls=$count $callee_line\n");
+    }
+    printf CG ("$caller_line $count\n\n");
+  }
+}
+
+# Print disassembly for all all routines that match $main::opt_disasm
+sub PrintDisassembly {
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $disasm_opts = shift;
+
+  my $total = TotalProfile($flat);
+
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      # See if there are any samples in this routine
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          PrintDisassembledFunction($lib->[0], $offset,
+                                    $routine, $flat, $cumulative,
+                                    $start_addr, $end_addr, $total);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+}
+
+# Return reference to array of tuples of the form:
+#       [start_address, filename, linenumber, instruction, limit_address]
+# E.g.,
+#       ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"]
+sub Disassemble {
+  my $prog = shift;
+  my $offset = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+
+  my $objdump = $obj_tool_map{"objdump"};
+  my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn",
+                        "--start-address=0x$start_addr",
+                        "--stop-address=0x$end_addr", $prog);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  my @result = ();
+  my $filename = "";
+  my $linenumber = -1;
+  my $last = ["", "", "", ""];
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    chop;
+    if (m|\s*([^:\s]+):(\d+)\s*$|) {
+      # Location line of the form:
+      #   <filename>:<linenumber>
+      $filename = $1;
+      $linenumber = $2;
+    } elsif (m/^ +([0-9a-f]+):\s*(.*)/) {
+      # Disassembly line -- zero-extend address to full length
+      my $addr = HexExtend($1);
+      my $k = AddressAdd($addr, $offset);
+      $last->[4] = $k;   # Store ending address for previous instruction
+      $last = [$k, $filename, $linenumber, $2, $end_addr];
+      push(@result, $last);
+    }
+  }
+  close(OBJDUMP);
+  return @result;
+}
+
+# The input file should contain lines of the form /proc/maps-like
+# output (same format as expected from the profiles) or that looks
+# like hex addresses (like "0xDEADBEEF").  We will parse all
+# /proc/maps output, and for all the hex addresses, we will output
+# "short" symbol names, one per line, in the same order as the input.
+sub PrintSymbols {
+  my $maps_and_symbols_file = shift;
+
+  # ParseLibraries expects pcs to be in a set.  Fine by us...
+  my @pclist = ();   # pcs in sorted order
+  my $pcs = {};
+  my $map = "";
+  foreach my $line (<$maps_and_symbols_file>) {
+    $line =~ s/\r//g;    # turn windows-looking lines into unix-looking lines
+    if ($line =~ /\b(0x[0-9a-f]+)\b/i) {
+      push(@pclist, HexExtend($1));
+      $pcs->{$pclist[-1]} = 1;
+    } else {
+      $map .= $line;
+    }
+  }
+
+  my $libs = ParseLibraries($main::prog, $map, $pcs);
+  my $symbols = ExtractSymbols($libs, $pcs);
+
+  foreach my $pc (@pclist) {
+    # ->[0] is the shortname, ->[2] is the full name
+    print(($symbols->{$pc}->[0] || "??") . "\n");
+  }
+}
+
+
+# For sorting functions by name
+sub ByName {
+  return ShortFunctionName($a) cmp ShortFunctionName($b);
+}
+
+# Print source-listing for all all routines that match $list_opts
+sub PrintListing {
+  my $total = shift;
+  my $libs = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $list_opts = shift;
+  my $html = shift;
+
+  my $output = \*STDOUT;
+  my $fname = "";
+
+  if ($html) {
+    # Arrange to write the output to a temporary file
+    $fname = TempName($main::next_tmpfile, "html");
+    $main::next_tmpfile++;
+    if (!open(TEMP, ">$fname")) {
+      print STDERR "$fname: $!\n";
+      return;
+    }
+    $output = \*TEMP;
+    print $output HtmlListingHeader();
+    printf $output ("<div class=\"legend\">%s<br>Total: %s %s</div>\n",
+                    $main::prog, Unparse($total), Units());
+  }
+
+  my $listed = 0;
+  foreach my $lib (@{$libs}) {
+    my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts);
+    my $offset = AddressSub($lib->[1], $lib->[3]);
+    foreach my $routine (sort ByName keys(%{$symbol_table})) {
+      # Print if there are any samples in this routine
+      my $start_addr = $symbol_table->{$routine}->[0];
+      my $end_addr = $symbol_table->{$routine}->[1];
+      my $length = hex(AddressSub($end_addr, $start_addr));
+      my $addr = AddressAdd($start_addr, $offset);
+      for (my $i = 0; $i < $length; $i++) {
+        if (defined($cumulative->{$addr})) {
+          $listed += PrintSource(
+            $lib->[0], $offset,
+            $routine, $flat, $cumulative,
+            $start_addr, $end_addr,
+            $html,
+            $output);
+          last;
+        }
+        $addr = AddressInc($addr);
+      }
+    }
+  }
+
+  if ($html) {
+    if ($listed > 0) {
+      print $output HtmlListingFooter();
+      close($output);
+      RunWeb($fname);
+    } else {
+      close($output);
+      unlink($fname);
+    }
+  }
+}
+
+sub HtmlListingHeader {
+  return <<'EOF';
+<DOCTYPE html>
+<html>
+<head>
+<title>Pprof listing</title>
+<style type="text/css">
+body {
+  font-family: sans-serif;
+}
+h1 {
+  font-size: 1.5em;
+  margin-bottom: 4px;
+}
+.legend {
+  font-size: 1.25em;
+}
+.line {
+  color: #aaaaaa;
+}
+.nop {
+  color: #aaaaaa;
+}
+.unimportant {
+  color: #cccccc;
+}
+.disasmloc {
+  color: #000000;
+}
+.deadsrc {
+  cursor: pointer;
+}
+.deadsrc:hover {
+  background-color: #eeeeee;
+}
+.livesrc {
+  color: #0000ff;
+  cursor: pointer;
+}
+.livesrc:hover {
+  background-color: #eeeeee;
+}
+.asm {
+  color: #008800;
+  display: none;
+}
+</style>
+<script type="text/javascript">
+function jeprof_toggle_asm(e) {
+  var target;
+  if (!e) e = window.event;
+  if (e.target) target = e.target;
+  else if (e.srcElement) target = e.srcElement;
+
+  if (target) {
+    var asm = target.nextSibling;
+    if (asm && asm.className == "asm") {
+      asm.style.display = (asm.style.display == "block" ? "" : "block");
+      e.preventDefault();
+      return false;
+    }
+  }
+}
+</script>
+</head>
+<body>
+EOF
+}
+
+sub HtmlListingFooter {
+  return <<'EOF';
+</body>
+</html>
+EOF
+}
+
+sub HtmlEscape {
+  my $text = shift;
+  $text =~ s/&/&amp;/g;
+  $text =~ s/</&lt;/g;
+  $text =~ s/>/&gt;/g;
+  return $text;
+}
+
+# Returns the indentation of the line, if it has any non-whitespace
+# characters.  Otherwise, returns -1.
+sub Indentation {
+  my $line = shift;
+  if (m/^(\s*)\S/) {
+    return length($1);
+  } else {
+    return -1;
+  }
+}
+
+# If the symbol table contains inlining info, Disassemble() may tag an
+# instruction with a location inside an inlined function.  But for
+# source listings, we prefer to use the location in the function we
+# are listing.  So use MapToSymbols() to fetch full location
+# information for each instruction and then pick out the first
+# location from a location list (location list contains callers before
+# callees in case of inlining).
+#
+# After this routine has run, each entry in $instructions contains:
+#   [0] start address
+#   [1] filename for function we are listing
+#   [2] line number for function we are listing
+#   [3] disassembly
+#   [4] limit address
+#   [5] most specific filename (may be different from [1] due to inlining)
+#   [6] most specific line number (may be different from [2] due to inlining)
+sub GetTopLevelLineNumbers {
+  my ($lib, $offset, $instructions) = @_;
+  my $pcs = [];
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    push(@{$pcs}, $instructions->[$i]->[0]);
+  }
+  my $symbols = {};
+  MapToSymbols($lib, $offset, $pcs, $symbols);
+  for (my $i = 0; $i <= $#{$instructions}; $i++) {
+    my $e = $instructions->[$i];
+    push(@{$e}, $e->[1]);
+    push(@{$e}, $e->[2]);
+    my $addr = $e->[0];
+    my $sym = $symbols->{$addr};
+    if (defined($sym)) {
+      if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) {
+        $e->[1] = $1;  # File name
+        $e->[2] = $2;  # Line number
+      }
+    }
+  }
+}
+
+# Print source-listing for one routine
+sub PrintSource {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $html = shift;
+  my $output = shift;
+
+  # Disassemble all instructions (just to get line numbers)
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+  GetTopLevelLineNumbers($prog, $offset, \@instructions);
+
+  # Hack 1: assume that the first source file encountered in the
+  # disassembly contains the routine
+  my $filename = undef;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[2] >= 0) {
+      $filename = $instructions[$i]->[1];
+      last;
+    }
+  }
+  if (!defined($filename)) {
+    print STDERR "no filename found in $routine\n";
+    return 0;
+  }
+
+  # Hack 2: assume that the largest line number from $filename is the
+  # end of the procedure.  This is typically safe since if P1 contains
+  # an inlined call to P2, then P2 usually occurs earlier in the
+  # source file.  If this does not work, we might have to compute a
+  # density profile or just print all regions we find.
+  my $lastline = 0;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    my $f = $instructions[$i]->[1];
+    my $l = $instructions[$i]->[2];
+    if (($f eq $filename) && ($l > $lastline)) {
+      $lastline = $l;
+    }
+  }
+
+  # Hack 3: assume the first source location from "filename" is the start of
+  # the source code.
+  my $firstline = 1;
+  for (my $i = 0; $i <= $#instructions; $i++) {
+    if ($instructions[$i]->[1] eq $filename) {
+      $firstline = $instructions[$i]->[2];
+      last;
+    }
+  }
+
+  # Hack 4: Extend last line forward until its indentation is less than
+  # the indentation we saw on $firstline
+  my $oldlastline = $lastline;
+  {
+    if (!open(FILE, "<$filename")) {
+      print STDERR "$filename: $!\n";
+      return 0;
+    }
+    my $l = 0;
+    my $first_indentation = -1;
+    while (<FILE>) {
+      s/\r//g;         # turn windows-looking lines into unix-looking lines
+      $l++;
+      my $indent = Indentation($_);
+      if ($l >= $firstline) {
+        if ($first_indentation < 0 && $indent >= 0) {
+          $first_indentation = $indent;
+          last if ($first_indentation == 0);
+        }
+      }
+      if ($l >= $lastline && $indent >= 0) {
+        if ($indent >= $first_indentation) {
+          $lastline = $l+1;
+        } else {
+          last;
+        }
+      }
+    }
+    close(FILE);
+  }
+
+  # Assign all samples to the range $firstline,$lastline,
+  # Hack 4: If an instruction does not occur in the range, its samples
+  # are moved to the next instruction that occurs in the range.
+  my $samples1 = {};        # Map from line number to flat count
+  my $samples2 = {};        # Map from line number to cumulative count
+  my $running1 = 0;         # Unassigned flat counts
+  my $running2 = 0;         # Unassigned cumulative counts
+  my $total1 = 0;           # Total flat counts
+  my $total2 = 0;           # Total cumulative counts
+  my %disasm = ();          # Map from line number to disassembly
+  my $running_disasm = "";  # Unassigned disassembly
+  my $skip_marker = "---\n";
+  if ($html) {
+    $skip_marker = "";
+    for (my $l = $firstline; $l <= $lastline; $l++) {
+      $disasm{$l} = "";
+    }
+  }
+  my $last_dis_filename = '';
+  my $last_dis_linenum = -1;
+  my $last_touched_line = -1;  # To detect gaps in disassembly for a line
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+
+    if ($html) {
+      my $dis = sprintf("      %6s %6s \t\t%8s: %s ",
+                        HtmlPrintNumber($c1),
+                        HtmlPrintNumber($c2),
+                        UnparseAddress($offset, $e->[0]),
+                        CleanDisassembly($e->[3]));
+
+      # Append the most specific source line associated with this instruction
+      if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) };
+      $dis = HtmlEscape($dis);
+      my $f = $e->[5];
+      my $l = $e->[6];
+      if ($f ne $last_dis_filename) {
+        $dis .= sprintf("<span class=disasmloc>%s:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      } elsif ($l ne $last_dis_linenum) {
+        # De-emphasize the unchanged file name portion
+        $dis .= sprintf("<span class=unimportant>%s</span>" .
+                        "<span class=disasmloc>:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      } else {
+        # De-emphasize the entire location
+        $dis .= sprintf("<span class=unimportant>%s:%d</span>",
+                        HtmlEscape(CleanFileName($f)), $l);
+      }
+      $last_dis_filename = $f;
+      $last_dis_linenum = $l;
+      $running_disasm .= $dis;
+      $running_disasm .= "\n";
+    }
+
+    $running1 += $c1;
+    $running2 += $c2;
+    $total1 += $c1;
+    $total2 += $c2;
+    my $file = $e->[1];
+    my $line = $e->[2];
+    if (($file eq $filename) &&
+        ($line >= $firstline) &&
+        ($line <= $lastline)) {
+      # Assign all accumulated samples to this line
+      AddEntry($samples1, $line, $running1);
+      AddEntry($samples2, $line, $running2);
+      $running1 = 0;
+      $running2 = 0;
+      if ($html) {
+        if ($line != $last_touched_line && $disasm{$line} ne '') {
+          $disasm{$line} .= "\n";
+        }
+        $disasm{$line} .= $running_disasm;
+        $running_disasm = '';
+        $last_touched_line = $line;
+      }
+    }
+  }
+
+  # Assign any leftover samples to $lastline
+  AddEntry($samples1, $lastline, $running1);
+  AddEntry($samples2, $lastline, $running2);
+  if ($html) {
+    if ($lastline != $last_touched_line && $disasm{$lastline} ne '') {
+      $disasm{$lastline} .= "\n";
+    }
+    $disasm{$lastline} .= $running_disasm;
+  }
+
+  if ($html) {
+    printf $output (
+      "<h1>%s</h1>%s\n<pre onClick=\"jeprof_toggle_asm()\">\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while (<FILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "<span class=\"asm\">" . $dis . "</span>";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "<span class=\"line\">%5d</span> " .
+          "<span class=\"%s\">%6s %6s %s</span>%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "</pre>\n";
+  }
+  return 1;
+}
+
+# Return the source line for the specified file/linenumber.
+# Returns undef if not found.
+sub SourceLine {
+  my $file = shift;
+  my $line = shift;
+
+  # Look in cache
+  if (!defined($main::source_cache{$file})) {
+    if (100 < scalar keys(%main::source_cache)) {
+      # Clear the cache when it gets too big
+      $main::source_cache = ();
+    }
+
+    # Read all lines from the file
+    if (!open(FILE, "<$file")) {
+      print STDERR "$file: $!\n";
+      $main::source_cache{$file} = [];  # Cache the negative result
+      return undef;
+    }
+    my $lines = [];
+    push(@{$lines}, "");        # So we can use 1-based line numbers as indices
+    while (<FILE>) {
+      push(@{$lines}, $_);
+    }
+    close(FILE);
+
+    # Save the lines in the cache
+    $main::source_cache{$file} = $lines;
+  }
+
+  my $lines = $main::source_cache{$file};
+  if (($line < 0) || ($line > $#{$lines})) {
+    return undef;
+  } else {
+    return $lines->[$line];
+  }
+}
+
+# Print disassembly for one routine with interspersed source if available
+sub PrintDisassembledFunction {
+  my $prog = shift;
+  my $offset = shift;
+  my $routine = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $start_addr = shift;
+  my $end_addr = shift;
+  my $total = shift;
+
+  # Disassemble all instructions
+  my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr);
+
+  # Make array of counts per instruction
+  my @flat_count = ();
+  my @cum_count = ();
+  my $flat_total = 0;
+  my $cum_total = 0;
+  foreach my $e (@instructions) {
+    # Add up counts for all address that fall inside this instruction
+    my $c1 = 0;
+    my $c2 = 0;
+    for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) {
+      $c1 += GetEntry($flat, $a);
+      $c2 += GetEntry($cumulative, $a);
+    }
+    push(@flat_count, $c1);
+    push(@cum_count, $c2);
+    $flat_total += $c1;
+    $cum_total += $c2;
+  }
+
+  # Print header with total counts
+  printf("ROUTINE ====================== %s\n" .
+         "%6s %6s %s (flat, cumulative) %.1f%% of total\n",
+         ShortFunctionName($routine),
+         Unparse($flat_total),
+         Unparse($cum_total),
+         Units(),
+         ($cum_total * 100.0) / $total);
+
+  # Process instructions in order
+  my $current_file = "";
+  for (my $i = 0; $i <= $#instructions; ) {
+    my $e = $instructions[$i];
+
+    # Print the new file name whenever we switch files
+    if ($e->[1] ne $current_file) {
+      $current_file = $e->[1];
+      my $fname = $current_file;
+      $fname =~ s|^\./||;   # Trim leading "./"
+
+      # Shorten long file names
+      if (length($fname) >= 58) {
+        $fname = "..." . substr($fname, -55);
+      }
+      printf("-------------------- %s\n", $fname);
+    }
+
+    # TODO: Compute range of lines to print together to deal with
+    # small reorderings.
+    my $first_line = $e->[2];
+    my $last_line = $first_line;
+    my %flat_sum = ();
+    my %cum_sum = ();
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      $flat_sum{$l} = 0;
+      $cum_sum{$l} = 0;
+    }
+
+    # Find run of instructions for this range of source lines
+    my $first_inst = $i;
+    while (($i <= $#instructions) &&
+           ($instructions[$i]->[2] >= $first_line) &&
+           ($instructions[$i]->[2] <= $last_line)) {
+      $e = $instructions[$i];
+      $flat_sum{$e->[2]} += $flat_count[$i];
+      $cum_sum{$e->[2]} += $cum_count[$i];
+      $i++;
+    }
+    my $last_inst = $i - 1;
+
+    # Print source lines
+    for (my $l = $first_line; $l <= $last_line; $l++) {
+      my $line = SourceLine($current_file, $l);
+      if (!defined($line)) {
+        $line = "?\n";
+        next;
+      } else {
+        $line =~ s/^\s+//;
+      }
+      printf("%6s %6s %5d: %s",
+             UnparseAlt($flat_sum{$l}),
+             UnparseAlt($cum_sum{$l}),
+             $l,
+             $line);
+    }
+
+    # Print disassembly
+    for (my $x = $first_inst; $x <= $last_inst; $x++) {
+      my $e = $instructions[$x];
+      printf("%6s %6s    %8s: %6s\n",
+             UnparseAlt($flat_count[$x]),
+             UnparseAlt($cum_count[$x]),
+             UnparseAddress($offset, $e->[0]),
+             CleanDisassembly($e->[3]));
+    }
+  }
+}
+
+# Print DOT graph
+sub PrintDot {
+  my $prog = shift;
+  my $symbols = shift;
+  my $raw = shift;
+  my $flat = shift;
+  my $cumulative = shift;
+  my $overall_total = shift;
+
+  # Get total
+  my $local_total = TotalProfile($flat);
+  my $nodelimit = int($main::opt_nodefraction * $local_total);
+  my $edgelimit = int($main::opt_edgefraction * $local_total);
+  my $nodecount = $main::opt_nodecount;
+
+  # Find nodes to include
+  my @list = (sort { abs(GetEntry($cumulative, $b)) <=>
+                     abs(GetEntry($cumulative, $a))
+                     || $a cmp $b }
+              keys(%{$cumulative}));
+  my $last = $nodecount - 1;
+  if ($last > $#list) {
+    $last = $#list;
+  }
+  while (($last >= 0) &&
+         (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) {
+    $last--;
+  }
+  if ($last < 0) {
+    print STDERR "No nodes to print\n";
+    return 0;
+  }
+
+  if ($nodelimit > 0 || $edgelimit > 0) {
+    printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n",
+                   Unparse($nodelimit), Units(),
+                   Unparse($edgelimit), Units());
+  }
+
+  # Open DOT output file
+  my $output;
+  my $escaped_dot = ShellEscape(@DOT);
+  my $escaped_ps2pdf = ShellEscape(@PS2PDF);
+  if ($main::opt_gv) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps"));
+    $output = "| $escaped_dot -Tps2 >$escaped_outfile";
+  } elsif ($main::opt_evince) {
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf"));
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile";
+  } elsif ($main::opt_ps) {
+    $output = "| $escaped_dot -Tps2";
+  } elsif ($main::opt_pdf) {
+    $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -";
+  } elsif ($main::opt_web || $main::opt_svg) {
+    # We need to post-process the SVG, so write to a temporary file always.
+    my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg"));
+    $output = "| $escaped_dot -Tsvg >$escaped_outfile";
+  } elsif ($main::opt_gif) {
+    $output = "| $escaped_dot -Tgif";
+  } else {
+    $output = ">&STDOUT";
+  }
+  open(DOT, $output) || error("$output: $!\n");
+
+  # Title
+  printf DOT ("digraph \"%s; %s %s\" {\n",
+              $prog,
+              Unparse($overall_total),
+              Units());
+  if ($main::opt_pdf) {
+    # The output is more printable if we set the page size for dot.
+    printf DOT ("size=\"8,11\"\n");
+  }
+  printf DOT ("node [width=0.375,height=0.25];\n");
+
+  # Print legend
+  printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," .
+              "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n",
+              $prog,
+              sprintf("Total %s: %s", Units(), Unparse($overall_total)),
+              sprintf("Focusing on: %s", Unparse($local_total)),
+              sprintf("Dropped nodes with <= %s abs(%s)",
+                      Unparse($nodelimit), Units()),
+              sprintf("Dropped edges with <= %s %s",
+                      Unparse($edgelimit), Units())
+              );
+
+  # Print nodes
+  my %node = ();
+  my $nextnode = 1;
+  foreach my $a (@list[0..$last]) {
+    # Pick font size
+    my $f = GetEntry($flat, $a);
+    my $c = GetEntry($cumulative, $a);
+
+    my $fs = 8;
+    if ($local_total > 0) {
+      $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total)));
+    }
+
+    $node{$a} = $nextnode++;
+    my $sym = $a;
+    $sym =~ s/\s+/\\n/g;
+    $sym =~ s/::/\\n/g;
+
+    # Extra cumulative info to print for non-leaves
+    my $extra = "";
+    if ($f != $c) {
+      $extra = sprintf("\\rof %s (%s)",
+                       Unparse($c),
+                       Percent($c, $local_total));
+    }
+    my $style = "";
+    if ($main::opt_heapcheck) {
+      if ($f > 0) {
+        # make leak-causing nodes more visible (add a background)
+        $style = ",style=filled,fillcolor=gray"
+      } elsif ($f < 0) {
+        # make anti-leak-causing nodes (which almost never occur)
+        # stand out as well (triple border)
+        $style = ",peripheries=3"
+      }
+    }
+
+    printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" .
+                "\",shape=box,fontsize=%.1f%s];\n",
+                $node{$a},
+                $sym,
+                Unparse($f),
+                Percent($f, $local_total),
+                $extra,
+                $fs,
+                $style,
+               );
+  }
+
+  # Get edges and counts per edge
+  my %edge = ();
+  my $n;
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$raw})) {
+    # TODO: omit low %age edges
+    $n = $raw->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    for (my $i = 1; $i <= $#translated; $i++) {
+      my $src = $translated[$i];
+      my $dst = $translated[$i-1];
+      #next if ($src eq $dst);  # Avoid self-edges?
+      if (exists($node{$src}) && exists($node{$dst})) {
+        my $edge_label = "$src\001$dst";
+        if (!exists($edge{$edge_label})) {
+          $edge{$edge_label} = 0;
+        }
+        $edge{$edge_label} += $n;
+      }
+    }
+  }
+
+  # Print edges (process in order of decreasing counts)
+  my %indegree = ();   # Number of incoming edges added per node so far
+  my %outdegree = ();  # Number of outgoing edges added per node so far
+  foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) {
+    my @x = split(/\001/, $e);
+    $n = $edge{$e};
+
+    # Initialize degree of kept incoming and outgoing edges if necessary
+    my $src = $x[0];
+    my $dst = $x[1];
+    if (!exists($outdegree{$src})) { $outdegree{$src} = 0; }
+    if (!exists($indegree{$dst})) { $indegree{$dst} = 0; }
+
+    my $keep;
+    if ($indegree{$dst} == 0) {
+      # Keep edge if needed for reachability
+      $keep = 1;
+    } elsif (abs($n) <= $edgelimit) {
+      # Drop if we are below --edgefraction
+      $keep = 0;
+    } elsif ($outdegree{$src} >= $main::opt_maxdegree ||
+             $indegree{$dst} >= $main::opt_maxdegree) {
+      # Keep limited number of in/out edges per node
+      $keep = 0;
+    } else {
+      $keep = 1;
+    }
+
+    if ($keep) {
+      $outdegree{$src}++;
+      $indegree{$dst}++;
+
+      # Compute line width based on edge count
+      my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0);
+      if ($fraction > 1) { $fraction = 1; }
+      my $w = $fraction * 2;
+      if ($w < 1 && ($main::opt_web || $main::opt_svg)) {
+        # SVG output treats line widths < 1 poorly.
+        $w = 1;
+      }
+
+      # Dot sometimes segfaults if given edge weights that are too large, so
+      # we cap the weights at a large value
+      my $edgeweight = abs($n) ** 0.7;
+      if ($edgeweight > 100000) { $edgeweight = 100000; }
+      $edgeweight = int($edgeweight);
+
+      my $style = sprintf("setlinewidth(%f)", $w);
+      if ($x[1] =~ m/\(inline\)/) {
+        $style .= ",dashed";
+      }
+
+      # Use a slightly squashed function of the edge count as the weight
+      printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n",
+                  $node{$x[0]},
+                  $node{$x[1]},
+                  Unparse($n),
+                  $edgeweight,
+                  $style);
+    }
+  }
+
+  print DOT ("}\n");
+  close(DOT);
+
+  if ($main::opt_web || $main::opt_svg) {
+    # Rewrite SVG to be more usable inside web browser.
+    RewriteSvg(TempName($main::next_tmpfile, "svg"));
+  }
+
+  return 1;
+}
+
+sub RewriteSvg {
+  my $svgfile = shift;
+
+  open(SVG, $svgfile) || die "open temp svg: $!";
+  my @svg = <SVG>;
+  close(SVG);
+  unlink $svgfile;
+  my $svg = join('', @svg);
+
+  # Dot's SVG output is
+  #
+  #    <svg width="___" height="___"
+  #     viewBox="___" xmlns=...>
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </svg>
+  #
+  # Change it to
+  #
+  #    <svg width="100%" height="100%"
+  #     xmlns=...>
+  #    $svg_javascript
+  #    <g id="viewport" transform="translate(0,0)">
+  #    <g id="graph0" transform="...">
+  #    ...
+  #    </g>
+  #    </g>
+  #    </svg>
+
+  # Fix width, height; drop viewBox.
+  $svg =~ s/(?s)<svg width="[^"]+" height="[^"]+"(.*?)viewBox="[^"]+"/<svg width="100%" height="100%"$1/;
+
+  # Insert script, viewport <g> above first <g>
+  my $svg_javascript = SvgJavascript();
+  my $viewport = "<g id=\"viewport\" transform=\"translate(0,0)\">\n";
+  $svg =~ s/<g id="graph\d"/$svg_javascript$viewport$&/;
+
+  # Insert final </g> above </svg>.
+  $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/;
+  $svg =~ s/<g id="graph\d"(.*?)/<g id="viewport"$1/;
+
+  if ($main::opt_svg) {
+    # --svg: write to standard output.
+    print $svg;
+  } else {
+    # Write back to temporary file.
+    open(SVG, ">$svgfile") || die "open $svgfile: $!";
+    print SVG $svg;
+    close(SVG);
+  }
+}
+
+sub SvgJavascript {
+  return <<'EOF';
+<script type="text/ecmascript"><![CDATA[
+// SVGPan
+// http://www.cyberz.org/blog/2009/12/08/svgpan-a-javascript-svg-panzoomdrag-library/
+// Local modification: if(true || ...) below to force panning, never moving.
+
+/**
+ *  SVGPan library 1.2
+ * ====================
+ *
+ * Given an unique existing element with id "viewport", including the
+ * the library into any SVG adds the following capabilities:
+ *
+ *  - Mouse panning
+ *  - Mouse zooming (using the wheel)
+ *  - Object dargging
+ *
+ * Known issues:
+ *
+ *  - Zooming (while panning) on Safari has still some issues
+ *
+ * Releases:
+ *
+ * 1.2, Sat Mar 20 08:42:50 GMT 2010, Zeng Xiaohui
+ *	Fixed a bug with browser mouse handler interaction
+ *
+ * 1.1, Wed Feb  3 17:39:33 GMT 2010, Zeng Xiaohui
+ *	Updated the zoom code to support the mouse wheel on Safari/Chrome
+ *
+ * 1.0, Andrea Leofreddi
+ *	First release
+ *
+ * This code is licensed under the following BSD license:
+ *
+ * Copyright 2009-2010 Andrea Leofreddi <[email protected]>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above copyright notice, this list
+ *       of conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Andrea Leofreddi ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Andrea Leofreddi OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation are those of the
+ * authors and should not be interpreted as representing official policies, either expressed
+ * or implied, of Andrea Leofreddi.
+ */
+
+var root = document.documentElement;
+
+var state = 'none', stateTarget, stateOrigin, stateTf;
+
+setupHandlers(root);
+
+/**
+ * Register handlers
+ */
+function setupHandlers(root){
+	setAttributes(root, {
+		"onmouseup" : "add(evt)",
+		"onmousedown" : "handleMouseDown(evt)",
+		"onmousemove" : "handleMouseMove(evt)",
+		"onmouseup" : "handleMouseUp(evt)",
+		//"onmouseout" : "handleMouseUp(evt)", // Decomment this to stop the pan functionality when dragging out of the SVG element
+	});
+
+	if(navigator.userAgent.toLowerCase().indexOf('webkit') >= 0)
+		window.addEventListener('mousewheel', handleMouseWheel, false); // Chrome/Safari
+	else
+		window.addEventListener('DOMMouseScroll', handleMouseWheel, false); // Others
+
+	var g = svgDoc.getElementById("svg");
+	g.width = "100%";
+	g.height = "100%";
+}
+
+/**
+ * Instance an SVGPoint object with given event coordinates.
+ */
+function getEventPoint(evt) {
+	var p = root.createSVGPoint();
+
+	p.x = evt.clientX;
+	p.y = evt.clientY;
+
+	return p;
+}
+
+/**
+ * Sets the current transform matrix of an element.
+ */
+function setCTM(element, matrix) {
+	var s = "matrix(" + matrix.a + "," + matrix.b + "," + matrix.c + "," + matrix.d + "," + matrix.e + "," + matrix.f + ")";
+
+	element.setAttribute("transform", s);
+}
+
+/**
+ * Dumps a matrix to a string (useful for debug).
+ */
+function dumpMatrix(matrix) {
+	var s = "[ " + matrix.a + ", " + matrix.c + ", " + matrix.e + "\n  " + matrix.b + ", " + matrix.d + ", " + matrix.f + "\n  0, 0, 1 ]";
+
+	return s;
+}
+
+/**
+ * Sets attributes of an element.
+ */
+function setAttributes(element, attributes){
+	for (i in attributes)
+		element.setAttributeNS(null, i, attributes[i]);
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseWheel(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var delta;
+
+	if(evt.wheelDelta)
+		delta = evt.wheelDelta / 3600; // Chrome/Safari
+	else
+		delta = evt.detail / -90; // Mozilla
+
+	var z = 1 + delta; // Zoom factor: 0.9/1.1
+
+	var g = svgDoc.getElementById("viewport");
+
+	var p = getEventPoint(evt);
+
+	p = p.matrixTransform(g.getCTM().inverse());
+
+	// Compute new scale matrix in current mouse position
+	var k = root.createSVGMatrix().translate(p.x, p.y).scale(z).translate(-p.x, -p.y);
+
+        setCTM(g, g.getCTM().multiply(k));
+
+	stateTf = stateTf.multiply(k.inverse());
+}
+
+/**
+ * Handle mouse move event.
+ */
+function handleMouseMove(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(state == 'pan') {
+		// Pan mode
+		var p = getEventPoint(evt).matrixTransform(stateTf);
+
+		setCTM(g, stateTf.inverse().translate(p.x - stateOrigin.x, p.y - stateOrigin.y));
+	} else if(state == 'move') {
+		// Move mode
+		var p = getEventPoint(evt).matrixTransform(g.getCTM().inverse());
+
+		setCTM(stateTarget, root.createSVGMatrix().translate(p.x - stateOrigin.x, p.y - stateOrigin.y).multiply(g.getCTM().inverse()).multiply(stateTarget.getCTM()));
+
+		stateOrigin = p;
+	}
+}
+
+/**
+ * Handle click event.
+ */
+function handleMouseDown(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	var g = svgDoc.getElementById("viewport");
+
+	if(true || evt.target.tagName == "svg") {
+		// Pan mode
+		state = 'pan';
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	} else {
+		// Move mode
+		state = 'move';
+
+		stateTarget = evt.target;
+
+		stateTf = g.getCTM().inverse();
+
+		stateOrigin = getEventPoint(evt).matrixTransform(stateTf);
+	}
+}
+
+/**
+ * Handle mouse button release event.
+ */
+function handleMouseUp(evt) {
+	if(evt.preventDefault)
+		evt.preventDefault();
+
+	evt.returnValue = false;
+
+	var svgDoc = evt.target.ownerDocument;
+
+	if(state == 'pan' || state == 'move') {
+		// Quit pan mode
+		state = '';
+	}
+}
+
+]]></script>
+EOF
+}
+
+# Provides a map from fullname to shortname for cases where the
+# shortname is ambiguous.  The symlist has both the fullname and
+# shortname for all symbols, which is usually fine, but sometimes --
+# such as overloaded functions -- two different fullnames can map to
+# the same shortname.  In that case, we use the address of the
+# function to disambiguate the two.  This function fills in a map that
+# maps fullnames to modified shortnames in such cases.  If a fullname
+# is not present in the map, the 'normal' shortname provided by the
+# symlist is the appropriate one to use.
+sub FillFullnameToShortnameMap {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $shortnames_seen_once = {};
+  my $shortnames_seen_more_than_once = {};
+
+  foreach my $symlist (values(%{$symbols})) {
+    # TODO(csilvers): deal with inlined symbols too.
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    if ($fullname !~ /<[0-9a-fA-F]+>$/) {  # fullname doesn't end in an address
+      next;       # the only collisions we care about are when addresses differ
+    }
+    if (defined($shortnames_seen_once->{$shortname}) &&
+        $shortnames_seen_once->{$shortname} ne $fullname) {
+      $shortnames_seen_more_than_once->{$shortname} = 1;
+    } else {
+      $shortnames_seen_once->{$shortname} = $fullname;
+    }
+  }
+
+  foreach my $symlist (values(%{$symbols})) {
+    my $shortname = $symlist->[0];
+    my $fullname = $symlist->[2];
+    # TODO(csilvers): take in a list of addresses we care about, and only
+    # store in the map if $symlist->[1] is in that list.  Saves space.
+    next if defined($fullname_to_shortname_map->{$fullname});
+    if (defined($shortnames_seen_more_than_once->{$shortname})) {
+      if ($fullname =~ /<0*([^>]*)>$/) {   # fullname has address at end of it
+        $fullname_to_shortname_map->{$fullname} = "$shortname\@$1";
+      }
+    }
+  }
+}
+
+# Return a small number that identifies the argument.
+# Multiple calls with the same argument will return the same number.
+# Calls with different arguments will return different numbers.
+sub ShortIdFor {
+  my $key = shift;
+  my $id = $main::uniqueid{$key};
+  if (!defined($id)) {
+    $id = keys(%main::uniqueid) + 1;
+    $main::uniqueid{$key} = $id;
+  }
+  return $id;
+}
+
+# Translate a stack of addresses into a stack of symbols
+sub TranslateStack {
+  my $symbols = shift;
+  my $fullname_to_shortname_map = shift;
+  my $k = shift;
+
+  my @addrs = split(/\n/, $k);
+  my @result = ();
+  for (my $i = 0; $i <= $#addrs; $i++) {
+    my $a = $addrs[$i];
+
+    # Skip large addresses since they sometimes show up as fake entries on RH9
+    if (length($a) > 8 && $a gt "7fffffffffffffff") {
+      next;
+    }
+
+    if ($main::opt_disasm || $main::opt_list) {
+      # We want just the address for the key
+      push(@result, $a);
+      next;
+    }
+
+    my $symlist = $symbols->{$a};
+    if (!defined($symlist)) {
+      $symlist = [$a, "", $a];
+    }
+
+    # We can have a sequence of symbols for a particular entry
+    # (more than one symbol in the case of inlining).  Callers
+    # come before callees in symlist, so walk backwards since
+    # the translated stack should contain callees before callers.
+    for (my $j = $#{$symlist}; $j >= 2; $j -= 3) {
+      my $func = $symlist->[$j-2];
+      my $fileline = $symlist->[$j-1];
+      my $fullfunc = $symlist->[$j];
+      if (defined($fullname_to_shortname_map->{$fullfunc})) {
+        $func = $fullname_to_shortname_map->{$fullfunc};
+      }
+      if ($j > 2) {
+        $func = "$func (inline)";
+      }
+
+      # Do not merge nodes corresponding to Callback::Run since that
+      # causes confusing cycles in dot display.  Instead, we synthesize
+      # a unique name for this frame per caller.
+      if ($func =~ m/Callback.*::Run$/) {
+        my $caller = ($i > 0) ? $addrs[$i-1] : 0;
+        $func = "Run#" . ShortIdFor($caller);
+      }
+
+      if ($main::opt_addresses) {
+        push(@result, "$a $func $fileline");
+      } elsif ($main::opt_lines) {
+        if ($func eq '??' && $fileline eq '??:0') {
+          push(@result, "$a");
+        } else {
+          push(@result, "$func $fileline");
+        }
+      } elsif ($main::opt_functions) {
+        if ($func eq '??') {
+          push(@result, "$a");
+        } else {
+          push(@result, $func);
+        }
+      } elsif ($main::opt_files) {
+        if ($fileline eq '??:0' || $fileline eq '') {
+          push(@result, "$a");
+        } else {
+          my $f = $fileline;
+          $f =~ s/:\d+$//;
+          push(@result, $f);
+        }
+      } else {
+        push(@result, $a);
+        last;  # Do not print inlined info
+      }
+    }
+  }
+
+  # print join(",", @addrs), " => ", join(",", @result), "\n";
+  return @result;
+}
+
+# Generate percent string for a number and a total
+sub Percent {
+  my $num = shift;
+  my $tot = shift;
+  if ($tot != 0) {
+    return sprintf("%.1f%%", $num * 100.0 / $tot);
+  } else {
+    return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf");
+  }
+}
+
+# Generate pretty-printed form of number
+sub Unparse {
+  my $num = shift;
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return sprintf("%d", $num);
+    } else {
+      if ($main::opt_show_bytes) {
+        return sprintf("%d", $num);
+      } else {
+        return sprintf("%.1f", $num / 1048576.0);
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds
+  } else {
+    return sprintf("%d", $num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to "."
+sub UnparseAlt {
+  my $num = shift;
+  if ($num == 0) {
+    return ".";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Alternate pretty-printed form: 0 maps to ""
+sub HtmlPrintNumber {
+  my $num = shift;
+  if ($num == 0) {
+    return "";
+  } else {
+    return Unparse($num);
+  }
+}
+
+# Return output units
+sub Units {
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    if ($main::opt_inuse_objects || $main::opt_alloc_objects) {
+      return "objects";
+    } else {
+      if ($main::opt_show_bytes) {
+        return "B";
+      } else {
+        return "MB";
+      }
+    }
+  } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) {
+    return "seconds";
+  } else {
+    return "samples";
+  }
+}
+
+##### Profile manipulation code #####
+
+# Generate flattened profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a]
+sub FlatProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    if ($#addrs >= 0) {
+      AddEntry($result, $addrs[0], $count);
+    }
+  }
+  return $result;
+}
+
+# Generate cumulative profile:
+# If count is charged to stack [a,b,c,d], in generated profile,
+# it will be charged to [a], [b], [c], [d]
+sub CumulativeProfile {
+  my $profile = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      AddEntry($result, $a, $count);
+    }
+  }
+  return $result;
+}
+
+# If the second-youngest PC on the stack is always the same, returns
+# that pc.  Otherwise, returns undef.
+sub IsSecondPcAlwaysTheSame {
+  my $profile = shift;
+
+  my $second_pc = undef;
+  foreach my $k (keys(%{$profile})) {
+    my @addrs = split(/\n/, $k);
+    if ($#addrs < 1) {
+      return undef;
+    }
+    if (not defined $second_pc) {
+      $second_pc = $addrs[1];
+    } else {
+      if ($second_pc ne $addrs[1]) {
+        return undef;
+      }
+    }
+  }
+  return $second_pc;
+}
+
+sub ExtractSymbolNameInlineStack {
+  my $symbols = shift;
+  my $address = shift;
+
+  my @stack = ();
+
+  if (exists $symbols->{$address}) {
+    my @localinlinestack = @{$symbols->{$address}};
+    for (my $i = $#localinlinestack; $i > 0; $i-=3) {
+      my $file = $localinlinestack[$i-1];
+      my $fn = $localinlinestack[$i-0];
+
+      if ($file eq "?" || $file eq ":0") {
+        $file = "??:0";
+      }
+      if ($fn eq '??') {
+        # If we can't get the symbol name, at least use the file information.
+        $fn = $file;
+      }
+      my $suffix = "[inline]";
+      if ($i == 2) {
+        $suffix = "";
+      }
+      push (@stack, $fn.$suffix);
+    }
+  }
+  else {
+    # If we can't get a symbol name, at least fill in the address.
+    push (@stack, $address);
+  }
+
+  return @stack;
+}
+
+sub ExtractSymbolLocation {
+  my $symbols = shift;
+  my $address = shift;
+  # 'addr2line' outputs "??:0" for unknown locations; we do the
+  # same to be consistent.
+  my $location = "??:0:unknown";
+  if (exists $symbols->{$address}) {
+    my $file = $symbols->{$address}->[1];
+    if ($file eq "?") {
+      $file = "??:0"
+    }
+    $location = $file . ":" . $symbols->{$address}->[0];
+  }
+  return $location;
+}
+
+# Extracts a graph of calls.
+sub ExtractCalls {
+  my $symbols = shift;
+  my $profile = shift;
+
+  my $calls = {};
+  while( my ($stack_trace, $count) = each %$profile ) {
+    my @address = split(/\n/, $stack_trace);
+    my $destination = ExtractSymbolLocation($symbols, $address[0]);
+    AddEntry($calls, $destination, $count);
+    for (my $i = 1; $i <= $#address; $i++) {
+      my $source = ExtractSymbolLocation($symbols, $address[$i]);
+      my $call = "$source -> $destination";
+      AddEntry($calls, $call, $count);
+      $destination = $source;
+    }
+  }
+
+  return $calls;
+}
+
+sub FilterFrames {
+  my $symbols = shift;
+  my $profile = shift;
+
+  if ($main::opt_retain eq '' && $main::opt_exclude eq '') {
+    return $profile;
+  }
+
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my @path = ();
+    foreach my $a (@addrs) {
+      my $sym;
+      if (exists($symbols->{$a})) {
+        $sym = $symbols->{$a}->[0];
+      } else {
+        $sym = $a;
+      }
+      if ($main::opt_retain ne '' && $sym !~ m/$main::opt_retain/) {
+        next;
+      }
+      if ($main::opt_exclude ne '' && $sym =~ m/$main::opt_exclude/) {
+        next;
+      }
+      push(@path, $a);
+    }
+    if (scalar(@path) > 0) {
+      my $reduced_path = join("\n", @path);
+      AddEntry($result, $reduced_path, $count);
+    }
+  }
+
+  return $result;
+}
+
+sub PrintCollapsedStacks {
+  my $symbols = shift;
+  my $profile = shift;
+
+  while (my ($stack_trace, $count) = each %$profile) {
+    my @address = split(/\n/, $stack_trace);
+    my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address );
+    printf("%s %d\n", join(";", @names), $count);
+  }
+}
+
+sub RemoveUninterestingFrames {
+  my $symbols = shift;
+  my $profile = shift;
+
+  # List of function names to skip
+  my %skip = ();
+  my $skip_regexp = 'NOMATCH';
+  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
+    foreach my $name ('@JEMALLOC_PREFIX@calloc',
+                      'cfree',
+                      '@JEMALLOC_PREFIX@malloc',
+                      'newImpl',
+                      'void* newImpl',
+                      '@JEMALLOC_PREFIX@free',
+                      '@JEMALLOC_PREFIX@memalign',
+                      '@JEMALLOC_PREFIX@posix_memalign',
+                      '@JEMALLOC_PREFIX@aligned_alloc',
+                      'pvalloc',
+                      '@JEMALLOC_PREFIX@valloc',
+                      '@JEMALLOC_PREFIX@realloc',
+                      '@JEMALLOC_PREFIX@mallocx',
+                      '@JEMALLOC_PREFIX@rallocx',
+                      '@JEMALLOC_PREFIX@xallocx',
+                      '@JEMALLOC_PREFIX@dallocx',
+                      '@JEMALLOC_PREFIX@sdallocx',
+                      '@JEMALLOC_PREFIX@sdallocx_noflags',
+                      'tc_calloc',
+                      'tc_cfree',
+                      'tc_malloc',
+                      'tc_free',
+                      'tc_memalign',
+                      'tc_posix_memalign',
+                      'tc_pvalloc',
+                      'tc_valloc',
+                      'tc_realloc',
+                      'tc_new',
+                      'tc_delete',
+                      'tc_newarray',
+                      'tc_deletearray',
+                      'tc_new_nothrow',
+                      'tc_newarray_nothrow',
+                      'do_malloc',
+                      '::do_malloc',   # new name -- got moved to an unnamed ns
+                      '::do_malloc_or_cpp_alloc',
+                      'DoSampledAllocation',
+                      'simple_alloc::allocate',
+                      '__malloc_alloc_template::allocate',
+                      '__builtin_delete',
+                      '__builtin_new',
+                      '__builtin_vec_delete',
+                      '__builtin_vec_new',
+                      'operator new',
+                      'operator new[]',
+                      # The entry to our memory-allocation routines on OS X
+                      'malloc_zone_malloc',
+                      'malloc_zone_calloc',
+                      'malloc_zone_valloc',
+                      'malloc_zone_realloc',
+                      'malloc_zone_memalign',
+                      'malloc_zone_free',
+                      # These mark the beginning/end of our custom sections
+                      '__start_google_malloc',
+                      '__stop_google_malloc',
+                      '__start_malloc_hook',
+                      '__stop_malloc_hook') {
+      $skip{$name} = 1;
+      $skip{"_" . $name} = 1;   # Mach (OS X) adds a _ prefix to everything
+    }
+    # TODO: Remove TCMalloc once everything has been
+    # moved into the tcmalloc:: namespace and we have flushed
+    # old code out of the system.
+    $skip_regexp = "TCMalloc|^tcmalloc::";
+  } elsif ($main::profile_type eq 'contention') {
+    foreach my $vname ('base::RecordLockProfileData',
+                       'base::SubmitMutexProfileData',
+                       'base::SubmitSpinLockProfileData',
+                       'Mutex::Unlock',
+                       'Mutex::UnlockSlow',
+                       'Mutex::ReaderUnlock',
+                       'MutexLock::~MutexLock',
+                       'SpinLock::Unlock',
+                       'SpinLock::SlowUnlock',
+                       'SpinLockHolder::~SpinLockHolder') {
+      $skip{$vname} = 1;
+    }
+  } elsif ($main::profile_type eq 'cpu') {
+    # Drop signal handlers used for CPU profile collection
+    # TODO(dpeng): this should not be necessary; it's taken
+    # care of by the general 2nd-pc mechanism below.
+    foreach my $name ('ProfileData::Add',           # historical
+                      'ProfileData::prof_handler',  # historical
+                      'CpuProfiler::prof_handler',
+                      '__FRAME_END__',
+                      '__pthread_sighandler',
+                      '__restore') {
+      $skip{$name} = 1;
+    }
+  } else {
+    # Nothing skipped for unknown types
+  }
+
+  if ($main::profile_type eq 'cpu') {
+    # If all the second-youngest program counters are the same,
+    # this STRONGLY suggests that it is an artifact of measurement,
+    # i.e., stack frames pushed by the CPU profiler signal handler.
+    # Hence, we delete them.
+    # (The topmost PC is read from the signal structure, not from
+    # the stack, so it does not get involved.)
+    while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) {
+      my $result = {};
+      my $func = '';
+      if (exists($symbols->{$second_pc})) {
+        $second_pc = $symbols->{$second_pc}->[0];
+      }
+      print STDERR "Removing $second_pc from all stack traces.\n";
+      foreach my $k (keys(%{$profile})) {
+        my $count = $profile->{$k};
+        my @addrs = split(/\n/, $k);
+        splice @addrs, 1, 1;
+        my $reduced_path = join("\n", @addrs);
+        AddEntry($result, $reduced_path, $count);
+      }
+      $profile = $result;
+    }
+  }
+
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my @path = ();
+    foreach my $a (@addrs) {
+      if (exists($symbols->{$a})) {
+        my $func = $symbols->{$a}->[0];
+        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
+          # Throw away the portion of the backtrace seen so far, under the
+          # assumption that previous frames were for functions internal to the
+          # allocator.
+          @path = ();
+          next;
+        }
+      }
+      push(@path, $a);
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+
+  $result = FilterFrames($symbols, $result);
+
+  return $result;
+}
+
+# Reduce profile to granularity given by user
+sub ReduceProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $result = {};
+  my $fullname_to_shortname_map = {};
+  FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map);
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k);
+    my @path = ();
+    my %seen = ();
+    $seen{''} = 1;      # So that empty keys are skipped
+    foreach my $e (@translated) {
+      # To avoid double-counting due to recursion, skip a stack-trace
+      # entry if it has already been seen
+      if (!$seen{$e}) {
+        $seen{$e} = 1;
+        push(@path, $e);
+      }
+    }
+    my $reduced_path = join("\n", @path);
+    AddEntry($result, $reduced_path, $count);
+  }
+  return $result;
+}
+
+# Does the specified symbol array match the regexp?
+sub SymbolMatches {
+  my $sym = shift;
+  my $re = shift;
+  if (defined($sym)) {
+    for (my $i = 0; $i < $#{$sym}; $i += 3) {
+      if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+# Focus only on paths involving specified regexps
+sub FocusProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $focus = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) {
+        AddEntry($result, $k, $count);
+        last;
+      }
+    }
+  }
+  return $result;
+}
+
+# Focus only on paths not involving specified regexps
+sub IgnoreProfile {
+  my $symbols = shift;
+  my $profile = shift;
+  my $ignore = shift;
+  my $result = {};
+  foreach my $k (keys(%{$profile})) {
+    my $count = $profile->{$k};
+    my @addrs = split(/\n/, $k);
+    my $matched = 0;
+    foreach my $a (@addrs) {
+      # Reply if it matches either the address/shortname/fileline
+      if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) {
+        $matched = 1;
+        last;
+      }
+    }
+    if (!$matched) {
+      AddEntry($result, $k, $count);
+    }
+  }
+  return $result;
+}
+
+# Get total count in profile
+sub TotalProfile {
+  my $profile = shift;
+  my $result = 0;
+  foreach my $k (keys(%{$profile})) {
+    $result += $profile->{$k};
+  }
+  return $result;
+}
+
+# Add A to B
+sub AddProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k};
+    AddEntry($R, $k, $v);
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    my $v = $B->{$k};
+    AddEntry($R, $k, $v);
+  }
+  return $R;
+}
+
+# Merges symbol maps
+sub MergeSymbols {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = $A->{$k};
+  }
+  if (defined($B)) {
+    foreach my $k (keys(%{$B})) {
+      $R->{$k} = $B->{$k};
+    }
+  }
+  return $R;
+}
+
+
+# Add A to B
+sub AddPcs {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  # add all keys in A
+  foreach my $k (keys(%{$A})) {
+    $R->{$k} = 1
+  }
+  # add all keys in B
+  foreach my $k (keys(%{$B})) {
+    $R->{$k} = 1
+  }
+  return $R;
+}
+
+# Subtract B from A
+sub SubtractProfile {
+  my $A = shift;
+  my $B = shift;
+
+  my $R = {};
+  foreach my $k (keys(%{$A})) {
+    my $v = $A->{$k} - GetEntry($B, $k);
+    if ($v < 0 && $main::opt_drop_negative) {
+      $v = 0;
+    }
+    AddEntry($R, $k, $v);
+  }
+  if (!$main::opt_drop_negative) {
+    # Take care of when subtracted profile has more entries
+    foreach my $k (keys(%{$B})) {
+      if (!exists($A->{$k})) {
+        AddEntry($R, $k, 0 - $B->{$k});
+      }
+    }
+  }
+  return $R;
+}
+
+# Get entry from profile; zero if not present
+sub GetEntry {
+  my $profile = shift;
+  my $k = shift;
+  if (exists($profile->{$k})) {
+    return $profile->{$k};
+  } else {
+    return 0;
+  }
+}
+
+# Add entry to specified profile
+sub AddEntry {
+  my $profile = shift;
+  my $k = shift;
+  my $n = shift;
+  if (!exists($profile->{$k})) {
+    $profile->{$k} = 0;
+  }
+  $profile->{$k} += $n;
+}
+
+# Add a stack of entries to specified profile, and add them to the $pcs
+# list.
+sub AddEntries {
+  my $profile = shift;
+  my $pcs = shift;
+  my $stack = shift;
+  my $count = shift;
+  my @k = ();
+
+  foreach my $e (split(/\s+/, $stack)) {
+    my $pc = HexExtend($e);
+    $pcs->{$pc} = 1;
+    push @k, $pc;
+  }
+  AddEntry($profile, (join "\n", @k), $count);
+}
+
+##### Code to profile a server dynamically #####
+
+sub CheckSymbolPage {
+  my $url = SymbolPageURL();
+  my $command = ShellEscape(@URL_FETCHER, $url);
+  open(SYMBOL, "$command |") or error($command);
+  my $line = <SYMBOL>;
+  $line =~ s/\r//g;         # turn windows-looking lines into unix-looking lines
+  close(SYMBOL);
+  unless (defined($line)) {
+    error("$url doesn't exist\n");
+  }
+
+  if ($line =~ /^num_symbols:\s+(\d+)$/) {
+    if ($1 == 0) {
+      error("Stripped binary. No symbols available.\n");
+    }
+  } else {
+    error("Failed to get the number of symbols from $url\n");
+  }
+}
+
+sub IsProfileURL {
+  my $profile_name = shift;
+  if (-f $profile_name) {
+    printf STDERR "Using local file $profile_name.\n";
+    return 0;
+  }
+  return 1;
+}
+
+sub ParseProfileURL {
+  my $profile_name = shift;
+
+  if (!defined($profile_name) || $profile_name eq "") {
+    return ();
+  }
+
+  # Split profile URL - matches all non-empty strings, so no test.
+  $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,;
+
+  my $proto = $1 || "http://";
+  my $hostport = $2;
+  my $prefix = $3;
+  my $profile = $4 || "/";
+
+  my $host = $hostport;
+  $host =~ s/:.*//;
+
+  my $baseurl = "$proto$hostport$prefix";
+  return ($host, $baseurl, $profile);
+}
+
+# We fetch symbols from the first profile argument.
+sub SymbolPageURL {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  return "$baseURL$SYMBOL_PAGE";
+}
+
+sub FetchProgramName() {
+  my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]);
+  my $url = "$baseURL$PROGRAM_NAME_PAGE";
+  my $command_line = ShellEscape(@URL_FETCHER, $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  my $cmdline = <CMDLINE>;
+  $cmdline =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+  close(CMDLINE);
+  error("Failed to get program name from $url\n") unless defined($cmdline);
+  $cmdline =~ s/\x00.+//;  # Remove argv[1] and latters.
+  $cmdline =~ s!\n!!g;  # Remove LFs.
+  return $cmdline;
+}
+
+# Gee, curl's -L (--location) option isn't reliable at least
+# with its 7.12.3 version.  Curl will forget to post data if
+# there is a redirection.  This function is a workaround for
+# curl.  Redirection happens on borg hosts.
+sub ResolveRedirectionForCurl {
+  my $url = shift;
+  my $command_line = ShellEscape(@URL_FETCHER, "--head", $url);
+  open(CMDLINE, "$command_line |") or error($command_line);
+  while (<CMDLINE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^Location: (.*)/) {
+      $url = $1;
+    }
+  }
+  close(CMDLINE);
+  return $url;
+}
+
+# Add a timeout flat to URL_FETCHER.  Returns a new list.
+sub AddFetchTimeout {
+  my $timeout = shift;
+  my @fetcher = @_;
+  if (defined($timeout)) {
+    if (join(" ", @fetcher) =~ m/\bcurl -s/) {
+      push(@fetcher, "--max-time", sprintf("%d", $timeout));
+    } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) {
+      push(@fetcher, sprintf("--deadline=%d", $timeout));
+    }
+  }
+  return @fetcher;
+}
+
+# Reads a symbol map from the file handle name given as $1, returning
+# the resulting symbol map.  Also processes variables relating to symbols.
+# Currently, the only variable processed is 'binary=<value>' which updates
+# $main::prog to have the correct program name.
+sub ReadSymbols {
+  my $in = shift;
+  my $map = {};
+  while (<$in>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Removes all the leading zeroes from the symbols, see comment below.
+    if (m/^0x0*([0-9a-f]+)\s+(.+)/) {
+      $map->{$1} = $2;
+    } elsif (m/^---/) {
+      last;
+    } elsif (m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1, $2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "binary") {
+        if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) {
+          printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n",
+                         $main::prog, $value);
+        }
+        $main::prog = $value;
+      } else {
+        printf STDERR ("Ignoring unknown variable in symbols list: " .
+            "'%s' = '%s'\n", $variable, $value);
+      }
+    }
+  }
+  return $map;
+}
+
+sub URLEncode {
+  my $str = shift;
+  $str =~ s/([^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%02x", ord $1 /eg;
+  return $str;
+}
+
+sub AppendSymbolFilterParams {
+  my $url = shift;
+  my @params = ();
+  if ($main::opt_retain ne '') {
+    push(@params, sprintf("retain=%s", URLEncode($main::opt_retain)));
+  }
+  if ($main::opt_exclude ne '') {
+    push(@params, sprintf("exclude=%s", URLEncode($main::opt_exclude)));
+  }
+  if (scalar @params > 0) {
+    $url = sprintf("%s?%s", $url, join("&", @params));
+  }
+  return $url;
+}
+
+# Fetches and processes symbols to prepare them for use in the profile output
+# code.  If the optional 'symbol_map' arg is not given, fetches symbols from
+# $SYMBOL_PAGE for all PC values found in profile.  Otherwise, the raw symbols
+# are assumed to have already been fetched into 'symbol_map' and are simply
+# extracted and processed.
+sub FetchSymbols {
+  my $pcset = shift;
+  my $symbol_map = shift;
+
+  my %seen = ();
+  my @pcs = grep { !$seen{$_}++ } keys(%$pcset);  # uniq
+
+  if (!defined($symbol_map)) {
+    my $post_data = join("+", sort((map {"0x" . "$_"} @pcs)));
+
+    open(POSTFILE, ">$main::tmpfile_sym");
+    print POSTFILE $post_data;
+    close(POSTFILE);
+
+    my $url = SymbolPageURL();
+
+    my $command_line;
+    if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) {
+      $url = ResolveRedirectionForCurl($url);
+      $url = AppendSymbolFilterParams($url);
+      $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym",
+                                  $url);
+    } else {
+      $url = AppendSymbolFilterParams($url);
+      $command_line = (ShellEscape(@URL_FETCHER, "--post", $url)
+                       . " < " . ShellEscape($main::tmpfile_sym));
+    }
+    # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols.
+    my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"});
+    open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line);
+    $symbol_map = ReadSymbols(*SYMBOL{IO});
+    close(SYMBOL);
+  }
+
+  my $symbols = {};
+  foreach my $pc (@pcs) {
+    my $fullname;
+    # For 64 bits binaries, symbols are extracted with 8 leading zeroes.
+    # Then /symbol reads the long symbols in as uint64, and outputs
+    # the result with a "0x%08llx" format which get rid of the zeroes.
+    # By removing all the leading zeroes in both $pc and the symbols from
+    # /symbol, the symbols match and are retrievable from the map.
+    my $shortpc = $pc;
+    $shortpc =~ s/^0*//;
+    # Each line may have a list of names, which includes the function
+    # and also other functions it has inlined.  They are separated (in
+    # PrintSymbolizedProfile), by --, which is illegal in function names.
+    my $fullnames;
+    if (defined($symbol_map->{$shortpc})) {
+      $fullnames = $symbol_map->{$shortpc};
+    } else {
+      $fullnames = "0x" . $pc;  # Just use addresses
+    }
+    my $sym = [];
+    $symbols->{$pc} = $sym;
+    foreach my $fullname (split("--", $fullnames)) {
+      my $name = ShortFunctionName($fullname);
+      push(@{$sym}, $name, "?", $fullname);
+    }
+  }
+  return $symbols;
+}
+
+sub BaseName {
+  my $file_name = shift;
+  $file_name =~ s!^.*/!!;  # Remove directory name
+  return $file_name;
+}
+
+sub MakeProfileBaseName {
+  my ($binary_name, $profile_name) = @_;
+  my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+  my $binary_shortname = BaseName($binary_name);
+  return sprintf("%s.%s.%s",
+                 $binary_shortname, $main::op_time, $host);
+}
+
+sub FetchDynamicProfile {
+  my $binary_name = shift;
+  my $profile_name = shift;
+  my $fetch_name_only = shift;
+  my $encourage_patience = shift;
+
+  if (!IsProfileURL($profile_name)) {
+    return $profile_name;
+  } else {
+    my ($host, $baseURL, $path) = ParseProfileURL($profile_name);
+    if ($path eq "" || $path eq "/") {
+      # Missing type specifier defaults to cpu-profile
+      $path = $PROFILE_PAGE;
+    }
+
+    my $profile_file = MakeProfileBaseName($binary_name, $profile_name);
+
+    my $url = "$baseURL$path";
+    my $fetch_timeout = undef;
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) {
+      if ($path =~ m/[?]/) {
+        $url .= "&";
+      } else {
+        $url .= "?";
+      }
+      $url .= sprintf("seconds=%d", $main::opt_seconds);
+      $fetch_timeout = $main::opt_seconds * 1.01 + 60;
+      # Set $profile_type for consumption by PrintSymbolizedProfile.
+      $main::profile_type = 'cpu';
+    } else {
+      # For non-CPU profiles, we add a type-extension to
+      # the target profile file name.
+      my $suffix = $path;
+      $suffix =~ s,/,.,g;
+      $profile_file .= $suffix;
+      # Set $profile_type for consumption by PrintSymbolizedProfile.
+      if ($path =~ m/$HEAP_PAGE/) {
+        $main::profile_type = 'heap';
+      } elsif ($path =~ m/$GROWTH_PAGE/) {
+        $main::profile_type = 'growth';
+      } elsif ($path =~ m/$CONTENTION_PAGE/) {
+        $main::profile_type = 'contention';
+      }
+    }
+
+    my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof");
+    if (! -d $profile_dir) {
+      mkdir($profile_dir)
+          || die("Unable to create profile directory $profile_dir: $!\n");
+    }
+    my $tmp_profile = "$profile_dir/.tmp.$profile_file";
+    my $real_profile = "$profile_dir/$profile_file";
+
+    if ($fetch_name_only > 0) {
+      return $real_profile;
+    }
+
+    my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER);
+    my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile);
+    if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){
+      print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n  ${real_profile}\n";
+      if ($encourage_patience) {
+        print STDERR "Be patient...\n";
+      }
+    } else {
+      print STDERR "Fetching $path profile from $url to\n  ${real_profile}\n";
+    }
+
+    (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n");
+    (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n");
+    print STDERR "Wrote profile to $real_profile\n";
+    $main::collected_profile = $real_profile;
+    return $main::collected_profile;
+  }
+}
+
+# Collect profiles in parallel
+sub FetchDynamicProfiles {
+  my $items = scalar(@main::pfile_args);
+  my $levels = log($items) / log(2);
+
+  if ($items == 1) {
+    $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1);
+  } else {
+    # math rounding issues
+    if ((2 ** $levels) < $items) {
+     $levels++;
+    }
+    my $count = scalar(@main::pfile_args);
+    for (my $i = 0; $i < $count; $i++) {
+      $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0);
+    }
+    print STDERR "Fetching $count profiles, Be patient...\n";
+    FetchDynamicProfilesRecurse($levels, 0, 0);
+    $main::collected_profile = join(" \\\n    ", @main::profile_files);
+  }
+}
+
+# Recursively fork a process to get enough processes
+# collecting profiles
+sub FetchDynamicProfilesRecurse {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if (my $pid = fork()) {
+    $position = 0 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    wait;
+  } else {
+    $position = 1 | ($position << 1);
+    TryCollectProfile($maxlevel, $level, $position);
+    cleanup();
+    exit(0);
+  }
+}
+
+# Collect a single profile
+sub TryCollectProfile {
+  my $maxlevel = shift;
+  my $level = shift;
+  my $position = shift;
+
+  if ($level >= ($maxlevel - 1)) {
+    if ($position < scalar(@main::pfile_args)) {
+      FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0);
+    }
+  } else {
+    FetchDynamicProfilesRecurse($maxlevel, $level+1, $position);
+  }
+}
+
+##### Parsing code #####
+
+# Provide a small streaming-read module to handle very large
+# cpu-profile files.  Stream in chunks along a sliding window.
+# Provides an interface to get one 'slot', correctly handling
+# endian-ness differences.  A slot is one 32-bit or 64-bit word
+# (depending on the input profile).  We tell endianness and bit-size
+# for the profile by looking at the first 8 bytes: in cpu profiles,
+# the second slot is always 3 (we'll accept anything that's not 0).
+BEGIN {
+  package CpuProfileStream;
+
+  sub new {
+    my ($class, $file, $fname) = @_;
+    my $self = { file        => $file,
+                 base        => 0,
+                 stride      => 512 * 1024,   # must be a multiple of bitsize/8
+                 slots       => [],
+                 unpack_code => "",           # N for big-endian, V for little
+                 perl_is_64bit => 1,          # matters if profile is 64-bit
+    };
+    bless $self, $class;
+    # Let unittests adjust the stride
+    if ($main::opt_test_stride > 0) {
+      $self->{stride} = $main::opt_test_stride;
+    }
+    # Read the first two slots to figure out bitsize and endianness.
+    my $slots = $self->{slots};
+    my $str;
+    read($self->{file}, $str, 8);
+    # Set the global $address_length based on what we see here.
+    # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars).
+    $address_length = ($str eq (chr(0)x8)) ? 16 : 8;
+    if ($address_length == 8) {
+      if (substr($str, 6, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 4, 2) eq chr(0)x2) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**16\n");
+      }
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # If we're a 64-bit profile, check if we're a 64-bit-capable
+      # perl.  Otherwise, each slot will be represented as a float
+      # instead of an int64, losing precision and making all the
+      # 64-bit addresses wrong.  We won't complain yet, but will
+      # later if we ever see a value that doesn't fit in 32 bits.
+      my $has_q = 0;
+      eval { $has_q = pack("Q", "1") ? 1 : 1; };
+      if (!$has_q) {
+        $self->{perl_is_64bit} = 0;
+      }
+      read($self->{file}, $str, 8);
+      if (substr($str, 4, 4) eq chr(0)x4) {
+        # We'd love to use 'Q', but it's a) not universal, b) not endian-proof.
+        $self->{unpack_code} = 'V';  # Little-endian.
+      } elsif (substr($str, 0, 4) eq chr(0)x4) {
+        $self->{unpack_code} = 'N';  # Big-endian
+      } else {
+        ::error("$fname: header size >= 2**32\n");
+      }
+      my @pair = unpack($self->{unpack_code} . "*", $str);
+      # Since we know one of the pair is 0, it's fine to just add them.
+      @$slots = (0, $pair[0] + $pair[1]);
+    }
+    return $self;
+  }
+
+  # Load more data when we access slots->get(X) which is not yet in memory.
+  sub overflow {
+    my ($self) = @_;
+    my $slots = $self->{slots};
+    $self->{base} += $#$slots + 1;   # skip over data we're replacing
+    my $str;
+    read($self->{file}, $str, $self->{stride});
+    if ($address_length == 8) {      # the 32-bit case
+      # This is the easy case: unpack provides 32-bit unpacking primitives.
+      @$slots = unpack($self->{unpack_code} . "*", $str);
+    } else {
+      # We need to unpack 32 bits at a time and combine.
+      my @b32_values = unpack($self->{unpack_code} . "*", $str);
+      my @b64_values = ();
+      for (my $i = 0; $i < $#b32_values; $i += 2) {
+        # TODO(csilvers): if this is a 32-bit perl, the math below
+        #    could end up in a too-large int, which perl will promote
+        #    to a double, losing necessary precision.  Deal with that.
+        #    Right now, we just die.
+        my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]);
+        if ($self->{unpack_code} eq 'N') {    # big-endian
+          ($lo, $hi) = ($hi, $lo);
+        }
+        my $value = $lo + $hi * (2**32);
+        if (!$self->{perl_is_64bit} &&   # check value is exactly represented
+            (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) {
+          ::error("Need a 64-bit perl to process this 64-bit profile.\n");
+        }
+        push(@b64_values, $value);
+      }
+      @$slots = @b64_values;
+    }
+  }
+
+  # Access the i-th long in the file (logically), or -1 at EOF.
+  sub get {
+    my ($self, $idx) = @_;
+    my $slots = $self->{slots};
+    while ($#$slots >= 0) {
+      if ($idx < $self->{base}) {
+        # The only time we expect a reference to $slots[$i - something]
+        # after referencing $slots[$i] is reading the very first header.
+        # Since $stride > |header|, that shouldn't cause any lookback
+        # errors.  And everything after the header is sequential.
+        print STDERR "Unexpected look-back reading CPU profile";
+        return -1;   # shrug, don't know what better to return
+      } elsif ($idx > $self->{base} + $#$slots) {
+        $self->overflow();
+      } else {
+        return $slots->[$idx - $self->{base}];
+      }
+    }
+    # If we get here, $slots is [], which means we've reached EOF
+    return -1;  # unique since slots is supposed to hold unsigned numbers
+  }
+}
+
+# Reads the top, 'header' section of a profile, and returns the last
+# line of the header, commonly called a 'header line'.  The header
+# section of a profile consists of zero or more 'command' lines that
+# are instructions to jeprof, which jeprof executes when reading the
+# header.  All 'command' lines start with a %.  After the command
+# lines is the 'header line', which is a profile-specific line that
+# indicates what type of profile it is, and perhaps other global
+# information about the profile.  For instance, here's a header line
+# for a heap profile:
+#   heap profile:     53:    38236 [  5525:  1284029] @ heapprofile
+# For historical reasons, the CPU profile does not contain a text-
+# readable header line.  If the profile looks like a CPU profile,
+# this function returns "".  If no header line could be found, this
+# function returns undef.
+#
+# The following commands are recognized:
+#   %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:'
+#
+# The input file should be in binmode.
+sub ReadProfileHeader {
+  local *PROFILE = shift;
+  my $firstchar = "";
+  my $line = "";
+  read(PROFILE, $firstchar, 1);
+  seek(PROFILE, -1, 1);                    # unread the firstchar
+  if ($firstchar !~ /[[:print:]]/) {       # is not a text character
+    return "";
+  }
+  while (defined($line = <PROFILE>)) {
+    $line =~ s/\r//g;   # turn windows-looking lines into unix-looking lines
+    if ($line =~ /^%warn\s+(.*)/) {        # 'warn' command
+      # Note this matches both '%warn blah\n' and '%warn\n'.
+      print STDERR "WARNING: $1\n";        # print the rest of the line
+    } elsif ($line =~ /^%/) {
+      print STDERR "Ignoring unknown command from profile header: $line";
+    } else {
+      # End of commands, must be the header line.
+      return $line;
+    }
+  }
+  return undef;     # got to EOF without seeing a header line
+}
+
+sub IsSymbolizedProfileFile {
+  my $file_name = shift;
+  if (!(-e $file_name) || !(-r $file_name)) {
+    return 0;
+  }
+  # Check if the file contains a symbol-section marker.
+  open(TFILE, "<$file_name");
+  binmode TFILE;
+  my $firstline = ReadProfileHeader(*TFILE);
+  close(TFILE);
+  if (!$firstline) {
+    return 0;
+  }
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  return $firstline =~ /^--- *$symbol_marker/;
+}
+
+# Parse profile generated by common/profiler.cc and return a reference
+# to a map:
+#      $result->{version}     Version number of profile file
+#      $result->{period}      Sampling period (in microseconds)
+#      $result->{profile}     Profile object
+#      $result->{threads}     Map of thread IDs to profile objects
+#      $result->{map}         Memory map info from profile
+#      $result->{pcs}         Hash of all PC values seen, key is hex address
+sub ReadProfile {
+  my $prog = shift;
+  my $fname = shift;
+  my $result;            # return value
+
+  $CONTENTION_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $contention_marker = $&;
+  $GROWTH_PAGE  =~ m,[^/]+$,;    # matches everything after the last slash
+  my $growth_marker = $&;
+  $SYMBOL_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $symbol_marker = $&;
+  $PROFILE_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $profile_marker = $&;
+  $HEAP_PAGE =~ m,[^/]+$,;    # matches everything after the last slash
+  my $heap_marker = $&;
+
+  # Look at first line to see if it is a heap or a CPU profile.
+  # CPU profile may start with no header at all, and just binary data
+  # (starting with \0\0\0\0) -- in that case, don't try to read the
+  # whole firstline, since it may be gigabytes(!) of data.
+  open(PROFILE, "<$fname") || error("$fname: $!\n");
+  binmode PROFILE;      # New perls do UTF-8 processing
+  my $header = ReadProfileHeader(*PROFILE);
+  if (!defined($header)) {   # means "at EOF"
+    error("Profile is empty.\n");
+  }
+
+  my $symbols;
+  if ($header =~ m/^--- *$symbol_marker/o) {
+    # Verify that the user asked for a symbolized profile
+    if (!$main::use_symbolized_profile) {
+      # we have both a binary and symbolized profiles, abort
+      error("FATAL ERROR: Symbolized profile\n   $fname\ncannot be used with " .
+            "a binary arg. Try again without passing\n   $prog\n");
+    }
+    # Read the symbol section of the symbolized profile file.
+    $symbols = ReadSymbols(*PROFILE{IO});
+    # Read the next line to get the header for the remaining profile.
+    $header = ReadProfileHeader(*PROFILE) || "";
+  }
+
+  if ($header =~ m/^--- *($heap_marker|$growth_marker)/o) {
+    # Skip "--- ..." line for profile types that have their own headers.
+    $header = ReadProfileHeader(*PROFILE) || "";
+  }
+
+  $main::profile_type = '';
+
+  if ($header =~ m/^heap profile:.*$growth_marker/o) {
+    $main::profile_type = 'growth';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap profile:/) {
+    $main::profile_type = 'heap';
+    $result =  ReadHeapProfile($prog, *PROFILE, $header);
+  } elsif ($header =~ m/^heap/) {
+    $main::profile_type = 'heap';
+    $result = ReadThreadedHeapProfile($prog, $fname, $header);
+  } elsif ($header =~ m/^--- *$contention_marker/o) {
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *Stacks:/) {
+    print STDERR
+      "Old format contention profile: mistakenly reports " .
+      "condition variable signals as lock contentions.\n";
+    $main::profile_type = 'contention';
+    $result = ReadSynchProfile($prog, *PROFILE);
+  } elsif ($header =~ m/^--- *$profile_marker/) {
+    # the binary cpu profile data starts immediately after this line
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  } else {
+    if (defined($symbols)) {
+      # a symbolized profile contains a format we don't recognize, bail out
+      error("$fname: Cannot recognize profile section after symbols.\n");
+    }
+    # no ascii header present -- must be a CPU profile
+    $main::profile_type = 'cpu';
+    $result = ReadCPUProfile($prog, $fname, *PROFILE);
+  }
+
+  close(PROFILE);
+
+  # if we got symbols along with the profile, return those as well
+  if (defined($symbols)) {
+    $result->{symbols} = $symbols;
+  }
+
+  return $result;
+}
+
+# Subtract one from caller pc so we map back to call instr.
+# However, don't do this if we're reading a symbolized profile
+# file, in which case the subtract-one was done when the file
+# was written.
+#
+# We apply the same logic to all readers, though ReadCPUProfile uses an
+# independent implementation.
+sub FixCallerAddresses {
+  my $stack = shift;
+  # --raw/http: Always subtract one from pc's, because PrintSymbolizedProfile()
+  # dumps unadjusted profiles.
+  {
+    $stack =~ /(\s)/;
+    my $delimiter = $1;
+    my @addrs = split(' ', $stack);
+    my @fixedaddrs;
+    $#fixedaddrs = $#addrs;
+    if ($#addrs >= 0) {
+      $fixedaddrs[0] = $addrs[0];
+    }
+    for (my $i = 1; $i <= $#addrs; $i++) {
+      $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1");
+    }
+    return join $delimiter, @fixedaddrs;
+  }
+}
+
+# CPU profile reader
+sub ReadCPUProfile {
+  my $prog = shift;
+  my $fname = shift;       # just used for logging
+  local *PROFILE = shift;
+  my $version;
+  my $period;
+  my $i;
+  my $profile = {};
+  my $pcs = {};
+
+  # Parse string into array of slots.
+  my $slots = CpuProfileStream->new(*PROFILE, $fname);
+
+  # Read header.  The current header version is a 5-element structure
+  # containing:
+  #   0: header count (always 0)
+  #   1: header "words" (after this one: 3)
+  #   2: format version (0)
+  #   3: sampling period (usec)
+  #   4: unused padding (always 0)
+  if ($slots->get(0) != 0 ) {
+    error("$fname: not a profile file, or old format profile file\n");
+  }
+  $i = 2 + $slots->get(1);
+  $version = $slots->get(2);
+  $period = $slots->get(3);
+  # Do some sanity checking on these header values.
+  if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) {
+    error("$fname: not a profile file, or corrupted profile file\n");
+  }
+
+  # Parse profile
+  while ($slots->get($i) != -1) {
+    my $n = $slots->get($i++);
+    my $d = $slots->get($i++);
+    if ($d > (2**16)) {  # TODO(csilvers): what's a reasonable max-stack-depth?
+      my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8));
+      print STDERR "At index $i (address $addr):\n";
+      error("$fname: stack trace depth >= 2**32\n");
+    }
+    if ($slots->get($i) == 0) {
+      # End of profile data marker
+      $i += $d;
+      last;
+    }
+
+    # Make key out of the stack entries
+    my @k = ();
+    for (my $j = 0; $j < $d; $j++) {
+      my $pc = $slots->get($i+$j);
+      # Subtract one from caller pc so we map back to call instr.
+      $pc--;
+      $pc = sprintf("%0*x", $address_length, $pc);
+      $pcs->{$pc} = 1;
+      push @k, $pc;
+    }
+
+    AddEntry($profile, (join "\n", @k), $n);
+    $i += $d;
+  }
+
+  # Parse map
+  my $map = '';
+  seek(PROFILE, $i * 4, 0);
+  read(PROFILE, $map, (stat PROFILE)[7]);
+
+  my $r = {};
+  $r->{version} = $version;
+  $r->{period} = $period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+
+  return $r;
+}
+
+sub HeapProfileIndex {
+  my $index = 1;
+  if ($main::opt_inuse_space) {
+    $index = 1;
+  } elsif ($main::opt_inuse_objects) {
+    $index = 0;
+  } elsif ($main::opt_alloc_space) {
+    $index = 3;
+  } elsif ($main::opt_alloc_objects) {
+    $index = 2;
+  }
+  return $index;
+}
+
+sub ReadMappedLibraries {
+  my $fh = shift;
+  my $map = "";
+  # Read the /proc/self/maps data
+  while (<$fh>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub ReadMemoryMap {
+  my $fh = shift;
+  my $map = "";
+  # Read /proc/self/maps data as formatted by DumpAddressMap()
+  my $buildvar = "";
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Parse "build=<dir>" specification if supplied
+    if (m/^\s*build=(.*)\n/) {
+      $buildvar = $1;
+    }
+
+    # Expand "$build" variable if available
+    $_ =~ s/\$build\b/$buildvar/g;
+
+    $map .= $_;
+  }
+  return $map;
+}
+
+sub AdjustSamples {
+  my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_;
+  if ($sample_adjustment) {
+    if ($sampling_algorithm == 2) {
+      # Remote-heap version 2
+      # The sampling frequency is the rate of a Poisson process.
+      # This means that the probability of sampling an allocation of
+      # size X with sampling rate Y is 1 - exp(-X/Y)
+      if ($n1 != 0) {
+        my $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n1 *= $scale_factor;
+        $s1 *= $scale_factor;
+      }
+      if ($n2 != 0) {
+        my $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+        my $scale_factor = 1/(1 - exp(-$ratio));
+        $n2 *= $scale_factor;
+        $s2 *= $scale_factor;
+      }
+    } else {
+      # Remote-heap version 1
+      my $ratio;
+      $ratio = (($s1*1.0)/$n1)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n1 /= $ratio;
+        $s1 /= $ratio;
+      }
+      $ratio = (($s2*1.0)/$n2)/($sample_adjustment);
+      if ($ratio < 1) {
+        $n2 /= $ratio;
+        $s2 /= $ratio;
+      }
+    }
+  }
+  return ($n1, $s1, $n2, $s2);
+}
+
+sub ReadHeapProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $index = HeapProfileIndex();
+
+  # Find the type of this profile.  The header line looks like:
+  #    heap profile:   1246:  8800744 [  1246:  8800744] @ <heap-url>/266053
+  # There are two pairs <count: size>, the first inuse objects/space, and the
+  # second allocated objects/space.  This is followed optionally by a profile
+  # type, and if that is present, optionally by a sampling frequency.
+  # For remote heap profiles (v1):
+  # The interpretation of the sampling frequency is that the profiler, for
+  # each sample, calculates a uniformly distributed random integer less than
+  # the given value, and records the next sample after that many bytes have
+  # been allocated.  Therefore, the expected sample interval is half of the
+  # given frequency.  By default, if not specified, the expected sample
+  # interval is 128KB.  Only remote-heap-page profiles are adjusted for
+  # sample size.
+  # For remote heap profiles (v2):
+  # The sampling frequency is the rate of a Poisson process. This means that
+  # the probability of sampling an allocation of size X with sampling rate Y
+  # is 1 - exp(-X/Y)
+  # For version 2, a typical header line might look like this:
+  # heap profile:   1922: 127792360 [  1922: 127792360] @ <heap-url>_v2/524288
+  # the trailing number (524288) is the sampling rate. (Version 1 showed
+  # double the 'rate' here)
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") {
+    if (defined($6) && ($6 ne '')) {
+      $type = $6;
+      my $sample_period = $8;
+      # $type is "heapprofile" for profiles generated by the
+      # heap-profiler, and either "heap" or "heap_v2" for profiles
+      # generated by sampling directly within tcmalloc.  It can also
+      # be "growth" for heap-growth profiles.  The first is typically
+      # found for profiles generated locally, and the others for
+      # remote profiles.
+      if (($type eq "heapprofile") || ($type !~ /heap/) ) {
+        # No need to adjust for the sampling rate with heap-profiler-derived data
+        $sampling_algorithm = 0;
+      } elsif ($type =~ /_v2/) {
+        $sampling_algorithm = 2;     # version 2 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period);
+        }
+      } else {
+        $sampling_algorithm = 1;     # version 1 sampling
+        if (defined($sample_period) && ($sample_period ne '')) {
+          $sample_adjustment = int($sample_period)/2;
+        }
+      }
+    } else {
+      # We detect whether or not this is a remote-heap profile by checking
+      # that the total-allocated stats ($n2,$s2) are exactly the
+      # same as the in-use stats ($n1,$s1).  It is remotely conceivable
+      # that a non-remote-heap profile may pass this check, but it is hard
+      # to imagine how that could happen.
+      # In this case it's so old it's guaranteed to be remote-heap version 1.
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      if (($n1 == $n2) && ($s1 == $s2)) {
+        # This is likely to be a remote-heap based sample profile
+        $sampling_algorithm = 1;
+      }
+    }
+  }
+
+  if ($sampling_algorithm > 0) {
+    # For remote-heap generated profiles, adjust the counts and sizes to
+    # account for the sample rate (we sample once every 128KB by default).
+    if ($sample_adjustment == 0) {
+      # Turn on profile adjustment.
+      $sample_adjustment = 128*1024;
+      print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n";
+    } else {
+      printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n",
+                     $sample_adjustment);
+    }
+    if ($sampling_algorithm > 1) {
+      # We don't bother printing anything for the original version (version 1)
+      printf STDERR "Heap version $sampling_algorithm\n";
+    }
+  }
+
+  my $profile = {};
+  my $pcs = {};
+  my $map = "";
+
+  while (<PROFILE>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    #  <count1>: <bytes1> [<count2>: <bytes2>] @ a1 a2 a3 ... an
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) {
+      my $stack = $5;
+      my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadThreadedHeapProfile {
+  my ($prog, $fname, $header) = @_;
+
+  my $index = HeapProfileIndex();
+  my $sampling_algorithm = 0;
+  my $sample_adjustment = 0;
+  chomp($header);
+  my $type = "unknown";
+  # Assuming a very specific type of header for now.
+  if ($header =~ m"^heap_v2/(\d+)") {
+    $type = "_v2";
+    $sampling_algorithm = 2;
+    $sample_adjustment = int($1);
+  }
+  if ($type ne "_v2" || !defined($sample_adjustment)) {
+    die "Threaded heap profiles require v2 sampling with a sample rate\n";
+  }
+
+  my $profile = {};
+  my $thread_profiles = {};
+  my $pcs = {};
+  my $map = "";
+  my $stack = "";
+
+  while (<PROFILE>) {
+    s/\r//g;
+    if (/^MAPPED_LIBRARIES:/) {
+      $map .= ReadMappedLibraries(*PROFILE);
+      last;
+    }
+
+    if (/^--- Memory map:/) {
+      $map .= ReadMemoryMap(*PROFILE);
+      last;
+    }
+
+    # Read entry of the form:
+    # @ a1 a2 ... an
+    #   t*: <count1>: <bytes1> [<count2>: <bytes2>]
+    #   t1: <count1>: <bytes1> [<count2>: <bytes2>]
+    #     ...
+    #   tn: <count1>: <bytes1> [<count2>: <bytes2>]
+    s/^\s*//;
+    s/\s*$//;
+    if (m/^@\s+(.*)$/) {
+      $stack = $1;
+    } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) {
+      if ($stack eq "") {
+        # Still in the header, so this is just a per-thread summary.
+        next;
+      }
+      my $thread = $2;
+      my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6);
+      my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm,
+                                 $n1, $s1, $n2, $s2);
+      if ($thread eq "*") {
+        AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]);
+      } else {
+        if (!exists($thread_profiles->{$thread})) {
+          $thread_profiles->{$thread} = {};
+        }
+        AddEntries($thread_profiles->{$thread}, $pcs,
+                   FixCallerAddresses($stack), $counts[$index]);
+      }
+    }
+  }
+
+  my $r = {};
+  $r->{version} = "heap";
+  $r->{period} = 1;
+  $r->{profile} = $profile;
+  $r->{threads} = $thread_profiles;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+sub ReadSynchProfile {
+  my $prog = shift;
+  local *PROFILE = shift;
+  my $header = shift;
+
+  my $map = '';
+  my $profile = {};
+  my $pcs = {};
+  my $sampling_period = 1;
+  my $cyclespernanosec = 2.8;   # Default assumption for old binaries
+  my $seen_clockrate = 0;
+  my $line;
+
+  my $index = 0;
+  if ($main::opt_total_delay) {
+    $index = 0;
+  } elsif ($main::opt_contentions) {
+    $index = 1;
+  } elsif ($main::opt_mean_delay) {
+    $index = 2;
+  }
+
+  while ( $line = <PROFILE> ) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $count, $stack) = ($1, $2, $3);
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+      $count *= $sampling_period;
+
+      my @values = ($cycles, $count, $cycles / $count);
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]);
+
+    } elsif ( $line =~ /^(slow release).*thread \d+  \@\s*(.*?)\s*$/ ||
+              $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) {
+      my ($cycles, $stack) = ($1, $2);
+      if ($cycles !~ /^\d+$/) {
+        next;
+      }
+
+      # Convert cycles to nanoseconds
+      $cycles /= $cyclespernanosec;
+
+      # Adjust for sampling done by application
+      $cycles *= $sampling_period;
+
+      AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles);
+
+    } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) {
+      my ($variable, $value) = ($1,$2);
+      for ($variable, $value) {
+        s/^\s+//;
+        s/\s+$//;
+      }
+      if ($variable eq "cycles/second") {
+        $cyclespernanosec = $value / 1e9;
+        $seen_clockrate = 1;
+      } elsif ($variable eq "sampling period") {
+        $sampling_period = $value;
+      } elsif ($variable eq "ms since reset") {
+        # Currently nothing is done with this value in jeprof
+        # So we just silently ignore it for now
+      } elsif ($variable eq "discarded samples") {
+        # Currently nothing is done with this value in jeprof
+        # So we just silently ignore it for now
+      } else {
+        printf STDERR ("Ignoring unnknown variable in /contention output: " .
+                       "'%s' = '%s'\n",$variable,$value);
+      }
+    } else {
+      # Memory map entry
+      $map .= $line;
+    }
+  }
+
+  if (!$seen_clockrate) {
+    printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n",
+                   $cyclespernanosec);
+  }
+
+  my $r = {};
+  $r->{version} = 0;
+  $r->{period} = $sampling_period;
+  $r->{profile} = $profile;
+  $r->{libs} = ParseLibraries($prog, $map, $pcs);
+  $r->{pcs} = $pcs;
+  return $r;
+}
+
+# Given a hex value in the form "0x1abcd" or "1abcd", return either
+# "0001abcd" or "000000000001abcd", depending on the current (global)
+# address length.
+sub HexExtend {
+  my $addr = shift;
+
+  $addr =~ s/^(0x)?0*//;
+  my $zeros_needed = $address_length - length($addr);
+  if ($zeros_needed < 0) {
+    printf STDERR "Warning: address $addr is longer than address length $address_length\n";
+    return $addr;
+  }
+  return ("0" x $zeros_needed) . $addr;
+}
+
+##### Symbol extraction #####
+
+# Aggressively search the lib_prefix values for the given library
+# If all else fails, just return the name of the library unmodified.
+# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so"
+# it will search the following locations in this order, until it finds a file:
+#   /my/path/lib/dir/mylib.so
+#   /other/path/lib/dir/mylib.so
+#   /my/path/dir/mylib.so
+#   /other/path/dir/mylib.so
+#   /my/path/mylib.so
+#   /other/path/mylib.so
+#   /lib/dir/mylib.so              (returned as last resort)
+sub FindLibrary {
+  my $file = shift;
+  my $suffix = $file;
+
+  # Search for the library as described above
+  do {
+    foreach my $prefix (@prefix_list) {
+      my $fullpath = $prefix . $suffix;
+      if (-e $fullpath) {
+        return $fullpath;
+      }
+    }
+  } while ($suffix =~ s|^/[^/]+/|/|);
+  return $file;
+}
+
+# Return path to library with debugging symbols.
+# For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+sub DebuggingLibrary {
+  my $file = shift;
+      
+  if ($file !~ m|^/|) {
+    return undef;
+  }
+      
+  # Find debug symbol file if it's named after the library's name.
+  
+  if (-f "/usr/lib/debug$file") {                 
+    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; }
+    return "/usr/lib/debug$file";
+  } elsif (-f "/usr/lib/debug$file.debug") {
+    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; }
+    return "/usr/lib/debug$file.debug"; 
+  }
+
+  if(!$main::opt_debug_syms_by_id) {
+    if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" };
+    return undef;
+  }
+
+  # Find debug file if it's named after the library's build ID.
+  
+  my $readelf = '';
+  if (!$main::gave_up_on_elfutils) {
+    $readelf = qx/eu-readelf -n ${file}/;
+    if ($?) {
+      print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n";
+      $main::gave_up_on_elfutils = 1;
+      return undef;
+    }
+    my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s;
+    if (defined $buildID && length $buildID > 0) {
+      my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug';
+      if (-e $symbolFile) {
+        if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" };
+        return $symbolFile;
+      } else {
+        if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" };
+        return undef;
+      }
+    }
+  }
+
+  if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" };
+  return undef;
+}
+
+
+# Parse text section header of a library using objdump
+sub ParseTextSectionHeaderFromObjdump {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma;
+  my $file_offset;
+  # Get objdump output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib);
+  open(OBJDUMP, "$cmd |") || error("$cmd: $!\n");
+  while (<OBJDUMP>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # Idx Name          Size      VMA       LMA       File off  Algn
+    #  10 .text         00104b2c  420156f0  420156f0  000156f0  2**4
+    # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file
+    # offset may still be 8.  But AddressSub below will still handle that.
+    my @x = split;
+    if (($#x >= 6) && ($x[1] eq '.text')) {
+      $size = $x[2];
+      $vma = $x[3];
+      $file_offset = $x[5];
+      last;
+    }
+  }
+  close(OBJDUMP);
+
+  if (!defined($size)) {
+    return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+# Parse text section header of a library using otool (on OS X)
+sub ParseTextSectionHeaderFromOtool {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma = undef;
+  my $file_offset = undef;
+  # Get otool output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib);
+  open(OTOOL, "$command |") || error("$command: $!\n");
+  my $cmd = "";
+  my $sectname = "";
+  my $segname = "";
+  foreach my $line (<OTOOL>) {
+    $line =~ s/\r//g;      # turn windows-looking lines into unix-looking lines
+    # Load command <#>
+    #       cmd LC_SEGMENT
+    # [...]
+    # Section
+    #   sectname __text
+    #    segname __TEXT
+    #       addr 0x000009f8
+    #       size 0x00018b9e
+    #     offset 2552
+    #      align 2^2 (4)
+    # We will need to strip off the leading 0x from the hex addresses,
+    # and convert the offset into hex.
+    if ($line =~ /Load command/) {
+      $cmd = "";
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /Section/) {
+      $sectname = "";
+      $segname = "";
+    } elsif ($line =~ /cmd (\w+)/) {
+      $cmd = $1;
+    } elsif ($line =~ /sectname (\w+)/) {
+      $sectname = $1;
+    } elsif ($line =~ /segname (\w+)/) {
+      $segname = $1;
+    } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") &&
+               $sectname eq "__text" &&
+               $segname eq "__TEXT")) {
+      next;
+    } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) {
+      $vma = $1;
+    } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) {
+      $size = $1;
+    } elsif ($line =~ /\boffset ([0-9]+)/) {
+      $file_offset = sprintf("%016x", $1);
+    }
+    if (defined($vma) && defined($size) && defined($file_offset)) {
+      last;
+    }
+  }
+  close(OTOOL);
+
+  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
+     return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
+sub ParseTextSectionHeader {
+  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
+  if (defined($obj_tool_map{"otool"})) {
+    my $r = ParseTextSectionHeaderFromOtool(@_);
+    if (defined($r)){
+      return $r;
+    }
+  }
+  # If otool doesn't work, or we don't have it, fall back to objdump
+  return ParseTextSectionHeaderFromObjdump(@_);
+}
+
+# Split /proc/pid/maps dump into a list of libraries
+sub ParseLibraries {
+  return if $main::use_symbol_page;  # We don't need libraries info.
+  my $prog = Cwd::abs_path(shift);
+  my $map = shift;
+  my $pcs = shift;
+
+  my $result = [];
+  my $h = "[a-f0-9]+";
+  my $zero_offset = HexExtend("0");
+
+  my $buildvar = "";
+  foreach my $l (split("\n", $map)) {
+    if ($l =~ m/^\s*build=(.*)$/) {
+      $buildvar = $1;
+    }
+
+    my $start;
+    my $finish;
+    my $offset;
+    my $lib;
+    if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) {
+      # Full line from /proc/self/maps.  Example:
+      #   40000000-40015000 r-xp 00000000 03:01 12845071   /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
+      # Cooked line from DumpAddressMap.  Example:
+      #   40000000-40015000: /lib/ld-2.3.2.so
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = $3;
+    } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) {
+      # PIEs and address space randomization do not play well with our
+      # default assumption that main executable is at lowest
+      # addresses. So we're detecting main executable in
+      # /proc/self/maps as well.
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    }
+    # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in
+    # function procfs_doprocmap (sys/fs/procfs/procfs_map.c)
+    #
+    # Example:
+    # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s
+    # o.1 NCH -1
+    elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) {
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = FindLibrary($5);
+
+    } else {
+      next;
+    }
+
+    # Expand "$build" variable if available
+    $lib =~ s/\$build\b/$buildvar/g;
+
+    $lib = FindLibrary($lib);
+
+    # Check for pre-relocated libraries, which use pre-relocated symbol tables
+    # and thus require adjusting the offset that we'll use to translate
+    # VM addresses into symbol table addresses.
+    # Only do this if we're not going to fetch the symbol table from a
+    # debugging copy of the library.
+    if (!DebuggingLibrary($lib)) {
+      my $text = ParseTextSectionHeader($lib);
+      if (defined($text)) {
+         my $vma_offset = AddressSub($text->{vma}, $text->{file_offset});
+         $offset = AddressAdd($offset, $vma_offset);
+      }
+    }
+
+    if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; }
+    push(@{$result}, [$lib, $start, $finish, $offset]);
+  }
+
+  # Append special entry for additional library (not relocated)
+  if ($main::opt_lib ne "") {
+    my $text = ParseTextSectionHeader($main::opt_lib);
+    if (defined($text)) {
+       my $start = $text->{vma};
+       my $finish = AddressAdd($start, $text->{size});
+
+       push(@{$result}, [$main::opt_lib, $start, $finish, $start]);
+    }
+  }
+
+  # Append special entry for the main program.  This covers
+  # 0..max_pc_value_seen, so that we assume pc values not found in one
+  # of the library ranges will be treated as coming from the main
+  # program binary.
+  my $min_pc = HexExtend("0");
+  my $max_pc = $min_pc;          # find the maximal PC value in any sample
+  foreach my $pc (keys(%{$pcs})) {
+    if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); }
+  }
+  push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]);
+
+  return $result;
+}
+
+# Add two hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressAdd {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+
+    if ($main::opt_debug and $main::opt_test) {
+      print STDERR "AddressAdd $addr1 + $addr2 = ";
+    }
+
+    my $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2);
+    my $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    my $r = sprintf("%07x", $sum);
+
+    $a1 = substr($addr1,-7);
+    $addr1 = substr($addr1,0,-7);
+    $a2 = substr($addr2,-7);
+    $addr2 = substr($addr2,0,-7);
+    $sum = hex($a1) + hex($a2) + $c;
+    $c = 0;
+    if ($sum > 0xfffffff) {
+      $c = 1;
+      $sum -= 0x10000000;
+    }
+    $r = sprintf("%07x", $sum) . $r;
+
+    $sum = hex($addr1) + hex($addr2) + $c;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+
+# Subtract two hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressSub {
+  my $addr1 = shift;
+  my $addr2 = shift;
+  my $diff;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16);
+    return sprintf("%08x", $diff);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize borrow handling.
+    # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; }
+
+    my $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    my $a2 = hex(substr($addr2,-7));
+    $addr2 = substr($addr2,0,-7);
+    my $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    my $r = sprintf("%07x", $diff);
+
+    $a1 = hex(substr($addr1,-7));
+    $addr1 = substr($addr1,0,-7);
+    $a2 = hex(substr($addr2,-7)) + $b;
+    $addr2 = substr($addr2,0,-7);
+    $b = 0;
+    if ($a2 > $a1) {
+      $b = 1;
+      $a1 += 0x10000000;
+    }
+    $diff = $a1 - $a2;
+    $r = sprintf("%07x", $diff) . $r;
+
+    $a1 = hex($addr1);
+    $a2 = hex($addr2) + $b;
+    if ($a2 > $a1) { $a1 += 0x100; }
+    $diff = $a1 - $a2;
+    $r = sprintf("%02x", $diff) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+
+    return $r;
+  }
+}
+
+# Increment a hex addresses of length $address_length.
+# Run jeprof --test for unit test if this is changed.
+sub AddressInc {
+  my $addr = shift;
+  my $sum;
+
+  if ($address_length == 8) {
+    # Perl doesn't cope with wraparound arithmetic, so do it explicitly:
+    $sum = (hex($addr)+1) % (0x10000000 * 16);
+    return sprintf("%08x", $sum);
+
+  } else {
+    # Do the addition in 7-nibble chunks to trivialize carry handling.
+    # We are always doing this to step through the addresses in a function,
+    # and will almost never overflow the first chunk, so we check for this
+    # case and exit early.
+
+    # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; }
+
+    my $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    my $r = sprintf("%07x", $sum);
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "0000000";
+    }
+
+    $a1 = substr($addr,-7);
+    $addr = substr($addr,0,-7);
+    $sum = hex($a1) + 1;
+    $r = sprintf("%07x", $sum) . $r;
+    if ($sum <= 0xfffffff) {
+      $r = $addr . $r;
+      # if ($main::opt_debug) { print STDERR "$r\n"; }
+      return HexExtend($r);
+    } else {
+      $r = "00000000000000";
+    }
+
+    $sum = hex($addr) + 1;
+    if ($sum > 0xff) { $sum -= 0x100; }
+    $r = sprintf("%02x", $sum) . $r;
+
+    # if ($main::opt_debug) { print STDERR "$r\n"; }
+    return $r;
+  }
+}
+
+# Extract symbols for all PC values found in profile
+sub ExtractSymbols {
+  my $libs = shift;
+  my $pcset = shift;
+
+  my $symbols = {};
+
+  # Map each PC value to the containing library.  To make this faster,
+  # we sort libraries by their starting pc value (highest first), and
+  # advance through the libraries as we advance the pc.  Sometimes the
+  # addresses of libraries may overlap with the addresses of the main
+  # binary, so to make sure the libraries 'win', we iterate over the
+  # libraries in reverse order (which assumes the binary doesn't start
+  # in the middle of a library, which seems a fair assumption).
+  my @pcs = (sort { $a cmp $b } keys(%{$pcset}));  # pcset is 0-extended strings
+  foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) {
+    my $libname = $lib->[0];
+    my $start = $lib->[1];
+    my $finish = $lib->[2];
+    my $offset = $lib->[3];
+
+    # Use debug library if it exists
+    my $debug_libname = DebuggingLibrary($libname);
+    if ($debug_libname) {
+        $libname = $debug_libname;
+    }
+
+    # Get list of pcs that belong in this library.
+    my $contained = [];
+    my ($start_pc_index, $finish_pc_index);
+    # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index].
+    for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0;
+         $finish_pc_index--) {
+      last if $pcs[$finish_pc_index - 1] le $finish;
+    }
+    # Find smallest start_pc_index such that $start <= $pc[$start_pc_index].
+    for ($start_pc_index = $finish_pc_index; $start_pc_index > 0;
+         $start_pc_index--) {
+      last if $pcs[$start_pc_index - 1] lt $start;
+    }
+    # This keeps PC values higher than $pc[$finish_pc_index] in @pcs,
+    # in case there are overlaps in libraries and the main binary.
+    @{$contained} = splice(@pcs, $start_pc_index,
+                           $finish_pc_index - $start_pc_index);
+    # Map to symbols
+    MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols);
+  }
+
+  return $symbols;
+}
+
+# Map list of PC values to symbols for a given image
+sub MapToSymbols {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  my $debug = 0;
+
+  # Ignore empty binaries
+  if ($#{$pclist} < 0) { return; }
+
+  # Figure out the addr2line command to use
+  my $addr2line = $obj_tool_map{"addr2line"};
+  my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image);
+  if (exists $obj_tool_map{"addr2line_pdb"}) {
+    $addr2line = $obj_tool_map{"addr2line_pdb"};
+    $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image);
+  }
+
+  # If "addr2line" isn't installed on the system at all, just use
+  # nm to get what info we can (function names, but not line numbers).
+  if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) {
+    MapSymbolsWithNM($image, $offset, $pclist, $symbols);
+    return;
+  }
+
+  # "addr2line -i" can produce a variable number of lines per input
+  # address, with no separator that allows us to tell when data for
+  # the next address starts.  So we find the address for a special
+  # symbol (_fini) and interleave this address between all real
+  # addresses passed to addr2line.  The name of this special symbol
+  # can then be used as a separator.
+  $sep_address = undef;  # May be filled in by MapSymbolsWithNM()
+  my $nm_symbols = {};
+  MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols);
+  if (defined($sep_address)) {
+    # Only add " -i" to addr2line if the binary supports it.
+    # addr2line --help returns 0, but not if it sees an unknown flag first.
+    if (system("$cmd -i --help >$dev_null 2>&1") == 0) {
+      $cmd .= " -i";
+    } else {
+      $sep_address = undef;   # no need for sep_address if we don't support -i
+    }
+  }
+
+  # Make file with all PC values with intervening 'sep_address' so
+  # that we can reliably detect the end of inlined function list
+  open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n");
+  if ($debug) { print("---- $image ---\n"); }
+  for (my $i = 0; $i <= $#{$pclist}; $i++) {
+    # addr2line always reads hex addresses, and does not need '0x' prefix.
+    if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); }
+    printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset));
+    if (defined($sep_address)) {
+      printf ADDRESSES ("%s\n", $sep_address);
+    }
+  }
+  close(ADDRESSES);
+  if ($debug) {
+    print("----\n");
+    system("cat", $main::tmpfile_sym);
+    print("----\n");
+    system("$cmd < " . ShellEscape($main::tmpfile_sym));
+    print("----\n");
+  }
+
+  open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |")
+      || error("$cmd: $!\n");
+  my $count = 0;   # Index in pclist
+  while (<SYMBOLS>) {
+    # Read fullfunction and filelineinfo from next pair of lines
+    s/\r?\n$//g;
+    my $fullfunction = $_;
+    $_ = <SYMBOLS>;
+    s/\r?\n$//g;
+    my $filelinenum = $_;
+
+    if (defined($sep_address) && $fullfunction eq $sep_symbol) {
+      # Terminating marker for data for this address
+      $count++;
+      next;
+    }
+
+    $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths
+
+    my $pcstr = $pclist->[$count];
+    my $function = ShortFunctionName($fullfunction);
+    my $nms = $nm_symbols->{$pcstr};
+    if (defined($nms)) {
+      if ($fullfunction eq '??') {
+        # nm found a symbol for us.
+        $function = $nms->[0];
+        $fullfunction = $nms->[2];
+      } else {
+	# MapSymbolsWithNM tags each routine with its starting address,
+	# useful in case the image has multiple occurrences of this
+	# routine.  (It uses a syntax that resembles template parameters,
+	# that are automatically stripped out by ShortFunctionName().)
+	# addr2line does not provide the same information.  So we check
+	# if nm disambiguated our symbol, and if so take the annotated
+	# (nm) version of the routine-name.  TODO(csilvers): this won't
+	# catch overloaded, inlined symbols, which nm doesn't see.
+	# Better would be to do a check similar to nm's, in this fn.
+	if ($nms->[2] =~ m/^\Q$function\E/) {  # sanity check it's the right fn
+	  $function = $nms->[0];
+	  $fullfunction = $nms->[2];
+	}
+      }
+    }
+
+    # Prepend to accumulated symbols for pcstr
+    # (so that caller comes before callee)
+    my $sym = $symbols->{$pcstr};
+    if (!defined($sym)) {
+      $sym = [];
+      $symbols->{$pcstr} = $sym;
+    }
+    unshift(@{$sym}, $function, $filelinenum, $fullfunction);
+    if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); }
+    if (!defined($sep_address)) {
+      # Inlining is off, so this entry ends immediately
+      $count++;
+    }
+  }
+  close(SYMBOLS);
+}
+
+# Use nm to map the list of referenced PCs to symbols.  Return true iff we
+# are able to read procedure information via nm.
+sub MapSymbolsWithNM {
+  my $image = shift;
+  my $offset = shift;
+  my $pclist = shift;
+  my $symbols = shift;
+
+  # Get nm output sorted by increasing address
+  my $symbol_table = GetProcedureBoundaries($image, ".");
+  if (!%{$symbol_table}) {
+    return 0;
+  }
+  # Start addresses are already the right length (8 or 16 hex digits).
+  my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] }
+    keys(%{$symbol_table});
+
+  if ($#names < 0) {
+    # No symbols: just use addresses
+    foreach my $pc (@{$pclist}) {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+    return 0;
+  }
+
+  # Sort addresses so we can do a join against nm output
+  my $index = 0;
+  my $fullname = $names[0];
+  my $name = ShortFunctionName($fullname);
+  foreach my $pc (sort { $a cmp $b } @{$pclist}) {
+    # Adjust for mapped offset
+    my $mpc = AddressSub($pc, $offset);
+    while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){
+      $index++;
+      $fullname = $names[$index];
+      $name = ShortFunctionName($fullname);
+    }
+    if ($mpc lt $symbol_table->{$fullname}->[1]) {
+      $symbols->{$pc} = [$name, "?", $fullname];
+    } else {
+      my $pcstr = "0x" . $pc;
+      $symbols->{$pc} = [$pcstr, "?", $pcstr];
+    }
+  }
+  return 1;
+}
+
+sub ShortFunctionName {
+  my $function = shift;
+  while ($function =~ s/\([^()]*\)(\s*const)?//g) { }   # Argument types
+  while ($function =~ s/<[^<>]*>//g)  { }    # Remove template arguments
+  $function =~ s/^.*\s+(\w+::)/$1/;          # Remove leading type
+  return $function;
+}
+
+# Trim overly long symbols found in disassembler output
+sub CleanDisassembly {
+  my $d = shift;
+  while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax)
+  while ($d =~ s/(\w+)<[^<>]*>/$1/g)  { }       # Remove template arguments
+  return $d;
+}
+
+# Clean file name for display
+sub CleanFileName {
+  my ($f) = @_;
+  $f =~ s|^/proc/self/cwd/||;
+  $f =~ s|^\./||;
+  return $f;
+}
+
+# Make address relative to section and clean up for display
+sub UnparseAddress {
+  my ($offset, $address) = @_;
+  $address = AddressSub($address, $offset);
+  $address =~ s/^0x//;
+  $address =~ s/^0*//;
+  return $address;
+}
+
+##### Miscellaneous #####
+
+# Find the right versions of the above object tools to use.  The
+# argument is the program file being analyzed, and should be an ELF
+# 32-bit or ELF 64-bit executable file.  The location of the tools
+# is determined by considering the following options in this order:
+#   1) --tools option, if set
+#   2) JEPROF_TOOLS environment variable, if set
+#   3) the environment
+sub ConfigureObjTools {
+  my $prog_file = shift;
+
+  # Check for the existence of $prog_file because /usr/bin/file does not
+  # predictably return error status in prod.
+  (-e $prog_file)  || error("$prog_file does not exist.\n");
+
+  my $file_type = undef;
+  if (-e "/usr/bin/file") {
+    # Follow symlinks (at least for systems where "file" supports that).
+    my $escaped_prog_file = ShellEscape($prog_file);
+    $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null ||
+                  /usr/bin/file $escaped_prog_file`;
+  } elsif ($^O == "MSWin32") {
+    $file_type = "MS Windows";
+  } else {
+    print STDERR "WARNING: Can't determine the file type of $prog_file";
+  }
+
+  if ($file_type =~ /64-bit/) {
+    # Change $address_length to 16 if the program file is ELF 64-bit.
+    # We can't detect this from many (most?) heap or lock contention
+    # profiles, since the actual addresses referenced are generally in low
+    # memory even for 64-bit programs.
+    $address_length = 16;
+  }
+
+  if ($file_type =~ /MS Windows/) {
+    # For windows, we provide a version of nm and addr2line as part of
+    # the opensource release, which is capable of parsing
+    # Windows-style PDB executables.  It should live in the path, or
+    # in the same directory as jeprof.
+    $obj_tool_map{"nm_pdb"} = "nm-pdb";
+    $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb";
+  }
+
+  if ($file_type =~ /Mach-O/) {
+    # OS X uses otool to examine Mach-O files, rather than objdump.
+    $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"addr2line"} = "false";  # no addr2line
+    $obj_tool_map{"objdump"} = "false";  # no objdump
+  }
+
+  # Go fill in %obj_tool_map with the pathnames to use:
+  foreach my $tool (keys %obj_tool_map) {
+    $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool});
+  }
+}
+
+# Returns the path of a caller-specified object tool.  If --tools or
+# JEPROF_TOOLS are specified, then returns the full path to the tool
+# with that prefix.  Otherwise, returns the path unmodified (which
+# means we will look for it on PATH).
+sub ConfigureTool {
+  my $tool = shift;
+  my $path;
+
+  # --tools (or $JEPROF_TOOLS) is a comma separated list, where each
+  # item is either a) a pathname prefix, or b) a map of the form
+  # <tool>:<path>.  First we look for an entry of type (b) for our
+  # tool.  If one is found, we use it.  Otherwise, we consider all the
+  # pathname prefixes in turn, until one yields an existing file.  If
+  # none does, we use a default path.
+  my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || "";
+  if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) {
+    $path = $2;
+    # TODO(csilvers): sanity-check that $path exists?  Hard if it's relative.
+  } elsif ($tools ne '') {
+    foreach my $prefix (split(',', $tools)) {
+      next if ($prefix =~ /:/);    # ignore "tool:fullpath" entries in the list
+      if (-x $prefix . $tool) {
+        $path = $prefix . $tool;
+        last;
+      }
+    }
+    if (!$path) {
+      error("No '$tool' found with prefix specified by " .
+            "--tools (or \$JEPROF_TOOLS) '$tools'\n");
+    }
+  } else {
+    # ... otherwise use the version that exists in the same directory as
+    # jeprof.  If there's nothing there, use $PATH.
+    $0 =~ m,[^/]*$,;     # this is everything after the last slash
+    my $dirname = $`;    # this is everything up to and including the last slash
+    if (-x "$dirname$tool") {
+      $path = "$dirname$tool";
+    } else {
+      $path = $tool;
+    }
+  }
+  if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; }
+  return $path;
+}
+
+sub ShellEscape {
+  my @escaped_words = ();
+  foreach my $word (@_) {
+    my $escaped_word = $word;
+    if ($word =~ m![^a-zA-Z0-9/.,_=-]!) {  # check for anything not in whitelist
+      $escaped_word =~ s/'/'\\''/;
+      $escaped_word = "'$escaped_word'";
+    }
+    push(@escaped_words, $escaped_word);
+  }
+  return join(" ", @escaped_words);
+}
+
+sub cleanup {
+  unlink($main::tmpfile_sym);
+  unlink(keys %main::tempnames);
+
+  # We leave any collected profiles in $HOME/jeprof in case the user wants
+  # to look at them later.  We print a message informing them of this.
+  if ((scalar(@main::profile_files) > 0) &&
+      defined($main::collected_profile)) {
+    if (scalar(@main::profile_files) == 1) {
+      print STDERR "Dynamically gathered profile is in $main::collected_profile\n";
+    }
+    print STDERR "If you want to investigate this profile further, you can do:\n";
+    print STDERR "\n";
+    print STDERR "  jeprof \\\n";
+    print STDERR "    $main::prog \\\n";
+    print STDERR "    $main::collected_profile\n";
+    print STDERR "\n";
+  }
+}
+
+sub sighandler {
+  cleanup();
+  exit(1);
+}
+
+sub error {
+  my $msg = shift;
+  print STDERR $msg;
+  cleanup();
+  exit(1);
+}
+
+
+# Run $nm_command and get all the resulting procedure boundaries whose
+# names match "$regexp" and returns them in a hashtable mapping from
+# procedure name to a two-element vector of [start address, end address]
+sub GetProcedureBoundariesViaNm {
+  my $escaped_nm_command = shift;    # shell-escaped
+  my $regexp = shift;
+
+  my $symbol_table = {};
+  open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n");
+  my $last_start = "0";
+  my $routine = "";
+  while (<NM>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    if (m/^\s*([0-9a-f]+) (.) (..*)/) {
+      my $start_val = $1;
+      my $type = $2;
+      my $this_routine = $3;
+
+      # It's possible for two symbols to share the same address, if
+      # one is a zero-length variable (like __start_google_malloc) or
+      # one symbol is a weak alias to another (like __libc_malloc).
+      # In such cases, we want to ignore all values except for the
+      # actual symbol, which in nm-speak has type "T".  The logic
+      # below does this, though it's a bit tricky: what happens when
+      # we have a series of lines with the same address, is the first
+      # one gets queued up to be processed.  However, it won't
+      # *actually* be processed until later, when we read a line with
+      # a different address.  That means that as long as we're reading
+      # lines with the same address, we have a chance to replace that
+      # item in the queue, which we do whenever we see a 'T' entry --
+      # that is, a line with type 'T'.  If we never see a 'T' entry,
+      # we'll just go ahead and process the first entry (which never
+      # got touched in the queue), and ignore the others.
+      if ($start_val eq $last_start && $type =~ /t/i) {
+        # We are the 'T' symbol at this address, replace previous symbol.
+        $routine = $this_routine;
+        next;
+      } elsif ($start_val eq $last_start) {
+        # We're not the 'T' symbol at this address, so ignore us.
+        next;
+      }
+
+      if ($this_routine eq $sep_symbol) {
+        $sep_address = HexExtend($start_val);
+      }
+
+      # Tag this routine with the starting address in case the image
+      # has multiple occurrences of this routine.  We use a syntax
+      # that resembles template parameters that are automatically
+      # stripped out by ShortFunctionName()
+      $this_routine .= "<$start_val>";
+
+      if (defined($routine) && $routine =~ m/$regexp/) {
+        $symbol_table->{$routine} = [HexExtend($last_start),
+                                     HexExtend($start_val)];
+      }
+      $last_start = $start_val;
+      $routine = $this_routine;
+    } elsif (m/^Loaded image name: (.+)/) {
+      # The win32 nm workalike emits information about the binary it is using.
+      if ($main::opt_debug) { print STDERR "Using Image $1\n"; }
+    } elsif (m/^PDB file name: (.+)/) {
+      # The win32 nm workalike emits information about the pdb it is using.
+      if ($main::opt_debug) { print STDERR "Using PDB $1\n"; }
+    }
+  }
+  close(NM);
+  # Handle the last line in the nm output.  Unfortunately, we don't know
+  # how big this last symbol is, because we don't know how big the file
+  # is.  For now, we just give it a size of 0.
+  # TODO(csilvers): do better here.
+  if (defined($routine) && $routine =~ m/$regexp/) {
+    $symbol_table->{$routine} = [HexExtend($last_start),
+                                 HexExtend($last_start)];
+  }
+  return $symbol_table;
+}
+
+# Gets the procedure boundaries for all routines in "$image" whose names
+# match "$regexp" and returns them in a hashtable mapping from procedure
+# name to a two-element vector of [start address, end address].
+# Will return an empty map if nm is not installed or not working properly.
+sub GetProcedureBoundaries {
+  my $image = shift;
+  my $regexp = shift;
+
+  # If $image doesn't start with /, then put ./ in front of it.  This works
+  # around an obnoxious bug in our probing of nm -f behavior.
+  # "nm -f $image" is supposed to fail on GNU nm, but if:
+  #
+  # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND
+  # b. you have a.out in your current directory (a not uncommon occurrence)
+  #
+  # then "nm -f $image" succeeds because -f only looks at the first letter of
+  # the argument, which looks valid because it's [BbSsPp], and then since
+  # there's no image provided, it looks for a.out and finds it.
+  #
+  # This regex makes sure that $image starts with . or /, forcing the -f
+  # parsing to fail since . and / are not valid formats.
+  $image =~ s#^[^/]#./$&#;
+
+  # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
+  my $debugging = DebuggingLibrary($image);
+  if ($debugging) {
+    $image = $debugging;
+  }
+
+  my $nm = $obj_tool_map{"nm"};
+  my $cppfilt = $obj_tool_map{"c++filt"};
+
+  # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm
+  # binary doesn't support --demangle.  In addition, for OS X we need
+  # to use the -f flag to get 'flat' nm output (otherwise we don't sort
+  # properly and get incorrect results).  Unfortunately, GNU nm uses -f
+  # in an incompatible way.  So first we test whether our nm supports
+  # --demangle and -f.
+  my $demangle_flag = "";
+  my $cppfilt_flag = "";
+  my $to_devnull = ">$dev_null 2>&1";
+  if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) {
+    # In this mode, we do "nm --demangle <foo>"
+    $demangle_flag = "--demangle";
+    $cppfilt_flag = "";
+  } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) {
+    # In this mode, we do "nm <foo> | c++filt"
+    $cppfilt_flag = " | " . ShellEscape($cppfilt);
+  };
+  my $flatten_flag = "";
+  if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) {
+    $flatten_flag = "-f";
+  }
+
+  # Finally, in the case $imagie isn't a debug library, we try again with
+  # -D to at least get *exported* symbols.  If we can't use --demangle,
+  # we use c++filt instead, if it exists on this system.
+  my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag,
+                                 $image) . " 2>$dev_null $cppfilt_flag",
+                     # 6nm is for Go binaries
+                     ShellEscape("6nm", "$image") . " 2>$dev_null | sort",
+                     );
+
+  # If the executable is an MS Windows PDB-format executable, we'll
+  # have set up obj_tool_map("nm_pdb").  In this case, we actually
+  # want to use both unix nm and windows-specific nm_pdb, since
+  # PDB-format executables can apparently include dwarf .o files.
+  if (exists $obj_tool_map{"nm_pdb"}) {
+    push(@nm_commands,
+         ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image)
+         . " 2>$dev_null");
+  }
+
+  foreach my $nm_command (@nm_commands) {
+    my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp);
+    return $symbol_table if (%{$symbol_table});
+  }
+  my $symbol_table = {};
+  return $symbol_table;
+}
+
+
+# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings.
+# To make them more readable, we add underscores at interesting places.
+# This routine removes the underscores, producing the canonical representation
+# used by jeprof to represent addresses, particularly in the tested routines.
+sub CanonicalHex {
+  my $arg = shift;
+  return join '', (split '_',$arg);
+}
+
+
+# Unit test for AddressAdd:
+sub AddressAddUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd ($row->[0], $row->[1]);
+    if ($sum ne $row->[2]) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    my $expected = join '', (split '_',$row->[2]);
+    if ($sum ne CanonicalHex($row->[2])) {
+      printf STDERR "ERROR: %s != %s + %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[2];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressSub:
+sub AddressSubUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub ($row->[0], $row->[1]);
+    if ($sum ne $row->[3]) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1]));
+    if ($sum ne CanonicalHex($row->[3])) {
+      printf STDERR "ERROR: %s != %s - %s = %s\n", $sum,
+             $row->[0], $row->[1], $row->[3];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Unit test for AddressInc:
+sub AddressIncUnitTest {
+  my $test_data_8 = shift;
+  my $test_data_16 = shift;
+  my $error_count = 0;
+  my $fail_count = 0;
+  my $pass_count = 0;
+  # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n";
+
+  # First a few 8-nibble addresses.  Note that this implementation uses
+  # plain old arithmetic, so a quick sanity check along with verifying what
+  # happens to overflow (we want it to wrap):
+  $address_length = 8;
+  foreach my $row (@{$test_data_8}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc ($row->[0]);
+    if ($sum ne $row->[4]) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count = $fail_count;
+  $fail_count = 0;
+  $pass_count = 0;
+
+  # Now 16-nibble addresses.
+  $address_length = 16;
+  foreach my $row (@{$test_data_16}) {
+    if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; }
+    my $sum = AddressInc (CanonicalHex($row->[0]));
+    if ($sum ne CanonicalHex($row->[4])) {
+      printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum,
+             $row->[0], $row->[4];
+      ++$fail_count;
+    } else {
+      ++$pass_count;
+    }
+  }
+  printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n",
+         $pass_count, $fail_count;
+  $error_count += $fail_count;
+
+  return $error_count;
+}
+
+
+# Driver for unit tests.
+# Currently just the address add/subtract/increment routines for 64-bit.
+sub RunUnitTests {
+  my $error_count = 0;
+
+  # This is a list of tuples [a, b, a+b, a-b, a+1]
+  my $unit_test_data_8 = [
+    [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)],
+    [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)],
+    [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)],
+    [qw(00000001 ffffffff 00000000 00000002 00000002)],
+    [qw(00000001 fffffff0 fffffff1 00000011 00000002)],
+  ];
+  my $unit_test_data_16 = [
+    # The implementation handles data in 7-nibble chunks, so those are the
+    # interesting boundaries.
+    [qw(aaaaaaaa 50505050
+        00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)],
+    [qw(50505050 aaaaaaaa
+        00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)],
+    [qw(ffffffff aaaaaaaa
+        00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)],
+    [qw(00000001 ffffffff
+        00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)],
+    [qw(00000001 fffffff0
+        00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)],
+
+    [qw(00_a00000a_aaaaaaa 50505050
+        00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)],
+    [qw(0f_fff0005_0505050 aaaaaaaa
+        0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)],
+    [qw(00_000000f_fffffff 01_800000a_aaaaaaa
+        01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)],
+    [qw(00_0000000_0000001 ff_fffffff_fffffff
+        00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)],
+    [qw(00_0000000_0000001 ff_fffffff_ffffff0
+        ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)],
+  ];
+
+  $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16);
+  $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16);
+  if ($error_count > 0) {
+    print STDERR $error_count, " errors: FAILED\n";
+  } else {
+    print STDERR "PASS\n";
+  }
+  exit ($error_count);
+}

+ 250 - 0
BeefRT/JEMalloc/build-aux/install-sh

@@ -0,0 +1,250 @@
+#! /bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5 (mit/util/scripts/install.sh).
+#
+# Copyright 1991 by the Massachusetts Institute of Technology
+#
+# Permission to use, copy, modify, distribute, and sell this software and its
+# documentation for any purpose is hereby granted without fee, provided that
+# the above copyright notice appear in all copies and that both that
+# copyright notice and this permission notice appear in supporting
+# documentation, and that the name of M.I.T. not be used in advertising or
+# publicity pertaining to distribution of the software without specific,
+# written prior permission.  M.I.T. makes no representations about the
+# suitability of this software for any purpose.  It is provided "as is"
+# without express or implied warranty.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0

+ 2669 - 0
BeefRT/JEMalloc/configure.ac

@@ -0,0 +1,2669 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_PREREQ(2.68)
+AC_INIT([Makefile.in])
+
+AC_CONFIG_AUX_DIR([build-aux])
+
+dnl ============================================================================
+dnl Custom macro definitions.
+
+dnl JE_CONCAT_VVV(r, a, b)
+dnl
+dnl Set $r to the concatenation of $a and $b, with a space separating them iff
+dnl both $a and $b are non-empty.
+AC_DEFUN([JE_CONCAT_VVV],
+if test "x[$]{$2}" = "x" -o "x[$]{$3}" = "x" ; then
+  $1="[$]{$2}[$]{$3}"
+else
+  $1="[$]{$2} [$]{$3}"
+fi
+)
+
+dnl JE_APPEND_VS(a, b)
+dnl
+dnl Set $a to the concatenation of $a and b, with a space separating them iff
+dnl both $a and b are non-empty.
+AC_DEFUN([JE_APPEND_VS],
+  T_APPEND_V=$2
+  JE_CONCAT_VVV($1, $1, T_APPEND_V)
+)
+
+CONFIGURE_CFLAGS=
+SPECIFIED_CFLAGS="${CFLAGS}"
+dnl JE_CFLAGS_ADD(cflag)
+dnl
+dnl CFLAGS is the concatenation of CONFIGURE_CFLAGS and SPECIFIED_CFLAGS
+dnl (ignoring EXTRA_CFLAGS, which does not impact configure tests.  This macro
+dnl appends to CONFIGURE_CFLAGS and regenerates CFLAGS.
+AC_DEFUN([JE_CFLAGS_ADD],
+[
+AC_MSG_CHECKING([whether compiler supports $1])
+T_CONFIGURE_CFLAGS="${CONFIGURE_CFLAGS}"
+JE_APPEND_VS(CONFIGURE_CFLAGS, $1)
+JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+[[
+]], [[
+    return 0;
+]])],
+              [je_cv_cflags_added=$1]
+              AC_MSG_RESULT([yes]),
+              [je_cv_cflags_added=]
+              AC_MSG_RESULT([no])
+              [CONFIGURE_CFLAGS="${T_CONFIGURE_CFLAGS}"]
+)
+JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS)
+])
+
+dnl JE_CFLAGS_SAVE()
+dnl JE_CFLAGS_RESTORE()
+dnl
+dnl Save/restore CFLAGS.  Nesting is not supported.
+AC_DEFUN([JE_CFLAGS_SAVE],
+SAVED_CONFIGURE_CFLAGS="${CONFIGURE_CFLAGS}"
+)
+AC_DEFUN([JE_CFLAGS_RESTORE],
+CONFIGURE_CFLAGS="${SAVED_CONFIGURE_CFLAGS}"
+JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS)
+)
+
+CONFIGURE_CXXFLAGS=
+SPECIFIED_CXXFLAGS="${CXXFLAGS}"
+dnl JE_CXXFLAGS_ADD(cxxflag)
+AC_DEFUN([JE_CXXFLAGS_ADD],
+[
+AC_MSG_CHECKING([whether compiler supports $1])
+T_CONFIGURE_CXXFLAGS="${CONFIGURE_CXXFLAGS}"
+JE_APPEND_VS(CONFIGURE_CXXFLAGS, $1)
+JE_CONCAT_VVV(CXXFLAGS, CONFIGURE_CXXFLAGS, SPECIFIED_CXXFLAGS)
+AC_LANG_PUSH([C++])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+[[
+]], [[
+    return 0;
+]])],
+              [je_cv_cxxflags_added=$1]
+              AC_MSG_RESULT([yes]),
+              [je_cv_cxxflags_added=]
+              AC_MSG_RESULT([no])
+              [CONFIGURE_CXXFLAGS="${T_CONFIGURE_CXXFLAGS}"]
+)
+AC_LANG_POP([C++])
+JE_CONCAT_VVV(CXXFLAGS, CONFIGURE_CXXFLAGS, SPECIFIED_CXXFLAGS)
+])
+
+dnl JE_COMPILABLE(label, hcode, mcode, rvar)
+dnl
+dnl Use AC_LINK_IFELSE() rather than AC_COMPILE_IFELSE() so that linker errors
+dnl cause failure.
+AC_DEFUN([JE_COMPILABLE],
+[
+AC_CACHE_CHECK([whether $1 is compilable],
+               [$4],
+               [AC_LINK_IFELSE([AC_LANG_PROGRAM([$2],
+                                                [$3])],
+                               [$4=yes],
+                               [$4=no])])
+])
+
+dnl ============================================================================
+
+CONFIG=`echo ${ac_configure_args} | sed -e 's#'"'"'\([^ ]*\)'"'"'#\1#g'`
+AC_SUBST([CONFIG])
+
+dnl Library revision.
+rev=2
+AC_SUBST([rev])
+
+srcroot=$srcdir
+if test "x${srcroot}" = "x." ; then
+  srcroot=""
+else
+  srcroot="${srcroot}/"
+fi
+AC_SUBST([srcroot])
+abs_srcroot="`cd \"${srcdir}\"; pwd`/"
+AC_SUBST([abs_srcroot])
+
+objroot=""
+AC_SUBST([objroot])
+abs_objroot="`pwd`/"
+AC_SUBST([abs_objroot])
+
+dnl Munge install path variables.
+case "$prefix" in
+   *\ * ) AC_MSG_ERROR([Prefix should not contain spaces]) ;;
+   "NONE" ) prefix="/usr/local" ;;
+esac
+case "$exec_prefix" in
+   *\ * ) AC_MSG_ERROR([Exec prefix should not contain spaces]) ;;
+   "NONE" ) exec_prefix=$prefix ;;
+esac
+PREFIX=$prefix
+AC_SUBST([PREFIX])
+BINDIR=`eval echo $bindir`
+BINDIR=`eval echo $BINDIR`
+AC_SUBST([BINDIR])
+INCLUDEDIR=`eval echo $includedir`
+INCLUDEDIR=`eval echo $INCLUDEDIR`
+AC_SUBST([INCLUDEDIR])
+LIBDIR=`eval echo $libdir`
+LIBDIR=`eval echo $LIBDIR`
+AC_SUBST([LIBDIR])
+DATADIR=`eval echo $datadir`
+DATADIR=`eval echo $DATADIR`
+AC_SUBST([DATADIR])
+MANDIR=`eval echo $mandir`
+MANDIR=`eval echo $MANDIR`
+AC_SUBST([MANDIR])
+
+dnl Support for building documentation.
+AC_PATH_PROG([XSLTPROC], [xsltproc], [false], [$PATH])
+if test -d "/usr/share/xml/docbook/stylesheet/docbook-xsl" ; then
+  DEFAULT_XSLROOT="/usr/share/xml/docbook/stylesheet/docbook-xsl"
+elif test -d "/usr/share/sgml/docbook/xsl-stylesheets" ; then
+  DEFAULT_XSLROOT="/usr/share/sgml/docbook/xsl-stylesheets"
+else
+  dnl Documentation building will fail if this default gets used.
+  DEFAULT_XSLROOT=""
+fi
+AC_ARG_WITH([xslroot],
+  [AS_HELP_STRING([--with-xslroot=<path>], [XSL stylesheet root path])], [
+if test "x$with_xslroot" = "xno" ; then
+  XSLROOT="${DEFAULT_XSLROOT}"
+else
+  XSLROOT="${with_xslroot}"
+fi
+],
+  XSLROOT="${DEFAULT_XSLROOT}"
+)
+if test "x$XSLTPROC" = "xfalse" ; then
+  XSLROOT=""
+fi
+AC_SUBST([XSLROOT])
+
+dnl If CFLAGS isn't defined, set CFLAGS to something reasonable.  Otherwise,
+dnl just prevent autoconf from molesting CFLAGS.
+CFLAGS=$CFLAGS
+AC_PROG_CC
+
+if test "x$GCC" != "xyes" ; then
+  AC_CACHE_CHECK([whether compiler is MSVC],
+                 [je_cv_msvc],
+                 [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
+                                                     [
+#ifndef _MSC_VER
+  int fail[-1];
+#endif
+])],
+                               [je_cv_msvc=yes],
+                               [je_cv_msvc=no])])
+fi
+
+dnl check if a cray prgenv wrapper compiler is being used
+je_cv_cray_prgenv_wrapper=""
+if test "x${PE_ENV}" != "x" ; then
+  case "${CC}" in
+    CC|cc)
+	je_cv_cray_prgenv_wrapper="yes"
+	;;
+    *)
+       ;;
+  esac
+fi
+
+AC_CACHE_CHECK([whether compiler is cray],
+              [je_cv_cray],
+              [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
+                                                  [
+#ifndef _CRAYC
+  int fail[-1];
+#endif
+])],
+                            [je_cv_cray=yes],
+                            [je_cv_cray=no])])
+
+if test "x${je_cv_cray}" = "xyes" ; then
+  AC_CACHE_CHECK([whether cray compiler version is 8.4],
+                [je_cv_cray_84],
+                [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
+                                                      [
+#if !(_RELEASE_MAJOR == 8 && _RELEASE_MINOR == 4)
+  int fail[-1];
+#endif
+])],
+                              [je_cv_cray_84=yes],
+                              [je_cv_cray_84=no])])
+fi
+
+if test "x$GCC" = "xyes" ; then
+  JE_CFLAGS_ADD([-std=gnu11])
+  if test "x$je_cv_cflags_added" = "x-std=gnu11" ; then
+    AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT], [ ], [ ])
+  else
+    JE_CFLAGS_ADD([-std=gnu99])
+    if test "x$je_cv_cflags_added" = "x-std=gnu99" ; then
+      AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT], [ ], [ ])
+    fi
+  fi
+  JE_CFLAGS_ADD([-Werror=unknown-warning-option])
+  JE_CFLAGS_ADD([-Wall])
+  JE_CFLAGS_ADD([-Wextra])
+  JE_CFLAGS_ADD([-Wshorten-64-to-32])
+  JE_CFLAGS_ADD([-Wsign-compare])
+  JE_CFLAGS_ADD([-Wundef])
+  JE_CFLAGS_ADD([-Wno-format-zero-length])
+  JE_CFLAGS_ADD([-Wpointer-arith])
+  dnl This warning triggers on the use of the universal zero initializer, which
+  dnl is a very handy idiom for things like the tcache static initializer (which
+  dnl has lots of nested structs).  See the discussion at.
+  dnl https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53119
+  JE_CFLAGS_ADD([-Wno-missing-braces])
+  dnl This one too.
+  JE_CFLAGS_ADD([-Wno-missing-field-initializers])
+  JE_CFLAGS_ADD([-Wno-missing-attributes])
+  JE_CFLAGS_ADD([-pipe])
+  JE_CFLAGS_ADD([-g3])
+elif test "x$je_cv_msvc" = "xyes" ; then
+  CC="$CC -nologo"
+  JE_CFLAGS_ADD([-Zi])
+  JE_CFLAGS_ADD([-MT])
+  JE_CFLAGS_ADD([-W3])
+  JE_CFLAGS_ADD([-FS])
+  JE_APPEND_VS(CPPFLAGS, -I${srcdir}/include/msvc_compat)
+fi
+if test "x$je_cv_cray" = "xyes" ; then
+  dnl cray compiler 8.4 has an inlining bug
+  if test "x$je_cv_cray_84" = "xyes" ; then
+    JE_CFLAGS_ADD([-hipa2])
+    JE_CFLAGS_ADD([-hnognu])
+  fi
+  dnl ignore unreachable code warning
+  JE_CFLAGS_ADD([-hnomessage=128])
+  dnl ignore redefinition of "malloc", "free", etc warning
+  JE_CFLAGS_ADD([-hnomessage=1357])
+fi
+AC_SUBST([CONFIGURE_CFLAGS])
+AC_SUBST([SPECIFIED_CFLAGS])
+AC_SUBST([EXTRA_CFLAGS])
+AC_PROG_CPP
+
+AC_ARG_ENABLE([cxx],
+  [AS_HELP_STRING([--disable-cxx], [Disable C++ integration])],
+if test "x$enable_cxx" = "xno" ; then
+  enable_cxx="0"
+else
+  enable_cxx="1"
+fi
+,
+enable_cxx="1"
+)
+if test "x$enable_cxx" = "x1" ; then
+  dnl Require at least c++14, which is the first version to support sized
+  dnl deallocation.  C++ support is not compiled otherwise.
+  m4_include([m4/ax_cxx_compile_stdcxx.m4])
+  AX_CXX_COMPILE_STDCXX([17], [noext], [optional])
+  if test "x${HAVE_CXX17}" != "x1"; then
+    AX_CXX_COMPILE_STDCXX([14], [noext], [optional])
+  fi
+  if test "x${HAVE_CXX14}" = "x1" -o "x${HAVE_CXX17}" = "x1"; then
+    JE_CXXFLAGS_ADD([-Wall])
+    JE_CXXFLAGS_ADD([-Wextra])
+    JE_CXXFLAGS_ADD([-g3])
+
+    SAVED_LIBS="${LIBS}"
+    JE_APPEND_VS(LIBS, -lstdc++)
+    JE_COMPILABLE([libstdc++ linkage], [
+#include <stdlib.h>
+], [[
+	int *arr = (int *)malloc(sizeof(int) * 42);
+	if (arr == NULL)
+		return 1;
+]], [je_cv_libstdcxx])
+    if test "x${je_cv_libstdcxx}" = "xno" ; then
+      LIBS="${SAVED_LIBS}"
+    fi
+  else
+    enable_cxx="0"
+  fi
+fi
+if test "x$enable_cxx" = "x1"; then
+  AC_DEFINE([JEMALLOC_ENABLE_CXX], [ ], [ ])
+fi
+AC_SUBST([enable_cxx])
+AC_SUBST([CONFIGURE_CXXFLAGS])
+AC_SUBST([SPECIFIED_CXXFLAGS])
+AC_SUBST([EXTRA_CXXFLAGS])
+
+AC_C_BIGENDIAN([ac_cv_big_endian=1], [ac_cv_big_endian=0])
+if test "x${ac_cv_big_endian}" = "x1" ; then
+  AC_DEFINE_UNQUOTED([JEMALLOC_BIG_ENDIAN], [ ], [ ])
+fi
+
+if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then
+  JE_APPEND_VS(CPPFLAGS, -I${srcdir}/include/msvc_compat/C99)
+fi
+
+if test "x${je_cv_msvc}" = "xyes" ; then
+  LG_SIZEOF_PTR=LG_SIZEOF_PTR_WIN
+  AC_MSG_RESULT([Using a predefined value for sizeof(void *): 4 for 32-bit, 8 for 64-bit])
+else
+  AC_CHECK_SIZEOF([void *])
+  if test "x${ac_cv_sizeof_void_p}" = "x8" ; then
+    LG_SIZEOF_PTR=3
+  elif test "x${ac_cv_sizeof_void_p}" = "x4" ; then
+    LG_SIZEOF_PTR=2
+  else
+    AC_MSG_ERROR([Unsupported pointer size: ${ac_cv_sizeof_void_p}])
+  fi
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_PTR], [$LG_SIZEOF_PTR], [ ])
+
+AC_CHECK_SIZEOF([int])
+if test "x${ac_cv_sizeof_int}" = "x8" ; then
+  LG_SIZEOF_INT=3
+elif test "x${ac_cv_sizeof_int}" = "x4" ; then
+  LG_SIZEOF_INT=2
+else
+  AC_MSG_ERROR([Unsupported int size: ${ac_cv_sizeof_int}])
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_INT], [$LG_SIZEOF_INT], [ ])
+
+AC_CHECK_SIZEOF([long])
+if test "x${ac_cv_sizeof_long}" = "x8" ; then
+  LG_SIZEOF_LONG=3
+elif test "x${ac_cv_sizeof_long}" = "x4" ; then
+  LG_SIZEOF_LONG=2
+else
+  AC_MSG_ERROR([Unsupported long size: ${ac_cv_sizeof_long}])
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG], [$LG_SIZEOF_LONG], [ ])
+
+AC_CHECK_SIZEOF([long long])
+if test "x${ac_cv_sizeof_long_long}" = "x8" ; then
+  LG_SIZEOF_LONG_LONG=3
+elif test "x${ac_cv_sizeof_long_long}" = "x4" ; then
+  LG_SIZEOF_LONG_LONG=2
+else
+  AC_MSG_ERROR([Unsupported long long size: ${ac_cv_sizeof_long_long}])
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG_LONG], [$LG_SIZEOF_LONG_LONG], [ ])
+
+AC_CHECK_SIZEOF([intmax_t])
+if test "x${ac_cv_sizeof_intmax_t}" = "x16" ; then
+  LG_SIZEOF_INTMAX_T=4
+elif test "x${ac_cv_sizeof_intmax_t}" = "x8" ; then
+  LG_SIZEOF_INTMAX_T=3
+elif test "x${ac_cv_sizeof_intmax_t}" = "x4" ; then
+  LG_SIZEOF_INTMAX_T=2
+else
+  AC_MSG_ERROR([Unsupported intmax_t size: ${ac_cv_sizeof_intmax_t}])
+fi
+AC_DEFINE_UNQUOTED([LG_SIZEOF_INTMAX_T], [$LG_SIZEOF_INTMAX_T], [ ])
+
+AC_CANONICAL_HOST
+dnl CPU-specific settings.
+CPU_SPINWAIT=""
+case "${host_cpu}" in
+  i686|x86_64)
+	HAVE_CPU_SPINWAIT=1
+	if test "x${je_cv_msvc}" = "xyes" ; then
+	    AC_CACHE_VAL([je_cv_pause_msvc],
+	      [JE_COMPILABLE([pause instruction MSVC], [],
+					[[_mm_pause(); return 0;]],
+					[je_cv_pause_msvc])])
+	    if test "x${je_cv_pause_msvc}" = "xyes" ; then
+		CPU_SPINWAIT='_mm_pause()'
+	    fi
+	else
+	    AC_CACHE_VAL([je_cv_pause],
+	      [JE_COMPILABLE([pause instruction], [],
+					[[__asm__ volatile("pause"); return 0;]],
+					[je_cv_pause])])
+	    if test "x${je_cv_pause}" = "xyes" ; then
+		CPU_SPINWAIT='__asm__ volatile("pause")'
+	    fi
+	fi
+	;;
+  aarch64|arm*)
+	HAVE_CPU_SPINWAIT=1
+	dnl isb is a better equivalent to the pause instruction on x86.
+	AC_CACHE_VAL([je_cv_isb],
+	  [JE_COMPILABLE([isb instruction], [],
+			[[__asm__ volatile("isb"); return 0;]],
+			[je_cv_isb])])
+	if test "x${je_cv_isb}" = "xyes" ; then
+	    CPU_SPINWAIT='__asm__ volatile("isb")'
+	fi
+	;;
+  *)
+	HAVE_CPU_SPINWAIT=0
+	;;
+esac
+AC_DEFINE_UNQUOTED([HAVE_CPU_SPINWAIT], [$HAVE_CPU_SPINWAIT], [ ])
+AC_DEFINE_UNQUOTED([CPU_SPINWAIT], [$CPU_SPINWAIT], [ ])
+
+AC_ARG_WITH([lg_vaddr],
+  [AS_HELP_STRING([--with-lg-vaddr=<lg-vaddr>], [Number of significant virtual address bits])],
+  [LG_VADDR="$with_lg_vaddr"], [LG_VADDR="detect"])
+
+case "${host_cpu}" in
+  aarch64)
+    if test "x$LG_VADDR" = "xdetect"; then
+      AC_MSG_CHECKING([number of significant virtual address bits])
+      if test "x${LG_SIZEOF_PTR}" = "x2" ; then
+        #aarch64 ILP32
+        LG_VADDR=32
+      else
+        #aarch64 LP64
+        LG_VADDR=48
+      fi
+      AC_MSG_RESULT([$LG_VADDR])
+    fi
+    ;;
+  x86_64)
+    if test "x$LG_VADDR" = "xdetect"; then
+      AC_CACHE_CHECK([number of significant virtual address bits],
+                     [je_cv_lg_vaddr],
+                     AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+#include <stdio.h>
+#ifdef _WIN32
+#include <limits.h>
+#include <intrin.h>
+typedef unsigned __int32 uint32_t;
+#else
+#include <stdint.h>
+#endif
+]], [[
+	uint32_t r[[4]];
+	uint32_t eax_in = 0x80000008U;
+#ifdef _WIN32
+	__cpuid((int *)r, (int)eax_in);
+#else
+	asm volatile ("cpuid"
+	    : "=a" (r[[0]]), "=b" (r[[1]]), "=c" (r[[2]]), "=d" (r[[3]])
+	    : "a" (eax_in), "c" (0)
+	);
+#endif
+	uint32_t eax_out = r[[0]];
+	uint32_t vaddr = ((eax_out & 0x0000ff00U) >> 8);
+	FILE *f = fopen("conftest.out", "w");
+	if (f == NULL) {
+		return 1;
+	}
+	if (vaddr > (sizeof(void *) << 3)) {
+		vaddr = sizeof(void *) << 3;
+	}
+	fprintf(f, "%u", vaddr);
+	fclose(f);
+	return 0;
+]])],
+                   [je_cv_lg_vaddr=`cat conftest.out`],
+                   [je_cv_lg_vaddr=error],
+                   [je_cv_lg_vaddr=57]))
+      if test "x${je_cv_lg_vaddr}" != "x" ; then
+        LG_VADDR="${je_cv_lg_vaddr}"
+      fi
+      if test "x${LG_VADDR}" != "xerror" ; then
+        AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ])
+      else
+        AC_MSG_ERROR([cannot determine number of significant virtual address bits])
+      fi
+    fi
+    ;;
+  *)
+    if test "x$LG_VADDR" = "xdetect"; then
+      AC_MSG_CHECKING([number of significant virtual address bits])
+      if test "x${LG_SIZEOF_PTR}" = "x3" ; then
+        LG_VADDR=64
+      elif test "x${LG_SIZEOF_PTR}" = "x2" ; then
+        LG_VADDR=32
+      elif test "x${LG_SIZEOF_PTR}" = "xLG_SIZEOF_PTR_WIN" ; then
+        LG_VADDR="(1U << (LG_SIZEOF_PTR_WIN+3))"
+      else
+        AC_MSG_ERROR([Unsupported lg(pointer size): ${LG_SIZEOF_PTR}])
+      fi
+      AC_MSG_RESULT([$LG_VADDR])
+    fi
+    ;;
+esac
+AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ])
+
+LD_PRELOAD_VAR="LD_PRELOAD"
+so="so"
+importlib="${so}"
+o="$ac_objext"
+a="a"
+exe="$ac_exeext"
+libprefix="lib"
+link_whole_archive="0"
+DSO_LDFLAGS='-shared -Wl,-soname,$(@F)'
+RPATH='-Wl,-rpath,$(1)'
+SOREV="${so}.${rev}"
+PIC_CFLAGS='-fPIC -DPIC'
+CTARGET='-o $@'
+LDTARGET='-o $@'
+TEST_LD_MODE=
+EXTRA_LDFLAGS=
+ARFLAGS='crus'
+AROUT=' $@'
+CC_MM=1
+
+if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then
+  TEST_LD_MODE='-dynamic'
+fi
+
+if test "x${je_cv_cray}" = "xyes" ; then
+  CC_MM=
+fi
+
+AN_MAKEVAR([AR], [AC_PROG_AR])
+AN_PROGRAM([ar], [AC_PROG_AR])
+AC_DEFUN([AC_PROG_AR], [AC_CHECK_TOOL(AR, ar, :)])
+AC_PROG_AR
+
+AN_MAKEVAR([NM], [AC_PROG_NM])
+AN_PROGRAM([nm], [AC_PROG_NM])
+AC_DEFUN([AC_PROG_NM], [AC_CHECK_TOOL(NM, nm, :)])
+AC_PROG_NM
+
+AC_PROG_AWK
+
+dnl ============================================================================
+dnl jemalloc version.
+dnl
+
+AC_ARG_WITH([version],
+  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
+   [Version string])],
+  [
+    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
+    if test $? -eq 0 ; then
+      echo "$with_version" > "${objroot}VERSION"
+    else
+      echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null
+      if test $? -ne 0 ; then
+        AC_MSG_ERROR([${with_version} does not match <major>.<minor>.<bugfix>-<nrev>-g<gid> or VERSION])
+      fi
+    fi
+  ], [
+    dnl Set VERSION if source directory is inside a git repository.
+    if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then
+      dnl Pattern globs aren't powerful enough to match both single- and
+      dnl double-digit version numbers, so iterate over patterns to support up
+      dnl to version 99.99.99 without any accidental matches.
+      for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \
+                     '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9]' \
+                     '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do
+        (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null
+        if test $? -eq 0 ; then
+          mv "${objroot}VERSION.tmp" "${objroot}VERSION"
+          break
+        fi
+      done
+    fi
+    rm -f "${objroot}VERSION.tmp"
+  ])
+
+if test ! -e "${objroot}VERSION" ; then
+  if test ! -e "${srcroot}VERSION" ; then
+    AC_MSG_RESULT(
+      [Missing VERSION file, and unable to generate it; creating bogus VERSION])
+    echo "0.0.0-0-g000000missing_version_try_git_fetch_tags" > "${objroot}VERSION"
+  else
+    cp ${srcroot}VERSION ${objroot}VERSION
+  fi
+fi
+jemalloc_version=`cat "${objroot}VERSION"`
+jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'`
+jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'`
+jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'`
+jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'`
+jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'`
+AC_SUBST([jemalloc_version])
+AC_SUBST([jemalloc_version_major])
+AC_SUBST([jemalloc_version_minor])
+AC_SUBST([jemalloc_version_bugfix])
+AC_SUBST([jemalloc_version_nrev])
+AC_SUBST([jemalloc_version_gid])
+
+dnl Platform-specific settings.  abi and RPATH can probably be determined
+dnl programmatically, but doing so is error-prone, which makes it generally
+dnl not worth the trouble.
+dnl
+dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
+dnl definitions need to be seen before any headers are included, which is a pain
+dnl to make happen otherwise.
+default_retain="0"
+zero_realloc_default_free="0"
+maps_coalesce="1"
+DUMP_SYMS="${NM} -a"
+SYM_PREFIX=""
+case "${host}" in
+  *-*-darwin* | *-*-ios*)
+	abi="macho"
+	RPATH=""
+	LD_PRELOAD_VAR="DYLD_INSERT_LIBRARIES"
+	so="dylib"
+	importlib="${so}"
+	force_tls="0"
+	DSO_LDFLAGS='-shared -Wl,-install_name,$(LIBDIR)/$(@F)'
+	SOREV="${rev}.${so}"
+	sbrk_deprecated="1"
+	SYM_PREFIX="_"
+	;;
+  *-*-freebsd*)
+	JE_APPEND_VS(CPPFLAGS, -D_BSD_SOURCE)
+	abi="elf"
+	AC_DEFINE([JEMALLOC_SYSCTL_VM_OVERCOMMIT], [ ], [ ])
+	force_lazy_lock="1"
+	;;
+  *-*-dragonfly*)
+	abi="elf"
+	;;
+  *-*-openbsd*)
+	abi="elf"
+	force_tls="0"
+	;;
+  *-*-bitrig*)
+	abi="elf"
+	;;
+  *-*-linux-android*)
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
+	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
+	abi="elf"
+	glibc="0"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ])
+	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ])
+	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ])
+	AC_DEFINE([JEMALLOC_C11_ATOMICS], [ ], [ ])
+	force_tls="0"
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
+	zero_realloc_default_free="1"
+	;;
+  *-*-linux*)
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
+	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
+	abi="elf"
+	glibc="1"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ])
+	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ])
+	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ])
+	AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ], [ ])
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
+	zero_realloc_default_free="1"
+	;;
+  *-*-kfreebsd*)
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
+	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
+	abi="elf"
+	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	AC_DEFINE([JEMALLOC_SYSCTL_VM_OVERCOMMIT], [ ], [ ])
+	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ])
+	AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ], [ ])
+	;;
+  *-*-netbsd*)
+	AC_MSG_CHECKING([ABI])
+        AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+[[#ifdef __ELF__
+/* ELF */
+#else
+#error aout
+#endif
+]])],
+                          [abi="elf"],
+                          [abi="aout"])
+	AC_MSG_RESULT([$abi])
+	;;
+  *-*-solaris2*)
+	abi="elf"
+	RPATH='-Wl,-R,$(1)'
+	dnl Solaris needs this for sigwait().
+	JE_APPEND_VS(CPPFLAGS, -D_POSIX_PTHREAD_SEMANTICS)
+	JE_APPEND_VS(LIBS, -lposix4 -lsocket -lnsl)
+	;;
+  *-ibm-aix*)
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  dnl 64bit AIX
+	  LD_PRELOAD_VAR="LDR_PRELOAD64"
+	else
+	  dnl 32bit AIX
+	  LD_PRELOAD_VAR="LDR_PRELOAD"
+	fi
+	abi="xcoff"
+	;;
+  *-*-mingw* | *-*-cygwin*)
+	abi="pecoff"
+	force_tls="0"
+	maps_coalesce="0"
+	RPATH=""
+	so="dll"
+	if test "x$je_cv_msvc" = "xyes" ; then
+	  importlib="lib"
+	  DSO_LDFLAGS="-LD"
+	  EXTRA_LDFLAGS="-link -DEBUG"
+	  CTARGET='-Fo$@'
+	  LDTARGET='-Fe$@'
+	  AR='lib'
+	  ARFLAGS='-nologo -out:'
+	  AROUT='$@'
+	  CC_MM=
+        else
+	  importlib="${so}"
+	  DSO_LDFLAGS="-shared"
+	  link_whole_archive="1"
+	fi
+	case "${host}" in
+	  *-*-cygwin*)
+	    DUMP_SYMS="dumpbin /SYMBOLS"
+	    ;;
+	  *)
+	    ;;
+	esac
+	a="lib"
+	libprefix=""
+	SOREV="${so}"
+	PIC_CFLAGS=""
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
+	zero_realloc_default_free="1"
+	;;
+  *-*-nto-qnx)
+	abi="elf"
+  force_tls="0"
+  AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	;;
+  *)
+	AC_MSG_RESULT([Unsupported operating system: ${host}])
+	abi="elf"
+	;;
+esac
+
+JEMALLOC_USABLE_SIZE_CONST=const
+AC_CHECK_HEADERS([malloc.h], [
+  AC_MSG_CHECKING([whether malloc_usable_size definition can use const argument])
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+    [#include <malloc.h>
+     #include <stddef.h>
+    size_t malloc_usable_size(const void *ptr);
+    ],
+    [])],[
+                AC_MSG_RESULT([yes])
+         ],[
+                JEMALLOC_USABLE_SIZE_CONST=
+                AC_MSG_RESULT([no])
+         ])
+])
+AC_DEFINE_UNQUOTED([JEMALLOC_USABLE_SIZE_CONST], [$JEMALLOC_USABLE_SIZE_CONST], [ ])
+AC_SUBST([abi])
+AC_SUBST([RPATH])
+AC_SUBST([LD_PRELOAD_VAR])
+AC_SUBST([so])
+AC_SUBST([importlib])
+AC_SUBST([o])
+AC_SUBST([a])
+AC_SUBST([exe])
+AC_SUBST([libprefix])
+AC_SUBST([link_whole_archive])
+AC_SUBST([DSO_LDFLAGS])
+AC_SUBST([EXTRA_LDFLAGS])
+AC_SUBST([SOREV])
+AC_SUBST([PIC_CFLAGS])
+AC_SUBST([CTARGET])
+AC_SUBST([LDTARGET])
+AC_SUBST([TEST_LD_MODE])
+AC_SUBST([MKLIB])
+AC_SUBST([ARFLAGS])
+AC_SUBST([AROUT])
+AC_SUBST([DUMP_SYMS])
+AC_SUBST([CC_MM])
+
+dnl Determine whether libm must be linked to use e.g. log(3).
+AC_SEARCH_LIBS([log], [m], , [AC_MSG_ERROR([Missing math functions])])
+if test "x$ac_cv_search_log" != "xnone required" ; then
+  LM="$ac_cv_search_log"
+else
+  LM=
+fi
+AC_SUBST(LM)
+
+JE_COMPILABLE([__attribute__ syntax],
+              [static __attribute__((unused)) void foo(void){}],
+              [],
+              [je_cv_attribute])
+if test "x${je_cv_attribute}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ], [ ])
+  if test "x${GCC}" = "xyes" -a "x${abi}" = "xelf"; then
+    JE_CFLAGS_ADD([-fvisibility=hidden])
+    JE_CXXFLAGS_ADD([-fvisibility=hidden])
+  fi
+fi
+dnl Check for tls_model attribute support (clang 3.0 still lacks support).
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([tls_model attribute], [],
+              [static __thread int
+               __attribute__((tls_model("initial-exec"), unused)) foo;
+               foo = 0;],
+              [je_cv_tls_model])
+JE_CFLAGS_RESTORE()
+dnl (Setting of JEMALLOC_TLS_MODEL is done later, after we've checked for
+dnl --disable-initial-exec-tls)
+
+dnl Check for alloc_size attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([alloc_size attribute], [#include <stdlib.h>],
+              [void *foo(size_t size) __attribute__((alloc_size(1)));],
+              [je_cv_alloc_size])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_alloc_size}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_ALLOC_SIZE], [ ], [ ])
+fi
+dnl Check for format(gnu_printf, ...) attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([format(gnu_printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(gnu_printf, 1, 2)));],
+              [je_cv_format_gnu_printf])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_format_gnu_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF], [ ], [ ])
+fi
+dnl Check for format(printf, ...) attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([format(printf, ...) attribute], [#include <stdlib.h>],
+              [void *foo(const char *format, ...) __attribute__((format(printf, 1, 2)));],
+              [je_cv_format_printf])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_format_printf}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_PRINTF], [ ], [ ])
+fi
+
+dnl Check for format_arg(...) attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([format(printf, ...) attribute], [#include <stdlib.h>],
+              [const char * __attribute__((__format_arg__(1))) foo(const char *format);],
+              [je_cv_format_arg])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_format_arg}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_ARG], [ ], [ ])
+fi
+
+dnl Check for fallthrough attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Wimplicit-fallthrough])
+JE_COMPILABLE([fallthrough attribute],
+              [#if !__has_attribute(fallthrough)
+               #error "foo"
+               #endif],
+              [int x = 0;
+               switch (x) {
+               case 0: __attribute__((__fallthrough__));
+               case 1: return 1;
+               }],
+              [je_cv_fallthrough])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_fallthrough}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_FALLTHROUGH], [ ], [ ])
+  JE_CFLAGS_ADD([-Wimplicit-fallthrough])
+  JE_CXXFLAGS_ADD([-Wimplicit-fallthrough])
+fi
+
+dnl Check for cold attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([cold attribute], [],
+              [__attribute__((__cold__)) void foo();],
+              [je_cv_cold])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_cold}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_COLD], [ ], [ ])
+fi
+
+dnl Check for VM_MAKE_TAG for mmap support.
+JE_COMPILABLE([vm_make_tag],
+	      [#include <sys/mman.h>
+	       #include <mach/vm_statistics.h>],
+	      [void *p;
+	       p = mmap(0, 16, PROT_READ, MAP_ANON|MAP_PRIVATE, VM_MAKE_TAG(1), 0);
+	       munmap(p, 16);],
+	      [je_cv_vm_make_tag])
+if test "x${je_cv_vm_make_tag}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_VM_MAKE_TAG], [ ], [ ])
+fi
+
+dnl Support optional additions to rpath.
+AC_ARG_WITH([rpath],
+  [AS_HELP_STRING([--with-rpath=<rpath>], [Colon-separated rpath (ELF systems only)])],
+if test "x$with_rpath" = "xno" ; then
+  RPATH_EXTRA=
+else
+  RPATH_EXTRA="`echo $with_rpath | tr \":\" \" \"`"
+fi,
+  RPATH_EXTRA=
+)
+AC_SUBST([RPATH_EXTRA])
+
+dnl Disable rules that do automatic regeneration of configure output by default.
+AC_ARG_ENABLE([autogen],
+  [AS_HELP_STRING([--enable-autogen], [Automatically regenerate configure output])],
+if test "x$enable_autogen" = "xno" ; then
+  enable_autogen="0"
+else
+  enable_autogen="1"
+fi
+,
+enable_autogen="0"
+)
+AC_SUBST([enable_autogen])
+
+AC_PROG_INSTALL
+AC_PROG_RANLIB
+AC_PATH_PROG([LD], [ld], [false], [$PATH])
+AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH])
+
+dnl Enable documentation
+AC_ARG_ENABLE([doc],
+	      [AS_HELP_STRING([--enable-doc], [Build documentation])],
+if test "x$enable_doc" = "xno" ; then
+  enable_doc="0"
+else
+  enable_doc="1"
+fi
+,
+enable_doc="1"
+)
+AC_SUBST([enable_doc])
+
+dnl Enable shared libs
+AC_ARG_ENABLE([shared],
+  [AS_HELP_STRING([--enable-shared], [Build shared libaries])],
+if test "x$enable_shared" = "xno" ; then
+  enable_shared="0"
+else
+  enable_shared="1"
+fi
+,
+enable_shared="1"
+)
+AC_SUBST([enable_shared])
+
+dnl Enable static libs
+AC_ARG_ENABLE([static],
+  [AS_HELP_STRING([--enable-static], [Build static libaries])],
+if test "x$enable_static" = "xno" ; then
+  enable_static="0"
+else
+  enable_static="1"
+fi
+,
+enable_static="1"
+)
+AC_SUBST([enable_static])
+
+if test "$enable_shared$enable_static" = "00" ; then
+  AC_MSG_ERROR([Please enable one of shared or static builds])
+fi
+
+dnl Perform no name mangling by default.
+AC_ARG_WITH([mangling],
+  [AS_HELP_STRING([--with-mangling=<map>], [Mangle symbols in <map>])],
+  [mangling_map="$with_mangling"], [mangling_map=""])
+
+dnl Do not prefix public APIs by default.
+AC_ARG_WITH([jemalloc_prefix],
+  [AS_HELP_STRING([--with-jemalloc-prefix=<prefix>], [Prefix to prepend to all public APIs])],
+  [JEMALLOC_PREFIX="$with_jemalloc_prefix"],
+  [if test "x$abi" != "xmacho" -a "x$abi" != "xpecoff"; then
+  JEMALLOC_PREFIX=""
+else
+  JEMALLOC_PREFIX="je_"
+fi]
+)
+if test "x$JEMALLOC_PREFIX" = "x" ; then
+  AC_DEFINE([JEMALLOC_IS_MALLOC], [ ], [ ])
+else
+  JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"`
+  AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"], [ ])
+  AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"], [ ])
+fi
+AC_SUBST([JEMALLOC_PREFIX])
+AC_SUBST([JEMALLOC_CPREFIX])
+
+AC_ARG_WITH([export],
+  [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
+  [if test "x$with_export" = "xno"; then
+  AC_DEFINE([JEMALLOC_EXPORT],[], [ ])
+fi]
+)
+
+public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+dnl Check for additional platform-specific public API functions.
+AC_CHECK_FUNC([memalign],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ], [ ])
+	       public_syms="${public_syms} memalign"])
+AC_CHECK_FUNC([valloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ], [ ])
+	       public_syms="${public_syms} valloc"])
+AC_CHECK_FUNC([malloc_size],
+	      [AC_DEFINE([JEMALLOC_HAVE_MALLOC_SIZE], [ ], [ ])
+	       public_syms="${public_syms} malloc_size"])
+
+dnl Check for allocator-related functions that should be wrapped.
+wrap_syms=
+if test "x${JEMALLOC_PREFIX}" = "x" ; then
+  AC_CHECK_FUNC([__libc_calloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_CALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_calloc"])
+  AC_CHECK_FUNC([__libc_free],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_free"])
+  AC_CHECK_FUNC([__libc_malloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_malloc"])
+  AC_CHECK_FUNC([__libc_memalign],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MEMALIGN], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_memalign"])
+  AC_CHECK_FUNC([__libc_realloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_REALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_realloc"])
+  AC_CHECK_FUNC([__libc_valloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_VALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_valloc"])
+  AC_CHECK_FUNC([__posix_memalign],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___POSIX_MEMALIGN], [ ], [ ])
+		 wrap_syms="${wrap_syms} __posix_memalign"])
+fi
+
+case "${host}" in
+  *-*-mingw* | *-*-cygwin*)
+    wrap_syms="${wrap_syms} tls_callback"
+    ;;
+  *)
+    ;;
+esac
+
+dnl Mangle library-private APIs.
+AC_ARG_WITH([private_namespace],
+  [AS_HELP_STRING([--with-private-namespace=<prefix>], [Prefix to prepend to all library-private APIs])],
+  [JEMALLOC_PRIVATE_NAMESPACE="${with_private_namespace}je_"],
+  [JEMALLOC_PRIVATE_NAMESPACE="je_"]
+)
+AC_DEFINE_UNQUOTED([JEMALLOC_PRIVATE_NAMESPACE], [$JEMALLOC_PRIVATE_NAMESPACE], [ ])
+private_namespace="$JEMALLOC_PRIVATE_NAMESPACE"
+AC_SUBST([private_namespace])
+
+dnl Do not add suffix to installed files by default.
+AC_ARG_WITH([install_suffix],
+  [AS_HELP_STRING([--with-install-suffix=<suffix>], [Suffix to append to all installed files])],
+  [case "$with_install_suffix" in
+   *\ * ) AC_MSG_ERROR([Install suffix should not contain spaces]) ;;
+   * ) INSTALL_SUFFIX="$with_install_suffix" ;;
+esac],
+  [INSTALL_SUFFIX=]
+)
+install_suffix="$INSTALL_SUFFIX"
+AC_SUBST([install_suffix])
+
+dnl Specify default malloc_conf.
+AC_ARG_WITH([malloc_conf],
+  [AS_HELP_STRING([--with-malloc-conf=<malloc_conf>], [config.malloc_conf options string])],
+  [JEMALLOC_CONFIG_MALLOC_CONF="$with_malloc_conf"],
+  [JEMALLOC_CONFIG_MALLOC_CONF=""]
+)
+config_malloc_conf="$JEMALLOC_CONFIG_MALLOC_CONF"
+AC_DEFINE_UNQUOTED([JEMALLOC_CONFIG_MALLOC_CONF], ["$config_malloc_conf"], [ ])
+
+dnl Substitute @je_@ in jemalloc_protos.h.in, primarily to make generation of
+dnl jemalloc_protos_jet.h easy.
+je_="je_"
+AC_SUBST([je_])
+
+cfgoutputs_in="Makefile.in"
+cfgoutputs_in="${cfgoutputs_in} jemalloc.pc.in"
+cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in"
+cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_typedefs.h.in"
+cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_preamble.h.in"
+cfgoutputs_in="${cfgoutputs_in} test/test.sh.in"
+cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in"
+
+cfgoutputs_out="Makefile"
+cfgoutputs_out="${cfgoutputs_out} jemalloc.pc"
+cfgoutputs_out="${cfgoutputs_out} doc/html.xsl"
+cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl"
+cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_typedefs.h"
+cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_preamble.h"
+cfgoutputs_out="${cfgoutputs_out} test/test.sh"
+cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h"
+
+cfgoutputs_tup="Makefile"
+cfgoutputs_tup="${cfgoutputs_tup} jemalloc.pc:jemalloc.pc.in"
+cfgoutputs_tup="${cfgoutputs_tup} doc/html.xsl:doc/html.xsl.in"
+cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in"
+cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_typedefs.h:include/jemalloc/jemalloc_typedefs.h.in"
+cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_preamble.h"
+cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in"
+cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in"
+
+cfghdrs_in="include/jemalloc/jemalloc_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh"
+cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh"
+cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in"
+
+cfghdrs_out="include/jemalloc/jemalloc_defs.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols.awk"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols_jet.awk"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle_jet.h"
+cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h"
+cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h"
+
+cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in"
+cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in"
+
+dnl ============================================================================
+dnl jemalloc build options.
+dnl
+
+dnl Do not compile with debugging by default.
+AC_ARG_ENABLE([debug],
+  [AS_HELP_STRING([--enable-debug],
+                  [Build debugging code])],
+[if test "x$enable_debug" = "xno" ; then
+  enable_debug="0"
+else
+  enable_debug="1"
+fi
+],
+[enable_debug="0"]
+)
+if test "x$enable_debug" = "x1" ; then
+  AC_DEFINE([JEMALLOC_DEBUG], [ ], [ ])
+fi
+AC_SUBST([enable_debug])
+
+dnl Only optimize if not debugging.
+if test "x$enable_debug" = "x0" ; then
+  if test "x$GCC" = "xyes" ; then
+    JE_CFLAGS_ADD([-O3])
+    JE_CXXFLAGS_ADD([-O3])
+    JE_CFLAGS_ADD([-funroll-loops])
+  elif test "x$je_cv_msvc" = "xyes" ; then
+    JE_CFLAGS_ADD([-O2])
+    JE_CXXFLAGS_ADD([-O2])
+  else
+    JE_CFLAGS_ADD([-O])
+    JE_CXXFLAGS_ADD([-O])
+  fi
+fi
+
+dnl Enable statistics calculation by default.
+AC_ARG_ENABLE([stats],
+  [AS_HELP_STRING([--disable-stats],
+                  [Disable statistics calculation/reporting])],
+[if test "x$enable_stats" = "xno" ; then
+  enable_stats="0"
+else
+  enable_stats="1"
+fi
+],
+[enable_stats="1"]
+)
+if test "x$enable_stats" = "x1" ; then
+  AC_DEFINE([JEMALLOC_STATS], [ ], [ ])
+fi
+AC_SUBST([enable_stats])
+
+dnl Do not enable smallocx by default.
+AC_ARG_ENABLE([experimental_smallocx],
+  [AS_HELP_STRING([--enable-experimental-smallocx], [Enable experimental smallocx API])],
+[if test "x$enable_experimental_smallocx" = "xno" ; then
+enable_experimental_smallocx="0"
+else
+enable_experimental_smallocx="1"
+fi
+],
+[enable_experimental_smallocx="0"]
+)
+if test "x$enable_experimental_smallocx" = "x1" ; then
+  AC_DEFINE([JEMALLOC_EXPERIMENTAL_SMALLOCX_API], [ ], [ ])
+fi
+AC_SUBST([enable_experimental_smallocx])
+
+dnl Do not enable profiling by default.
+AC_ARG_ENABLE([prof],
+  [AS_HELP_STRING([--enable-prof], [Enable allocation profiling])],
+[if test "x$enable_prof" = "xno" ; then
+  enable_prof="0"
+else
+  enable_prof="1"
+fi
+],
+[enable_prof="0"]
+)
+if test "x$enable_prof" = "x1" ; then
+  backtrace_method=""
+else
+  backtrace_method="N/A"
+fi
+
+AC_ARG_ENABLE([prof-libunwind],
+  [AS_HELP_STRING([--enable-prof-libunwind], [Use libunwind for backtracing])],
+[if test "x$enable_prof_libunwind" = "xno" ; then
+  enable_prof_libunwind="0"
+else
+  enable_prof_libunwind="1"
+  if test "x$enable_prof" = "x0" ; then
+    AC_MSG_ERROR([--enable-prof-libunwind should only be used with --enable-prof])
+  fi
+fi
+],
+[enable_prof_libunwind="0"]
+)
+AC_ARG_WITH([static_libunwind],
+  [AS_HELP_STRING([--with-static-libunwind=<libunwind.a>],
+  [Path to static libunwind library; use rather than dynamically linking])],
+if test "x$with_static_libunwind" = "xno" ; then
+  LUNWIND="-lunwind"
+else
+  if test ! -f "$with_static_libunwind" ; then
+    AC_MSG_ERROR([Static libunwind not found: $with_static_libunwind])
+  fi
+  LUNWIND="$with_static_libunwind"
+fi,
+  LUNWIND="-lunwind"
+)
+if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
+  AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"])
+  if test "x$LUNWIND" = "x-lunwind" ; then
+    AC_CHECK_LIB([unwind], [unw_backtrace], [JE_APPEND_VS(LIBS, $LUNWIND)],
+                 [enable_prof_libunwind="0"])
+  else
+    JE_APPEND_VS(LIBS, $LUNWIND)
+  fi
+  if test "x${enable_prof_libunwind}" = "x1" ; then
+    backtrace_method="libunwind"
+    AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ], [ ])
+  fi
+fi
+
+AC_ARG_ENABLE([prof-libgcc],
+  [AS_HELP_STRING([--disable-prof-libgcc],
+  [Do not use libgcc for backtracing])],
+[if test "x$enable_prof_libgcc" = "xno" ; then
+  enable_prof_libgcc="0"
+else
+  enable_prof_libgcc="1"
+fi
+],
+[enable_prof_libgcc="1"]
+)
+if test "x$backtrace_method" = "x" -a "x$enable_prof_libgcc" = "x1" \
+     -a "x$GCC" = "xyes" ; then
+  AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"])
+  if test "x${enable_prof_libgcc}" = "x1" ; then
+    AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [JE_APPEND_VS(LIBS, -lgcc)], [enable_prof_libgcc="0"])
+  fi
+  if test "x${enable_prof_libgcc}" = "x1" ; then
+    backtrace_method="libgcc"
+    AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ], [ ])
+  fi
+else
+  enable_prof_libgcc="0"
+fi
+
+AC_ARG_ENABLE([prof-gcc],
+  [AS_HELP_STRING([--disable-prof-gcc],
+  [Do not use gcc intrinsics for backtracing])],
+[if test "x$enable_prof_gcc" = "xno" ; then
+  enable_prof_gcc="0"
+else
+  enable_prof_gcc="1"
+fi
+],
+[enable_prof_gcc="1"]
+)
+if test "x$backtrace_method" = "x" -a "x$enable_prof_gcc" = "x1" \
+     -a "x$GCC" = "xyes" ; then
+  JE_CFLAGS_ADD([-fno-omit-frame-pointer])
+  backtrace_method="gcc intrinsics"
+  AC_DEFINE([JEMALLOC_PROF_GCC], [ ], [ ])
+else
+  enable_prof_gcc="0"
+fi
+
+if test "x$backtrace_method" = "x" ; then
+  backtrace_method="none (disabling profiling)"
+  enable_prof="0"
+fi
+AC_MSG_CHECKING([configured backtracing method])
+AC_MSG_RESULT([$backtrace_method])
+if test "x$enable_prof" = "x1" ; then
+  dnl Heap profiling uses the log(3) function.
+  JE_APPEND_VS(LIBS, $LM)
+
+  AC_DEFINE([JEMALLOC_PROF], [ ], [ ])
+fi
+AC_SUBST([enable_prof])
+
+dnl Indicate whether adjacent virtual memory mappings automatically coalesce
+dnl (and fragment on demand).
+if test "x${maps_coalesce}" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MAPS_COALESCE], [ ], [ ])
+fi
+
+dnl Indicate whether to retain memory (rather than using munmap()) by default.
+if test "x$default_retain" = "x1" ; then
+  AC_DEFINE([JEMALLOC_RETAIN], [ ], [ ])
+fi
+
+dnl Indicate whether realloc(ptr, 0) defaults to the "alloc" behavior.
+if test "x$zero_realloc_default_free" = "x1" ; then
+  AC_DEFINE([JEMALLOC_ZERO_REALLOC_DEFAULT_FREE], [ ], [ ])
+fi
+
+dnl Enable allocation from DSS if supported by the OS.
+have_dss="1"
+dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
+AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"])
+if test "x$have_sbrk" = "x1" ; then
+  if test "x$sbrk_deprecated" = "x1" ; then
+    AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated])
+    have_dss="0"
+  fi
+else
+  have_dss="0"
+fi
+
+if test "x$have_dss" = "x1" ; then
+  AC_DEFINE([JEMALLOC_DSS], [ ], [ ])
+fi
+
+dnl Support the junk/zero filling option by default.
+AC_ARG_ENABLE([fill],
+  [AS_HELP_STRING([--disable-fill], [Disable support for junk/zero filling])],
+[if test "x$enable_fill" = "xno" ; then
+  enable_fill="0"
+else
+  enable_fill="1"
+fi
+],
+[enable_fill="1"]
+)
+if test "x$enable_fill" = "x1" ; then
+  AC_DEFINE([JEMALLOC_FILL], [ ], [ ])
+fi
+AC_SUBST([enable_fill])
+
+dnl Disable utrace(2)-based tracing by default.
+AC_ARG_ENABLE([utrace],
+  [AS_HELP_STRING([--enable-utrace], [Enable utrace(2)-based tracing])],
+[if test "x$enable_utrace" = "xno" ; then
+  enable_utrace="0"
+else
+  enable_utrace="1"
+fi
+],
+[enable_utrace="0"]
+)
+JE_COMPILABLE([utrace(2)], [
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+], [
+	utrace((void *)0, 0);
+], [je_cv_utrace])
+if test "x${je_cv_utrace}" = "xno" ; then
+  JE_COMPILABLE([utrace(2) with label], [
+  #include <sys/types.h>
+  #include <sys/param.h>
+  #include <sys/time.h>
+  #include <sys/uio.h>
+  #include <sys/ktrace.h>
+  ], [
+	  utrace((void *)0, (void *)0, 0);
+  ], [je_cv_utrace_label])
+  if test "x${je_cv_utrace_label}" = "xno"; then
+    enable_utrace="0"
+  fi
+  if test "x$enable_utrace" = "x1" ; then
+    AC_DEFINE([JEMALLOC_UTRACE_LABEL], [ ], [ ])
+  fi
+else
+  if test "x$enable_utrace" = "x1" ; then
+    AC_DEFINE([JEMALLOC_UTRACE], [ ], [ ])
+  fi
+fi
+AC_SUBST([enable_utrace])
+
+dnl Do not support the xmalloc option by default.
+AC_ARG_ENABLE([xmalloc],
+  [AS_HELP_STRING([--enable-xmalloc], [Support xmalloc option])],
+[if test "x$enable_xmalloc" = "xno" ; then
+  enable_xmalloc="0"
+else
+  enable_xmalloc="1"
+fi
+],
+[enable_xmalloc="0"]
+)
+if test "x$enable_xmalloc" = "x1" ; then
+  AC_DEFINE([JEMALLOC_XMALLOC], [ ], [ ])
+fi
+AC_SUBST([enable_xmalloc])
+
+dnl Support cache-oblivious allocation alignment by default.
+AC_ARG_ENABLE([cache-oblivious],
+  [AS_HELP_STRING([--disable-cache-oblivious],
+                  [Disable support for cache-oblivious allocation alignment])],
+[if test "x$enable_cache_oblivious" = "xno" ; then
+  enable_cache_oblivious="0"
+else
+  enable_cache_oblivious="1"
+fi
+],
+[enable_cache_oblivious="1"]
+)
+if test "x$enable_cache_oblivious" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ], [ ])
+fi
+AC_SUBST([enable_cache_oblivious])
+
+dnl Do not log by default.
+AC_ARG_ENABLE([log],
+  [AS_HELP_STRING([--enable-log], [Support debug logging])],
+[if test "x$enable_log" = "xno" ; then
+  enable_log="0"
+else
+  enable_log="1"
+fi
+],
+[enable_log="0"]
+)
+if test "x$enable_log" = "x1" ; then
+  AC_DEFINE([JEMALLOC_LOG], [ ], [ ])
+fi
+AC_SUBST([enable_log])
+
+dnl Do not use readlinkat by default
+AC_ARG_ENABLE([readlinkat],
+  [AS_HELP_STRING([--enable-readlinkat], [Use readlinkat over readlink])],
+[if test "x$enable_readlinkat" = "xno" ; then
+  enable_readlinkat="0"
+else
+  enable_readlinkat="1"
+fi
+],
+[enable_readlinkat="0"]
+)
+if test "x$enable_readlinkat" = "x1" ; then
+  AC_DEFINE([JEMALLOC_READLINKAT], [ ], [ ])
+fi
+AC_SUBST([enable_readlinkat])
+
+dnl Avoid extra safety checks by default
+AC_ARG_ENABLE([opt-safety-checks],
+  [AS_HELP_STRING([--enable-opt-safety-checks],
+  [Perform certain low-overhead checks, even in opt mode])],
+[if test "x$enable_opt_safety_checks" = "xno" ; then
+  enable_opt_safety_checks="0"
+else
+  enable_opt_safety_checks="1"
+fi
+],
+[enable_opt_safety_checks="0"]
+)
+if test "x$enable_opt_safety_checks" = "x1" ; then
+  AC_DEFINE([JEMALLOC_OPT_SAFETY_CHECKS], [ ], [ ])
+fi
+AC_SUBST([enable_opt_safety_checks])
+
+dnl Look for sized-deallocation bugs while otherwise being in opt mode.
+AC_ARG_ENABLE([opt-size-checks],
+  [AS_HELP_STRING([--enable-opt-size-checks],
+  [Perform sized-deallocation argument checks, even in opt mode])],
+[if test "x$enable_opt_size_checks" = "xno" ; then
+  enable_opt_size_checks="0"
+else
+  enable_opt_size_checks="1"
+fi
+],
+[enable_opt_size_checks="0"]
+)
+if test "x$enable_opt_size_checks" = "x1" ; then
+  AC_DEFINE([JEMALLOC_OPT_SIZE_CHECKS], [ ], [ ])
+fi
+AC_SUBST([enable_opt_size_checks])
+
+dnl Do not check for use-after-free by default.
+AC_ARG_ENABLE([uaf-detection],
+  [AS_HELP_STRING([--enable-uaf-detection],
+  [Allow sampled junk-filling on deallocation to detect use-after-free])],
+[if test "x$enable_uaf_detection" = "xno" ; then
+  enable_uaf_detection="0"
+else
+  enable_uaf_detection="1"
+fi
+],
+[enable_uaf_detection="0"]
+)
+if test "x$enable_uaf_detection" = "x1" ; then
+  AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ])
+fi
+AC_SUBST([enable_uaf_detection])
+
+JE_COMPILABLE([a program using __builtin_unreachable], [
+void foo (void) {
+  __builtin_unreachable();
+}
+], [
+	{
+		foo();
+	}
+], [je_cv_gcc_builtin_unreachable])
+if test "x${je_cv_gcc_builtin_unreachable}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [__builtin_unreachable], [ ])
+else
+  AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [abort], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
+dnl One of those two functions should (theoretically) exist on all platforms
+dnl that jemalloc currently has a chance of functioning on without modification.
+dnl We additionally assume ffs[ll]() or __builtin_ffs[ll]() are defined if
+dnl ffsl() or __builtin_ffsl() are defined, respectively.
+JE_COMPILABLE([a program using __builtin_ffsl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_ffsl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_ffsl])
+if test "x${je_cv_gcc_builtin_ffsl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSLL], [__builtin_ffsll], [ ])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl], [ ])
+  AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs], [ ])
+else
+  JE_COMPILABLE([a program using ffsl], [
+  #include <stdio.h>
+  #include <strings.h>
+  #include <string.h>
+  ], [
+	{
+		int rv = ffsl(0x08);
+		printf("%d\n", rv);
+	}
+  ], [je_cv_function_ffsl])
+  if test "x${je_cv_function_ffsl}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSLL], [ffsll], [ ])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl], [ ])
+    AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs], [ ])
+  else
+    AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()])
+  fi
+fi
+
+JE_COMPILABLE([a program using __builtin_popcountl], [
+#include <stdio.h>
+#include <strings.h>
+#include <string.h>
+], [
+	{
+		int rv = __builtin_popcountl(0x08);
+		printf("%d\n", rv);
+	}
+], [je_cv_gcc_builtin_popcountl])
+if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount], [ ])
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl], [ ])
+  AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTLL], [__builtin_popcountll], [ ])
+fi
+
+AC_ARG_WITH([lg_quantum],
+  [AS_HELP_STRING([--with-lg-quantum=<lg-quantum>],
+   [Base 2 log of minimum allocation alignment])])
+if test "x$with_lg_quantum" != "x" ; then
+  AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum], [ ])
+fi
+
+AC_ARG_WITH([lg_slab_maxregs],
+  [AS_HELP_STRING([--with-lg-slab-maxregs=<lg-slab-maxregs>],
+   [Base 2 log of maximum number of regions in a slab (used with malloc_conf slab_sizes)])],
+  [CONFIG_LG_SLAB_MAXREGS="with_lg_slab_maxregs"],
+  [CONFIG_LG_SLAB_MAXREGS=""])
+if test "x$with_lg_slab_maxregs" != "x" ; then
+  AC_DEFINE_UNQUOTED([CONFIG_LG_SLAB_MAXREGS], [$with_lg_slab_maxregs], [ ])
+fi
+
+AC_ARG_WITH([lg_page],
+  [AS_HELP_STRING([--with-lg-page=<lg-page>], [Base 2 log of system page size])],
+  [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"])
+case "${host}" in
+  aarch64-apple-darwin*)
+      dnl When cross-compile for Apple M1 and no page size specified, use the
+      dnl default and skip detecting the page size (which is likely incorrect).
+      if test "x${host}" != "x${build}" -a "x$LG_PAGE" = "xdetect"; then
+        LG_PAGE=14
+      fi
+      ;;
+esac
+if test "x$LG_PAGE" = "xdetect"; then
+  AC_CACHE_CHECK([LG_PAGE],
+               [je_cv_lg_page],
+               AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+#include <strings.h>
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+#include <stdio.h>
+]],
+[[
+    int result;
+    FILE *f;
+
+#ifdef _WIN32
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    result = si.dwPageSize;
+#else
+    result = sysconf(_SC_PAGESIZE);
+#endif
+    if (result == -1) {
+	return 1;
+    }
+    result = JEMALLOC_INTERNAL_FFSL(result) - 1;
+
+    f = fopen("conftest.out", "w");
+    if (f == NULL) {
+	return 1;
+    }
+    fprintf(f, "%d", result);
+    fclose(f);
+
+    return 0;
+]])],
+                             [je_cv_lg_page=`cat conftest.out`],
+                             [je_cv_lg_page=undefined],
+                             [je_cv_lg_page=12]))
+fi
+if test "x${je_cv_lg_page}" != "x" ; then
+  LG_PAGE="${je_cv_lg_page}"
+fi
+if test "x${LG_PAGE}" != "xundefined" ; then
+   AC_DEFINE_UNQUOTED([LG_PAGE], [$LG_PAGE], [ ])
+else
+   AC_MSG_ERROR([cannot determine value for LG_PAGE])
+fi
+
+AC_ARG_WITH([lg_hugepage],
+  [AS_HELP_STRING([--with-lg-hugepage=<lg-hugepage>],
+   [Base 2 log of system huge page size])],
+  [je_cv_lg_hugepage="${with_lg_hugepage}"],
+  [je_cv_lg_hugepage=""])
+if test "x${je_cv_lg_hugepage}" = "x" ; then
+  dnl Look in /proc/meminfo (Linux-specific) for information on the default huge
+  dnl page size, if any.  The relevant line looks like:
+  dnl
+  dnl   Hugepagesize:       2048 kB
+  if test -e "/proc/meminfo" ; then
+    hpsk=[`cat /proc/meminfo 2>/dev/null | \
+          grep -e '^Hugepagesize:[[:space:]]\+[0-9]\+[[:space:]]kB$' | \
+          awk '{print $2}'`]
+    if test "x${hpsk}" != "x" ; then
+      je_cv_lg_hugepage=10
+      while test "${hpsk}" -gt 1 ; do
+        hpsk="$((hpsk / 2))"
+        je_cv_lg_hugepage="$((je_cv_lg_hugepage + 1))"
+      done
+    fi
+  fi
+
+  dnl Set default if unable to automatically configure.
+  if test "x${je_cv_lg_hugepage}" = "x" ; then
+    je_cv_lg_hugepage=21
+  fi
+fi
+if test "x${LG_PAGE}" != "xundefined" -a \
+        "${je_cv_lg_hugepage}" -lt "${LG_PAGE}" ; then
+  AC_MSG_ERROR([Huge page size (2^${je_cv_lg_hugepage}) must be at least page size (2^${LG_PAGE})])
+fi
+AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}], [ ])
+
+dnl ============================================================================
+dnl Enable libdl by default.
+AC_ARG_ENABLE([libdl],
+  [AS_HELP_STRING([--disable-libdl],
+  [Do not use libdl])],
+[if test "x$enable_libdl" = "xno" ; then
+  enable_libdl="0"
+else
+  enable_libdl="1"
+fi
+],
+[enable_libdl="1"]
+)
+AC_SUBST([libdl])
+
+dnl ============================================================================
+dnl Configure pthreads.
+
+if test "x$abi" != "xpecoff" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD], [ ], [ ])
+  AC_CHECK_HEADERS([pthread.h], , [AC_MSG_ERROR([pthread.h is missing])])
+  dnl Some systems may embed pthreads functionality in libc; check for libpthread
+  dnl first, but try libc too before failing.
+  AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -pthread)],
+               [AC_SEARCH_LIBS([pthread_create], , ,
+                               AC_MSG_ERROR([libpthread is missing]))])
+  wrap_syms="${wrap_syms} pthread_create"
+  have_pthread="1"
+
+dnl Check if we have dlsym support.
+  if test "x$enable_libdl" = "x1" ; then
+    have_dlsym="1"
+    AC_CHECK_HEADERS([dlfcn.h],
+      AC_CHECK_FUNC([dlsym], [],
+        [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]),
+      [have_dlsym="0"])
+    if test "x$have_dlsym" = "x1" ; then
+      AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ], [ ])
+    fi
+  else
+    have_dlsym="0"
+  fi
+
+  JE_COMPILABLE([pthread_atfork(3)], [
+#include <pthread.h>
+], [
+  pthread_atfork((void *)0, (void *)0, (void *)0);
+], [je_cv_pthread_atfork])
+  if test "x${je_cv_pthread_atfork}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_ATFORK], [ ], [ ])
+  fi
+  dnl Check if pthread_setname_np is available with the expected API.
+  JE_COMPILABLE([pthread_setname_np(3)], [
+#include <pthread.h>
+], [
+  pthread_setname_np(pthread_self(), "setname_test");
+], [je_cv_pthread_setname_np])
+  if test "x${je_cv_pthread_setname_np}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SETNAME_NP], [ ], [ ])
+  fi
+  dnl Check if pthread_getname_np is not necessarily present despite
+  dnl the pthread_setname_np counterpart
+  JE_COMPILABLE([pthread_getname_np(3)], [
+#include <pthread.h>
+#include <stdlib.h>
+], [
+  {
+  	char *name = malloc(16);
+  	pthread_getname_np(pthread_self(), name, 16);
+	free(name);
+  }
+], [je_cv_pthread_getname_np])
+  if test "x${je_cv_pthread_getname_np}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GETNAME_NP], [ ], [ ])
+  fi
+  dnl Check if pthread_get_name_np is not necessarily present despite
+  dnl the pthread_set_name_np counterpart
+  JE_COMPILABLE([pthread_get_name_np(3)], [
+#include <pthread.h>
+#include <pthread_np.h>
+#include <stdlib.h>
+], [
+  {
+  	char *name = malloc(16);
+  	pthread_get_name_np(pthread_self(), name, 16);
+	free(name);
+  }
+], [je_cv_pthread_get_name_np])
+  if test "x${je_cv_pthread_get_name_np}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GET_NAME_NP], [ ], [ ])
+  fi
+fi
+
+JE_APPEND_VS(CPPFLAGS, -D_REENTRANT)
+
+dnl Check whether clock_gettime(2) is in libc or librt.
+AC_SEARCH_LIBS([clock_gettime], [rt])
+
+dnl Cray wrapper compiler often adds `-lrt` when using `-static`. Check with
+dnl `-dynamic` as well in case a user tries to dynamically link in jemalloc
+if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then
+  if test "$ac_cv_search_clock_gettime" != "-lrt"; then
+    JE_CFLAGS_SAVE()
+
+    unset ac_cv_search_clock_gettime
+    JE_CFLAGS_ADD([-dynamic])
+    AC_SEARCH_LIBS([clock_gettime], [rt])
+
+    JE_CFLAGS_RESTORE()
+  fi
+fi
+
+dnl check for CLOCK_MONOTONIC_COARSE (Linux-specific).
+JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC_COARSE, ...)], [
+#include <time.h>
+], [
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+], [je_cv_clock_monotonic_coarse])
+if test "x${je_cv_clock_monotonic_coarse}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE], [ ], [ ])
+fi
+
+dnl check for CLOCK_MONOTONIC.
+JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC, ...)], [
+#include <unistd.h>
+#include <time.h>
+], [
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+#if !defined(_POSIX_MONOTONIC_CLOCK) || _POSIX_MONOTONIC_CLOCK < 0
+#  error _POSIX_MONOTONIC_CLOCK missing/invalid
+#endif
+], [je_cv_clock_monotonic])
+if test "x${je_cv_clock_monotonic}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC], [ ], [ ])
+fi
+
+dnl Check for mach_absolute_time().
+JE_COMPILABLE([mach_absolute_time()], [
+#include <mach/mach_time.h>
+], [
+	mach_absolute_time();
+], [je_cv_mach_absolute_time])
+if test "x${je_cv_mach_absolute_time}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MACH_ABSOLUTE_TIME], [ ], [ ])
+fi
+
+dnl check for CLOCK_REALTIME (always should be available on Linux)
+JE_COMPILABLE([clock_gettime(CLOCK_REALTIME, ...)], [
+#include <time.h>
+], [
+	struct timespec ts;
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+], [je_cv_clock_realtime])
+if test "x${je_cv_clock_realtime}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_REALTIME], [ ], [ ])
+fi
+
+dnl Use syscall(2) (if available) by default.
+AC_ARG_ENABLE([syscall],
+  [AS_HELP_STRING([--disable-syscall], [Disable use of syscall(2)])],
+[if test "x$enable_syscall" = "xno" ; then
+  enable_syscall="0"
+else
+  enable_syscall="1"
+fi
+],
+[enable_syscall="1"]
+)
+if test "x$enable_syscall" = "x1" ; then
+  dnl Check if syscall(2) is usable.  Treat warnings as errors, so that e.g. OS
+  dnl X 10.12's deprecation warning prevents use.
+  JE_CFLAGS_SAVE()
+  JE_CFLAGS_ADD([-Werror])
+  JE_COMPILABLE([syscall(2)], [
+#include <sys/syscall.h>
+#include <unistd.h>
+], [
+	syscall(SYS_write, 2, "hello", 5);
+],
+                [je_cv_syscall])
+  JE_CFLAGS_RESTORE()
+  if test "x$je_cv_syscall" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_USE_SYSCALL], [ ], [ ])
+  fi
+fi
+
+dnl Check if the GNU-specific secure_getenv function exists.
+AC_CHECK_FUNC([secure_getenv],
+              [have_secure_getenv="1"],
+              [have_secure_getenv="0"]
+             )
+if test "x$have_secure_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ], [ ])
+fi
+
+dnl Check if the GNU-specific sched_getcpu function exists.
+AC_CHECK_FUNC([sched_getcpu],
+              [have_sched_getcpu="1"],
+              [have_sched_getcpu="0"]
+             )
+if test "x$have_sched_getcpu" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SCHED_GETCPU], [ ], [ ])
+fi
+
+dnl Check if the GNU-specific sched_setaffinity function exists.
+AC_CHECK_FUNC([sched_setaffinity],
+              [have_sched_setaffinity="1"],
+              [have_sched_setaffinity="0"]
+             )
+if test "x$have_sched_setaffinity" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SCHED_SETAFFINITY], [ ], [ ])
+fi
+
+dnl Check if the Solaris/BSD issetugid function exists.
+AC_CHECK_FUNC([issetugid],
+              [have_issetugid="1"],
+              [have_issetugid="0"]
+             )
+if test "x$have_issetugid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ISSETUGID], [ ], [ ])
+fi
+
+dnl Check whether the BSD-specific _malloc_thread_cleanup() exists.  If so, use
+dnl it rather than pthreads TSD cleanup functions to support cleanup during
+dnl thread exit, in order to avoid pthreads library recursion during
+dnl bootstrapping.
+AC_CHECK_FUNC([_malloc_thread_cleanup],
+              [have__malloc_thread_cleanup="1"],
+              [have__malloc_thread_cleanup="0"]
+             )
+if test "x$have__malloc_thread_cleanup" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MALLOC_THREAD_CLEANUP], [ ], [ ])
+  wrap_syms="${wrap_syms} _malloc_thread_cleanup _malloc_tsd_cleanup_register"
+  force_tls="1"
+fi
+
+dnl Check whether the BSD-specific _pthread_mutex_init_calloc_cb() exists.  If
+dnl so, mutex initialization causes allocation, and we need to implement this
+dnl callback function in order to prevent recursive allocation.
+AC_CHECK_FUNC([_pthread_mutex_init_calloc_cb],
+              [have__pthread_mutex_init_calloc_cb="1"],
+              [have__pthread_mutex_init_calloc_cb="0"]
+             )
+if test "x$have__pthread_mutex_init_calloc_cb" = "x1" ; then
+  AC_DEFINE([JEMALLOC_MUTEX_INIT_CB], [ ], [ ])
+  wrap_syms="${wrap_syms} _malloc_prefork _malloc_postfork"
+fi
+
+AC_CHECK_FUNC([memcntl],
+	      [have_memcntl="1"],
+	      [have_memcntl="0"],
+	      )
+if test "x$have_memcntl" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MEMCNTL], [ ], [ ])
+fi
+
+dnl Disable lazy locking by default.
+AC_ARG_ENABLE([lazy_lock],
+  [AS_HELP_STRING([--enable-lazy-lock],
+  [Enable lazy locking (only lock when multi-threaded)])],
+[if test "x$enable_lazy_lock" = "xno" ; then
+  enable_lazy_lock="0"
+else
+  enable_lazy_lock="1"
+fi
+],
+[enable_lazy_lock=""]
+)
+if test "x${enable_lazy_lock}" = "x" ; then
+  if test "x${force_lazy_lock}" = "x1" ; then
+    AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues])
+    enable_lazy_lock="1"
+  else
+    enable_lazy_lock="0"
+  fi
+fi
+if test "x${enable_lazy_lock}" = "x1" -a "x${abi}" = "xpecoff" ; then
+  AC_MSG_RESULT([Forcing no lazy-lock because thread creation monitoring is unimplemented])
+  enable_lazy_lock="0"
+fi
+if test "x$enable_lazy_lock" = "x1" ; then
+  if test "x$have_dlsym" = "x1" ; then
+    AC_DEFINE([JEMALLOC_LAZY_LOCK], [ ], [ ])
+  else
+    AC_MSG_ERROR([Missing dlsym support: lazy-lock cannot be enabled.])
+  fi
+fi
+AC_SUBST([enable_lazy_lock])
+
+dnl Automatically configure TLS.
+if test "x${force_tls}" = "x1" ; then
+  enable_tls="1"
+elif test "x${force_tls}" = "x0" ; then
+  enable_tls="0"
+else
+  enable_tls="1"
+fi
+if test "x${enable_tls}" = "x1" ; then
+AC_MSG_CHECKING([for TLS])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+[[
+    __thread int x;
+]], [[
+    x = 42;
+
+    return 0;
+]])],
+              AC_MSG_RESULT([yes]),
+              AC_MSG_RESULT([no])
+              enable_tls="0")
+else
+  enable_tls="0"
+fi
+AC_SUBST([enable_tls])
+if test "x${enable_tls}" = "x1" ; then
+  AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for C11 atomics.
+
+JE_COMPILABLE([C11 atomics], [
+#include <stdint.h>
+#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#else
+#error Atomics not available
+#endif
+], [
+    uint64_t *p = (uint64_t *)0;
+    uint64_t x = 1;
+    volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
+    uint64_t r = atomic_fetch_add(a, x) + x;
+    return r == 0;
+], [je_cv_c11_atomics])
+if test "x${je_cv_c11_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_C11_ATOMICS], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for GCC-style __atomic atomics.
+
+JE_COMPILABLE([GCC __atomic atomics], [
+], [
+    int x = 0;
+    int val = 1;
+    int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
+    int after_add = x;
+    return after_add == 1;
+], [je_cv_gcc_atomic_atomics])
+if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS], [ ], [ ])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __atomic atomics], [
+  ], [
+      unsigned char x = 0;
+      int val = 1;
+      int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED);
+      int after_add = (int)x;
+      return after_add == 1;
+  ], [je_cv_gcc_u8_atomic_atomics])
+  if test "x${je_cv_gcc_u8_atomic_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_ATOMIC_ATOMICS], [ ], [ ])
+  fi
+fi
+
+dnl ============================================================================
+dnl Check for GCC-style __sync atomics.
+
+JE_COMPILABLE([GCC __sync atomics], [
+], [
+    int x = 0;
+    int before_add = __sync_fetch_and_add(&x, 1);
+    int after_add = x;
+    return (before_add == 0) && (after_add == 1);
+], [je_cv_gcc_sync_atomics])
+if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS], [ ], [ ])
+
+  dnl check for 8-bit atomic support
+  JE_COMPILABLE([GCC 8-bit __sync atomics], [
+  ], [
+      unsigned char x = 0;
+      int before_add = __sync_fetch_and_add(&x, 1);
+      int after_add = (int)x;
+      return (before_add == 0) && (after_add == 1);
+  ], [je_cv_gcc_u8_sync_atomics])
+  if test "x${je_cv_gcc_u8_sync_atomics}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_GCC_U8_SYNC_ATOMICS], [ ], [ ])
+  fi
+fi
+
+dnl ============================================================================
+dnl Check for atomic(3) operations as provided on Darwin.
+dnl We need this not for the atomic operations (which are provided above), but
+dnl rather for the OS_unfair_lock type it exposes.
+
+JE_COMPILABLE([Darwin OSAtomic*()], [
+#include <libkern/OSAtomic.h>
+#include <inttypes.h>
+], [
+	{
+		int32_t x32 = 0;
+		volatile int32_t *x32p = &x32;
+		OSAtomicAdd32(1, x32p);
+	}
+	{
+		int64_t x64 = 0;
+		volatile int64_t *x64p = &x64;
+		OSAtomicAdd64(1, x64p);
+	}
+], [je_cv_osatomic])
+if test "x${je_cv_osatomic}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_OSATOMIC], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for madvise(2).
+
+JE_COMPILABLE([madvise(2)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, 0);
+], [je_cv_madvise])
+if test "x${je_cv_madvise}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MADVISE], [ ], [ ])
+
+  dnl Check for madvise(..., MADV_FREE).
+  JE_COMPILABLE([madvise(..., MADV_FREE)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_FREE);
+], [je_cv_madv_free])
+  if test "x${je_cv_madv_free}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ], [ ])
+  elif test "x${je_cv_madvise}" = "xyes" ; then
+    case "${host_cpu}" in i686|x86_64)
+        case "${host}" in *-*-linux*)
+            AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ], [ ])
+            AC_DEFINE([JEMALLOC_DEFINE_MADVISE_FREE], [ ], [ ])
+	    ;;
+        esac
+        ;;
+    esac
+  fi
+
+  dnl Check for madvise(..., MADV_DONTNEED).
+  JE_COMPILABLE([madvise(..., MADV_DONTNEED)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_DONTNEED);
+], [je_cv_madv_dontneed])
+  if test "x${je_cv_madv_dontneed}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ], [ ])
+  fi
+
+  dnl Check for madvise(..., MADV_DO[NT]DUMP).
+  JE_COMPILABLE([madvise(..., MADV_DO[[NT]]DUMP)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_DONTDUMP);
+	madvise((void *)0, 0, MADV_DODUMP);
+], [je_cv_madv_dontdump])
+  if test "x${je_cv_madv_dontdump}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_MADVISE_DONTDUMP], [ ], [ ])
+  fi
+
+  dnl Check for madvise(..., MADV_[NO]HUGEPAGE).
+  JE_COMPILABLE([madvise(..., MADV_[[NO]]HUGEPAGE)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_HUGEPAGE);
+	madvise((void *)0, 0, MADV_NOHUGEPAGE);
+], [je_cv_thp])
+  dnl Check for madvise(..., MADV_[NO]CORE).
+  JE_COMPILABLE([madvise(..., MADV_[[NO]]CORE)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_NOCORE);
+	madvise((void *)0, 0, MADV_CORE);
+], [je_cv_madv_nocore])
+  if test "x${je_cv_madv_nocore}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ])
+  fi
+case "${host_cpu}" in
+  arm*)
+    ;;
+  *)
+  if test "x${je_cv_thp}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_MADVISE_HUGE], [ ], [ ])
+  fi
+  ;;
+esac
+else
+  dnl Check for posix_madvise.
+  JE_COMPILABLE([posix_madvise], [
+  #include <sys/mman.h>
+  ], [
+    posix_madvise((void *)0, 0, 0);
+  ], [je_cv_posix_madvise])
+  if test "x${je_cv_posix_madvise}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_POSIX_MADVISE], [ ], [ ])
+
+    dnl Check for posix_madvise(..., POSIX_MADV_DONTNEED).
+    JE_COMPILABLE([posix_madvise(..., POSIX_MADV_DONTNEED)], [
+  #include <sys/mman.h>
+  ], [
+    posix_madvise((void *)0, 0, POSIX_MADV_DONTNEED);
+  ], [je_cv_posix_madv_dontneed])
+    if test "x${je_cv_posix_madv_dontneed}" = "xyes" ; then
+      AC_DEFINE([JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED], [ ], [ ])
+    fi
+  fi
+fi
+
+dnl ============================================================================
+dnl Check for mprotect(2).
+
+JE_COMPILABLE([mprotect(2)], [
+#include <sys/mman.h>
+], [
+	mprotect((void *)0, 0, PROT_NONE);
+], [je_cv_mprotect])
+if test "x${je_cv_mprotect}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MPROTECT], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for __builtin_clz(), __builtin_clzl(), and __builtin_clzll().
+
+AC_CACHE_CHECK([for __builtin_clz],
+               [je_cv_builtin_clz],
+               [AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+                                                [
+                                                {
+                                                        unsigned x = 0;
+                                                        int y = __builtin_clz(x);
+                                                }
+                                                {
+                                                        unsigned long x = 0;
+                                                        int y = __builtin_clzl(x);
+                                                }
+                                                {
+                                                        unsigned long long x = 0;
+                                                        int y = __builtin_clzll(x);
+                                                }
+                                                ])],
+                               [je_cv_builtin_clz=yes],
+                               [je_cv_builtin_clz=no])])
+
+if test "x${je_cv_builtin_clz}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for os_unfair_lock operations as provided on Darwin.
+
+JE_COMPILABLE([Darwin os_unfair_lock_*()], [
+#include <os/lock.h>
+#include <AvailabilityMacros.h>
+], [
+	#if MAC_OS_X_VERSION_MIN_REQUIRED < 101200
+	#error "os_unfair_lock is not supported"
+	#else
+	os_unfair_lock lock = OS_UNFAIR_LOCK_INIT;
+	os_unfair_lock_lock(&lock);
+	os_unfair_lock_unlock(&lock);
+	#endif
+], [je_cv_os_unfair_lock])
+if test "x${je_cv_os_unfair_lock}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_OS_UNFAIR_LOCK], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Darwin-related configuration.
+
+AC_ARG_ENABLE([zone-allocator],
+  [AS_HELP_STRING([--disable-zone-allocator],
+                  [Disable zone allocator for Darwin])],
+[if test "x$enable_zone_allocator" = "xno" ; then
+  enable_zone_allocator="0"
+else
+  enable_zone_allocator="1"
+fi
+],
+[if test "x${abi}" = "xmacho"; then
+  enable_zone_allocator="1"
+fi
+]
+)
+AC_SUBST([enable_zone_allocator])
+
+if test "x${enable_zone_allocator}" = "x1" ; then
+  if test "x${abi}" != "xmacho"; then
+    AC_MSG_ERROR([--enable-zone-allocator is only supported on Darwin])
+  fi
+  AC_DEFINE([JEMALLOC_ZONE], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Use initial-exec TLS by default.
+AC_ARG_ENABLE([initial-exec-tls],
+  [AS_HELP_STRING([--disable-initial-exec-tls],
+                  [Disable the initial-exec tls model])],
+[if test "x$enable_initial_exec_tls" = "xno" ; then
+  enable_initial_exec_tls="0"
+else
+  enable_initial_exec_tls="1"
+fi
+],
+[enable_initial_exec_tls="1"]
+)
+AC_SUBST([enable_initial_exec_tls])
+
+if test "x${je_cv_tls_model}" = "xyes" -a \
+       "x${enable_initial_exec_tls}" = "x1" ; then
+  AC_DEFINE([JEMALLOC_TLS_MODEL],
+            [__attribute__((tls_model("initial-exec")))], 
+            [ ])
+else
+  AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Enable background threads if possible.
+
+if test "x${have_pthread}" = "x1" -a "x${je_cv_os_unfair_lock}" != "xyes" -a \
+       "x${abi}" != "xmacho" ; then
+  AC_DEFINE([JEMALLOC_BACKGROUND_THREAD], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for glibc malloc hooks
+
+if test "x$glibc" = "x1" ; then
+  JE_COMPILABLE([glibc malloc hook], [
+  #include <stddef.h>
+
+  extern void (* __free_hook)(void *ptr);
+  extern void *(* __malloc_hook)(size_t size);
+  extern void *(* __realloc_hook)(void *ptr, size_t size);
+], [
+    void *ptr = 0L;
+    if (__malloc_hook) ptr = __malloc_hook(1);
+    if (__realloc_hook) ptr = __realloc_hook(ptr, 2);
+    if (__free_hook && ptr) __free_hook(ptr);
+], [je_cv_glibc_malloc_hook])
+  if test "x${je_cv_glibc_malloc_hook}" = "xyes" ; then
+    if test "x${JEMALLOC_PREFIX}" = "x" ; then
+      AC_DEFINE([JEMALLOC_GLIBC_MALLOC_HOOK], [ ], [ ])
+      wrap_syms="${wrap_syms} __free_hook __malloc_hook __realloc_hook"
+    fi
+  fi
+
+  JE_COMPILABLE([glibc memalign hook], [
+  #include <stddef.h>
+
+  extern void *(* __memalign_hook)(size_t alignment, size_t size);
+], [
+    void *ptr = 0L;
+    if (__memalign_hook) ptr = __memalign_hook(16, 7);
+], [je_cv_glibc_memalign_hook])
+  if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then
+    if test "x${JEMALLOC_PREFIX}" = "x" ; then
+      AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ], [ ])
+      wrap_syms="${wrap_syms} __memalign_hook"
+    fi
+  fi
+fi
+
+JE_COMPILABLE([pthreads adaptive mutexes], [
+#include <pthread.h>
+], [
+  pthread_mutexattr_t attr;
+  pthread_mutexattr_init(&attr);
+  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+  pthread_mutexattr_destroy(&attr);
+], [je_cv_pthread_mutex_adaptive_np])
+if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ], [ ])
+fi
+
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-D_GNU_SOURCE])
+JE_CFLAGS_ADD([-Werror])
+JE_CFLAGS_ADD([-herror_on_warning])
+JE_COMPILABLE([strerror_r returns char with gnu source], [
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+], [
+  char *buffer = (char *) malloc(100);
+  char *error = strerror_r(EINVAL, buffer, 100);
+  printf("%s\n", error);
+], [je_cv_strerror_r_returns_char_with_gnu_source])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_strerror_r_returns_char_with_gnu_source}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE], [ ], [ ])
+fi
+
+dnl ============================================================================
+dnl Check for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+
+dnl ============================================================================
+dnl Define commands that generate output files.
+
+AC_CONFIG_COMMANDS([include/jemalloc/internal/public_symbols.txt], [
+  f="${objroot}include/jemalloc/internal/public_symbols.txt"
+  mkdir -p "${objroot}include/jemalloc/internal"
+  cp /dev/null "${f}"
+  for nm in `echo ${mangling_map} |tr ',' ' '` ; do
+    n=`echo ${nm} |tr ':' ' ' |awk '{print $[]1}'`
+    m=`echo ${nm} |tr ':' ' ' |awk '{print $[]2}'`
+    echo "${n}:${m}" >> "${f}"
+    dnl Remove name from public_syms so that it isn't redefined later.
+    public_syms=`for sym in ${public_syms}; do echo "${sym}"; done |grep -v "^${n}\$" |tr '\n' ' '`
+  done
+  for sym in ${public_syms} ; do
+    n="${sym}"
+    m="${JEMALLOC_PREFIX}${sym}"
+    echo "${n}:${m}" >> "${f}"
+  done
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+  mangling_map="${mangling_map}"
+  public_syms="${public_syms}"
+  JEMALLOC_PREFIX="${JEMALLOC_PREFIX}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/internal/private_symbols.awk], [
+  f="${objroot}include/jemalloc/internal/private_symbols.awk"
+  mkdir -p "${objroot}include/jemalloc/internal"
+  export_syms=`for sym in ${public_syms}; do echo "${JEMALLOC_PREFIX}${sym}"; done; for sym in ${wrap_syms}; do echo "${sym}"; done;`
+  "${srcdir}/include/jemalloc/internal/private_symbols.sh" "${SYM_PREFIX}" ${export_syms} > "${objroot}include/jemalloc/internal/private_symbols.awk"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+  public_syms="${public_syms}"
+  wrap_syms="${wrap_syms}"
+  SYM_PREFIX="${SYM_PREFIX}"
+  JEMALLOC_PREFIX="${JEMALLOC_PREFIX}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/internal/private_symbols_jet.awk], [
+  f="${objroot}include/jemalloc/internal/private_symbols_jet.awk"
+  mkdir -p "${objroot}include/jemalloc/internal"
+  export_syms=`for sym in ${public_syms}; do echo "jet_${sym}"; done; for sym in ${wrap_syms}; do echo "${sym}"; done;`
+  "${srcdir}/include/jemalloc/internal/private_symbols.sh" "${SYM_PREFIX}" ${export_syms} > "${objroot}include/jemalloc/internal/private_symbols_jet.awk"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+  public_syms="${public_syms}"
+  wrap_syms="${wrap_syms}"
+  SYM_PREFIX="${SYM_PREFIX}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/internal/public_namespace.h], [
+  mkdir -p "${objroot}include/jemalloc/internal"
+  "${srcdir}/include/jemalloc/internal/public_namespace.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/internal/public_namespace.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [
+  mkdir -p "${objroot}include/jemalloc/internal"
+  "${srcdir}/include/jemalloc/internal/public_unnamespace.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/internal/public_unnamespace.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [
+  mkdir -p "${objroot}include/jemalloc"
+  cat "${srcdir}/include/jemalloc/jemalloc_protos.h.in" | sed -e 's/@je_@/jet_/g' > "${objroot}include/jemalloc/jemalloc_protos_jet.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_rename.h], [
+  mkdir -p "${objroot}include/jemalloc"
+  "${srcdir}/include/jemalloc/jemalloc_rename.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/jemalloc_rename.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_mangle.h], [
+  mkdir -p "${objroot}include/jemalloc"
+  "${srcdir}/include/jemalloc/jemalloc_mangle.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" je_ > "${objroot}include/jemalloc/jemalloc_mangle.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_mangle_jet.h], [
+  mkdir -p "${objroot}include/jemalloc"
+  "${srcdir}/include/jemalloc/jemalloc_mangle.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" jet_ > "${objroot}include/jemalloc/jemalloc_mangle_jet.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+])
+AC_CONFIG_COMMANDS([include/jemalloc/jemalloc.h], [
+  mkdir -p "${objroot}include/jemalloc"
+  "${srcdir}/include/jemalloc/jemalloc.sh" "${objroot}" > "${objroot}include/jemalloc/jemalloc${install_suffix}.h"
+], [
+  srcdir="${srcdir}"
+  objroot="${objroot}"
+  install_suffix="${install_suffix}"
+])
+
+dnl Process .in files.
+AC_SUBST([cfghdrs_in])
+AC_SUBST([cfghdrs_out])
+AC_CONFIG_HEADERS([$cfghdrs_tup])
+
+dnl ============================================================================
+dnl Generate outputs.
+
+AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh bin/jeprof])
+AC_SUBST([cfgoutputs_in])
+AC_SUBST([cfgoutputs_out])
+AC_OUTPUT
+
+dnl ============================================================================
+dnl Print out the results of configuration.
+AC_MSG_RESULT([===============================================================================])
+AC_MSG_RESULT([jemalloc version   : ${jemalloc_version}])
+AC_MSG_RESULT([library revision   : ${rev}])
+AC_MSG_RESULT([])
+AC_MSG_RESULT([CONFIG             : ${CONFIG}])
+AC_MSG_RESULT([CC                 : ${CC}])
+AC_MSG_RESULT([CONFIGURE_CFLAGS   : ${CONFIGURE_CFLAGS}])
+AC_MSG_RESULT([SPECIFIED_CFLAGS   : ${SPECIFIED_CFLAGS}])
+AC_MSG_RESULT([EXTRA_CFLAGS       : ${EXTRA_CFLAGS}])
+AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
+AC_MSG_RESULT([CXX                : ${CXX}])
+AC_MSG_RESULT([CONFIGURE_CXXFLAGS : ${CONFIGURE_CXXFLAGS}])
+AC_MSG_RESULT([SPECIFIED_CXXFLAGS : ${SPECIFIED_CXXFLAGS}])
+AC_MSG_RESULT([EXTRA_CXXFLAGS     : ${EXTRA_CXXFLAGS}])
+AC_MSG_RESULT([LDFLAGS            : ${LDFLAGS}])
+AC_MSG_RESULT([EXTRA_LDFLAGS      : ${EXTRA_LDFLAGS}])
+AC_MSG_RESULT([DSO_LDFLAGS        : ${DSO_LDFLAGS}])
+AC_MSG_RESULT([LIBS               : ${LIBS}])
+AC_MSG_RESULT([RPATH_EXTRA        : ${RPATH_EXTRA}])
+AC_MSG_RESULT([])
+AC_MSG_RESULT([XSLTPROC           : ${XSLTPROC}])
+AC_MSG_RESULT([XSLROOT            : ${XSLROOT}])
+AC_MSG_RESULT([])
+AC_MSG_RESULT([PREFIX             : ${PREFIX}])
+AC_MSG_RESULT([BINDIR             : ${BINDIR}])
+AC_MSG_RESULT([DATADIR            : ${DATADIR}])
+AC_MSG_RESULT([INCLUDEDIR         : ${INCLUDEDIR}])
+AC_MSG_RESULT([LIBDIR             : ${LIBDIR}])
+AC_MSG_RESULT([MANDIR             : ${MANDIR}])
+AC_MSG_RESULT([])
+AC_MSG_RESULT([srcroot            : ${srcroot}])
+AC_MSG_RESULT([abs_srcroot        : ${abs_srcroot}])
+AC_MSG_RESULT([objroot            : ${objroot}])
+AC_MSG_RESULT([abs_objroot        : ${abs_objroot}])
+AC_MSG_RESULT([])
+AC_MSG_RESULT([JEMALLOC_PREFIX    : ${JEMALLOC_PREFIX}])
+AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE])
+AC_MSG_RESULT([                   : ${JEMALLOC_PRIVATE_NAMESPACE}])
+AC_MSG_RESULT([install_suffix     : ${install_suffix}])
+AC_MSG_RESULT([malloc_conf        : ${config_malloc_conf}])
+AC_MSG_RESULT([documentation      : ${enable_doc}])
+AC_MSG_RESULT([shared libs        : ${enable_shared}])
+AC_MSG_RESULT([static libs        : ${enable_static}])
+AC_MSG_RESULT([autogen            : ${enable_autogen}])
+AC_MSG_RESULT([debug              : ${enable_debug}])
+AC_MSG_RESULT([stats              : ${enable_stats}])
+AC_MSG_RESULT([experimental_smallocx : ${enable_experimental_smallocx}])
+AC_MSG_RESULT([prof               : ${enable_prof}])
+AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
+AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
+AC_MSG_RESULT([prof-gcc           : ${enable_prof_gcc}])
+AC_MSG_RESULT([fill               : ${enable_fill}])
+AC_MSG_RESULT([utrace             : ${enable_utrace}])
+AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
+AC_MSG_RESULT([log                : ${enable_log}])
+AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
+AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
+AC_MSG_RESULT([cxx                : ${enable_cxx}])
+AC_MSG_RESULT([===============================================================================])

+ 23 - 0
BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h

@@ -0,0 +1,23 @@
+#ifndef JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H
+#define JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H
+
+/*
+ * The callback to be executed "periodically", in response to some amount of
+ * allocator activity.
+ *
+ * This callback need not be computing any sort of peak (although that's the
+ * intended first use case), but we drive it from the peak counter, so it's
+ * keeps things tidy to keep it here.
+ *
+ * The calls to this thunk get driven by the peak_event module.
+ */
+#define ACTIVITY_CALLBACK_THUNK_INITIALIZER {NULL, NULL}
+typedef void (*activity_callback_t)(void *uctx, uint64_t allocated,
+    uint64_t deallocated);
+typedef struct activity_callback_thunk_s activity_callback_thunk_t;
+struct activity_callback_thunk_s {
+	activity_callback_t callback;
+	void *uctx;
+};
+
+#endif /* JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H */

+ 121 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h

@@ -0,0 +1,121 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
+#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
+
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/hook.h"
+#include "jemalloc/internal/pages.h"
+#include "jemalloc/internal/stats.h"
+
+/*
+ * When the amount of pages to be purged exceeds this amount, deferred purge
+ * should happen.
+ */
+#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024)
+
+extern ssize_t opt_dirty_decay_ms;
+extern ssize_t opt_muzzy_decay_ms;
+
+extern percpu_arena_mode_t opt_percpu_arena;
+extern const char *percpu_arena_mode_names[];
+
+extern div_info_t arena_binind_div_info[SC_NBINS];
+
+extern malloc_mutex_t arenas_lock;
+extern emap_t arena_emap_global;
+
+extern size_t opt_oversize_threshold;
+extern size_t oversize_threshold;
+
+/*
+ * arena_bin_offsets[binind] is the offset of the first bin shard for size class
+ * binind.
+ */
+extern uint32_t arena_bin_offsets[SC_NBINS];
+
+void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
+    unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
+    ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
+void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
+    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
+    bin_stats_data_t *bstats, arena_stats_large_t *lstats,
+    pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
+void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
+edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
+    size_t usize, size_t alignment, bool zero);
+void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
+    edata_t *edata);
+void arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena,
+    edata_t *edata, size_t oldsize);
+void arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena,
+    edata_t *edata, size_t oldsize);
+bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state,
+    ssize_t decay_ms);
+ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
+void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
+    bool all);
+uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
+void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
+void arena_reset(tsd_t *tsd, arena_t *arena);
+void arena_destroy(tsd_t *tsd, arena_t *arena);
+void arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena,
+    cache_bin_t *cache_bin, cache_bin_info_t *cache_bin_info, szind_t binind,
+    const unsigned nfill);
+
+void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
+    szind_t ind, bool zero);
+void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache);
+void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
+void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
+    bool slow_path);
+void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
+
+void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
+    edata_t *slab, bin_t *bin);
+void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
+    edata_t *slab, bin_t *bin);
+void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
+bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t extra, bool zero, size_t *newsize);
+void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
+dss_prec_t arena_dss_prec_get(arena_t *arena);
+ehooks_t *arena_get_ehooks(arena_t *arena);
+extent_hooks_t *arena_set_extent_hooks(tsd_t *tsd, arena_t *arena,
+    extent_hooks_t *extent_hooks);
+bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+ssize_t arena_dirty_decay_ms_default_get(void);
+bool arena_dirty_decay_ms_default_set(ssize_t decay_ms);
+ssize_t arena_muzzy_decay_ms_default_get(void);
+bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
+bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena,
+    size_t *old_limit, size_t *new_limit);
+unsigned arena_nthreads_get(arena_t *arena, bool internal);
+void arena_nthreads_inc(arena_t *arena, bool internal);
+void arena_nthreads_dec(arena_t *arena, bool internal);
+arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
+bool arena_init_huge(void);
+bool arena_is_huge(unsigned arena_ind);
+arena_t *arena_choose_huge(tsd_t *tsd);
+bin_t *arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard);
+size_t arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    void **ptrs, size_t nfill, bool zero);
+bool arena_boot(sc_data_t *sc_data, base_t *base, bool hpa);
+void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork2(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork3(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork4(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork5(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork6(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork7(tsdn_t *tsdn, arena_t *arena);
+void arena_prefork8(tsdn_t *tsdn, arena_t *arena);
+void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena);
+void arena_postfork_child(tsdn_t *tsdn, arena_t *arena);
+
+#endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */

+ 24 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h

@@ -0,0 +1,24 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H
+#define JEMALLOC_INTERNAL_ARENA_INLINES_A_H
+
+static inline unsigned
+arena_ind_get(const arena_t *arena) {
+	return arena->ind;
+}
+
+static inline void
+arena_internal_add(arena_t *arena, size_t size) {
+	atomic_fetch_add_zu(&arena->stats.internal, size, ATOMIC_RELAXED);
+}
+
+static inline void
+arena_internal_sub(arena_t *arena, size_t size) {
+	atomic_fetch_sub_zu(&arena->stats.internal, size, ATOMIC_RELAXED);
+}
+
+static inline size_t
+arena_internal_get(arena_t *arena) {
+	return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */

+ 550 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h

@@ -0,0 +1,550 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
+#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H
+
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/ticker.h"
+
+static inline arena_t *
+arena_get_from_edata(edata_t *edata) {
+	return (arena_t *)atomic_load_p(&arenas[edata_arena_ind_get(edata)],
+	    ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/*
+	 * For huge allocations, use the dedicated huge arena if both are true:
+	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
+	 * thread is not assigned to a manual arena.
+	 */
+	if (unlikely(size >= oversize_threshold)) {
+		arena_t *tsd_arena = tsd_arena_get(tsd);
+		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
+			return arena_choose_huge(tsd);
+		}
+	}
+
+	return arena_choose(tsd, NULL);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
+    prof_info_t *prof_info, bool reset_recent) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(prof_info != NULL);
+
+	edata_t *edata = NULL;
+	bool is_slab;
+
+	/* Static check. */
+	if (alloc_ctx == NULL) {
+		edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
+		    ptr);
+		is_slab = edata_slab_get(edata);
+	} else if (unlikely(!(is_slab = alloc_ctx->slab))) {
+		edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
+		    ptr);
+	}
+
+	if (unlikely(!is_slab)) {
+		/* edata must have been initialized at this point. */
+		assert(edata != NULL);
+		large_prof_info_get(tsd, edata, prof_info, reset_recent);
+	} else {
+		prof_info->alloc_tctx = (prof_tctx_t *)(uintptr_t)1U;
+		/*
+		 * No need to set other fields in prof_info; they will never be
+		 * accessed if (uintptr_t)alloc_tctx == (uintptr_t)1U.
+		 */
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_tctx_reset(tsd_t *tsd, const void *ptr,
+    emap_alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	/* Static check. */
+	if (alloc_ctx == NULL) {
+		edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd),
+		    &arena_emap_global, ptr);
+		if (unlikely(!edata_slab_get(edata))) {
+			large_prof_tctx_reset(edata);
+		}
+	} else {
+		if (unlikely(!alloc_ctx->slab)) {
+			edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd),
+			    &arena_emap_global, ptr);
+			large_prof_tctx_reset(edata);
+		}
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
+	    ptr);
+	assert(!edata_slab_get(edata));
+
+	large_prof_tctx_reset(edata);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx,
+    size_t size) {
+	cassert(config_prof);
+
+	assert(!edata_slab_get(edata));
+	large_prof_info_set(edata, tctx, size);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
+	if (unlikely(tsdn_null(tsdn))) {
+		return;
+	}
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	/*
+	 * We use the ticker_geom_t to avoid having per-arena state in the tsd.
+	 * Instead of having a countdown-until-decay timer running for every
+	 * arena in every thread, we flip a coin once per tick, whose
+	 * probability of coming up heads is 1/nticks; this is effectively the
+	 * operation of the ticker_geom_t.  Each arena has the same chance of a
+	 * coinflip coming up heads (1/ARENA_DECAY_NTICKS_PER_UPDATE), so we can
+	 * use a single ticker for all of them.
+	 */
+	ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd);
+	uint64_t *prng_state = tsd_prng_statep_get(tsd);
+	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks))) {
+		arena_decay(tsdn, arena, false, false);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {
+	arena_decay_ticks(tsdn, arena, 1);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
+    tcache_t *tcache, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+
+	if (likely(tcache != NULL)) {
+		if (likely(size <= SC_SMALL_MAXCLASS)) {
+			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
+			    tcache, size, ind, zero, slow_path);
+		}
+		if (likely(size <= tcache_maxclass)) {
+			return tcache_alloc_large(tsdn_tsd(tsdn), arena,
+			    tcache, size, ind, zero, slow_path);
+		}
+		/* (size > tcache_maxclass) case falls through. */
+		assert(size > tcache_maxclass);
+	}
+
+	return arena_malloc_hard(tsdn, arena, size, ind, zero);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_aalloc(tsdn_t *tsdn, const void *ptr) {
+	edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr);
+	unsigned arena_ind = edata_arena_ind_get(edata);
+	return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_salloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+	emap_alloc_ctx_t alloc_ctx;
+	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
+	assert(alloc_ctx.szind != SC_NSIZES);
+
+	return sz_index2size(alloc_ctx.szind);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
+	/*
+	 * Return 0 if ptr is not within an extent managed by jemalloc.  This
+	 * function has two extra costs relative to isalloc():
+	 * - The rtree calls cannot claim to be dependent lookups, which induces
+	 *   rtree lookup load dependencies.
+	 * - The lookup may fail, so there is an extra branch to check for
+	 *   failure.
+	 */
+
+	emap_full_alloc_ctx_t full_alloc_ctx;
+	bool missing = emap_full_alloc_ctx_try_lookup(tsdn, &arena_emap_global,
+	    ptr, &full_alloc_ctx);
+	if (missing) {
+		return 0;
+	}
+
+	if (full_alloc_ctx.edata == NULL) {
+		return 0;
+	}
+	assert(edata_state_get(full_alloc_ctx.edata) == extent_state_active);
+	/* Only slab members should be looked up via interior pointers. */
+	assert(edata_addr_get(full_alloc_ctx.edata) == ptr
+	    || edata_slab_get(full_alloc_ctx.edata));
+
+	assert(full_alloc_ctx.szind != SC_NSIZES);
+
+	return sz_index2size(full_alloc_ctx.szind);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+large_dalloc_safety_checks(edata_t *edata, void *ptr, szind_t szind) {
+	if (!config_opt_safety_checks) {
+		return false;
+	}
+
+	/*
+	 * Eagerly detect double free and sized dealloc bugs for large sizes.
+	 * The cost is low enough (as edata will be accessed anyway) to be
+	 * enabled all the time.
+	 */
+	if (unlikely(edata == NULL ||
+	    edata_state_get(edata) != extent_state_active)) {
+		safety_check_fail("Invalid deallocation detected: "
+		    "pages being freed (%p) not currently active, "
+		    "possibly caused by double free bugs.",
+		    (uintptr_t)edata_addr_get(edata));
+		return true;
+	}
+	size_t input_size = sz_index2size(szind);
+	if (unlikely(input_size != edata_usize_get(edata))) {
+		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
+		    /* true_size */ edata_usize_get(edata), input_size);
+		return true;
+	}
+
+	return false;
+}
+
+static inline void
+arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
+	if (config_prof && unlikely(szind < SC_NBINS)) {
+		arena_dalloc_promoted(tsdn, ptr, NULL, true);
+	} else {
+		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
+		    ptr);
+		if (large_dalloc_safety_checks(edata, ptr, szind)) {
+			/* See the comment in isfree. */
+			return;
+		}
+		large_dalloc(tsdn, edata);
+	}
+}
+
+static inline void
+arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
+	assert(ptr != NULL);
+
+	emap_alloc_ctx_t alloc_ctx;
+	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
+		    ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.szind < SC_NSIZES);
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		arena_dalloc_small(tsdn, ptr);
+	} else {
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
+    bool slow_path) {
+	if (szind < nhbins) {
+		if (config_prof && unlikely(szind < SC_NBINS)) {
+			arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
+		} else {
+			tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind,
+			    slow_path);
+		}
+	} else {
+		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
+		    ptr);
+		if (large_dalloc_safety_checks(edata, ptr, szind)) {
+			/* See the comment in isfree. */
+			return;
+		}
+		large_dalloc(tsdn, edata);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
+    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+	assert(ptr != NULL);
+
+	if (unlikely(tcache == NULL)) {
+		arena_dalloc_no_tcache(tsdn, ptr);
+		return;
+	}
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (caller_alloc_ctx != NULL) {
+		alloc_ctx = *caller_alloc_ctx;
+	} else {
+		util_assume(!tsdn_null(tsdn));
+		emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
+		    &alloc_ctx);
+	}
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
+		    ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.szind < SC_NSIZES);
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
+		    alloc_ctx.szind, slow_path);
+	} else {
+		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
+		    slow_path);
+	}
+}
+
+static inline void
+arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
+	assert(ptr != NULL);
+	assert(size <= SC_LARGE_MAXCLASS);
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (!config_prof || !opt_prof) {
+		/*
+		 * There is no risk of being confused by a promoted sampled
+		 * object, so base szind and slab on the given size.
+		 */
+		alloc_ctx.szind = sz_size2index(size);
+		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+	}
+
+	if ((config_prof && opt_prof) || config_debug) {
+		emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
+		    &alloc_ctx);
+
+		assert(alloc_ctx.szind == sz_size2index(size));
+		assert((config_prof && opt_prof)
+		    || alloc_ctx.slab == (alloc_ctx.szind < SC_NBINS));
+
+		if (config_debug) {
+			edata_t *edata = emap_edata_lookup(tsdn,
+			    &arena_emap_global, ptr);
+			assert(alloc_ctx.szind == edata_szind_get(edata));
+			assert(alloc_ctx.slab == edata_slab_get(edata));
+		}
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		arena_dalloc_small(tsdn, ptr);
+	} else {
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
+    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+	assert(ptr != NULL);
+	assert(size <= SC_LARGE_MAXCLASS);
+
+	if (unlikely(tcache == NULL)) {
+		arena_sdalloc_no_tcache(tsdn, ptr, size);
+		return;
+	}
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (config_prof && opt_prof) {
+		if (caller_alloc_ctx == NULL) {
+			/* Uncommon case and should be a static check. */
+			emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
+			    &alloc_ctx);
+			assert(alloc_ctx.szind == sz_size2index(size));
+		} else {
+			alloc_ctx = *caller_alloc_ctx;
+		}
+	} else {
+		/*
+		 * There is no risk of being confused by a promoted sampled
+		 * object, so base szind and slab on the given size.
+		 */
+		alloc_ctx.szind = sz_size2index(size);
+		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+	}
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
+		    ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
+		    alloc_ctx.szind, slow_path);
+	} else {
+		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
+		    slow_path);
+	}
+}
+
+static inline void
+arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
+    size_t alignment) {
+	assert(edata_base_get(edata) == edata_addr_get(edata));
+
+	if (alignment < PAGE) {
+		unsigned lg_range = LG_PAGE -
+		    lg_floor(CACHELINE_CEILING(alignment));
+		size_t r;
+		if (!tsdn_null(tsdn)) {
+			tsd_t *tsd = tsdn_tsd(tsdn);
+			r = (size_t)prng_lg_range_u64(
+			    tsd_prng_statep_get(tsd), lg_range);
+		} else {
+			uint64_t stack_value = (uint64_t)(uintptr_t)&r;
+			r = (size_t)prng_lg_range_u64(&stack_value, lg_range);
+		}
+		uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE -
+		    lg_range);
+		edata->e_addr = (void *)((uintptr_t)edata->e_addr +
+		    random_offset);
+		assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment) ==
+		    edata->e_addr);
+	}
+}
+
+/*
+ * The dalloc bin info contains just the information that the common paths need
+ * during tcache flushes.  By force-inlining these paths, and using local copies
+ * of data (so that the compiler knows it's constant), we avoid a whole bunch of
+ * redundant loads and stores by leaving this information in registers.
+ */
+typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t;
+struct arena_dalloc_bin_locked_info_s {
+	div_info_t div_info;
+	uint32_t nregs;
+	uint64_t ndalloc;
+};
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
+    edata_t *slab, const void *ptr) {
+	size_t diff, regind;
+
+	/* Freeing a pointer outside the slab can cause assertion failure. */
+	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
+	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
+	/* Freeing an interior pointer can cause assertion failure. */
+	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
+	    (uintptr_t)bin_infos[binind].reg_size == 0);
+
+	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
+
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(&info->div_info, diff);
+
+	assert(regind < bin_infos[binind].nregs);
+
+	return regind;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
+    szind_t binind) {
+	info->div_info = arena_binind_div_info[binind];
+	info->nregs = bin_infos[binind].nregs;
+	info->ndalloc = 0;
+}
+
+/*
+ * Does the deallocation work associated with freeing a single pointer (a
+ * "step") in between a arena_dalloc_bin_locked begin and end call.
+ *
+ * Returns true if arena_slab_dalloc must be called on slab.  Doesn't do
+ * stats updates, which happen during finish (this lets running counts get left
+ * in a register).
+ */
+JEMALLOC_ALWAYS_INLINE bool
+arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
+    void *ptr) {
+	const bin_info_t *bin_info = &bin_infos[binind];
+	size_t regind = arena_slab_regind(info, binind, slab, ptr);
+	slab_data_t *slab_data = edata_slab_data_get(slab);
+
+	assert(edata_nfree_get(slab) < bin_info->nregs);
+	/* Freeing an unallocated pointer can cause assertion failure. */
+	assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
+
+	bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
+	edata_nfree_inc(slab);
+
+	if (config_stats) {
+		info->ndalloc++;
+	}
+
+	unsigned nfree = edata_nfree_get(slab);
+	if (nfree == bin_info->nregs) {
+		arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
+		    bin);
+		return true;
+	} else if (nfree == 1 && slab != bin->slabcur) {
+		arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
+		    bin);
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
+    arena_dalloc_bin_locked_info_t *info) {
+	if (config_stats) {
+		bin->stats.ndalloc += info->ndalloc;
+		assert(bin->stats.curregs >= (size_t)info->ndalloc);
+		bin->stats.curregs -= (size_t)info->ndalloc;
+	}
+}
+
+static inline bin_t *
+arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) {
+	bin_t *shard0 = (bin_t *)((uintptr_t)arena + arena_bin_offsets[binind]);
+	return shard0 + binshard;
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */

+ 114 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h

@@ -0,0 +1,114 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
+#define JEMALLOC_INTERNAL_ARENA_STATS_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/lockedint.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/pa.h"
+#include "jemalloc/internal/sc.h"
+
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
+typedef struct arena_stats_large_s arena_stats_large_t;
+struct arena_stats_large_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the arena.
+	 */
+	locked_u64_t	nmalloc;
+	locked_u64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to this size class.
+	 * This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	locked_u64_t	nrequests; /* Partially derived. */
+	/*
+	 * Number of tcache fills / flushes for large (similarly, periodically
+	 * merged).  Note that there is no large tcache batch-fill currently
+	 * (i.e. only fill 1 at a time); however flush may be batched.
+	 */
+	locked_u64_t	nfills; /* Partially derived. */
+	locked_u64_t	nflushes; /* Partially derived. */
+
+	/* Current number of allocations of this size class. */
+	size_t		curlextents; /* Derived. */
+};
+
+/*
+ * Arena stats.  Note that fields marked "derived" are not directly maintained
+ * within the arena code; rather their values are derived during stats merge
+ * requests.
+ */
+typedef struct arena_stats_s arena_stats_t;
+struct arena_stats_s {
+	LOCKEDINT_MTX_DECLARE(mtx)
+
+	/*
+	 * resident includes the base stats -- that's why it lives here and not
+	 * in pa_shard_stats_t.
+	 */
+	size_t			base; /* Derived. */
+	size_t			resident; /* Derived. */
+	size_t			metadata_thp; /* Derived. */
+	size_t			mapped; /* Derived. */
+
+	atomic_zu_t		internal;
+
+	size_t			allocated_large; /* Derived. */
+	uint64_t		nmalloc_large; /* Derived. */
+	uint64_t		ndalloc_large; /* Derived. */
+	uint64_t		nfills_large; /* Derived. */
+	uint64_t		nflushes_large; /* Derived. */
+	uint64_t		nrequests_large; /* Derived. */
+
+	/*
+	 * The stats logically owned by the pa_shard in the same arena.  This
+	 * lives here only because it's convenient for the purposes of the ctl
+	 * module -- it only knows about the single arena_stats.
+	 */
+	pa_shard_stats_t	pa_shard_stats;
+
+	/* Number of bytes cached in tcache associated with this arena. */
+	size_t			tcache_bytes; /* Derived. */
+	size_t			tcache_stashed_bytes; /* Derived. */
+
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
+
+	/* One element for each large size class. */
+	arena_stats_large_t	lstats[SC_NSIZES - SC_NBINS];
+
+	/* Arena uptime. */
+	nstime_t		uptime;
+};
+
+static inline bool
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+	if (config_debug) {
+		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
+			assert(((char *)arena_stats)[i] == 0);
+		}
+	}
+	if (LOCKEDINT_MTX_INIT(arena_stats->mtx, "arena_stats",
+	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+	/* Memory is zeroed, so there is no need to clear stats. */
+	return false;
+}
+
+static inline void
+arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    szind_t szind, uint64_t nrequests) {
+	LOCKEDINT_MTX_LOCK(tsdn, arena_stats->mtx);
+	arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS];
+	locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx),
+	    &lstats->nrequests, nrequests);
+	locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx),
+	    &lstats->nflushes, 1);
+	LOCKEDINT_MTX_UNLOCK(tsdn, arena_stats->mtx);
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */

+ 101 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h

@@ -0,0 +1,101 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H
+#define JEMALLOC_INTERNAL_ARENA_STRUCTS_H
+
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/counter.h"
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/pa.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/ticker.h"
+
+struct arena_s {
+	/*
+	 * Number of threads currently assigned to this arena.  Each thread has
+	 * two distinct assignments, one for application-serving allocation, and
+	 * the other for internal metadata allocation.  Internal metadata must
+	 * not be allocated from arenas explicitly created via the arenas.create
+	 * mallctl, because the arena.<i>.reset mallctl indiscriminately
+	 * discards all allocations for the affected arena.
+	 *
+	 *   0: Application allocation.
+	 *   1: Internal metadata allocation.
+	 *
+	 * Synchronization: atomic.
+	 */
+	atomic_u_t		nthreads[2];
+
+	/* Next bin shard for binding new threads. Synchronization: atomic. */
+	atomic_u_t		binshard_next;
+
+	/*
+	 * When percpu_arena is enabled, to amortize the cost of reading /
+	 * updating the current CPU id, track the most recent thread accessing
+	 * this arena, and only read CPU if there is a mismatch.
+	 */
+	tsdn_t		*last_thd;
+
+	/* Synchronization: internal. */
+	arena_stats_t		stats;
+
+	/*
+	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
+	 * associated with this arena.  Stats from these are merged
+	 * incrementally, and at exit if opt_stats_print is enabled.
+	 *
+	 * Synchronization: tcache_ql_mtx.
+	 */
+	ql_head(tcache_slow_t)			tcache_ql;
+	ql_head(cache_bin_array_descriptor_t)	cache_bin_array_descriptor_ql;
+	malloc_mutex_t				tcache_ql_mtx;
+
+	/*
+	 * Represents a dss_prec_t, but atomically.
+	 *
+	 * Synchronization: atomic.
+	 */
+	atomic_u_t		dss_prec;
+
+	/*
+	 * Extant large allocations.
+	 *
+	 * Synchronization: large_mtx.
+	 */
+	edata_list_active_t	large;
+	/* Synchronizes all large allocation/update/deallocation. */
+	malloc_mutex_t		large_mtx;
+
+	/* The page-level allocator shard this arena uses. */
+	pa_shard_t		pa_shard;
+
+	/*
+	 * A cached copy of base->ind.  This can get accessed on hot paths;
+	 * looking it up in base requires an extra pointer hop / cache miss.
+	 */
+	unsigned ind;
+
+	/*
+	 * Base allocator, from which arena metadata are allocated.
+	 *
+	 * Synchronization: internal.
+	 */
+	base_t			*base;
+	/* Used to determine uptime.  Read-only after initialization. */
+	nstime_t		create_time;
+
+	/*
+	 * The arena is allocated alongside its bins; really this is a
+	 * dynamically sized array determined by the binshard settings.
+	 */
+	bin_t			bins[0];
+};
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */

+ 58 - 0
BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h

@@ -0,0 +1,58 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
+#define JEMALLOC_INTERNAL_ARENA_TYPES_H
+
+#include "jemalloc/internal/sc.h"
+
+/* Default decay times in milliseconds. */
+#define DIRTY_DECAY_MS_DEFAULT	ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT	(0)
+/* Number of event ticks between time checks. */
+#define ARENA_DECAY_NTICKS_PER_UPDATE	1000
+
+typedef struct arena_decay_s arena_decay_t;
+typedef struct arena_s arena_t;
+
+typedef enum {
+	percpu_arena_mode_names_base   = 0, /* Used for options processing. */
+
+	/*
+	 * *_uninit are used only during bootstrapping, and must correspond
+	 * to initialized variant plus percpu_arena_mode_enabled_base.
+	 */
+	percpu_arena_uninit            = 0,
+	per_phycpu_arena_uninit        = 1,
+
+	/* All non-disabled modes must come after percpu_arena_disabled. */
+	percpu_arena_disabled          = 2,
+
+	percpu_arena_mode_names_limit  = 3, /* Used for options processing. */
+	percpu_arena_mode_enabled_base = 3,
+
+	percpu_arena                   = 3,
+	per_phycpu_arena               = 4  /* Hyper threads share arena. */
+} percpu_arena_mode_t;
+
+#define PERCPU_ARENA_ENABLED(m)	((m) >= percpu_arena_mode_enabled_base)
+#define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
+
+/*
+ * When allocation_size >= oversize_threshold, use the dedicated huge arena
+ * (unless have explicitly spicified arena index).  0 disables the feature.
+ */
+#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20)
+
+struct arena_config_s {
+	/* extent hooks to be used for the arena */
+	extent_hooks_t *extent_hooks;
+
+	/*
+	 * Use extent hooks for metadata (base) allocations when true.
+	 */
+	bool metadata_use_hooks;
+};
+
+typedef struct arena_config_s arena_config_t;
+
+extern const arena_config_t arena_config_default;
+
+#endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */

+ 56 - 0
BeefRT/JEMalloc/include/jemalloc/internal/assert.h

@@ -0,0 +1,56 @@
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/util.h"
+
+/*
+ * Define a custom assert() in order to reduce the chances of deadlock during
+ * assertion failure.
+ */
+#ifndef assert
+#define assert(e) do {							\
+	if (unlikely(config_debug && !(e))) {				\
+		malloc_printf(						\
+		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
+		    __FILE__, __LINE__, #e);				\
+		abort();						\
+	}								\
+} while (0)
+#endif
+
+#ifndef not_reached
+#define not_reached() do {						\
+	if (config_debug) {						\
+		malloc_printf(						\
+		    "<jemalloc>: %s:%d: Unreachable code reached\n",	\
+		    __FILE__, __LINE__);				\
+		abort();						\
+	}								\
+	unreachable();							\
+} while (0)
+#endif
+
+#ifndef not_implemented
+#define not_implemented() do {						\
+	if (config_debug) {						\
+		malloc_printf("<jemalloc>: %s:%d: Not implemented\n",	\
+		    __FILE__, __LINE__);				\
+		abort();						\
+	}								\
+} while (0)
+#endif
+
+#ifndef assert_not_implemented
+#define assert_not_implemented(e) do {					\
+	if (unlikely(config_debug && !(e))) {				\
+		not_implemented();					\
+	}								\
+} while (0)
+#endif
+
+/* Use to assert a particular configuration, e.g., cassert(config_debug). */
+#ifndef cassert
+#define cassert(c) do {							\
+	if (unlikely(!(c))) {						\
+		not_reached();						\
+	}								\
+} while (0)
+#endif

+ 107 - 0
BeefRT/JEMalloc/include/jemalloc/internal/atomic.h

@@ -0,0 +1,107 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_H
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define JEMALLOC_U8_ATOMICS
+#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
+#  include "jemalloc/internal/atomic_gcc_atomic.h"
+#  if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
+#elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
+#  include "jemalloc/internal/atomic_gcc_sync.h"
+#  if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#    undef JEMALLOC_U8_ATOMICS
+#  endif
+#elif defined(_MSC_VER)
+#  include "jemalloc/internal/atomic_msvc.h"
+#elif defined(JEMALLOC_C11_ATOMICS)
+#  include "jemalloc/internal/atomic_c11.h"
+#else
+#  error "Don't have atomics implemented on this platform."
+#endif
+
+/*
+ * This header gives more or less a backport of C11 atomics. The user can write
+ * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
+ * counterparts of the C11 atomic functions for type, as so:
+ *   JEMALLOC_GENERATE_ATOMICS(int *, pi, 3);
+ * and then write things like:
+ *   int *some_ptr;
+ *   atomic_pi_t atomic_ptr_to_int;
+ *   atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED);
+ *   int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL);
+ *   assert(some_ptr == prev_value);
+ * and expect things to work in the obvious way.
+ *
+ * Also included (with naming differences to avoid conflicts with the standard
+ * library):
+ *   atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence).
+ *   ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT).
+ */
+
+/*
+ * Pure convenience, so that we don't have to type "atomic_memory_order_"
+ * quite so often.
+ */
+#define ATOMIC_RELAXED atomic_memory_order_relaxed
+#define ATOMIC_ACQUIRE atomic_memory_order_acquire
+#define ATOMIC_RELEASE atomic_memory_order_release
+#define ATOMIC_ACQ_REL atomic_memory_order_acq_rel
+#define ATOMIC_SEQ_CST atomic_memory_order_seq_cst
+
+/*
+ * Another convenience -- simple atomic helper functions.
+ */
+#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type,	\
+    lg_size)								\
+    JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)		\
+    ATOMIC_INLINE void							\
+    atomic_load_add_store_##short_type(atomic_##short_type##_t *a,	\
+	type inc) {							\
+	    type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);	\
+	    type newval = oldval + inc;					\
+	    atomic_store_##short_type(a, newval, ATOMIC_RELAXED);	\
+	}								\
+    ATOMIC_INLINE void							\
+    atomic_load_sub_store_##short_type(atomic_##short_type##_t *a,	\
+	type inc) {							\
+	    type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);	\
+	    type newval = oldval - inc;					\
+	    atomic_store_##short_type(a, newval, ATOMIC_RELAXED);	\
+	}
+
+/*
+ * Not all platforms have 64-bit atomics.  If we do, this #define exposes that
+ * fact.
+ */
+#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+#  define JEMALLOC_ATOMIC_U64
+#endif
+
+JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR)
+
+/*
+ * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only
+ * platform that actually needs to know the size, MSVC.
+ */
+JEMALLOC_GENERATE_ATOMICS(bool, b, 0)
+
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT)
+
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)
+
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)
+
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint8_t, u8, 0)
+
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint32_t, u32, 2)
+
+#ifdef JEMALLOC_ATOMIC_U64
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint64_t, u64, 3)
+#endif
+
+#undef ATOMIC_INLINE
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_H */

+ 97 - 0
BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h

@@ -0,0 +1,97 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
+#define JEMALLOC_INTERNAL_ATOMIC_C11_H
+
+#include <stdatomic.h>
+
+#define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
+
+#define atomic_memory_order_t memory_order
+#define atomic_memory_order_relaxed memory_order_relaxed
+#define atomic_memory_order_acquire memory_order_acquire
+#define atomic_memory_order_release memory_order_release
+#define atomic_memory_order_acq_rel memory_order_acq_rel
+#define atomic_memory_order_seq_cst memory_order_seq_cst
+
+#define atomic_fence atomic_thread_fence
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef _Atomic(type) atomic_##short_type##_t;				\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	/*								\
+	 * A strict interpretation of the C standard prevents		\
+	 * atomic_load from taking a const argument, but it's		\
+	 * convenient for our purposes. This cast is a workaround.	\
+	 */								\
+	atomic_##short_type##_t* a_nonconst =				\
+	    (atomic_##short_type##_t*)a;				\
+	return atomic_load_explicit(a_nonconst, mo);			\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	atomic_store_explicit(a, val, mo);				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return atomic_exchange_explicit(a, val, mo);			\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_weak_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_strong_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}
+
+/*
+ * Integral types have some special operations available that non-integral ones
+ * lack.
+ */
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, 		\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_add_explicit(a, val, mo);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_sub_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_and_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_or_explicit(a, val, mo);			\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return atomic_fetch_xor_explicit(a, val, mo);			\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */

+ 129 - 0
BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h

@@ -0,0 +1,129 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+
+#include "jemalloc/internal/assert.h"
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE int
+atomic_enum_to_builtin(atomic_memory_order_t mo) {
+	switch (mo) {
+	case atomic_memory_order_relaxed:
+		return __ATOMIC_RELAXED;
+	case atomic_memory_order_acquire:
+		return __ATOMIC_ACQUIRE;
+	case atomic_memory_order_release:
+		return __ATOMIC_RELEASE;
+	case atomic_memory_order_acq_rel:
+		return __ATOMIC_ACQ_REL;
+	case atomic_memory_order_seq_cst:
+		return __ATOMIC_SEQ_CST;
+	}
+	/* Can't happen; the switch is exhaustive. */
+	not_reached();
+}
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	__atomic_thread_fence(atomic_enum_to_builtin(mo));
+}
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef struct {							\
+	type repr;							\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	type result;							\
+	__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));	\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a, type val,		\
+    atomic_memory_order_t mo) {						\
+	__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));	\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	type result;							\
+	__atomic_exchange(&a->repr, &val, &result,			\
+	    atomic_enum_to_builtin(mo));				\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    UNUSED type *expected, type desired,				\
+    atomic_memory_order_t success_mo,					\
+    atomic_memory_order_t failure_mo) {					\
+	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
+	    true, atomic_enum_to_builtin(success_mo),			\
+	    atomic_enum_to_builtin(failure_mo));			\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    UNUSED type *expected, type desired,				\
+    atomic_memory_order_t success_mo,					\
+    atomic_memory_order_t failure_mo) {					\
+	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
+	    false,							\
+	    atomic_enum_to_builtin(success_mo),				\
+	    atomic_enum_to_builtin(failure_mo));			\
+}
+
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_add(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_sub(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_and(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_or(&a->repr, val,				\
+	    atomic_enum_to_builtin(mo));				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __atomic_fetch_xor(&a->repr, val,			\
+	    atomic_enum_to_builtin(mo));				\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */

+ 195 - 0
BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h

@@ -0,0 +1,195 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	/* Easy cases first: no barrier, and full barrier. */
+	if (mo == atomic_memory_order_relaxed) {
+		asm volatile("" ::: "memory");
+		return;
+	}
+	if (mo == atomic_memory_order_seq_cst) {
+		asm volatile("" ::: "memory");
+		__sync_synchronize();
+		asm volatile("" ::: "memory");
+		return;
+	}
+	asm volatile("" ::: "memory");
+#  if defined(__i386__) || defined(__x86_64__)
+	/* This is implicit on x86. */
+#  elif defined(__ppc64__)
+	asm volatile("lwsync");
+#  elif defined(__ppc__)
+	asm volatile("sync");
+#  elif defined(__sparc__) && defined(__arch64__)
+	if (mo == atomic_memory_order_acquire) {
+		asm volatile("membar #LoadLoad | #LoadStore");
+	} else if (mo == atomic_memory_order_release) {
+		asm volatile("membar #LoadStore | #StoreStore");
+	} else {
+		asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
+	}
+#  else
+	__sync_synchronize();
+#  endif
+	asm volatile("" ::: "memory");
+}
+
+/*
+ * A correct implementation of seq_cst loads and stores on weakly ordered
+ * architectures could do either of the following:
+ *   1. store() is weak-fence -> store -> strong fence, load() is load ->
+ *      strong-fence.
+ *   2. store() is strong-fence -> store, load() is strong-fence -> load ->
+ *      weak-fence.
+ * The tricky thing is, load() and store() above can be the load or store
+ * portions of a gcc __sync builtin, so we have to follow GCC's lead, which
+ * means going with strategy 2.
+ * On strongly ordered architectures, the natural strategy is to stick a strong
+ * fence after seq_cst stores, and have naked loads.  So we want the strong
+ * fences in different places on different architectures.
+ * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to
+ * accomplish this.
+ */
+
+ATOMIC_INLINE void
+atomic_pre_sc_load_fence() {
+#  if defined(__i386__) || defined(__x86_64__) ||			\
+    (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_relaxed);
+#  else
+	atomic_fence(atomic_memory_order_seq_cst);
+#  endif
+}
+
+ATOMIC_INLINE void
+atomic_post_sc_store_fence() {
+#  if defined(__i386__) || defined(__x86_64__) ||			\
+    (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_seq_cst);
+#  else
+	atomic_fence(atomic_memory_order_relaxed);
+#  endif
+
+}
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef struct {							\
+	type volatile repr;						\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_pre_sc_load_fence();				\
+	}								\
+	type result = a->repr;						\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_acquire);		\
+	}								\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_release);		\
+	}								\
+	a->repr = val;							\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_post_sc_store_fence();				\
+	}								\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
+    atomic_memory_order_t mo) {                  					 \
+	/*								\
+	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
+	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
+	 */								\
+	while (true) {							\
+		type old = a->repr;					\
+		if (__sync_bool_compare_and_swap(&a->repr, old, val)) {	\
+			return old;					\
+		}							\
+	}								\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired,                                     \
+    atomic_memory_order_t success_mo,                          \
+    atomic_memory_order_t failure_mo) {				                \
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired,                                       \
+    atomic_memory_order_t success_mo,                            \
+    atomic_memory_order_t failure_mo) {                          \
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_add(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_sub(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_and(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_or(&a->repr, val);			\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return __sync_fetch_and_xor(&a->repr, val);			\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */

+ 158 - 0
BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h

@@ -0,0 +1,158 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+#define JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+
+#define ATOMIC_INIT(...) {__VA_ARGS__}
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+typedef char atomic_repr_0_t;
+typedef short atomic_repr_1_t;
+typedef long atomic_repr_2_t;
+typedef __int64 atomic_repr_3_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	_ReadWriteBarrier();
+#  if defined(_M_ARM) || defined(_M_ARM64)
+	/* ARM needs a barrier for everything but relaxed. */
+	if (mo != atomic_memory_order_relaxed) {
+		MemoryBarrier();
+	}
+#  elif defined(_M_IX86) || defined (_M_X64)
+	/* x86 needs a barrier only for seq_cst. */
+	if (mo == atomic_memory_order_seq_cst) {
+		MemoryBarrier();
+	}
+#  else
+#  error "Don't know how to create atomics for this platform for MSVC."
+#  endif
+	_ReadWriteBarrier();
+}
+
+#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t
+
+#define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b)
+#define ATOMIC_RAW_CONCAT(a, b) a ## b
+
+#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT(	\
+    base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))
+
+#define ATOMIC_INTERLOCKED_SUFFIX(lg_size)				\
+    ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)
+
+#define ATOMIC_INTERLOCKED_SUFFIX_0 8
+#define ATOMIC_INTERLOCKED_SUFFIX_1 16
+#define ATOMIC_INTERLOCKED_SUFFIX_2
+#define ATOMIC_INTERLOCKED_SUFFIX_3 64
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)		\
+typedef struct {							\
+	ATOMIC_INTERLOCKED_REPR(lg_size) repr;				\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;			\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_acquire);		\
+	}								\
+	return (type) ret;						\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_release);		\
+	}								\
+	a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val;		\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_fence(atomic_memory_order_seq_cst);		\
+	}								\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,	\
+	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	ATOMIC_INTERLOCKED_REPR(lg_size) e =				\
+	    (ATOMIC_INTERLOCKED_REPR(lg_size))*expected;		\
+	ATOMIC_INTERLOCKED_REPR(lg_size) d =				\
+	    (ATOMIC_INTERLOCKED_REPR(lg_size))desired;			\
+	ATOMIC_INTERLOCKED_REPR(lg_size) old =				\
+	    ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, 	\
+		lg_size)(&a->repr, d, e);				\
+	if (old == e) {							\
+		return true;						\
+	} else {							\
+		*expected = (type)old;					\
+		return false;						\
+	}								\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	/* We implement the weak version with strong semantics. */	\
+	return atomic_compare_exchange_weak_##short_type(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}
+
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)	\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)			\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd,	\
+	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	/*								\
+	 * MSVC warns on negation of unsigned operands, but for us it	\
+	 * gives exactly the right semantics (MAX_TYPE + 1 - operand).	\
+	 */								\
+	__pragma(warning(push))						\
+	__pragma(warning(disable: 4146))				\
+	return atomic_fetch_add_##short_type(a, -val, mo);		\
+	__pragma(warning(pop))						\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */

+ 33 - 0
BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h

@@ -0,0 +1,33 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
+
+extern bool opt_background_thread;
+extern size_t opt_max_background_threads;
+extern malloc_mutex_t background_thread_lock;
+extern atomic_b_t background_thread_enabled_state;
+extern size_t n_background_threads;
+extern size_t max_background_threads;
+extern background_thread_info_t *background_thread_info;
+
+bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
+bool background_threads_enable(tsd_t *tsd);
+bool background_threads_disable(tsd_t *tsd);
+bool background_thread_is_started(background_thread_info_t* info);
+void background_thread_wakeup_early(background_thread_info_t *info,
+    nstime_t *remaining_sleep);
+void background_thread_prefork0(tsdn_t *tsdn);
+void background_thread_prefork1(tsdn_t *tsdn);
+void background_thread_postfork_parent(tsdn_t *tsdn);
+void background_thread_postfork_child(tsdn_t *tsdn);
+bool background_thread_stats_read(tsdn_t *tsdn,
+    background_thread_stats_t *stats);
+void background_thread_ctl_init(tsdn_t *tsdn);
+
+#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
+extern int pthread_create_wrapper(pthread_t *__restrict, const pthread_attr_t *,
+    void *(*)(void *), void *__restrict);
+#endif
+bool background_thread_boot0(void);
+bool background_thread_boot1(tsdn_t *tsdn, base_t *base);
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H */

+ 48 - 0
BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h

@@ -0,0 +1,48 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
+
+JEMALLOC_ALWAYS_INLINE bool
+background_thread_enabled(void) {
+	return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+background_thread_enabled_set(tsdn_t *tsdn, bool state) {
+	malloc_mutex_assert_owner(tsdn, &background_thread_lock);
+	atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+arena_background_thread_info_get(arena_t *arena) {
+	unsigned arena_ind = arena_ind_get(arena);
+	return &background_thread_info[arena_ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+background_thread_info_get(size_t ind) {
+	return &background_thread_info[ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+background_thread_wakeup_time_get(background_thread_info_t *info) {
+	uint64_t next_wakeup = nstime_ns(&info->next_wakeup);
+	assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE) ==
+	    (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP));
+	return next_wakeup;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+background_thread_wakeup_time_set(tsdn_t *tsdn, background_thread_info_t *info,
+    uint64_t wakeup_time) {
+	malloc_mutex_assert_owner(tsdn, &info->mtx);
+	atomic_store_b(&info->indefinite_sleep,
+	    wakeup_time == BACKGROUND_THREAD_INDEFINITE_SLEEP, ATOMIC_RELEASE);
+	nstime_init(&info->next_wakeup, wakeup_time);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+background_thread_indefinite_sleep(background_thread_info_t *info) {
+	return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE);
+}
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */

+ 66 - 0
BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h

@@ -0,0 +1,66 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
+
+/* This file really combines "structs" and "types", but only transitionally. */
+
+#if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK)
+#  define JEMALLOC_PTHREAD_CREATE_WRAPPER
+#endif
+
+#define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
+#define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT
+#define DEFAULT_NUM_BACKGROUND_THREAD 4
+
+/*
+ * These exist only as a transitional state.  Eventually, deferral should be
+ * part of the PAI, and each implementation can indicate wait times with more
+ * specificity.
+ */
+#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2)
+#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000
+
+#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0)
+#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_MAX
+
+typedef enum {
+	background_thread_stopped,
+	background_thread_started,
+	/* Thread waits on the global lock when paused (for arena_reset). */
+	background_thread_paused,
+} background_thread_state_t;
+
+struct background_thread_info_s {
+#ifdef JEMALLOC_BACKGROUND_THREAD
+	/* Background thread is pthread specific. */
+	pthread_t		thread;
+	pthread_cond_t		cond;
+#endif
+	malloc_mutex_t		mtx;
+	background_thread_state_t	state;
+	/* When true, it means no wakeup scheduled. */
+	atomic_b_t		indefinite_sleep;
+	/* Next scheduled wakeup time (absolute time in ns). */
+	nstime_t		next_wakeup;
+	/*
+	 *  Since the last background thread run, newly added number of pages
+	 *  that need to be purged by the next wakeup.  This is adjusted on
+	 *  epoch advance, and is used to determine whether we should signal the
+	 *  background thread to wake up earlier.
+	 */
+	size_t			npages_to_purge_new;
+	/* Stats: total number of runs since started. */
+	uint64_t		tot_n_runs;
+	/* Stats: total sleep time since started. */
+	nstime_t		tot_sleep_time;
+};
+typedef struct background_thread_info_s background_thread_info_t;
+
+struct background_thread_stats_s {
+	size_t num_threads;
+	uint64_t num_runs;
+	nstime_t run_interval;
+	mutex_prof_data_t max_counter_per_bg_thd;
+};
+typedef struct background_thread_stats_s background_thread_stats_t;
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H */

+ 110 - 0
BeefRT/JEMalloc/include/jemalloc/internal/base.h

@@ -0,0 +1,110 @@
+#ifndef JEMALLOC_INTERNAL_BASE_H
+#define JEMALLOC_INTERNAL_BASE_H
+
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/mutex.h"
+
+enum metadata_thp_mode_e {
+	metadata_thp_disabled   = 0,
+	/*
+	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
+	 * + low usage arena (i.e. THP becomes a significant percentage), the
+	 * "auto" option only starts using THP after a base allocator used up
+	 * the first THP region.  Starting from the second hugepage (in a single
+	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
+	 * right away.
+	 */
+	metadata_thp_auto       = 1,
+	metadata_thp_always     = 2,
+	metadata_thp_mode_limit = 3
+};
+typedef enum metadata_thp_mode_e metadata_thp_mode_t;
+
+#define METADATA_THP_DEFAULT metadata_thp_disabled
+extern metadata_thp_mode_t opt_metadata_thp;
+extern const char *metadata_thp_mode_names[];
+
+
+/* Embedded at the beginning of every block of base-managed virtual memory. */
+typedef struct base_block_s base_block_t;
+struct base_block_s {
+	/* Total size of block's virtual memory mapping. */
+	size_t size;
+
+	/* Next block in list of base's blocks. */
+	base_block_t *next;
+
+	/* Tracks unused trailing space. */
+	edata_t edata;
+};
+
+typedef struct base_s base_t;
+struct base_s {
+	/*
+	 * User-configurable extent hook functions.
+	 */
+	ehooks_t ehooks;
+
+	/*
+	 * User-configurable extent hook functions for metadata allocations.
+	 */
+	ehooks_t ehooks_base;
+
+	/* Protects base_alloc() and base_stats_get() operations. */
+	malloc_mutex_t mtx;
+
+	/* Using THP when true (metadata_thp auto mode). */
+	bool auto_thp_switched;
+	/*
+	 * Most recent size class in the series of increasingly large base
+	 * extents.  Logarithmic spacing between subsequent allocations ensures
+	 * that the total number of distinct mappings remains small.
+	 */
+	pszind_t pind_last;
+
+	/* Serial number generation state. */
+	size_t extent_sn_next;
+
+	/* Chain of all blocks associated with base. */
+	base_block_t *blocks;
+
+	/* Heap of extents that track unused trailing space within blocks. */
+	edata_heap_t avail[SC_NSIZES];
+
+	/* Stats, only maintained if config_stats. */
+	size_t allocated;
+	size_t resident;
+	size_t mapped;
+	/* Number of THP regions touched. */
+	size_t n_thp;
+};
+
+static inline unsigned
+base_ind_get(const base_t *base) {
+	return ehooks_ind_get(&base->ehooks);
+}
+
+static inline bool
+metadata_thp_enabled(void) {
+	return (opt_metadata_thp != metadata_thp_disabled);
+}
+
+base_t *b0get(void);
+base_t *base_new(tsdn_t *tsdn, unsigned ind,
+    const extent_hooks_t *extent_hooks, bool metadata_use_hooks);
+void base_delete(tsdn_t *tsdn, base_t *base);
+ehooks_t *base_ehooks_get(base_t *base);
+ehooks_t *base_ehooks_get_for_metadata(base_t *base);
+extent_hooks_t *base_extent_hooks_set(base_t *base,
+    extent_hooks_t *extent_hooks);
+void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
+edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
+void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
+    size_t *resident, size_t *mapped, size_t *n_thp);
+void base_prefork(tsdn_t *tsdn, base_t *base);
+void base_postfork_parent(tsdn_t *tsdn, base_t *base);
+void base_postfork_child(tsdn_t *tsdn, base_t *base);
+bool base_boot(tsdn_t *tsdn);
+
+#endif /* JEMALLOC_INTERNAL_BASE_H */

+ 82 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bin.h

@@ -0,0 +1,82 @@
+#ifndef JEMALLOC_INTERNAL_BIN_H
+#define JEMALLOC_INTERNAL_BIN_H
+
+#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/bin_types.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * A bin contains a set of extents that are currently being used for slab
+ * allocations.
+ */
+typedef struct bin_s bin_t;
+struct bin_s {
+	/* All operations on bin_t fields require lock ownership. */
+	malloc_mutex_t		lock;
+
+	/*
+	 * Bin statistics.  These get touched every time the lock is acquired,
+	 * so put them close by in the hopes of getting some cache locality.
+	 */
+	bin_stats_t	stats;
+
+	/*
+	 * Current slab being used to service allocations of this bin's size
+	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
+	 * slabcur is reassigned, the previous slab must be deallocated or
+	 * inserted into slabs_{nonfull,full}.
+	 */
+	edata_t			*slabcur;
+
+	/*
+	 * Heap of non-full slabs.  This heap is used to assure that new
+	 * allocations come from the non-full slab that is oldest/lowest in
+	 * memory.
+	 */
+	edata_heap_t		slabs_nonfull;
+
+	/* List used to track full slabs. */
+	edata_list_active_t	slabs_full;
+};
+
+/* A set of sharded bins of the same size class. */
+typedef struct bins_s bins_t;
+struct bins_s {
+	/* Sharded bins.  Dynamically sized. */
+	bin_t *bin_shards;
+};
+
+void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards);
+
+/* Initializes a bin to empty.  Returns true on error. */
+bool bin_init(bin_t *bin);
+
+/* Forking. */
+void bin_prefork(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
+
+/* Stats. */
+static inline void
+bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) {
+	malloc_mutex_lock(tsdn, &bin->lock);
+	malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	bin_stats_t *stats = &dst_bin_stats->stats_data;
+	stats->nmalloc += bin->stats.nmalloc;
+	stats->ndalloc += bin->stats.ndalloc;
+	stats->nrequests += bin->stats.nrequests;
+	stats->curregs += bin->stats.curregs;
+	stats->nfills += bin->stats.nfills;
+	stats->nflushes += bin->stats.nflushes;
+	stats->nslabs += bin->stats.nslabs;
+	stats->reslabs += bin->stats.reslabs;
+	stats->curslabs += bin->stats.curslabs;
+	stats->nonfull_slabs += bin->stats.nonfull_slabs;
+	malloc_mutex_unlock(tsdn, &bin->lock);
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_H */

+ 50 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h

@@ -0,0 +1,50 @@
+#ifndef JEMALLOC_INTERNAL_BIN_INFO_H
+#define JEMALLOC_INTERNAL_BIN_INFO_H
+
+#include "jemalloc/internal/bitmap.h"
+
+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ *
+ * Each slab has the following layout:
+ *
+ *   /--------------------\
+ *   | region 0           |
+ *   |--------------------|
+ *   | region 1           |
+ *   |--------------------|
+ *   | ...                |
+ *   | ...                |
+ *   | ...                |
+ *   |--------------------|
+ *   | region nregs-1     |
+ *   \--------------------/
+ */
+typedef struct bin_info_s bin_info_t;
+struct bin_info_s {
+	/* Size of regions in a slab for this bin's size class. */
+	size_t			reg_size;
+
+	/* Total size of a slab for this bin's size class. */
+	size_t			slab_size;
+
+	/* Total number of regions in a slab for this bin's size class. */
+	uint32_t		nregs;
+
+	/* Number of sharded bins in each arena for this size class. */
+	uint32_t		n_shards;
+
+	/*
+	 * Metadata used to manipulate bitmaps for slabs associated with this
+	 * bin.
+	 */
+	bitmap_info_t		bitmap_info;
+};
+
+extern bin_info_t bin_infos[SC_NBINS];
+
+void bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]);
+
+#endif /* JEMALLOC_INTERNAL_BIN_INFO_H */

+ 57 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h

@@ -0,0 +1,57 @@
+#ifndef JEMALLOC_INTERNAL_BIN_STATS_H
+#define JEMALLOC_INTERNAL_BIN_STATS_H
+
+#include "jemalloc/internal/mutex_prof.h"
+
+typedef struct bin_stats_s bin_stats_t;
+struct bin_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the bin.  Note that tcache may allocate an object, then recycle it
+	 * many times, resulting many increments to nrequests, but only one
+	 * each to nmalloc and ndalloc.
+	 */
+	uint64_t	nmalloc;
+	uint64_t	ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to the size of this
+	 * bin.  This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	uint64_t	nrequests;
+
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t		curregs;
+
+	/* Number of tcache fills from this bin. */
+	uint64_t	nfills;
+
+	/* Number of tcache flushes to this bin. */
+	uint64_t	nflushes;
+
+	/* Total number of slabs created for this bin's size class. */
+	uint64_t	nslabs;
+
+	/*
+	 * Total number of slabs reused by extracting them from the slabs heap
+	 * for this bin's size class.
+	 */
+	uint64_t	reslabs;
+
+	/* Current number of slabs in this bin. */
+	size_t		curslabs;
+
+	/* Current size of nonfull slabs heap in this bin. */
+	size_t		nonfull_slabs;
+};
+
+typedef struct bin_stats_data_s bin_stats_data_t;
+struct bin_stats_data_s {
+	bin_stats_t stats_data;
+	mutex_prof_data_t mutex_data;
+};
+#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */

+ 17 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h

@@ -0,0 +1,17 @@
+#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
+#define JEMALLOC_INTERNAL_BIN_TYPES_H
+
+#include "jemalloc/internal/sc.h"
+
+#define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH)
+#define N_BIN_SHARDS_DEFAULT 1
+
+/* Used in TSD static initializer only. Real init in arena_bind(). */
+#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}}
+
+typedef struct tsd_binshards_s tsd_binshards_t;
+struct tsd_binshards_s {
+	uint8_t binshard[SC_NBINS];
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */

+ 422 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h

@@ -0,0 +1,422 @@
+#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
+#define JEMALLOC_INTERNAL_BIT_UTIL_H
+
+#include "jemalloc/internal/assert.h"
+
+/* Sanity check. */
+#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
+    || !defined(JEMALLOC_INTERNAL_FFS)
+#  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
+#endif
+
+/*
+ * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
+ * input, and returns the position of the lowest bit set (as opposed to the
+ * posix versions, which return 1 larger than that position and use a return
+ * value of zero as a sentinel.  This tends to simplify logic in callers, and
+ * allows for consistency with the builtins we build fls on top of.
+ */
+static inline unsigned
+ffs_llu(unsigned long long x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
+}
+
+static inline unsigned
+ffs_lu(unsigned long x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFSL(x) - 1;
+}
+
+static inline unsigned
+ffs_u(unsigned x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFS(x) - 1;
+}
+
+#define DO_FLS_SLOW(x, suffix) do {					\
+	util_assume(x != 0);						\
+	x |= (x >> 1);							\
+	x |= (x >> 2);							\
+	x |= (x >> 4);							\
+	x |= (x >> 8);							\
+	x |= (x >> 16);							\
+	if (sizeof(x) > 4) {						\
+		/*							\
+		 * If sizeof(x) is 4, then the expression "x >> 32"	\
+		 * will generate compiler warnings even if the code	\
+		 * never executes.  This circumvents the warning, and	\
+		 * gets compiled out in optimized builds.		\
+		 */							\
+		int constant_32 = sizeof(x) * 4;			\
+		x |= (x >> constant_32);				\
+	}								\
+	x++;								\
+	if (x == 0) {							\
+		return 8 * sizeof(x) - 1;				\
+	}								\
+	return ffs_##suffix(x) - 1;					\
+} while(0)
+
+static inline unsigned
+fls_llu_slow(unsigned long long x) {
+	DO_FLS_SLOW(x, llu);
+}
+
+static inline unsigned
+fls_lu_slow(unsigned long x) {
+	DO_FLS_SLOW(x, lu);
+}
+
+static inline unsigned
+fls_u_slow(unsigned x) {
+	DO_FLS_SLOW(x, u);
+}
+
+#undef DO_FLS_SLOW
+
+#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
+static inline unsigned
+fls_llu(unsigned long long x) {
+	util_assume(x != 0);
+	/*
+	 * Note that the xor here is more naturally written as subtraction; the
+	 * last bit set is the number of bits in the type minus the number of
+	 * leading zero bits.  But GCC implements that as:
+	 *    bsr     edi, edi
+	 *    mov     eax, 31
+	 *    xor     edi, 31
+	 *    sub     eax, edi
+	 * If we write it as xor instead, then we get
+	 *    bsr     eax, edi
+	 * as desired.
+	 */
+	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
+}
+#elif defined(_MSC_VER)
+
+#if LG_SIZEOF_PTR == 3
+#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#else
+/*
+ * This never actually runs; we're just dodging a compiler error for the
+ * never-taken branch where sizeof(void *) == 8.
+ */
+#define DO_BSR64(bit, x) bit = 0; unreachable()
+#endif
+
+#define DO_FLS(x) do {							\
+	if (x == 0) {							\
+		return 8 * sizeof(x);					\
+	}								\
+	unsigned long bit;						\
+	if (sizeof(x) == 4) {						\
+		_BitScanReverse(&bit, (unsigned)x);			\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
+		DO_BSR64(bit, x);					\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
+		/* Dodge a compiler warning, as above. */		\
+		int constant_32 = sizeof(x) * 4;			\
+		if (_BitScanReverse(&bit,				\
+		    (unsigned)(x >> constant_32))) {			\
+			return 32 + (unsigned)bit;			\
+		} else {						\
+			_BitScanReverse(&bit, (unsigned)x);		\
+			return (unsigned)bit;				\
+		}							\
+	}								\
+	unreachable();							\
+} while (0)
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	DO_FLS(x);
+}
+
+#undef DO_FLS
+#undef DO_BSR64
+#else
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	return fls_llu_slow(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	return fls_lu_slow(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	return fls_u_slow(x);
+}
+#endif
+
+#if LG_SIZEOF_LONG_LONG > 3
+#  error "Haven't implemented popcount for 16-byte ints."
+#endif
+
+#define DO_POPCOUNT(x, type) do {					\
+	/*								\
+	 * Algorithm from an old AMD optimization reference manual.	\
+	 * We're putting a little bit more work than you might expect	\
+	 * into the no-instrinsic case, since we only support the	\
+	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
+	 * whether or not the popcount builtin is actually useable in	\
+	 * MSVC is nontrivial.						\
+	 */								\
+									\
+	type bmul = (type)0x0101010101010101ULL;			\
+									\
+	/*								\
+	 * Replace each 2 bits with the sideways sum of the original	\
+	 * values.  0x5 = 0b0101.					\
+	 *								\
+	 * You might expect this to be:					\
+	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
+	 * That costs an extra mask relative to this, though.		\
+	 */								\
+	x = x - ((x >> 1) & (0x55U * bmul));				\
+	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
+	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
+	/*								\
+	 * Replace each 8 bits with their sideways sum.  Note that we	\
+	 * can't overflow within each 4-bit sum here, so we can skip	\
+	 * the initial mask.						\
+	 */								\
+	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
+	/*								\
+	 * None of the partial sums in this multiplication (viewed in	\
+	 * base-256) can overflow into the next digit.  So the least	\
+	 * significant byte of the product will be the least		\
+	 * significant byte of the original value, the second least	\
+	 * significant byte will be the sum of the two least		\
+	 * significant bytes of the original value, and so on.		\
+	 * Importantly, the high byte will be the byte-wise sum of all	\
+	 * the bytes of the original value.				\
+	 */								\
+	x = x * bmul;							\
+	x >>= ((sizeof(x) - 1) * 8);					\
+	return (unsigned)x;						\
+} while(0)
+
+static inline unsigned
+popcount_u_slow(unsigned bitmap) {
+	DO_POPCOUNT(bitmap, unsigned);
+}
+
+static inline unsigned
+popcount_lu_slow(unsigned long bitmap) {
+	DO_POPCOUNT(bitmap, unsigned long);
+}
+
+static inline unsigned
+popcount_llu_slow(unsigned long long bitmap) {
+	DO_POPCOUNT(bitmap, unsigned long long);
+}
+
+#undef DO_POPCOUNT
+
+static inline unsigned
+popcount_u(unsigned bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNT
+	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
+#else
+	return popcount_u_slow(bitmap);
+#endif
+}
+
+static inline unsigned
+popcount_lu(unsigned long bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+#else
+	return popcount_lu_slow(bitmap);
+#endif
+}
+
+static inline unsigned
+popcount_llu(unsigned long long bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
+	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
+#else
+	return popcount_llu_slow(bitmap);
+#endif
+}
+
+/*
+ * Clears first unset bit in bitmap, and returns
+ * place of bit.  bitmap *must not* be 0.
+ */
+
+static inline size_t
+cfs_lu(unsigned long* bitmap) {
+	util_assume(*bitmap != 0);
+	size_t bit = ffs_lu(*bitmap);
+	*bitmap ^= ZU(1) << bit;
+	return bit;
+}
+
+static inline unsigned
+ffs_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return ffs_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return ffs_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return ffs_llu(x);
+#else
+#error No implementation for size_t ffs()
+#endif
+}
+
+static inline unsigned
+fls_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return fls_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return fls_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return fls_llu(x);
+#else
+#error No implementation for size_t fls()
+#endif
+}
+
+
+static inline unsigned
+ffs_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return ffs_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return ffs_llu(x);
+#else
+#error No implementation for 64-bit ffs()
+#endif
+}
+
+static inline unsigned
+fls_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return fls_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return fls_llu(x);
+#else
+#error No implementation for 64-bit fls()
+#endif
+}
+
+static inline unsigned
+ffs_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return ffs_u(x);
+#else
+#error No implementation for 32-bit ffs()
+#endif
+	return ffs_u(x);
+}
+
+static inline unsigned
+fls_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return fls_u(x);
+#else
+#error No implementation for 32-bit fls()
+#endif
+	return fls_u(x);
+}
+
+static inline uint64_t
+pow2_ceil_u64(uint64_t x) {
+	if (unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index = fls_u64(x - 1);
+	/*
+	 * Range-check; it's on the callers to ensure that the result of this
+	 * call won't overflow.
+	 */
+	assert(msb_on_index < 63);
+	return 1ULL << (msb_on_index + 1);
+}
+
+static inline uint32_t
+pow2_ceil_u32(uint32_t x) {
+	if (unlikely(x <= 1)) {
+	    return x;
+	}
+	size_t msb_on_index = fls_u32(x - 1);
+	/* As above. */
+	assert(msb_on_index < 31);
+	return 1U << (msb_on_index + 1);
+}
+
+/* Compute the smallest power of 2 that is >= x. */
+static inline size_t
+pow2_ceil_zu(size_t x) {
+#if (LG_SIZEOF_PTR == 3)
+	return pow2_ceil_u64(x);
+#else
+	return pow2_ceil_u32(x);
+#endif
+}
+
+static inline unsigned
+lg_floor(size_t x) {
+	util_assume(x != 0);
+#if (LG_SIZEOF_PTR == 3)
+	return fls_u64(x);
+#else
+	return fls_u32(x);
+#endif
+}
+
+static inline unsigned
+lg_ceil(size_t x) {
+	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
+}
+
+/* A compile-time version of lg_floor and lg_ceil. */
+#define LG_FLOOR_1(x) 0
+#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
+#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
+#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
+#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#if LG_SIZEOF_PTR == 2
+#  define LG_FLOOR(x) LG_FLOOR_32((x))
+#else
+#  define LG_FLOOR(x) LG_FLOOR_64((x))
+#endif
+
+#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
+
+#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */

+ 368 - 0
BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h

@@ -0,0 +1,368 @@
+#ifndef JEMALLOC_INTERNAL_BITMAP_H
+#define JEMALLOC_INTERNAL_BITMAP_H
+
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/sc.h"
+
+typedef unsigned long bitmap_t;
+#define LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+
+/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
+#if SC_LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
+/* Maximum bitmap bit count is determined by maximum regions per slab. */
+#  define LG_BITMAP_MAXBITS	SC_LG_SLAB_MAXREGS
+#else
+/* Maximum bitmap bit count is determined by number of extent size classes. */
+#  define LG_BITMAP_MAXBITS	LG_CEIL(SC_NSIZES)
+#endif
+#define BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
+
+/* Number of bits per group. */
+#define LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
+#define BITMAP_GROUP_NBITS		(1U << LG_BITMAP_GROUP_NBITS)
+#define BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+
+/*
+ * Do some analysis on how big the bitmap is before we use a tree.  For a brute
+ * force linear search, if we would have to call ffs_lu() more than 2^3 times,
+ * use a tree instead.
+ */
+#if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3
+#  define BITMAP_USE_TREE
+#endif
+
+/* Number of groups required to store a given number of bits. */
+#define BITMAP_BITS2GROUPS(nbits)					\
+    (((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+
+/*
+ * Number of groups required at a particular level for a given number of bits.
+ */
+#define BITMAP_GROUPS_L0(nbits)						\
+    BITMAP_BITS2GROUPS(nbits)
+#define BITMAP_GROUPS_L1(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define BITMAP_GROUPS_L2(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define BITMAP_GROUPS_L3(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS((nbits)))))
+#define BITMAP_GROUPS_L4(nbits)						\
+    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))))
+
+/*
+ * Assuming the number of levels, number of groups required for a given number
+ * of bits.
+ */
+#define BITMAP_GROUPS_1_LEVEL(nbits)					\
+    BITMAP_GROUPS_L0(nbits)
+#define BITMAP_GROUPS_2_LEVEL(nbits)					\
+    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define BITMAP_GROUPS_3_LEVEL(nbits)					\
+    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define BITMAP_GROUPS_4_LEVEL(nbits)					\
+    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+#define BITMAP_GROUPS_5_LEVEL(nbits)					\
+    (BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits))
+
+/*
+ * Maximum number of groups required to support LG_BITMAP_MAXBITS.
+ */
+#ifdef BITMAP_USE_TREE
+
+#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_1_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_2_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_3_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_4_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5
+#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_5_LEVEL(nbits)
+#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS)
+#else
+#  error "Unsupported bitmap size"
+#endif
+
+/*
+ * Maximum number of levels possible.  This could be statically computed based
+ * on LG_BITMAP_MAXBITS:
+ *
+ * #define BITMAP_MAX_LEVELS \
+ *     (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \
+ *     + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+ *
+ * However, that would not allow the generic BITMAP_INFO_INITIALIZER() macro, so
+ * instead hardcode BITMAP_MAX_LEVELS to the largest number supported by the
+ * various cascading macros.  The only additional cost this incurs is some
+ * unused trailing entries in bitmap_info_t structures; the bitmaps themselves
+ * are not impacted.
+ */
+#define BITMAP_MAX_LEVELS	5
+
+#define BITMAP_INFO_INITIALIZER(nbits) {				\
+	/* nbits. */							\
+	nbits,								\
+	/* nlevels. */							\
+	(BITMAP_GROUPS_L0(nbits) > BITMAP_GROUPS_L1(nbits)) +		\
+	    (BITMAP_GROUPS_L1(nbits) > BITMAP_GROUPS_L2(nbits)) +	\
+	    (BITMAP_GROUPS_L2(nbits) > BITMAP_GROUPS_L3(nbits)) +	\
+	    (BITMAP_GROUPS_L3(nbits) > BITMAP_GROUPS_L4(nbits)) + 1,	\
+	/* levels. */							\
+	{								\
+		{0},							\
+		{BITMAP_GROUPS_L0(nbits)},				\
+		{BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
+		{BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) +	\
+		    BITMAP_GROUPS_L0(nbits)},				\
+		{BITMAP_GROUPS_L3(nbits) + BITMAP_GROUPS_L2(nbits) +	\
+		    BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
+		{BITMAP_GROUPS_L4(nbits) + BITMAP_GROUPS_L3(nbits) +	\
+		     BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits)	\
+		     + BITMAP_GROUPS_L0(nbits)}				\
+	}								\
+}
+
+#else /* BITMAP_USE_TREE */
+
+#define BITMAP_GROUPS(nbits)	BITMAP_BITS2GROUPS(nbits)
+#define BITMAP_GROUPS_MAX	BITMAP_BITS2GROUPS(BITMAP_MAXBITS)
+
+#define BITMAP_INFO_INITIALIZER(nbits) {				\
+	/* nbits. */							\
+	nbits,								\
+	/* ngroups. */							\
+	BITMAP_BITS2GROUPS(nbits)					\
+}
+
+#endif /* BITMAP_USE_TREE */
+
+typedef struct bitmap_level_s {
+	/* Offset of this level's groups within the array of groups. */
+	size_t group_offset;
+} bitmap_level_t;
+
+typedef struct bitmap_info_s {
+	/* Logical number of bits in bitmap (stored at bottom level). */
+	size_t nbits;
+
+#ifdef BITMAP_USE_TREE
+	/* Number of levels necessary for nbits. */
+	unsigned nlevels;
+
+	/*
+	 * Only the first (nlevels+1) elements are used, and levels are ordered
+	 * bottom to top (e.g. the bottom level is stored in levels[0]).
+	 */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
+#else /* BITMAP_USE_TREE */
+	/* Number of groups necessary for nbits. */
+	size_t ngroups;
+#endif /* BITMAP_USE_TREE */
+} bitmap_info_t;
+
+void bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill);
+size_t bitmap_size(const bitmap_info_t *binfo);
+
+static inline bool
+bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {
+#ifdef BITMAP_USE_TREE
+	size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	bitmap_t rg = bitmap[rgoff];
+	/* The bitmap is full iff the root group is 0. */
+	return (rg == 0);
+#else
+	size_t i;
+
+	for (i = 0; i < binfo->ngroups; i++) {
+		if (bitmap[i] != 0) {
+			return false;
+		}
+	}
+	return true;
+#endif
+}
+
+static inline bool
+bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t goff;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	g = bitmap[goff];
+	return !(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
+}
+
+static inline void
+bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+
+	assert(bit < binfo->nbits);
+	assert(!bitmap_get(bitmap, binfo, bit));
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
+	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(bitmap_get(bitmap, binfo, bit));
+#ifdef BITMAP_USE_TREE
+	/* Propagate group state transitions up the tree. */
+	if (g == 0) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
+			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (g != 0) {
+				break;
+			}
+		}
+	}
+#endif
+}
+
+/* ffu: find first unset >= bit. */
+static inline size_t
+bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
+	assert(min_bit < binfo->nbits);
+
+#ifdef BITMAP_USE_TREE
+	size_t bit = 0;
+	for (unsigned level = binfo->nlevels; level--;) {
+		size_t lg_bits_per_group = (LG_BITMAP_GROUP_NBITS * (level +
+		    1));
+		bitmap_t group = bitmap[binfo->levels[level].group_offset + (bit
+		    >> lg_bits_per_group)];
+		unsigned group_nmask = (unsigned)(((min_bit > bit) ? (min_bit -
+		    bit) : 0) >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS));
+		assert(group_nmask <= BITMAP_GROUP_NBITS);
+		bitmap_t group_mask = ~((1LU << group_nmask) - 1);
+		bitmap_t group_masked = group & group_mask;
+		if (group_masked == 0LU) {
+			if (group == 0LU) {
+				return binfo->nbits;
+			}
+			/*
+			 * min_bit was preceded by one or more unset bits in
+			 * this group, but there are no other unset bits in this
+			 * group.  Try again starting at the first bit of the
+			 * next sibling.  This will recurse at most once per
+			 * non-root level.
+			 */
+			size_t sib_base = bit + (ZU(1) << lg_bits_per_group);
+			assert(sib_base > min_bit);
+			assert(sib_base > bit);
+			if (sib_base >= binfo->nbits) {
+				return binfo->nbits;
+			}
+			return bitmap_ffu(bitmap, binfo, sib_base);
+		}
+		bit += ((size_t)ffs_lu(group_masked)) <<
+		    (lg_bits_per_group - LG_BITMAP_GROUP_NBITS);
+	}
+	assert(bit >= min_bit);
+	assert(bit < binfo->nbits);
+	return bit;
+#else
+	size_t i = min_bit >> LG_BITMAP_GROUP_NBITS;
+	bitmap_t g = bitmap[i] & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK))
+	    - 1);
+	size_t bit;
+	do {
+		if (g != 0) {
+			bit = ffs_lu(g);
+			return (i << LG_BITMAP_GROUP_NBITS) + bit;
+		}
+		i++;
+		g = bitmap[i];
+	} while (i < binfo->ngroups);
+	return binfo->nbits;
+#endif
+}
+
+/* sfu: set first unset. */
+static inline size_t
+bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {
+	size_t bit;
+	bitmap_t g;
+	unsigned i;
+
+	assert(!bitmap_full(bitmap, binfo));
+
+#ifdef BITMAP_USE_TREE
+	i = binfo->nlevels - 1;
+	g = bitmap[binfo->levels[i].group_offset];
+	bit = ffs_lu(g);
+	while (i > 0) {
+		i--;
+		g = bitmap[binfo->levels[i].group_offset + bit];
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
+	}
+#else
+	i = 0;
+	g = bitmap[0];
+	while (g == 0) {
+		i++;
+		g = bitmap[i];
+	}
+	bit = (i << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
+#endif
+	bitmap_set(bitmap, binfo, bit);
+	return bit;
+}
+
+static inline void
+bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t goff;
+	bitmap_t *gp;
+	bitmap_t g;
+	UNUSED bool propagate;
+
+	assert(bit < binfo->nbits);
+	assert(bitmap_get(bitmap, binfo, bit));
+	goff = bit >> LG_BITMAP_GROUP_NBITS;
+	gp = &bitmap[goff];
+	g = *gp;
+	propagate = (g == 0);
+	assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))) == 0);
+	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+	*gp = g;
+	assert(!bitmap_get(bitmap, binfo, bit));
+#ifdef BITMAP_USE_TREE
+	/* Propagate group state transitions up the tree. */
+	if (propagate) {
+		unsigned i;
+		for (i = 1; i < binfo->nlevels; i++) {
+			bit = goff;
+			goff = bit >> LG_BITMAP_GROUP_NBITS;
+			gp = &bitmap[binfo->levels[i].group_offset + goff];
+			g = *gp;
+			propagate = (g == 0);
+			assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)))
+			    == 0);
+			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
+			*gp = g;
+			if (!propagate) {
+				break;
+			}
+		}
+	}
+#endif /* BITMAP_USE_TREE */
+}
+
+#endif /* JEMALLOC_INTERNAL_BITMAP_H */

+ 32 - 0
BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h

@@ -0,0 +1,32 @@
+#ifndef JEMALLOC_INTERNAL_BUF_WRITER_H
+#define JEMALLOC_INTERNAL_BUF_WRITER_H
+
+/*
+ * Note: when using the buffered writer, cbopaque is passed to write_cb only
+ * when the buffer is flushed.  It would make a difference if cbopaque points
+ * to something that's changing for each write_cb call, or something that
+ * affects write_cb in a way dependent on the content of the output string.
+ * However, the most typical usage case in practice is that cbopaque points to
+ * some "option like" content for the write_cb, so it doesn't matter.
+ */
+
+typedef struct {
+	write_cb_t *write_cb;
+	void *cbopaque;
+	char *buf;
+	size_t buf_size;
+	size_t buf_end;
+	bool internal_buf;
+} buf_writer_t;
+
+bool buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer,
+    write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len);
+void buf_writer_flush(buf_writer_t *buf_writer);
+write_cb_t buf_writer_cb;
+void buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer);
+
+typedef ssize_t (read_cb_t)(void *read_cbopaque, void *buf, size_t limit);
+void buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb,
+    void *read_cbopaque);
+
+#endif /* JEMALLOC_INTERNAL_BUF_WRITER_H */

+ 670 - 0
BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h

@@ -0,0 +1,670 @@
+#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
+#define JEMALLOC_INTERNAL_CACHE_BIN_H
+
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sz.h"
+
+/*
+ * The cache_bins are the mechanism that the tcache and the arena use to
+ * communicate.  The tcache fills from and flushes to the arena by passing a
+ * cache_bin_t to fill/flush.  When the arena needs to pull stats from the
+ * tcaches associated with it, it does so by iterating over its
+ * cache_bin_array_descriptor_t objects and reading out per-bin stats it
+ * contains.  This makes it so that the arena need not know about the existence
+ * of the tcache at all.
+ */
+
+/*
+ * The size in bytes of each cache bin stack.  We also use this to indicate
+ * *counts* of individual objects.
+ */
+typedef uint16_t cache_bin_sz_t;
+
+/*
+ * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a
+ * bug starts leaking those.  Make it look like the junk pattern but be distinct
+ * from it.
+ */
+static const uintptr_t cache_bin_preceding_junk =
+    (uintptr_t)0x7a7a7a7a7a7a7a7aULL;
+/* Note: a7 vs. 7a above -- this tells you which pointer leaked. */
+static const uintptr_t cache_bin_trailing_junk =
+    (uintptr_t)0xa7a7a7a7a7a7a7a7ULL;
+
+/*
+ * That implies the following value, for the maximum number of items in any
+ * individual bin.  The cache bins track their bounds looking just at the low
+ * bits of a pointer, compared against a cache_bin_sz_t.  So that's
+ *   1 << (sizeof(cache_bin_sz_t) * 8)
+ * bytes spread across pointer sized objects to get the maximum.
+ */
+#define CACHE_BIN_NCACHED_MAX (((size_t)1 << sizeof(cache_bin_sz_t) * 8) \
+    / sizeof(void *) - 1)
+
+/*
+ * This lives inside the cache_bin (for locality reasons), and is initialized
+ * alongside it, but is otherwise not modified by any cache bin operations.
+ * It's logically public and maintained by its callers.
+ */
+typedef struct cache_bin_stats_s cache_bin_stats_t;
+struct cache_bin_stats_s {
+	/*
+	 * Number of allocation requests that corresponded to the size of this
+	 * bin.
+	 */
+	uint64_t nrequests;
+};
+
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+typedef struct cache_bin_info_s cache_bin_info_t;
+struct cache_bin_info_s {
+	cache_bin_sz_t ncached_max;
+};
+
+/*
+ * Responsible for caching allocations associated with a single size.
+ *
+ * Several pointers are used to track the stack.  To save on metadata bytes,
+ * only the stack_head is a full sized pointer (which is dereferenced on the
+ * fastpath), while the others store only the low 16 bits -- this is correct
+ * because a single stack never takes more space than 2^16 bytes, and at the
+ * same time only equality checks are performed on the low bits.
+ *
+ * (low addr)                                                  (high addr)
+ * |------stashed------|------available------|------cached-----|
+ * ^                   ^                     ^                 ^
+ * low_bound(derived)  low_bits_full         stack_head        low_bits_empty
+ */
+typedef struct cache_bin_s cache_bin_t;
+struct cache_bin_s {
+	/*
+	 * The stack grows down.  Whenever the bin is nonempty, the head points
+	 * to an array entry containing a valid allocation.  When it is empty,
+	 * the head points to one element past the owned array.
+	 */
+	void **stack_head;
+	/*
+	 * cur_ptr and stats are both modified frequently.  Let's keep them
+	 * close so that they have a higher chance of being on the same
+	 * cacheline, thus less write-backs.
+	 */
+	cache_bin_stats_t tstats;
+
+	/*
+	 * The low bits of the address of the first item in the stack that
+	 * hasn't been used since the last GC, to track the low water mark (min
+	 * # of cached items).
+	 *
+	 * Since the stack grows down, this is a higher address than
+	 * low_bits_full.
+	 */
+	uint16_t low_bits_low_water;
+
+	/*
+	 * The low bits of the value that stack_head will take on when the array
+	 * is full (of cached & stashed items).  But remember that stack_head
+	 * always points to a valid item when the array is nonempty -- this is
+	 * in the array.
+	 *
+	 * Recall that since the stack grows down, this is the lowest available
+	 * address in the array for caching.  Only adjusted when stashing items.
+	 */
+	uint16_t low_bits_full;
+
+	/*
+	 * The low bits of the value that stack_head will take on when the array
+	 * is empty.
+	 *
+	 * The stack grows down -- this is one past the highest address in the
+	 * array.  Immutable after initialization.
+	 */
+	uint16_t low_bits_empty;
+};
+
+/*
+ * The cache_bins live inside the tcache, but the arena (by design) isn't
+ * supposed to know much about tcache internals.  To let the arena iterate over
+ * associated bins, we keep (with the tcache) a linked list of
+ * cache_bin_array_descriptor_ts that tell the arena how to find the bins.
+ */
+typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
+struct cache_bin_array_descriptor_s {
+	/*
+	 * The arena keeps a list of the cache bins associated with it, for
+	 * stats collection.
+	 */
+	ql_elm(cache_bin_array_descriptor_t) link;
+	/* Pointers to the tcache bins. */
+	cache_bin_t *bins;
+};
+
+static inline void
+cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
+    cache_bin_t *bins) {
+	ql_elm_new(descriptor, link);
+	descriptor->bins = bins;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_nonfast_aligned(const void *ptr) {
+	if (!config_uaf_detection) {
+		return false;
+	}
+	/*
+	 * Currently we use alignment to decide which pointer to junk & stash on
+	 * dealloc (for catching use-after-free).  In some common cases a
+	 * page-aligned check is needed already (sdalloc w/ config_prof), so we
+	 * are getting it more or less for free -- no added instructions on
+	 * free_fastpath.
+	 *
+	 * Another way of deciding which pointer to sample, is adding another
+	 * thread_event to pick one every N bytes.  That also adds no cost on
+	 * the fastpath, however it will tend to pick large allocations which is
+	 * not the desired behavior.
+	 */
+	return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0;
+}
+
+/* Returns ncached_max: Upper limit on ncached. */
+static inline cache_bin_sz_t
+cache_bin_info_ncached_max(cache_bin_info_t *info) {
+	return info->ncached_max;
+}
+
+/*
+ * Internal.
+ *
+ * Asserts that the pointer associated with earlier is <= the one associated
+ * with later.
+ */
+static inline void
+cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
+	if (earlier > later) {
+		assert(bin->low_bits_full > bin->low_bits_empty);
+	}
+}
+
+/*
+ * Internal.
+ *
+ * Does difference calculations that handle wraparound correctly.  Earlier must
+ * be associated with the position earlier in memory.
+ */
+static inline uint16_t
+cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later, bool racy) {
+	/*
+	 * When it's racy, bin->low_bits_full can be modified concurrently. It
+	 * can cross the uint16_t max value and become less than
+	 * bin->low_bits_empty at the time of the check.
+	 */
+	if (!racy) {
+		cache_bin_assert_earlier(bin, earlier, later);
+	}
+	return later - earlier;
+}
+
+/*
+ * Number of items currently cached in the bin, without checking ncached_max.
+ * We require specifying whether or not the request is racy or not (i.e. whether
+ * or not concurrent modifications are possible).
+ */
+static inline cache_bin_sz_t
+cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
+	cache_bin_sz_t diff = cache_bin_diff(bin,
+	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty, racy);
+	cache_bin_sz_t n = diff / sizeof(void *);
+	/*
+	 * We have undefined behavior here; if this function is called from the
+	 * arena stats updating code, then stack_head could change from the
+	 * first line to the next one.  Morally, these loads should be atomic,
+	 * but compilers won't currently generate comparisons with in-memory
+	 * operands against atomics, and these variables get accessed on the
+	 * fast paths.  This should still be "safe" in the sense of generating
+	 * the correct assembly for the foreseeable future, though.
+	 */
+	assert(n == 0 || *(bin->stack_head) != NULL || racy);
+	return n;
+}
+
+/*
+ * Number of items currently cached in the bin, with checking ncached_max.  The
+ * caller must know that no concurrent modification of the cache_bin is
+ * possible.
+ */
+static inline cache_bin_sz_t
+cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
+	    /* racy */ false);
+	assert(n <= cache_bin_info_ncached_max(info));
+	return n;
+}
+
+/*
+ * Internal.
+ *
+ * A pointer to the position one past the end of the backing array.
+ *
+ * Do not call if racy, because both 'bin->stack_head' and 'bin->low_bits_full'
+ * are subject to concurrent modifications.
+ */
+static inline void **
+cache_bin_empty_position_get(cache_bin_t *bin) {
+	cache_bin_sz_t diff = cache_bin_diff(bin,
+	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty,
+	    /* racy */ false);
+	uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff;
+	void **ret = (void **)empty_bits;
+
+	assert(ret >= bin->stack_head);
+
+	return ret;
+}
+
+/*
+ * Internal.
+ *
+ * Calculates low bits of the lower bound of the usable cache bin's range (see
+ * cache_bin_t visual representation above).
+ *
+ * No values are concurrently modified, so should be safe to read in a
+ * multithreaded environment. Currently concurrent access happens only during
+ * arena statistics collection.
+ */
+static inline uint16_t
+cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
+	return (uint16_t)bin->low_bits_empty -
+	    info->ncached_max * sizeof(void *);
+}
+
+/*
+ * Internal.
+ *
+ * A pointer to the position with the lowest address of the backing array.
+ */
+static inline void **
+cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
+	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
+	void **ret = cache_bin_empty_position_get(bin) - ncached_max;
+	assert(ret <= bin->stack_head);
+
+	return ret;
+}
+
+/*
+ * As the name implies.  This is important since it's not correct to try to
+ * batch fill a nonempty cache bin.
+ */
+static inline void
+cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
+	assert(cache_bin_ncached_get_local(bin, info) == 0);
+	assert(cache_bin_empty_position_get(bin) == bin->stack_head);
+}
+
+/*
+ * Get low water, but without any of the correctness checking we do for the
+ * caller-usable version, if we are temporarily breaking invariants (like
+ * ncached >= low_water during flush).
+ */
+static inline cache_bin_sz_t
+cache_bin_low_water_get_internal(cache_bin_t *bin) {
+	return cache_bin_diff(bin, bin->low_bits_low_water,
+	    bin->low_bits_empty, /* racy */ false) / sizeof(void *);
+}
+
+/* Returns the numeric value of low water in [0, ncached]. */
+static inline cache_bin_sz_t
+cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
+	cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin);
+	assert(low_water <= cache_bin_info_ncached_max(info));
+	assert(low_water <= cache_bin_ncached_get_local(bin, info));
+
+	cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head,
+	    bin->low_bits_low_water);
+
+	return low_water;
+}
+
+/*
+ * Indicates that the current cache bin position should be the low water mark
+ * going forward.
+ */
+static inline void
+cache_bin_low_water_set(cache_bin_t *bin) {
+	bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
+}
+
+static inline void
+cache_bin_low_water_adjust(cache_bin_t *bin) {
+	if (cache_bin_ncached_get_internal(bin, /* racy */ false)
+	    < cache_bin_low_water_get_internal(bin)) {
+		cache_bin_low_water_set(bin);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
+	/*
+	 * success (instead of ret) should be checked upon the return of this
+	 * function.  We avoid checking (ret == NULL) because there is never a
+	 * null stored on the avail stack (which is unknown to the compiler),
+	 * and eagerly checking ret would cause pipeline stall (waiting for the
+	 * cacheline).
+	 */
+
+	/*
+	 * This may read from the empty position; however the loaded value won't
+	 * be used.  It's safe because the stack has one more slot reserved.
+	 */
+	void *ret = *bin->stack_head;
+	uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head;
+	void **new_head = bin->stack_head + 1;
+
+	/*
+	 * Note that the low water mark is at most empty; if we pass this check,
+	 * we know we're non-empty.
+	 */
+	if (likely(low_bits != bin->low_bits_low_water)) {
+		bin->stack_head = new_head;
+		*success = true;
+		return ret;
+	}
+	if (!adjust_low_water) {
+		*success = false;
+		return NULL;
+	}
+	/*
+	 * In the fast-path case where we call alloc_easy and then alloc, the
+	 * previous checking and computation is optimized away -- we didn't
+	 * actually commit any of our operations.
+	 */
+	if (likely(low_bits != bin->low_bits_empty)) {
+		bin->stack_head = new_head;
+		bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head;
+		*success = true;
+		return ret;
+	}
+	*success = false;
+	return NULL;
+}
+
+/*
+ * Allocate an item out of the bin, failing if we're at the low-water mark.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
+	/* We don't look at info if we're not adjusting low-water. */
+	return cache_bin_alloc_impl(bin, success, false);
+}
+
+/*
+ * Allocate an item out of the bin, even if we're currently at the low-water
+ * mark (and failing only if the bin is empty).
+ */
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc(cache_bin_t *bin, bool *success) {
+	return cache_bin_alloc_impl(bin, success, true);
+}
+
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
+	    /* racy */ false);
+	if (n > num) {
+		n = (cache_bin_sz_t)num;
+	}
+	memcpy(out, bin->stack_head, n * sizeof(void *));
+	bin->stack_head += n;
+	cache_bin_low_water_adjust(bin);
+
+	return n;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_full(cache_bin_t *bin) {
+	return ((uint16_t)(uintptr_t)bin->stack_head == bin->low_bits_full);
+}
+
+/*
+ * Free an object into the given bin.  Fails only if the bin is full.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
+	if (unlikely(cache_bin_full(bin))) {
+		return false;
+	}
+
+	bin->stack_head--;
+	*bin->stack_head = ptr;
+	cache_bin_assert_earlier(bin, bin->low_bits_full,
+	    (uint16_t)(uintptr_t)bin->stack_head);
+
+	return true;
+}
+
+/* Returns false if failed to stash (i.e. bin is full). */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_stash(cache_bin_t *bin, void *ptr) {
+	if (cache_bin_full(bin)) {
+		return false;
+	}
+
+	/* Stash at the full position, in the [full, head) range. */
+	uint16_t low_bits_head = (uint16_t)(uintptr_t)bin->stack_head;
+	/* Wraparound handled as well. */
+	uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head,
+	    /* racy */ false);
+	*(void **)((uintptr_t)bin->stack_head - diff) = ptr;
+
+	assert(!cache_bin_full(bin));
+	bin->low_bits_full += sizeof(void *);
+	cache_bin_assert_earlier(bin, bin->low_bits_full, low_bits_head);
+
+	return true;
+}
+
+/*
+ * Get the number of stashed pointers.
+ *
+ * When called from a thread not owning the TLS (i.e. racy = true), it's
+ * important to keep in mind that 'bin->stack_head' and 'bin->low_bits_full' can
+ * be modified concurrently and almost none assertions about their values can be
+ * made.
+ */
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_nstashed_get_internal(cache_bin_t *bin, cache_bin_info_t *info,
+    bool racy) {
+	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
+	uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin,
+	    info);
+
+	cache_bin_sz_t n = cache_bin_diff(bin, low_bits_low_bound,
+	    bin->low_bits_full, racy) / sizeof(void *);
+	assert(n <= ncached_max);
+
+	if (!racy) {
+		/* Below are for assertions only. */
+		void **low_bound = cache_bin_low_bound_get(bin, info);
+
+		assert((uint16_t)(uintptr_t)low_bound == low_bits_low_bound);
+		void *stashed = *(low_bound + n - 1);
+		bool aligned = cache_bin_nonfast_aligned(stashed);
+#ifdef JEMALLOC_JET
+		/* Allow arbitrary pointers to be stashed in tests. */
+		aligned = true;
+#endif
+		assert(n == 0 || (stashed != NULL && aligned));
+	}
+
+	return n;
+}
+
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_nstashed_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
+	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin, info,
+	    /* racy */ false);
+	assert(n <= cache_bin_info_ncached_max(info));
+	return n;
+}
+
+/*
+ * Obtain a racy view of the number of items currently in the cache bin, in the
+ * presence of possible concurrent modifications.
+ */
+static inline void
+cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_info_t *info,
+    cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, /* racy */ true);
+	assert(n <= cache_bin_info_ncached_max(info));
+	*ncached = n;
+
+	n = cache_bin_nstashed_get_internal(bin, info, /* racy */ true);
+	assert(n <= cache_bin_info_ncached_max(info));
+	*nstashed = n;
+	/* Note that cannot assert ncached + nstashed <= ncached_max (racy). */
+}
+
+/*
+ * Filling and flushing are done in batch, on arrays of void *s.  For filling,
+ * the arrays go forward, and can be accessed with ordinary array arithmetic.
+ * For flushing, we work from the end backwards, and so need to use special
+ * accessors that invert the usual ordering.
+ *
+ * This is important for maintaining first-fit; the arena code fills with
+ * earliest objects first, and so those are the ones we should return first for
+ * cache_bin_alloc calls.  When flushing, we should flush the objects that we
+ * wish to return later; those at the end of the array.  This is better for the
+ * first-fit heuristic as well as for cache locality; the most recently freed
+ * objects are the ones most likely to still be in cache.
+ *
+ * This all sounds very hand-wavey and theoretical, but reverting the ordering
+ * on one or the other pathway leads to measurable slowdowns.
+ */
+
+typedef struct cache_bin_ptr_array_s cache_bin_ptr_array_t;
+struct cache_bin_ptr_array_s {
+	cache_bin_sz_t n;
+	void **ptr;
+};
+
+/*
+ * Declare a cache_bin_ptr_array_t sufficient for nval items.
+ *
+ * In the current implementation, this could be just part of a
+ * cache_bin_ptr_array_init_... call, since we reuse the cache bin stack memory.
+ * Indirecting behind a macro, though, means experimenting with linked-list
+ * representations is easy (since they'll require an alloca in the calling
+ * frame).
+ */
+#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval)				\
+    cache_bin_ptr_array_t name;						\
+    name.n = (nval)
+
+/*
+ * Start a fill.  The bin must be empty, and This must be followed by a
+ * finish_fill call before doing any alloc/dalloc operations on the bin.
+ */
+static inline void
+cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
+	cache_bin_assert_empty(bin, info);
+	arr->ptr = cache_bin_empty_position_get(bin) - nfill;
+}
+
+/*
+ * While nfill in cache_bin_init_ptr_array_for_fill is the number we *intend* to
+ * fill, nfilled here is the number we actually filled (which may be less, in
+ * case of OOM.
+ */
+static inline void
+cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
+	cache_bin_assert_empty(bin, info);
+	void **empty_position = cache_bin_empty_position_get(bin);
+	if (nfilled < arr->n) {
+		memmove(empty_position - nfilled, empty_position - arr->n,
+		    nfilled * sizeof(void *));
+	}
+	bin->stack_head = empty_position - nfilled;
+}
+
+/*
+ * Same deal, but with flush.  Unlike fill (which can fail), the user must flush
+ * everything we give them.
+ */
+static inline void
+cache_bin_init_ptr_array_for_flush(cache_bin_t *bin, cache_bin_info_t *info,
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) {
+	arr->ptr = cache_bin_empty_position_get(bin) - nflush;
+	assert(cache_bin_ncached_get_local(bin, info) == 0
+	    || *arr->ptr != NULL);
+}
+
+static inline void
+cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info,
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
+	unsigned rem = cache_bin_ncached_get_local(bin, info) - nflushed;
+	memmove(bin->stack_head + nflushed, bin->stack_head,
+	    rem * sizeof(void *));
+	bin->stack_head = bin->stack_head + nflushed;
+	cache_bin_low_water_adjust(bin);
+}
+
+static inline void
+cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind,
+    cache_bin_info_t *info, cache_bin_ptr_array_t *arr,
+    cache_bin_sz_t nstashed) {
+	assert(nstashed > 0);
+	assert(cache_bin_nstashed_get_local(bin, info) == nstashed);
+
+	void **low_bound = cache_bin_low_bound_get(bin, info);
+	arr->ptr = low_bound;
+	assert(*arr->ptr != NULL);
+}
+
+static inline void
+cache_bin_finish_flush_stashed(cache_bin_t *bin, cache_bin_info_t *info) {
+	void **low_bound = cache_bin_low_bound_get(bin, info);
+
+	/* Reset the bin local full position. */
+	bin->low_bits_full = (uint16_t)(uintptr_t)low_bound;
+	assert(cache_bin_nstashed_get_local(bin, info) == 0);
+}
+
+/*
+ * Initialize a cache_bin_info to represent up to the given number of items in
+ * the cache_bins it is associated with.
+ */
+void cache_bin_info_init(cache_bin_info_t *bin_info,
+    cache_bin_sz_t ncached_max);
+/*
+ * Given an array of initialized cache_bin_info_ts, determine how big an
+ * allocation is required to initialize a full set of cache_bin_ts.
+ */
+void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
+    size_t *size, size_t *alignment);
+
+/*
+ * Actually initialize some cache bins.  Callers should allocate the backing
+ * memory indicated by a call to cache_bin_compute_alloc.  They should then
+ * preincrement, call init once for each bin and info, and then call
+ * cache_bin_postincrement.  *alloc_cur will then point immediately past the end
+ * of the allocation.
+ */
+void cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos,
+    void *alloc, size_t *cur_offset);
+void cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos,
+    void *alloc, size_t *cur_offset);
+void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
+    size_t *cur_offset);
+
+/*
+ * If a cache bin was zero initialized (either because it lives in static or
+ * thread-local storage, or was memset to 0), this function indicates whether or
+ * not cache_bin_init was called on it.
+ */
+bool cache_bin_still_zero_initialized(cache_bin_t *bin);
+
+#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */

+ 101 - 0
BeefRT/JEMalloc/include/jemalloc/internal/ckh.h

@@ -0,0 +1,101 @@
+#ifndef JEMALLOC_INTERNAL_CKH_H
+#define JEMALLOC_INTERNAL_CKH_H
+
+#include "jemalloc/internal/tsd.h"
+
+/* Cuckoo hashing implementation.  Skip to the end for the interface. */
+
+/******************************************************************************/
+/* INTERNAL DEFINITIONS -- IGNORE */
+/******************************************************************************/
+
+/* Maintain counters used to get an idea of performance. */
+/* #define CKH_COUNT */
+/* Print counter values in ckh_delete() (requires CKH_COUNT). */
+/* #define CKH_VERBOSE */
+
+/*
+ * There are 2^LG_CKH_BUCKET_CELLS cells in each hash table bucket.  Try to fit
+ * one bucket per L1 cache line.
+ */
+#define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)
+
+/* Typedefs to allow easy function pointer passing. */
+typedef void ckh_hash_t (const void *, size_t[2]);
+typedef bool ckh_keycomp_t (const void *, const void *);
+
+/* Hash table cell. */
+typedef struct {
+	const void *key;
+	const void *data;
+} ckhc_t;
+
+/* The hash table itself. */
+typedef struct {
+#ifdef CKH_COUNT
+	/* Counters used to get an idea of performance. */
+	uint64_t ngrows;
+	uint64_t nshrinks;
+	uint64_t nshrinkfails;
+	uint64_t ninserts;
+	uint64_t nrelocs;
+#endif
+
+	/* Used for pseudo-random number generation. */
+	uint64_t prng_state;
+
+	/* Total number of items. */
+	size_t count;
+
+	/*
+	 * Minimum and current number of hash table buckets.  There are
+	 * 2^LG_CKH_BUCKET_CELLS cells per bucket.
+	 */
+	unsigned lg_minbuckets;
+	unsigned lg_curbuckets;
+
+	/* Hash and comparison functions. */
+	ckh_hash_t *hash;
+	ckh_keycomp_t *keycomp;
+
+	/* Hash table with 2^lg_curbuckets buckets. */
+	ckhc_t *tab;
+} ckh_t;
+
+/******************************************************************************/
+/* BEGIN PUBLIC API */
+/******************************************************************************/
+
+/* Lifetime management.  Minitems is the initial capacity. */
+bool ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+    ckh_keycomp_t *keycomp);
+void ckh_delete(tsd_t *tsd, ckh_t *ckh);
+
+/* Get the number of elements in the set. */
+size_t ckh_count(ckh_t *ckh);
+
+/*
+ * To iterate over the elements in the table, initialize *tabind to 0 and call
+ * this function until it returns true.  Each call that returns false will
+ * update *key and *data to the next element in the table, assuming the pointers
+ * are non-NULL.
+ */
+bool ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
+
+/*
+ * Basic hash table operations -- insert, removal, lookup.  For ckh_remove and
+ * ckh_search, key or data can be NULL.  The hash-table only stores pointers to
+ * the key and value, and doesn't do any lifetime management.
+ */
+bool ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
+    void **data);
+bool ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data);
+
+/* Some useful hash and comparison functions for strings and pointers. */
+void ckh_string_hash(const void *key, size_t r_hash[2]);
+bool ckh_string_keycomp(const void *k1, const void *k2);
+void ckh_pointer_hash(const void *key, size_t r_hash[2]);
+bool ckh_pointer_keycomp(const void *k1, const void *k2);
+
+#endif /* JEMALLOC_INTERNAL_CKH_H */

+ 34 - 0
BeefRT/JEMalloc/include/jemalloc/internal/counter.h

@@ -0,0 +1,34 @@
+#ifndef JEMALLOC_INTERNAL_COUNTER_H
+#define JEMALLOC_INTERNAL_COUNTER_H
+
+#include "jemalloc/internal/mutex.h"
+
+typedef struct counter_accum_s {
+	LOCKEDINT_MTX_DECLARE(mtx)
+	locked_u64_t accumbytes;
+	uint64_t interval;
+} counter_accum_t;
+
+JEMALLOC_ALWAYS_INLINE bool
+counter_accum(tsdn_t *tsdn, counter_accum_t *counter, uint64_t bytes) {
+	uint64_t interval = counter->interval;
+	assert(interval > 0);
+	LOCKEDINT_MTX_LOCK(tsdn, counter->mtx);
+	/*
+	 * If the event moves fast enough (and/or if the event handling is slow
+	 * enough), extreme overflow can cause counter trigger coalescing.
+	 * This is an intentional mechanism that avoids rate-limiting
+	 * allocation.
+	 */
+	bool overflow = locked_inc_mod_u64(tsdn, LOCKEDINT_MTX(counter->mtx),
+	    &counter->accumbytes, bytes, interval);
+	LOCKEDINT_MTX_UNLOCK(tsdn, counter->mtx);
+	return overflow;
+}
+
+bool counter_accum_init(counter_accum_t *counter, uint64_t interval);
+void counter_prefork(tsdn_t *tsdn, counter_accum_t *counter);
+void counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter);
+void counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter);
+
+#endif /* JEMALLOC_INTERNAL_COUNTER_H */

+ 159 - 0
BeefRT/JEMalloc/include/jemalloc/internal/ctl.h

@@ -0,0 +1,159 @@
+#ifndef JEMALLOC_INTERNAL_CTL_H
+#define JEMALLOC_INTERNAL_CTL_H
+
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/stats.h"
+
+/* Maximum ctl tree depth. */
+#define CTL_MAX_DEPTH	7
+
+typedef struct ctl_node_s {
+	bool named;
+} ctl_node_t;
+
+typedef struct ctl_named_node_s {
+	ctl_node_t node;
+	const char *name;
+	/* If (nchildren == 0), this is a terminal node. */
+	size_t nchildren;
+	const ctl_node_t *children;
+	int (*ctl)(tsd_t *, const size_t *, size_t, void *, size_t *, void *,
+	    size_t);
+} ctl_named_node_t;
+
+typedef struct ctl_indexed_node_s {
+	struct ctl_node_s node;
+	const ctl_named_node_t *(*index)(tsdn_t *, const size_t *, size_t,
+	    size_t);
+} ctl_indexed_node_t;
+
+typedef struct ctl_arena_stats_s {
+	arena_stats_t astats;
+
+	/* Aggregate stats for small size classes, based on bin stats. */
+	size_t allocated_small;
+	uint64_t nmalloc_small;
+	uint64_t ndalloc_small;
+	uint64_t nrequests_small;
+	uint64_t nfills_small;
+	uint64_t nflushes_small;
+
+	bin_stats_data_t bstats[SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+	pac_estats_t estats[SC_NPSIZES];
+	hpa_shard_stats_t hpastats;
+	sec_stats_t secstats;
+} ctl_arena_stats_t;
+
+typedef struct ctl_stats_s {
+	size_t allocated;
+	size_t active;
+	size_t metadata;
+	size_t metadata_thp;
+	size_t resident;
+	size_t mapped;
+	size_t retained;
+
+	background_thread_stats_t background_thread;
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes];
+} ctl_stats_t;
+
+typedef struct ctl_arena_s ctl_arena_t;
+struct ctl_arena_s {
+	unsigned arena_ind;
+	bool initialized;
+	ql_elm(ctl_arena_t) destroyed_link;
+
+	/* Basic stats, supported even if !config_stats. */
+	unsigned nthreads;
+	const char *dss;
+	ssize_t dirty_decay_ms;
+	ssize_t muzzy_decay_ms;
+	size_t pactive;
+	size_t pdirty;
+	size_t pmuzzy;
+
+	/* NULL if !config_stats. */
+	ctl_arena_stats_t *astats;
+};
+
+typedef struct ctl_arenas_s {
+	uint64_t epoch;
+	unsigned narenas;
+	ql_head(ctl_arena_t) destroyed;
+
+	/*
+	 * Element 0 corresponds to merged stats for extant arenas (accessed via
+	 * MALLCTL_ARENAS_ALL), element 1 corresponds to merged stats for
+	 * destroyed arenas (accessed via MALLCTL_ARENAS_DESTROYED), and the
+	 * remaining MALLOCX_ARENA_LIMIT elements correspond to arenas.
+	 */
+	ctl_arena_t *arenas[2 + MALLOCX_ARENA_LIMIT];
+} ctl_arenas_t;
+
+int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp,
+    void *newp, size_t newlen);
+int ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp);
+int ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen);
+int ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
+    size_t *miblenp);
+int ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
+    size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+bool ctl_boot(void);
+void ctl_prefork(tsdn_t *tsdn);
+void ctl_postfork_parent(tsdn_t *tsdn);
+void ctl_postfork_child(tsdn_t *tsdn);
+void ctl_mtx_assert_held(tsdn_t *tsdn);
+
+#define xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
+	if (je_mallctl(name, oldp, oldlenp, newp, newlen)		\
+	    != 0) {							\
+		malloc_printf(						\
+		    "<jemalloc>: Failure in xmallctl(\"%s\", ...)\n",	\
+		    name);						\
+		abort();						\
+	}								\
+} while (0)
+
+#define xmallctlnametomib(name, mibp, miblenp) do {			\
+	if (je_mallctlnametomib(name, mibp, miblenp) != 0) {		\
+		malloc_printf("<jemalloc>: Failure in "			\
+		    "xmallctlnametomib(\"%s\", ...)\n", name);		\
+		abort();						\
+	}								\
+} while (0)
+
+#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen) do {	\
+	if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp,		\
+	    newlen) != 0) {						\
+		malloc_write(						\
+		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
+		abort();						\
+	}								\
+} while (0)
+
+#define xmallctlmibnametomib(mib, miblen, name, miblenp) do {		\
+	if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp)	\
+	    != 0) {							\
+		malloc_write(						\
+		    "<jemalloc>: Failure in ctl_mibnametomib()\n");	\
+		abort();						\
+	}								\
+} while (0)
+
+#define xmallctlbymibname(mib, miblen, name, miblenp, oldp, oldlenp,	\
+    newp, newlen) do {							\
+	if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp,	\
+	    oldp, oldlenp, newp, newlen) != 0) {			\
+		malloc_write(						\
+		    "<jemalloc>: Failure in ctl_bymibname()\n");	\
+		abort();						\
+	}								\
+} while (0)
+
+#endif /* JEMALLOC_INTERNAL_CTL_H */

+ 186 - 0
BeefRT/JEMalloc/include/jemalloc/internal/decay.h

@@ -0,0 +1,186 @@
+#ifndef JEMALLOC_INTERNAL_DECAY_H
+#define JEMALLOC_INTERNAL_DECAY_H
+
+#include "jemalloc/internal/smoothstep.h"
+
+#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t)-1)
+
+/*
+ * The decay_t computes the number of pages we should purge at any given time.
+ * Page allocators inform a decay object when pages enter a decay-able state
+ * (i.e. dirty or muzzy), and query it to determine how many pages should be
+ * purged at any given time.
+ *
+ * This is mostly a single-threaded data structure and doesn't care about
+ * synchronization at all; it's the caller's responsibility to manage their
+ * synchronization on their own.  There are two exceptions:
+ * 1) It's OK to racily call decay_ms_read (i.e. just the simplest state query).
+ * 2) The mtx and purging fields live (and are initialized) here, but are
+ *    logically owned by the page allocator.  This is just a convenience (since
+ *    those fields would be duplicated for both the dirty and muzzy states
+ *    otherwise).
+ */
+typedef struct decay_s decay_t;
+struct decay_s {
+	/* Synchronizes all non-atomic fields. */
+	malloc_mutex_t mtx;
+	/*
+	 * True if a thread is currently purging the extents associated with
+	 * this decay structure.
+	 */
+	bool purging;
+	/*
+	 * Approximate time in milliseconds from the creation of a set of unused
+	 * dirty pages until an equivalent set of unused dirty pages is purged
+	 * and/or reused.
+	 */
+	atomic_zd_t time_ms;
+	/* time / SMOOTHSTEP_NSTEPS. */
+	nstime_t interval;
+	/*
+	 * Time at which the current decay interval logically started.  We do
+	 * not actually advance to a new epoch until sometime after it starts
+	 * because of scheduling and computation delays, and it is even possible
+	 * to completely skip epochs.  In all cases, during epoch advancement we
+	 * merge all relevant activity into the most recently recorded epoch.
+	 */
+	nstime_t epoch;
+	/* Deadline randomness generator. */
+	uint64_t jitter_state;
+	/*
+	 * Deadline for current epoch.  This is the sum of interval and per
+	 * epoch jitter which is a uniform random variable in [0..interval).
+	 * Epochs always advance by precise multiples of interval, but we
+	 * randomize the deadline to reduce the likelihood of arenas purging in
+	 * lockstep.
+	 */
+	nstime_t deadline;
+	/*
+	 * The number of pages we cap ourselves at in the current epoch, per
+	 * decay policies.  Updated on an epoch change.  After an epoch change,
+	 * the caller should take steps to try to purge down to this amount.
+	 */
+	size_t npages_limit;
+	/*
+	 * Number of unpurged pages at beginning of current epoch.  During epoch
+	 * advancement we use the delta between arena->decay_*.nunpurged and
+	 * ecache_npages_get(&arena->ecache_*) to determine how many dirty pages,
+	 * if any, were generated.
+	 */
+	size_t nunpurged;
+	/*
+	 * Trailing log of how many unused dirty pages were generated during
+	 * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last
+	 * element is the most recent epoch.  Corresponding epoch times are
+	 * relative to epoch.
+	 *
+	 * Updated only on epoch advance, triggered by
+	 * decay_maybe_advance_epoch, below.
+	 */
+	size_t backlog[SMOOTHSTEP_NSTEPS];
+
+	/* Peak number of pages in associated extents.  Used for debug only. */
+	uint64_t ceil_npages;
+};
+
+/*
+ * The current decay time setting.  This is the only public access to a decay_t
+ * that's allowed without holding mtx.
+ */
+static inline ssize_t
+decay_ms_read(const decay_t *decay) {
+	return atomic_load_zd(&decay->time_ms, ATOMIC_RELAXED);
+}
+
+/*
+ * See the comment on the struct field -- the limit on pages we should allow in
+ * this decay state this epoch.
+ */
+static inline size_t
+decay_npages_limit_get(const decay_t *decay) {
+	return decay->npages_limit;
+}
+
+/* How many unused dirty pages were generated during the last epoch. */
+static inline size_t
+decay_epoch_npages_delta(const decay_t *decay) {
+	return decay->backlog[SMOOTHSTEP_NSTEPS - 1];
+}
+
+/*
+ * Current epoch duration, in nanoseconds.  Given that new epochs are started
+ * somewhat haphazardly, this is not necessarily exactly the time between any
+ * two calls to decay_maybe_advance_epoch; see the comments on fields in the
+ * decay_t.
+ */
+static inline uint64_t
+decay_epoch_duration_ns(const decay_t *decay) {
+	return nstime_ns(&decay->interval);
+}
+
+static inline bool
+decay_immediately(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms == 0;
+}
+
+static inline bool
+decay_disabled(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms < 0;
+}
+
+/* Returns true if decay is enabled and done gradually. */
+static inline bool
+decay_gradually(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms > 0;
+}
+
+/*
+ * Returns true if the passed in decay time setting is valid.
+ * < -1 : invalid
+ * -1   : never decay
+ *  0   : decay immediately
+ *  > 0 : some positive decay time, up to a maximum allowed value of
+ *  NSTIME_SEC_MAX * 1000, which corresponds to decaying somewhere in the early
+ *  27th century.  By that time, we expect to have implemented alternate purging
+ *  strategies.
+ */
+bool decay_ms_valid(ssize_t decay_ms);
+
+/*
+ * As a precondition, the decay_t must be zeroed out (as if with memset).
+ *
+ * Returns true on error.
+ */
+bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
+
+/*
+ * Given an already-initialized decay_t, reinitialize it with the given decay
+ * time.  The decay_t must have previously been initialized (and should not then
+ * be zeroed).
+ */
+void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
+
+/*
+ * Compute how many of 'npages_new' pages we would need to purge in 'time'.
+ */
+uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time,
+    size_t npages_new);
+
+/* Returns true if the epoch advanced and there are pages to purge. */
+bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time,
+    size_t current_npages);
+
+/*
+ * Calculates wait time until a number of pages in the interval
+ * [0.5 * npages_threshold .. 1.5 * npages_threshold] should be purged.
+ *
+ * Returns number of nanoseconds or DECAY_UNBOUNDED_TIME_TO_PURGE in case of
+ * indefinite wait.
+ */
+uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current,
+    uint64_t npages_threshold);
+
+#endif /* JEMALLOC_INTERNAL_DECAY_H */

+ 41 - 0
BeefRT/JEMalloc/include/jemalloc/internal/div.h

@@ -0,0 +1,41 @@
+#ifndef JEMALLOC_INTERNAL_DIV_H
+#define JEMALLOC_INTERNAL_DIV_H
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * This module does the division that computes the index of a region in a slab,
+ * given its offset relative to the base.
+ * That is, given a divisor d, an n = i * d (all integers), we'll return i.
+ * We do some pre-computation to do this more quickly than a CPU division
+ * instruction.
+ * We bound n < 2^32, and don't support dividing by one.
+ */
+
+typedef struct div_info_s div_info_t;
+struct div_info_s {
+	uint32_t magic;
+#ifdef JEMALLOC_DEBUG
+	size_t d;
+#endif
+};
+
+void div_init(div_info_t *div_info, size_t divisor);
+
+static inline size_t
+div_compute(div_info_t *div_info, size_t n) {
+	assert(n <= (uint32_t)-1);
+	/*
+	 * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine,
+	 * the compilers I tried were all smart enough to turn this into the
+	 * appropriate "get the high 32 bits of the result of a multiply" (e.g.
+	 * mul; mov edx eax; on x86, umull on arm, etc.).
+	 */
+	size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32;
+#ifdef JEMALLOC_DEBUG
+	assert(i * div_info->d == n);
+#endif
+	return i;
+}
+
+#endif /* JEMALLOC_INTERNAL_DIV_H */

+ 55 - 0
BeefRT/JEMalloc/include/jemalloc/internal/ecache.h

@@ -0,0 +1,55 @@
+#ifndef JEMALLOC_INTERNAL_ECACHE_H
+#define JEMALLOC_INTERNAL_ECACHE_H
+
+#include "jemalloc/internal/eset.h"
+#include "jemalloc/internal/san.h"
+#include "jemalloc/internal/mutex.h"
+
+typedef struct ecache_s ecache_t;
+struct ecache_s {
+	malloc_mutex_t mtx;
+	eset_t eset;
+	eset_t guarded_eset;
+	/* All stored extents must be in the same state. */
+	extent_state_t state;
+	/* The index of the ehooks the ecache is associated with. */
+	unsigned ind;
+	/*
+	 * If true, delay coalescing until eviction; otherwise coalesce during
+	 * deallocation.
+	 */
+	bool delay_coalesce;
+};
+
+static inline size_t
+ecache_npages_get(ecache_t *ecache) {
+	return eset_npages_get(&ecache->eset) +
+	    eset_npages_get(&ecache->guarded_eset);
+}
+
+/* Get the number of extents in the given page size index. */
+static inline size_t
+ecache_nextents_get(ecache_t *ecache, pszind_t ind) {
+	return eset_nextents_get(&ecache->eset, ind) +
+	    eset_nextents_get(&ecache->guarded_eset, ind);
+}
+
+/* Get the sum total bytes of the extents in the given page size index. */
+static inline size_t
+ecache_nbytes_get(ecache_t *ecache, pszind_t ind) {
+	return eset_nbytes_get(&ecache->eset, ind) +
+	    eset_nbytes_get(&ecache->guarded_eset, ind);
+}
+
+static inline unsigned
+ecache_ind_get(ecache_t *ecache) {
+	return ecache->ind;
+}
+
+bool ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state,
+    unsigned ind, bool delay_coalesce);
+void ecache_prefork(tsdn_t *tsdn, ecache_t *ecache);
+void ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache);
+void ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache);
+
+#endif /* JEMALLOC_INTERNAL_ECACHE_H */

+ 698 - 0
BeefRT/JEMalloc/include/jemalloc/internal/edata.h

@@ -0,0 +1,698 @@
+#ifndef JEMALLOC_INTERNAL_EDATA_H
+#define JEMALLOC_INTERNAL_EDATA_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin_info.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/hpdata.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/slab_data.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/typed_list.h"
+
+/*
+ * sizeof(edata_t) is 128 bytes on 64-bit architectures.  Ensure the alignment
+ * to free up the low bits in the rtree leaf.
+ */
+#define EDATA_ALIGNMENT 128
+
+enum extent_state_e {
+	extent_state_active   = 0,
+	extent_state_dirty    = 1,
+	extent_state_muzzy    = 2,
+	extent_state_retained = 3,
+	extent_state_transition = 4, /* States below are intermediate. */
+	extent_state_merging = 5,
+	extent_state_max = 5 /* Sanity checking only. */
+};
+typedef enum extent_state_e extent_state_t;
+
+enum extent_head_state_e {
+	EXTENT_NOT_HEAD,
+	EXTENT_IS_HEAD   /* See comments in ehooks_default_merge_impl(). */
+};
+typedef enum extent_head_state_e extent_head_state_t;
+
+/*
+ * Which implementation of the page allocator interface, (PAI, defined in
+ * pai.h) owns the given extent?
+ */
+enum extent_pai_e {
+	EXTENT_PAI_PAC = 0,
+	EXTENT_PAI_HPA = 1
+};
+typedef enum extent_pai_e extent_pai_t;
+
+struct e_prof_info_s {
+	/* Time when this was allocated. */
+	nstime_t	e_prof_alloc_time;
+	/* Allocation request size. */
+	size_t		e_prof_alloc_size;
+	/* Points to a prof_tctx_t. */
+	atomic_p_t	e_prof_tctx;
+	/*
+	 * Points to a prof_recent_t for the allocation; NULL
+	 * means the recent allocation record no longer exists.
+	 * Protected by prof_recent_alloc_mtx.
+	 */
+	atomic_p_t	e_prof_recent_alloc;
+};
+typedef struct e_prof_info_s e_prof_info_t;
+
+/*
+ * The information about a particular edata that lives in an emap.  Space is
+ * more precious there (the information, plus the edata pointer, has to live in
+ * a 64-bit word if we want to enable a packed representation.
+ *
+ * There are two things that are special about the information here:
+ * - It's quicker to access.  You have one fewer pointer hop, since finding the
+ *   edata_t associated with an item always requires accessing the rtree leaf in
+ *   which this data is stored.
+ * - It can be read unsynchronized, and without worrying about lifetime issues.
+ */
+typedef struct edata_map_info_s edata_map_info_t;
+struct edata_map_info_s {
+	bool slab;
+	szind_t szind;
+};
+
+typedef struct edata_cmp_summary_s edata_cmp_summary_t;
+struct edata_cmp_summary_s {
+	uint64_t sn;
+	uintptr_t addr;
+};
+
+/* Extent (span of pages).  Use accessor functions for e_* fields. */
+typedef struct edata_s edata_t;
+ph_structs(edata_avail, edata_t);
+ph_structs(edata_heap, edata_t);
+struct edata_s {
+	/*
+	 * Bitfield containing several fields:
+	 *
+	 * a: arena_ind
+	 * b: slab
+	 * c: committed
+	 * p: pai
+	 * z: zeroed
+	 * g: guarded
+	 * t: state
+	 * i: szind
+	 * f: nfree
+	 * s: bin_shard
+	 *
+	 * 00000000 ... 0000ssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa
+	 *
+	 * arena_ind: Arena from which this extent came, or all 1 bits if
+	 *            unassociated.
+	 *
+	 * slab: The slab flag indicates whether the extent is used for a slab
+	 *       of small regions.  This helps differentiate small size classes,
+	 *       and it indicates whether interior pointers can be looked up via
+	 *       iealloc().
+	 *
+	 * committed: The committed flag indicates whether physical memory is
+	 *            committed to the extent, whether explicitly or implicitly
+	 *            as on a system that overcommits and satisfies physical
+	 *            memory needs on demand via soft page faults.
+	 *
+	 * pai: The pai flag is an extent_pai_t.
+	 *
+	 * zeroed: The zeroed flag is used by extent recycling code to track
+	 *         whether memory is zero-filled.
+	 *
+	 * guarded: The guarded flag is use by the sanitizer to track whether
+	 *          the extent has page guards around it.
+	 *
+	 * state: The state flag is an extent_state_t.
+	 *
+	 * szind: The szind flag indicates usable size class index for
+	 *        allocations residing in this extent, regardless of whether the
+	 *        extent is a slab.  Extent size and usable size often differ
+	 *        even for non-slabs, either due to sz_large_pad or promotion of
+	 *        sampled small regions.
+	 *
+	 * nfree: Number of free regions in slab.
+	 *
+	 * bin_shard: the shard of the bin from which this extent came.
+	 */
+	uint64_t		e_bits;
+#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT))
+
+#define EDATA_BITS_ARENA_WIDTH  MALLOCX_ARENA_BITS
+#define EDATA_BITS_ARENA_SHIFT  0
+#define EDATA_BITS_ARENA_MASK  MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT)
+
+#define EDATA_BITS_SLAB_WIDTH  1
+#define EDATA_BITS_SLAB_SHIFT  (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT)
+#define EDATA_BITS_SLAB_MASK  MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT)
+
+#define EDATA_BITS_COMMITTED_WIDTH  1
+#define EDATA_BITS_COMMITTED_SHIFT  (EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT)
+#define EDATA_BITS_COMMITTED_MASK  MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT)
+
+#define EDATA_BITS_PAI_WIDTH  1
+#define EDATA_BITS_PAI_SHIFT  (EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT)
+#define EDATA_BITS_PAI_MASK  MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT)
+
+#define EDATA_BITS_ZEROED_WIDTH  1
+#define EDATA_BITS_ZEROED_SHIFT  (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT)
+#define EDATA_BITS_ZEROED_MASK  MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT)
+
+#define EDATA_BITS_GUARDED_WIDTH  1
+#define EDATA_BITS_GUARDED_SHIFT  (EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT)
+#define EDATA_BITS_GUARDED_MASK  MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT)
+
+#define EDATA_BITS_STATE_WIDTH  3
+#define EDATA_BITS_STATE_SHIFT  (EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT)
+#define EDATA_BITS_STATE_MASK  MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT)
+
+#define EDATA_BITS_SZIND_WIDTH  LG_CEIL(SC_NSIZES)
+#define EDATA_BITS_SZIND_SHIFT  (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT)
+#define EDATA_BITS_SZIND_MASK  MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT)
+
+#define EDATA_BITS_NFREE_WIDTH  (SC_LG_SLAB_MAXREGS + 1)
+#define EDATA_BITS_NFREE_SHIFT  (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT)
+#define EDATA_BITS_NFREE_MASK  MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT)
+
+#define EDATA_BITS_BINSHARD_WIDTH  6
+#define EDATA_BITS_BINSHARD_SHIFT  (EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT)
+#define EDATA_BITS_BINSHARD_MASK  MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT)
+
+#define EDATA_BITS_IS_HEAD_WIDTH 1
+#define EDATA_BITS_IS_HEAD_SHIFT  (EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT)
+#define EDATA_BITS_IS_HEAD_MASK  MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT)
+
+	/* Pointer to the extent that this structure is responsible for. */
+	void			*e_addr;
+
+	union {
+		/*
+		 * Extent size and serial number associated with the extent
+		 * structure (different than the serial number for the extent at
+		 * e_addr).
+		 *
+		 * ssssssss [...] ssssssss ssssnnnn nnnnnnnn
+		 */
+		size_t			e_size_esn;
+	#define EDATA_SIZE_MASK	((size_t)~(PAGE-1))
+	#define EDATA_ESN_MASK		((size_t)PAGE-1)
+		/* Base extent size, which may not be a multiple of PAGE. */
+		size_t			e_bsize;
+	};
+
+	/*
+	 * If this edata is a user allocation from an HPA, it comes out of some
+	 * pageslab (we don't yet support huegpage allocations that don't fit
+	 * into pageslabs).  This tracks it.
+	 */
+	hpdata_t *e_ps;
+
+	/*
+	 * Serial number.  These are not necessarily unique; splitting an extent
+	 * results in two extents with the same serial number.
+	 */
+	uint64_t e_sn;
+
+	union {
+		/*
+		 * List linkage used when the edata_t is active; either in
+		 * arena's large allocations or bin_t's slabs_full.
+		 */
+		ql_elm(edata_t)	ql_link_active;
+		/*
+		 * Pairing heap linkage.  Used whenever the extent is inactive
+		 * (in the page allocators), or when it is active and in
+		 * slabs_nonfull, or when the edata_t is unassociated with an
+		 * extent and sitting in an edata_cache.
+		 */
+		union {
+			edata_heap_link_t heap_link;
+			edata_avail_link_t avail_link;
+		};
+	};
+
+	union {
+		/*
+		 * List linkage used when the extent is inactive:
+		 * - Stashed dirty extents
+		 * - Ecache LRU functionality.
+		 */
+		ql_elm(edata_t) ql_link_inactive;
+		/* Small region slab metadata. */
+		slab_data_t	e_slab_data;
+
+		/* Profiling data, used for large objects. */
+		e_prof_info_t	e_prof_info;
+	};
+};
+
+TYPED_LIST(edata_list_active, edata_t, ql_link_active)
+TYPED_LIST(edata_list_inactive, edata_t, ql_link_inactive)
+
+static inline unsigned
+edata_arena_ind_get(const edata_t *edata) {
+	unsigned arena_ind = (unsigned)((edata->e_bits &
+	    EDATA_BITS_ARENA_MASK) >> EDATA_BITS_ARENA_SHIFT);
+	assert(arena_ind < MALLOCX_ARENA_LIMIT);
+
+	return arena_ind;
+}
+
+static inline szind_t
+edata_szind_get_maybe_invalid(const edata_t *edata) {
+	szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK) >>
+	    EDATA_BITS_SZIND_SHIFT);
+	assert(szind <= SC_NSIZES);
+	return szind;
+}
+
+static inline szind_t
+edata_szind_get(const edata_t *edata) {
+	szind_t szind = edata_szind_get_maybe_invalid(edata);
+	assert(szind < SC_NSIZES); /* Never call when "invalid". */
+	return szind;
+}
+
+static inline size_t
+edata_usize_get(const edata_t *edata) {
+	return sz_index2size(edata_szind_get(edata));
+}
+
+static inline unsigned
+edata_binshard_get(const edata_t *edata) {
+	unsigned binshard = (unsigned)((edata->e_bits &
+	    EDATA_BITS_BINSHARD_MASK) >> EDATA_BITS_BINSHARD_SHIFT);
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	return binshard;
+}
+
+static inline uint64_t
+edata_sn_get(const edata_t *edata) {
+	return edata->e_sn;
+}
+
+static inline extent_state_t
+edata_state_get(const edata_t *edata) {
+	return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK) >>
+	    EDATA_BITS_STATE_SHIFT);
+}
+
+static inline bool
+edata_guarded_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK) >>
+	    EDATA_BITS_GUARDED_SHIFT);
+}
+
+static inline bool
+edata_zeroed_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK) >>
+	    EDATA_BITS_ZEROED_SHIFT);
+}
+
+static inline bool
+edata_committed_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK) >>
+	    EDATA_BITS_COMMITTED_SHIFT);
+}
+
+static inline extent_pai_t
+edata_pai_get(const edata_t *edata) {
+	return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK) >>
+	    EDATA_BITS_PAI_SHIFT);
+}
+
+static inline bool
+edata_slab_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >>
+	    EDATA_BITS_SLAB_SHIFT);
+}
+
+static inline unsigned
+edata_nfree_get(const edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK) >>
+	    EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void *
+edata_base_get(const edata_t *edata) {
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) ||
+	    !edata_slab_get(edata));
+	return PAGE_ADDR2BASE(edata->e_addr);
+}
+
+static inline void *
+edata_addr_get(const edata_t *edata) {
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) ||
+	    !edata_slab_get(edata));
+	return edata->e_addr;
+}
+
+static inline size_t
+edata_size_get(const edata_t *edata) {
+	return (edata->e_size_esn & EDATA_SIZE_MASK);
+}
+
+static inline size_t
+edata_esn_get(const edata_t *edata) {
+	return (edata->e_size_esn & EDATA_ESN_MASK);
+}
+
+static inline size_t
+edata_bsize_get(const edata_t *edata) {
+	return edata->e_bsize;
+}
+
+static inline hpdata_t *
+edata_ps_get(const edata_t *edata) {
+	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
+	return edata->e_ps;
+}
+
+static inline void *
+edata_before_get(const edata_t *edata) {
+	return (void *)((uintptr_t)edata_base_get(edata) - PAGE);
+}
+
+static inline void *
+edata_last_get(const edata_t *edata) {
+	return (void *)((uintptr_t)edata_base_get(edata) +
+	    edata_size_get(edata) - PAGE);
+}
+
+static inline void *
+edata_past_get(const edata_t *edata) {
+	return (void *)((uintptr_t)edata_base_get(edata) +
+	    edata_size_get(edata));
+}
+
+static inline slab_data_t *
+edata_slab_data_get(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return &edata->e_slab_data;
+}
+
+static inline const slab_data_t *
+edata_slab_data_get_const(const edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return &edata->e_slab_data;
+}
+
+static inline prof_tctx_t *
+edata_prof_tctx_get(const edata_t *edata) {
+	return (prof_tctx_t *)atomic_load_p(&edata->e_prof_info.e_prof_tctx,
+	    ATOMIC_ACQUIRE);
+}
+
+static inline const nstime_t *
+edata_prof_alloc_time_get(const edata_t *edata) {
+	return &edata->e_prof_info.e_prof_alloc_time;
+}
+
+static inline size_t
+edata_prof_alloc_size_get(const edata_t *edata) {
+	return edata->e_prof_info.e_prof_alloc_size;
+}
+
+static inline prof_recent_t *
+edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) {
+	return (prof_recent_t *)atomic_load_p(
+	    &edata->e_prof_info.e_prof_recent_alloc, ATOMIC_RELAXED);
+}
+
+static inline void
+edata_arena_ind_set(edata_t *edata, unsigned arena_ind) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK) |
+	    ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT);
+}
+
+static inline void
+edata_binshard_set(edata_t *edata, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK) |
+	    ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT);
+}
+
+static inline void
+edata_addr_set(edata_t *edata, void *addr) {
+	edata->e_addr = addr;
+}
+
+static inline void
+edata_size_set(edata_t *edata, size_t size) {
+	assert((size & ~EDATA_SIZE_MASK) == 0);
+	edata->e_size_esn = size | (edata->e_size_esn & ~EDATA_SIZE_MASK);
+}
+
+static inline void
+edata_esn_set(edata_t *edata, size_t esn) {
+	edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK) | (esn &
+	    EDATA_ESN_MASK);
+}
+
+static inline void
+edata_bsize_set(edata_t *edata, size_t bsize) {
+	edata->e_bsize = bsize;
+}
+
+static inline void
+edata_ps_set(edata_t *edata, hpdata_t *ps) {
+	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
+	edata->e_ps = ps;
+}
+
+static inline void
+edata_szind_set(edata_t *edata, szind_t szind) {
+	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK) |
+	    ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT);
+}
+
+static inline void
+edata_nfree_set(edata_t *edata, unsigned nfree) {
+	assert(edata_slab_get(edata));
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK) |
+	    ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_binshard_set(edata_t *edata, unsigned nfree, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	edata->e_bits = (edata->e_bits &
+	    (~EDATA_BITS_NFREE_MASK & ~EDATA_BITS_BINSHARD_MASK)) |
+	    ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT) |
+	    ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_inc(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	edata->e_bits += ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_dec(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	edata->e_bits -= ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_sub(edata_t *edata, uint64_t n) {
+	assert(edata_slab_get(edata));
+	edata->e_bits -= (n << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_sn_set(edata_t *edata, uint64_t sn) {
+	edata->e_sn = sn;
+}
+
+static inline void
+edata_state_set(edata_t *edata, extent_state_t state) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK) |
+	    ((uint64_t)state << EDATA_BITS_STATE_SHIFT);
+}
+
+static inline void
+edata_guarded_set(edata_t *edata, bool guarded) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK) |
+	    ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT);
+}
+
+static inline void
+edata_zeroed_set(edata_t *edata, bool zeroed) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK) |
+	    ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT);
+}
+
+static inline void
+edata_committed_set(edata_t *edata, bool committed) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK) |
+	    ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT);
+}
+
+static inline void
+edata_pai_set(edata_t *edata, extent_pai_t pai) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK) |
+	    ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
+}
+
+static inline void
+edata_slab_set(edata_t *edata, bool slab) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) |
+	    ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT);
+}
+
+static inline void
+edata_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) {
+	atomic_store_p(&edata->e_prof_info.e_prof_tctx, tctx, ATOMIC_RELEASE);
+}
+
+static inline void
+edata_prof_alloc_time_set(edata_t *edata, nstime_t *t) {
+	nstime_copy(&edata->e_prof_info.e_prof_alloc_time, t);
+}
+
+static inline void
+edata_prof_alloc_size_set(edata_t *edata, size_t size) {
+	edata->e_prof_info.e_prof_alloc_size = size;
+}
+
+static inline void
+edata_prof_recent_alloc_set_dont_call_directly(edata_t *edata,
+    prof_recent_t *recent_alloc) {
+	atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc,
+	    ATOMIC_RELAXED);
+}
+
+static inline bool
+edata_is_head_get(edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK) >>
+	    EDATA_BITS_IS_HEAD_SHIFT);
+}
+
+static inline void
+edata_is_head_set(edata_t *edata, bool is_head) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK) |
+	    ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
+}
+
+static inline bool
+edata_state_in_transition(extent_state_t state) {
+	return state >= extent_state_transition;
+}
+
+/*
+ * Because this function is implemented as a sequence of bitfield modifications,
+ * even though each individual bit is properly initialized, we technically read
+ * uninitialized data within it.  This is mostly fine, since most callers get
+ * their edatas from zeroing sources, but callers who make stack edata_ts need
+ * to manually zero them.
+ */
+static inline void
+edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
+    bool slab, szind_t szind, uint64_t sn, extent_state_t state, bool zeroed,
+    bool committed, extent_pai_t pai, extent_head_state_t is_head) {
+	assert(addr == PAGE_ADDR2BASE(addr) || !slab);
+
+	edata_arena_ind_set(edata, arena_ind);
+	edata_addr_set(edata, addr);
+	edata_size_set(edata, size);
+	edata_slab_set(edata, slab);
+	edata_szind_set(edata, szind);
+	edata_sn_set(edata, sn);
+	edata_state_set(edata, state);
+	edata_guarded_set(edata, false);
+	edata_zeroed_set(edata, zeroed);
+	edata_committed_set(edata, committed);
+	edata_pai_set(edata, pai);
+	edata_is_head_set(edata, is_head == EXTENT_IS_HEAD);
+	if (config_prof) {
+		edata_prof_tctx_set(edata, NULL);
+	}
+}
+
+static inline void
+edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
+	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
+	edata_addr_set(edata, addr);
+	edata_bsize_set(edata, bsize);
+	edata_slab_set(edata, false);
+	edata_szind_set(edata, SC_NSIZES);
+	edata_sn_set(edata, sn);
+	edata_state_set(edata, extent_state_active);
+	edata_guarded_set(edata, false);
+	edata_zeroed_set(edata, true);
+	edata_committed_set(edata, true);
+	/*
+	 * This isn't strictly true, but base allocated extents never get
+	 * deallocated and can't be looked up in the emap, but no sense in
+	 * wasting a state bit to encode this fact.
+	 */
+	edata_pai_set(edata, EXTENT_PAI_PAC);
+}
+
+static inline int
+edata_esn_comp(const edata_t *a, const edata_t *b) {
+	size_t a_esn = edata_esn_get(a);
+	size_t b_esn = edata_esn_get(b);
+
+	return (a_esn > b_esn) - (a_esn < b_esn);
+}
+
+static inline int
+edata_ead_comp(const edata_t *a, const edata_t *b) {
+	uintptr_t a_eaddr = (uintptr_t)a;
+	uintptr_t b_eaddr = (uintptr_t)b;
+
+	return (a_eaddr > b_eaddr) - (a_eaddr < b_eaddr);
+}
+
+static inline edata_cmp_summary_t
+edata_cmp_summary_get(const edata_t *edata) {
+	return (edata_cmp_summary_t){edata_sn_get(edata),
+		(uintptr_t)edata_addr_get(edata)};
+}
+
+static inline int
+edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
+	int ret;
+	ret = (a.sn > b.sn) - (a.sn < b.sn);
+	if (ret != 0) {
+		return ret;
+	}
+	ret = (a.addr > b.addr) - (a.addr < b.addr);
+	return ret;
+}
+
+static inline int
+edata_snad_comp(const edata_t *a, const edata_t *b) {
+	edata_cmp_summary_t a_cmp = edata_cmp_summary_get(a);
+	edata_cmp_summary_t b_cmp = edata_cmp_summary_get(b);
+
+	return edata_cmp_summary_comp(a_cmp, b_cmp);
+}
+
+static inline int
+edata_esnead_comp(const edata_t *a, const edata_t *b) {
+	int ret;
+
+	ret = edata_esn_comp(a, b);
+	if (ret != 0) {
+		return ret;
+	}
+
+	ret = edata_ead_comp(a, b);
+	return ret;
+}
+
+ph_proto(, edata_avail, edata_t)
+ph_proto(, edata_heap, edata_t)
+
+#endif /* JEMALLOC_INTERNAL_EDATA_H */

+ 49 - 0
BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h

@@ -0,0 +1,49 @@
+#ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H
+#define JEMALLOC_INTERNAL_EDATA_CACHE_H
+
+#include "jemalloc/internal/base.h"
+
+/* For tests only. */
+#define EDATA_CACHE_FAST_FILL 4
+
+/*
+ * A cache of edata_t structures allocated via base_alloc_edata (as opposed to
+ * the underlying extents they describe).  The contents of returned edata_t
+ * objects are garbage and cannot be relied upon.
+ */
+
+typedef struct edata_cache_s edata_cache_t;
+struct edata_cache_s {
+	edata_avail_t avail;
+	atomic_zu_t count;
+	malloc_mutex_t mtx;
+	base_t *base;
+};
+
+bool edata_cache_init(edata_cache_t *edata_cache, base_t *base);
+edata_t *edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata);
+
+void edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache);
+
+/*
+ * An edata_cache_small is like an edata_cache, but it relies on external
+ * synchronization and avoids first-fit strategies.
+ */
+
+typedef struct edata_cache_fast_s edata_cache_fast_t;
+struct edata_cache_fast_s {
+	edata_list_inactive_t list;
+	edata_cache_t *fallback;
+	bool disabled;
+};
+
+void edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback);
+edata_t *edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs);
+void edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs,
+    edata_t *edata);
+void edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs);
+
+#endif /* JEMALLOC_INTERNAL_EDATA_CACHE_H */

+ 412 - 0
BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h

@@ -0,0 +1,412 @@
+#ifndef JEMALLOC_INTERNAL_EHOOKS_H
+#define JEMALLOC_INTERNAL_EHOOKS_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/extent_mmap.h"
+
+/*
+ * This module is the internal interface to the extent hooks (both
+ * user-specified and external).  Eventually, this will give us the flexibility
+ * to use multiple different versions of user-visible extent-hook APIs under a
+ * single user interface.
+ *
+ * Current API expansions (not available to anyone but the default hooks yet):
+ *   - Head state tracking.  Hooks can decide whether or not to merge two
+ *     extents based on whether or not one of them is the head (i.e. was
+ *     allocated on its own).  The later extent loses its "head" status.
+ */
+
+extern const extent_hooks_t ehooks_default_extent_hooks;
+
+typedef struct ehooks_s ehooks_t;
+struct ehooks_s {
+	/*
+	 * The user-visible id that goes with the ehooks (i.e. that of the base
+	 * they're a part of, the associated arena's index within the arenas
+	 * array).
+	 */
+	unsigned ind;
+	/* Logically an extent_hooks_t *. */
+	atomic_p_t ptr;
+};
+
+extern const extent_hooks_t ehooks_default_extent_hooks;
+
+/*
+ * These are not really part of the public API.  Each hook has a fast-path for
+ * the default-hooks case that can avoid various small inefficiencies:
+ *   - Forgetting tsd and then calling tsd_get within the hook.
+ *   - Getting more state than necessary out of the extent_t.
+ *   - Doing arena_ind -> arena -> arena_ind lookups.
+ * By making the calls to these functions visible to the compiler, it can move
+ * those extra bits of computation down below the fast-paths where they get ignored.
+ */
+void *ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
+bool ehooks_default_dalloc_impl(void *addr, size_t size);
+void ehooks_default_destroy_impl(void *addr, size_t size);
+bool ehooks_default_commit_impl(void *addr, size_t offset, size_t length);
+bool ehooks_default_decommit_impl(void *addr, size_t offset, size_t length);
+#ifdef PAGES_CAN_PURGE_LAZY
+bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length);
+#endif
+#ifdef PAGES_CAN_PURGE_FORCED
+bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length);
+#endif
+bool ehooks_default_split_impl();
+/*
+ * Merge is the only default extent hook we declare -- see the comment in
+ * ehooks_merge.
+ */
+bool ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a,
+    size_t size_a, void *addr_b, size_t size_b, bool committed,
+    unsigned arena_ind);
+bool ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b);
+void ehooks_default_zero_impl(void *addr, size_t size);
+void ehooks_default_guard_impl(void *guard1, void *guard2);
+void ehooks_default_unguard_impl(void *guard1, void *guard2);
+
+/*
+ * We don't officially support reentrancy from wtihin the extent hooks.  But
+ * various people who sit within throwing distance of the jemalloc team want
+ * that functionality in certain limited cases.  The default reentrancy guards
+ * assert that we're not reentrant from a0 (since it's the bootstrap arena,
+ * where reentrant allocations would be redirected), which we would incorrectly
+ * trigger in cases where a0 has extent hooks (those hooks themselves can't be
+ * reentrant, then, but there are reasonable uses for such functionality, like
+ * putting internal metadata on hugepages).  Therefore, we use the raw
+ * reentrancy guards.
+ *
+ * Eventually, we need to think more carefully about whether and where we
+ * support allocating from within extent hooks (and what that means for things
+ * like profiling, stats collection, etc.), and document what the guarantee is.
+ */
+static inline void
+ehooks_pre_reentrancy(tsdn_t *tsdn) {
+	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
+	tsd_pre_reentrancy_raw(tsd);
+}
+
+static inline void
+ehooks_post_reentrancy(tsdn_t *tsdn) {
+	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
+	tsd_post_reentrancy_raw(tsd);
+}
+
+/* Beginning of the public API. */
+void ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind);
+
+static inline unsigned
+ehooks_ind_get(const ehooks_t *ehooks) {
+	return ehooks->ind;
+}
+
+static inline void
+ehooks_set_extent_hooks_ptr(ehooks_t *ehooks, extent_hooks_t *extent_hooks) {
+	atomic_store_p(&ehooks->ptr, extent_hooks, ATOMIC_RELEASE);
+}
+
+static inline extent_hooks_t *
+ehooks_get_extent_hooks_ptr(ehooks_t *ehooks) {
+	return (extent_hooks_t *)atomic_load_p(&ehooks->ptr, ATOMIC_ACQUIRE);
+}
+
+static inline bool
+ehooks_are_default(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks) ==
+	    &ehooks_default_extent_hooks;
+}
+
+/*
+ * In some cases, a caller needs to allocate resources before attempting to call
+ * a hook.  If that hook is doomed to fail, this is wasteful.  We therefore
+ * include some checks for such cases.
+ */
+static inline bool
+ehooks_dalloc_will_fail(ehooks_t *ehooks) {
+	if (ehooks_are_default(ehooks)) {
+		return opt_retain;
+	} else {
+		return ehooks_get_extent_hooks_ptr(ehooks)->dalloc == NULL;
+	}
+}
+
+static inline bool
+ehooks_split_will_fail(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks)->split == NULL;
+}
+
+static inline bool
+ehooks_merge_will_fail(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks)->merge == NULL;
+}
+
+static inline bool
+ehooks_guard_will_fail(ehooks_t *ehooks) {
+	/*
+	 * Before the guard hooks are officially introduced, limit the use to
+	 * the default hooks only.
+	 */
+	return !ehooks_are_default(ehooks);
+}
+
+/*
+ * Some hooks are required to return zeroed memory in certain situations.  In
+ * debug mode, we do some heuristic checks that they did what they were supposed
+ * to.
+ *
+ * This isn't really ehooks-specific (i.e. anyone can check for zeroed memory).
+ * But incorrect zero information indicates an ehook bug.
+ */
+static inline void
+ehooks_debug_zero_check(void *addr, size_t size) {
+	assert(((uintptr_t)addr & PAGE_MASK) == 0);
+	assert((size & PAGE_MASK) == 0);
+	assert(size > 0);
+	if (config_debug) {
+		/* Check the whole first page. */
+		size_t *p = (size_t *)addr;
+		for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
+			assert(p[i] == 0);
+		}
+		/*
+		 * And 4 spots within.  There's a tradeoff here; the larger
+		 * this number, the more likely it is that we'll catch a bug
+		 * where ehooks return a sparsely non-zero range.  But
+		 * increasing the number of checks also increases the number of
+		 * page faults in debug mode.  FreeBSD does much of their
+		 * day-to-day development work in debug mode, so we don't want
+		 * even the debug builds to be too slow.
+		 */
+		const size_t nchecks = 4;
+		assert(PAGE >= sizeof(size_t) * nchecks);
+		for (size_t i = 0; i < nchecks; ++i) {
+			assert(p[i * (size / sizeof(size_t) / nchecks)] == 0);
+		}
+	}
+}
+
+
+static inline void *
+ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit) {
+	bool orig_zero = *zero;
+	void *ret;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ret = ehooks_default_alloc_impl(tsdn, new_addr, size,
+		    alignment, zero, commit, ehooks_ind_get(ehooks));
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		ret = extent_hooks->alloc(extent_hooks, new_addr, size,
+		    alignment, zero, commit, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+	assert(new_addr == NULL || ret == NULL || new_addr == ret);
+	assert(!orig_zero || *zero);
+	if (*zero && ret != NULL) {
+		ehooks_debug_zero_check(ret, size);
+	}
+	return ret;
+}
+
+static inline bool
+ehooks_dalloc(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_dalloc_impl(addr, size);
+	} else if (extent_hooks->dalloc == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->dalloc(extent_hooks, addr, size,
+		    committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline void
+ehooks_destroy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_destroy_impl(addr, size);
+	} else if (extent_hooks->destroy == NULL) {
+		/* Do nothing. */
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		extent_hooks->destroy(extent_hooks, addr, size, committed,
+		    ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+}
+
+static inline bool
+ehooks_commit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	bool err;
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		err = ehooks_default_commit_impl(addr, offset, length);
+	} else if (extent_hooks->commit == NULL) {
+		err = true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		err = extent_hooks->commit(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+	if (!err) {
+		ehooks_debug_zero_check(addr, size);
+	}
+	return err;
+}
+
+static inline bool
+ehooks_decommit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_decommit_impl(addr, offset, length);
+	} else if (extent_hooks->decommit == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->decommit(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_purge_lazy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+#ifdef PAGES_CAN_PURGE_LAZY
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_purge_lazy_impl(addr, offset, length);
+	}
+#endif
+	if (extent_hooks->purge_lazy == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->purge_lazy(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_purge_forced(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	/*
+	 * It would be correct to have a ehooks_debug_zero_check call at the end
+	 * of this function; purge_forced is required to zero.  But checking
+	 * would touch the page in question, which may have performance
+	 * consequences (imagine the hooks are using hugepages, with a global
+	 * zero page off).  Even in debug mode, it's usually a good idea to
+	 * avoid cases that can dramatically increase memory consumption.
+	 */
+#ifdef PAGES_CAN_PURGE_FORCED
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_purge_forced_impl(addr, offset, length);
+	}
+#endif
+	if (extent_hooks->purge_forced == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->purge_forced(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_split(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t size_a, size_t size_b, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (ehooks_are_default(ehooks)) {
+		return ehooks_default_split_impl();
+	} else if (extent_hooks->split == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->split(extent_hooks, addr, size, size_a,
+		    size_b, committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_merge(tsdn_t *tsdn, ehooks_t *ehooks, void *addr_a, size_t size_a,
+    void *addr_b, size_t size_b, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_merge_impl(tsdn, addr_a, addr_b);
+	} else if (extent_hooks->merge == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->merge(extent_hooks, addr_a, size_a,
+		    addr_b, size_b, committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline void
+ehooks_zero(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_zero_impl(addr, size);
+	} else {
+		/*
+		 * It would be correct to try using the user-provided purge
+		 * hooks (since they are required to have zeroed the extent if
+		 * they indicate success), but we don't necessarily know their
+		 * cost.  We'll be conservative and use memset.
+		 */
+		memset(addr, 0, size);
+	}
+}
+
+static inline bool
+ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
+	bool err;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_guard_impl(guard1, guard2);
+		err = false;
+	} else {
+		err = true;
+	}
+
+	return err;
+}
+
+static inline bool
+ehooks_unguard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
+	bool err;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_unguard_impl(guard1, guard2);
+		err = false;
+	} else {
+		err = true;
+	}
+
+	return err;
+}
+
+#endif /* JEMALLOC_INTERNAL_EHOOKS_H */

+ 357 - 0
BeefRT/JEMalloc/include/jemalloc/internal/emap.h

@@ -0,0 +1,357 @@
+#ifndef JEMALLOC_INTERNAL_EMAP_H
+#define JEMALLOC_INTERNAL_EMAP_H
+
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+
+/*
+ * Note: Ends without at semicolon, so that
+ *     EMAP_DECLARE_RTREE_CTX;
+ * in uses will avoid empty-statement warnings.
+ */
+#define EMAP_DECLARE_RTREE_CTX						\
+    rtree_ctx_t rtree_ctx_fallback;					\
+    rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
+
+typedef struct emap_s emap_t;
+struct emap_s {
+	rtree_t rtree;
+};
+
+/* Used to pass rtree lookup context down the path. */
+typedef struct emap_alloc_ctx_t emap_alloc_ctx_t;
+struct emap_alloc_ctx_t {
+	szind_t szind;
+	bool slab;
+};
+
+typedef struct emap_full_alloc_ctx_s emap_full_alloc_ctx_t;
+struct emap_full_alloc_ctx_s {
+	szind_t szind;
+	bool slab;
+	edata_t *edata;
+};
+
+bool emap_init(emap_t *emap, base_t *base, bool zeroed);
+
+void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind,
+    bool slab);
+
+void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
+    extent_state_t state);
+
+/*
+ * The two acquire functions below allow accessing neighbor edatas, if it's safe
+ * and valid to do so (i.e. from the same arena, of the same state, etc.).  This
+ * is necessary because the ecache locks are state based, and only protect
+ * edatas with the same state.  Therefore the neighbor edata's state needs to be
+ * verified first, before chasing the edata pointer.  The returned edata will be
+ * in an acquired state, meaning other threads will be prevented from accessing
+ * it, even if technically the edata can still be discovered from the rtree.
+ *
+ * This means, at any moment when holding pointers to edata, either one of the
+ * state based locks is held (and the edatas are all of the protected state), or
+ * the edatas are in an acquired state (e.g. in active or merging state).  The
+ * acquire operation itself (changing the edata to an acquired state) is done
+ * under the state locks.
+ */
+edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap,
+    edata_t *edata, extent_pai_t pai, extent_state_t expected_state,
+    bool forward);
+edata_t *emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap,
+    edata_t *edata, extent_pai_t pai, extent_state_t expected_state);
+void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
+    extent_state_t new_state);
+
+/*
+ * Associate the given edata with its beginning and end address, setting the
+ * szind and slab info appropriately.
+ * Returns true on error (i.e. resource exhaustion).
+ */
+bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
+    szind_t szind, bool slab);
+
+/*
+ * Does the same thing, but with the interior of the range, for slab
+ * allocations.
+ *
+ * You might wonder why we don't just have a single emap_register function that
+ * does both depending on the value of 'slab'.  The answer is twofold:
+ * - As a practical matter, in places like the extract->split->commit pathway,
+ *   we defer the interior operation until we're sure that the commit won't fail
+ *   (but we have to register the split boundaries there).
+ * - In general, we're trying to move to a world where the page-specific
+ *   allocator doesn't know as much about how the pages it allocates will be
+ *   used, and passing a 'slab' parameter everywhere makes that more
+ *   complicated.
+ *
+ * Unlike the boundary version, this function can't fail; this is because slabs
+ * can't get big enough to touch a new page that neither of the boundaries
+ * touched, so no allocation is necessary to fill the interior once the boundary
+ * has been touched.
+ */
+void emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
+    szind_t szind);
+
+void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+void emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+
+typedef struct emap_prepare_s emap_prepare_t;
+struct emap_prepare_s {
+	rtree_leaf_elm_t *lead_elm_a;
+	rtree_leaf_elm_t *lead_elm_b;
+	rtree_leaf_elm_t *trail_elm_a;
+	rtree_leaf_elm_t *trail_elm_b;
+};
+
+/**
+ * These functions the emap metadata management for merging, splitting, and
+ * reusing extents.  In particular, they set the boundary mappings from
+ * addresses to edatas.  If the result is going to be used as a slab, you
+ * still need to call emap_register_interior on it, though.
+ *
+ * Remap simply changes the szind and slab status of an extent's boundary
+ * mappings.  If the extent is not a slab, it doesn't bother with updating the
+ * end mapping (since lookups only occur in the interior of an extent for
+ * slabs).  Since the szind and slab status only make sense for active extents,
+ * this should only be called while activating or deactivating an extent.
+ *
+ * Split and merge have a "prepare" and a "commit" portion.  The prepare portion
+ * does the operations that can be done without exclusive access to the extent
+ * in question, while the commit variant requires exclusive access to maintain
+ * the emap invariants.  The only function that can fail is emap_split_prepare,
+ * and it returns true on failure (at which point the caller shouldn't commit).
+ *
+ * In all cases, "lead" refers to the lower-addressed extent, and trail to the
+ * higher-addressed one.  It's the caller's responsibility to set the edata
+ * state appropriately.
+ */
+bool emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *edata, size_t size_a, edata_t *trail, size_t size_b);
+void emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, size_t size_a, edata_t *trail, size_t size_b);
+void emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, edata_t *trail);
+void emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, edata_t *trail);
+
+/* Assert that the emap's view of the given edata matches the edata's view. */
+void emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+static inline void
+emap_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (config_debug) {
+		emap_do_assert_mapped(tsdn, emap, edata);
+	}
+}
+
+/* Assert that the given edata isn't in the map. */
+void emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+static inline void
+emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (config_debug) {
+		emap_do_assert_not_mapped(tsdn, emap, edata);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	assert(config_debug);
+	emap_assert_mapped(tsdn, emap, edata);
+
+	EMAP_DECLARE_RTREE_CTX;
+	rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
+	    (uintptr_t)edata_base_get(edata));
+
+	return edata_state_in_transition(contents.metadata.state);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (!config_debug) {
+		/* For assertions only. */
+		return false;
+	}
+
+	/*
+	 * The edata is considered acquired if no other threads will attempt to
+	 * read / write any fields from it.  This includes a few cases:
+	 *
+	 * 1) edata not hooked into emap yet -- This implies the edata just got
+	 * allocated or initialized.
+	 *
+	 * 2) in an active or transition state -- In both cases, the edata can
+	 * be discovered from the emap, however the state tracked in the rtree
+	 * will prevent other threads from accessing the actual edata.
+	 */
+	EMAP_DECLARE_RTREE_CTX;
+	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
+	    rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true,
+	    /* init_missing */ false);
+	if (elm == NULL) {
+		return true;
+	}
+	rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
+	    /* dependent */ true);
+	if (contents.edata == NULL ||
+	    contents.metadata.state == extent_state_active ||
+	    edata_state_in_transition(contents.metadata.state)) {
+		return true;
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) {
+	assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer));
+	assert(edata_pai_get(inner) == edata_pai_get(outer));
+	assert(edata_committed_get(inner) == edata_committed_get(outer));
+	assert(edata_state_get(inner) == extent_state_active);
+	assert(edata_state_get(outer) == extent_state_merging);
+	assert(!edata_guarded_get(inner) && !edata_guarded_get(outer));
+	assert(edata_base_get(inner) == edata_past_get(outer) ||
+	    edata_base_get(outer) == edata_past_get(inner));
+}
+
+JEMALLOC_ALWAYS_INLINE void
+extent_assert_can_expand(const edata_t *original, const edata_t *expand) {
+	assert(edata_arena_ind_get(original) == edata_arena_ind_get(expand));
+	assert(edata_pai_get(original) == edata_pai_get(expand));
+	assert(edata_state_get(original) == extent_state_active);
+	assert(edata_state_get(expand) == extent_state_merging);
+	assert(edata_past_get(original) == edata_base_get(expand));
+}
+
+JEMALLOC_ALWAYS_INLINE edata_t *
+emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
+}
+
+/* Fills in alloc_ctx with the info in the map. */
+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
+    emap_alloc_ctx_t *alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
+	    rtree_ctx, (uintptr_t)ptr);
+	alloc_ctx->szind = metadata.szind;
+	alloc_ctx->slab = metadata.slab;
+}
+
+/* The pointer must be mapped. */
+JEMALLOC_ALWAYS_INLINE void
+emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
+    emap_full_alloc_ctx_t *full_alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
+	    (uintptr_t)ptr);
+	full_alloc_ctx->edata = contents.edata;
+	full_alloc_ctx->szind = contents.metadata.szind;
+	full_alloc_ctx->slab = contents.metadata.slab;
+}
+
+/*
+ * The pointer is allowed to not be mapped.
+ *
+ * Returns true when the pointer is not present.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
+    emap_full_alloc_ctx_t *full_alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_contents_t contents;
+	bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx,
+	    (uintptr_t)ptr, &contents);
+	if (err) {
+		return true;
+	}
+	full_alloc_ctx->edata = contents.edata;
+	full_alloc_ctx->szind = contents.metadata.szind;
+	full_alloc_ctx->slab = contents.metadata.slab;
+	return false;
+}
+
+/*
+ * Only used on the fastpath of free.  Returns true when cannot be fulfilled by
+ * fast path, e.g. when the metadata key is not cached.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr,
+    emap_alloc_ctx_t *alloc_ctx) {
+	/* Use the unsafe getter since this may gets called during exit. */
+	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd);
+
+	rtree_metadata_t metadata;
+	bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree,
+	    rtree_ctx, (uintptr_t)ptr, &metadata);
+	if (err) {
+		return true;
+	}
+	alloc_ctx->szind = metadata.szind;
+	alloc_ctx->slab = metadata.slab;
+	return false;
+}
+
+/*
+ * We want to do batch lookups out of the cache bins, which use
+ * cache_bin_ptr_array_get to access the i'th element of the bin (since they
+ * invert usual ordering in deciding what to flush).  This lets the emap avoid
+ * caring about its caller's ordering.
+ */
+typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind);
+/*
+ * This allows size-checking assertions, which we can only do while we're in the
+ * process of edata lookups.
+ */
+typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx);
+
+typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t;
+union emap_batch_lookup_result_u {
+	edata_t *edata;
+	rtree_leaf_elm_t *rtree_leaf;
+};
+
+JEMALLOC_ALWAYS_INLINE void
+emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs,
+    emap_ptr_getter ptr_getter, void *ptr_getter_ctx,
+    emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx,
+    emap_batch_lookup_result_t *result) {
+	/* Avoids null-checking tsdn in the loop below. */
+	util_assume(tsd != NULL);
+	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd);
+
+	for (size_t i = 0; i < nptrs; i++) {
+		const void *ptr = ptr_getter(ptr_getter_ctx, i);
+		/*
+		 * Reuse the edatas array as a temp buffer, lying a little about
+		 * the types.
+		 */
+		result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd),
+		    &emap->rtree, rtree_ctx, (uintptr_t)ptr,
+		    /* dependent */ true, /* init_missing */ false);
+	}
+
+	for (size_t i = 0; i < nptrs; i++) {
+		rtree_leaf_elm_t *elm = result[i].rtree_leaf;
+		rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd),
+		    &emap->rtree, elm, /* dependent */ true);
+		result[i].edata = contents.edata;
+		emap_full_alloc_ctx_t alloc_ctx;
+		/*
+		 * Not all these fields are read in practice by the metadata
+		 * visitor.  But the compiler can easily optimize away the ones
+		 * that aren't, so no sense in being incomplete.
+		 */
+		alloc_ctx.szind = contents.metadata.szind;
+		alloc_ctx.slab = contents.metadata.slab;
+		alloc_ctx.edata = contents.edata;
+		metadata_visitor(metadata_visitor_ctx, &alloc_ctx);
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_EMAP_H */

+ 510 - 0
BeefRT/JEMalloc/include/jemalloc/internal/emitter.h

@@ -0,0 +1,510 @@
+#ifndef JEMALLOC_INTERNAL_EMITTER_H
+#define JEMALLOC_INTERNAL_EMITTER_H
+
+#include "jemalloc/internal/ql.h"
+
+typedef enum emitter_output_e emitter_output_t;
+enum emitter_output_e {
+	emitter_output_json,
+	emitter_output_json_compact,
+	emitter_output_table
+};
+
+typedef enum emitter_justify_e emitter_justify_t;
+enum emitter_justify_e {
+	emitter_justify_left,
+	emitter_justify_right,
+	/* Not for users; just to pass to internal functions. */
+	emitter_justify_none
+};
+
+typedef enum emitter_type_e emitter_type_t;
+enum emitter_type_e {
+	emitter_type_bool,
+	emitter_type_int,
+	emitter_type_int64,
+	emitter_type_unsigned,
+	emitter_type_uint32,
+	emitter_type_uint64,
+	emitter_type_size,
+	emitter_type_ssize,
+	emitter_type_string,
+	/*
+	 * A title is a column title in a table; it's just a string, but it's
+	 * not quoted.
+	 */
+	emitter_type_title,
+};
+
+typedef struct emitter_col_s emitter_col_t;
+struct emitter_col_s {
+	/* Filled in by the user. */
+	emitter_justify_t justify;
+	int width;
+	emitter_type_t type;
+	union {
+		bool bool_val;
+		int int_val;
+		unsigned unsigned_val;
+		uint32_t uint32_val;
+		uint32_t uint32_t_val;
+		uint64_t uint64_val;
+		uint64_t uint64_t_val;
+		size_t size_val;
+		ssize_t ssize_val;
+		const char *str_val;
+	};
+
+	/* Filled in by initialization. */
+	ql_elm(emitter_col_t) link;
+};
+
+typedef struct emitter_row_s emitter_row_t;
+struct emitter_row_s {
+	ql_head(emitter_col_t) cols;
+};
+
+typedef struct emitter_s emitter_t;
+struct emitter_s {
+	emitter_output_t output;
+	/* The output information. */
+	write_cb_t *write_cb;
+	void *cbopaque;
+	int nesting_depth;
+	/* True if we've already emitted a value at the given depth. */
+	bool item_at_depth;
+	/* True if we emitted a key and will emit corresponding value next. */
+	bool emitted_key;
+};
+
+static inline bool
+emitter_outputs_json(emitter_t *emitter) {
+	return emitter->output == emitter_output_json ||
+	    emitter->output == emitter_output_json_compact;
+}
+
+/* Internal convenience function.  Write to the emitter the given string. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_printf(emitter_t *emitter, const char *format, ...) {
+	va_list ap;
+
+	va_start(ap, format);
+	malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+	va_end(ap);
+}
+
+static inline const char * JEMALLOC_FORMAT_ARG(3)
+emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
+    emitter_justify_t justify, int width) {
+	size_t written;
+	fmt_specifier++;
+	if (justify == emitter_justify_none) {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%%s", fmt_specifier);
+	} else if (justify == emitter_justify_left) {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%-%d%s", width, fmt_specifier);
+	} else {
+		written = malloc_snprintf(out_fmt, out_size,
+		    "%%%d%s", width, fmt_specifier);
+	}
+	/* Only happens in case of bad format string, which *we* choose. */
+	assert(written <  out_size);
+	return out_fmt;
+}
+
+/*
+ * Internal.  Emit the given value type in the relevant encoding (so that the
+ * bool true gets mapped to json "true", but the string "true" gets mapped to
+ * json "\"true\"", for instance.
+ *
+ * Width is ignored if justify is emitter_justify_none.
+ */
+static inline void
+emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
+    emitter_type_t value_type, const void *value) {
+	size_t str_written;
+#define BUF_SIZE 256
+#define FMT_SIZE 10
+	/*
+	 * We dynamically generate a format string to emit, to let us use the
+	 * snprintf machinery.  This is kinda hacky, but gets the job done
+	 * quickly without having to think about the various snprintf edge
+	 * cases.
+	 */
+	char fmt[FMT_SIZE];
+	char buf[BUF_SIZE];
+
+#define EMIT_SIMPLE(type, format)					\
+	emitter_printf(emitter,						\
+	    emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width),	\
+	    *(const type *)value);
+
+	switch (value_type) {
+	case emitter_type_bool:
+		emitter_printf(emitter,
+		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width),
+		    *(const bool *)value ?  "true" : "false");
+		break;
+	case emitter_type_int:
+		EMIT_SIMPLE(int, "%d")
+		break;
+	case emitter_type_int64:
+		EMIT_SIMPLE(int64_t, "%" FMTd64)
+		break;
+	case emitter_type_unsigned:
+		EMIT_SIMPLE(unsigned, "%u")
+		break;
+	case emitter_type_ssize:
+		EMIT_SIMPLE(ssize_t, "%zd")
+		break;
+	case emitter_type_size:
+		EMIT_SIMPLE(size_t, "%zu")
+		break;
+	case emitter_type_string:
+		str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"",
+		    *(const char *const *)value);
+		/*
+		 * We control the strings we output; we shouldn't get anything
+		 * anywhere near the fmt size.
+		 */
+		assert(str_written < BUF_SIZE);
+		emitter_printf(emitter,
+		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), buf);
+		break;
+	case emitter_type_uint32:
+		EMIT_SIMPLE(uint32_t, "%" FMTu32)
+		break;
+	case emitter_type_uint64:
+		EMIT_SIMPLE(uint64_t, "%" FMTu64)
+		break;
+	case emitter_type_title:
+		EMIT_SIMPLE(char *const, "%s");
+		break;
+	default:
+		unreachable();
+	}
+#undef BUF_SIZE
+#undef FMT_SIZE
+}
+
+
+/* Internal functions.  In json mode, tracks nesting state. */
+static inline void
+emitter_nest_inc(emitter_t *emitter) {
+	emitter->nesting_depth++;
+	emitter->item_at_depth = false;
+}
+
+static inline void
+emitter_nest_dec(emitter_t *emitter) {
+	emitter->nesting_depth--;
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_indent(emitter_t *emitter) {
+	int amount = emitter->nesting_depth;
+	const char *indent_str;
+	assert(emitter->output != emitter_output_json_compact);
+	if (emitter->output == emitter_output_json) {
+		indent_str = "\t";
+	} else {
+		amount *= 2;
+		indent_str = " ";
+	}
+	for (int i = 0; i < amount; i++) {
+		emitter_printf(emitter, "%s", indent_str);
+	}
+}
+
+static inline void
+emitter_json_key_prefix(emitter_t *emitter) {
+	assert(emitter_outputs_json(emitter));
+	if (emitter->emitted_key) {
+		emitter->emitted_key = false;
+		return;
+	}
+	if (emitter->item_at_depth) {
+		emitter_printf(emitter, ",");
+	}
+	if (emitter->output != emitter_output_json_compact) {
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+	}
+}
+
+/******************************************************************************/
+/* Public functions for emitter_t. */
+
+static inline void
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    write_cb_t *write_cb, void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->emitted_key = false;
+	emitter->nesting_depth = 0;
+}
+
+/******************************************************************************/
+/* JSON public API. */
+
+/*
+ * Emits a key (e.g. as appears in an object). The next json entity emitted will
+ * be the corresponding value.
+ */
+static inline void
+emitter_json_key(emitter_t *emitter, const char *json_key) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\":%s", json_key,
+		    emitter->output == emitter_output_json_compact ? "" : " ");
+		emitter->emitted_key = true;
+	}
+}
+
+static inline void
+emitter_json_value(emitter_t *emitter, emitter_type_t value_type,
+    const void *value) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		emitter->item_at_depth = true;
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_value. */
+static inline void
+emitter_json_kv(emitter_t *emitter, const char *json_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_value(emitter, value_type, value);
+}
+
+static inline void
+emitter_json_array_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "[");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */
+static inline void
+emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_array_begin(emitter);
+}
+
+static inline void
+emitter_json_array_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		if (emitter->output != emitter_output_json_compact) {
+			emitter_printf(emitter, "\n");
+			emitter_indent(emitter);
+		}
+		emitter_printf(emitter, "]");
+	}
+}
+
+static inline void
+emitter_json_object_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */
+static inline void
+emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_object_begin(emitter);
+}
+
+static inline void
+emitter_json_object_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		if (emitter->output != emitter_output_json_compact) {
+			emitter_printf(emitter, "\n");
+			emitter_indent(emitter);
+		}
+		emitter_printf(emitter, "}");
+	}
+}
+
+
+/******************************************************************************/
+/* Table public API. */
+
+static inline void
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_key);
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_nest_dec(emitter);
+	}
+}
+
+static inline void
+emitter_table_kv_note(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s: ", table_key);
+		emitter_print_value(emitter, emitter_justify_none, -1,
+		    value_type, value);
+		if (table_note_key != NULL) {
+			emitter_printf(emitter, " (%s: ", table_note_key);
+			emitter_print_value(emitter, emitter_justify_none, -1,
+			    table_note_value_type, table_note_value);
+			emitter_printf(emitter, ")");
+		}
+		emitter_printf(emitter, "\n");
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_table_kv(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_table_kv_note(emitter, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
+	if (emitter->output == emitter_output_table) {
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
+	}
+}
+
+static inline void
+emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
+	if (emitter->output != emitter_output_table) {
+		return;
+	}
+	emitter_col_t *col;
+	ql_foreach(col, &row->cols, link) {
+		emitter_print_value(emitter, col->justify, col->width,
+		    col->type, (const void *)&col->bool_val);
+	}
+	emitter_table_printf(emitter, "\n");
+}
+
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
+}
+
+static inline void
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+
+/******************************************************************************/
+/*
+ * Generalized public API. Emits using either JSON or table, according to
+ * settings in the emitter_t. */
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value,
+    const char *table_note_key, emitter_type_t table_note_value_type,
+    const void *table_note_value) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_value(emitter, value_type, value);
+	} else {
+		emitter_table_kv_note(emitter, table_key, value_type, value,
+		    table_note_key, table_note_value_type, table_note_value);
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_dict_begin(emitter_t *emitter, const char *json_key,
+    const char *table_header) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_object_begin(emitter);
+	} else {
+		emitter_table_dict_begin(emitter, table_header);
+	}
+}
+
+static inline void
+emitter_dict_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_object_end(emitter);
+	} else {
+		emitter_table_dict_end(emitter);
+	}
+}
+
+static inline void
+emitter_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		/*
+		 * This guarantees that we always call write_cb at least once.
+		 * This is useful if some invariant is established by each call
+		 * to write_cb, but doesn't hold initially: e.g., some buffer
+		 * holds a null-terminated string.
+		 */
+		emitter_printf(emitter, "%s", "");
+	}
+}
+
+static inline void
+emitter_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "%s", emitter->output ==
+		    emitter_output_json_compact ? "}" : "\n}\n");
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_EMITTER_H */

+ 77 - 0
BeefRT/JEMalloc/include/jemalloc/internal/eset.h

@@ -0,0 +1,77 @@
+#ifndef JEMALLOC_INTERNAL_ESET_H
+#define JEMALLOC_INTERNAL_ESET_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/mutex.h"
+
+/*
+ * An eset ("extent set") is a quantized collection of extents, with built-in
+ * LRU queue.
+ *
+ * This class is not thread-safe; synchronization must be done externally if
+ * there are mutating operations.  One exception is the stats counters, which
+ * may be read without any locking.
+ */
+
+typedef struct eset_bin_s eset_bin_t;
+struct eset_bin_s {
+	edata_heap_t heap;
+	/*
+	 * We do first-fit across multiple size classes.  If we compared against
+	 * the min element in each heap directly, we'd take a cache miss per
+	 * extent we looked at.  If we co-locate the edata summaries, we only
+	 * take a miss on the edata we're actually going to return (which is
+	 * inevitable anyways).
+	 */
+	edata_cmp_summary_t heap_min;
+};
+
+typedef struct eset_bin_stats_s eset_bin_stats_t;
+struct eset_bin_stats_s {
+	atomic_zu_t nextents;
+	atomic_zu_t nbytes;
+};
+
+typedef struct eset_s eset_t;
+struct eset_s {
+	/* Bitmap for which set bits correspond to non-empty heaps. */
+	fb_group_t bitmap[FB_NGROUPS(SC_NPSIZES + 1)];
+
+	/* Quantized per size class heaps of extents. */
+	eset_bin_t bins[SC_NPSIZES + 1];
+
+	eset_bin_stats_t bin_stats[SC_NPSIZES + 1];
+
+	/* LRU of all extents in heaps. */
+	edata_list_inactive_t lru;
+
+	/* Page sum for all extents in heaps. */
+	atomic_zu_t npages;
+
+	/*
+	 * A duplication of the data in the containing ecache.  We use this only
+	 * for assertions on the states of the passed-in extents.
+	 */
+	extent_state_t state;
+};
+
+void eset_init(eset_t *eset, extent_state_t state);
+
+size_t eset_npages_get(eset_t *eset);
+/* Get the number of extents in the given page size index. */
+size_t eset_nextents_get(eset_t *eset, pszind_t ind);
+/* Get the sum total bytes of the extents in the given page size index. */
+size_t eset_nbytes_get(eset_t *eset, pszind_t ind);
+
+void eset_insert(eset_t *eset, edata_t *edata);
+void eset_remove(eset_t *eset, edata_t *edata);
+/*
+ * Select an extent from this eset of the given size and alignment.  Returns
+ * null if no such item could be found.
+ */
+edata_t *eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only,
+    unsigned lg_max_fit);
+
+#endif /* JEMALLOC_INTERNAL_ESET_H */

+ 50 - 0
BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h

@@ -0,0 +1,50 @@
+#ifndef JEMALLOC_INTERNAL_EXP_GROW_H
+#define JEMALLOC_INTERNAL_EXP_GROW_H
+
+typedef struct exp_grow_s exp_grow_t;
+struct exp_grow_s {
+	/*
+	 * Next extent size class in a growing series to use when satisfying a
+	 * request via the extent hooks (only if opt_retain).  This limits the
+	 * number of disjoint virtual memory ranges so that extent merging can
+	 * be effective even if multiple arenas' extent allocation requests are
+	 * highly interleaved.
+	 *
+	 * retain_grow_limit is the max allowed size ind to expand (unless the
+	 * required size is greater).  Default is no limit, and controlled
+	 * through mallctl only.
+	 */
+	pszind_t next;
+	pszind_t limit;
+};
+
+static inline bool
+exp_grow_size_prepare(exp_grow_t *exp_grow, size_t alloc_size_min,
+    size_t *r_alloc_size, pszind_t *r_skip) {
+	*r_skip = 0;
+	*r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip);
+	while (*r_alloc_size < alloc_size_min) {
+		(*r_skip)++;
+		if (exp_grow->next + *r_skip  >=
+		    sz_psz2ind(SC_LARGE_MAXCLASS)) {
+			/* Outside legal range. */
+			return true;
+		}
+		*r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip);
+	}
+	return false;
+}
+
+static inline void
+exp_grow_size_commit(exp_grow_t *exp_grow, pszind_t skip) {
+	if (exp_grow->next + skip + 1 <= exp_grow->limit) {
+		exp_grow->next += skip + 1;
+	} else {
+		exp_grow->next = exp_grow->limit;
+	}
+
+}
+
+void exp_grow_init(exp_grow_t *exp_grow);
+
+#endif /* JEMALLOC_INTERNAL_EXP_GROW_H */

+ 137 - 0
BeefRT/JEMalloc/include/jemalloc/internal/extent.h

@@ -0,0 +1,137 @@
+#ifndef JEMALLOC_INTERNAL_EXTENT_H
+#define JEMALLOC_INTERNAL_EXTENT_H
+
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/rtree.h"
+
+/*
+ * This module contains the page-level allocator.  It chooses the addresses that
+ * allocations requested by other modules will inhabit, and updates the global
+ * metadata to reflect allocation/deallocation/purging decisions.
+ */
+
+/*
+ * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
+ * is the max ratio between the size of the active extent and the new extent.
+ */
+#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
+extern size_t opt_lg_extent_max_active_fit;
+
+edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
+    bool zero, bool guarded);
+edata_t *ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
+    bool zero, bool guarded);
+void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, edata_t *edata);
+edata_t *ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, size_t npages_min);
+
+void extent_gdump_add(tsdn_t *tsdn, const edata_t *edata);
+void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
+    edata_t *edata);
+void extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *edata);
+edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    void *new_addr, size_t size, size_t alignment, bool zero, bool *commit,
+    bool growing_retained);
+void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *edata);
+void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *edata);
+bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+bool extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac,
+    ehooks_t *ehooks, edata_t *edata, size_t size_a, size_t size_b,
+    bool holding_core_locks);
+bool extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *a, edata_t *b);
+bool extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    bool commit, bool zero, bool growing_retained);
+size_t extent_sn_next(pac_t *pac);
+bool extent_boot(void);
+
+JEMALLOC_ALWAYS_INLINE bool
+extent_neighbor_head_state_mergeable(bool edata_is_head,
+    bool neighbor_is_head, bool forward) {
+	/*
+	 * Head states checking: disallow merging if the higher addr extent is a
+	 * head extent.  This helps preserve first-fit, and more importantly
+	 * makes sure no merge across arenas.
+	 */
+	if (forward) {
+		if (neighbor_is_head) {
+			return false;
+		}
+	} else {
+		if (edata_is_head) {
+			return false;
+		}
+	}
+	return true;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
+    extent_pai_t pai, extent_state_t expected_state, bool forward,
+    bool expanding) {
+	edata_t *neighbor = contents.edata;
+	if (neighbor == NULL) {
+		return false;
+	}
+	/* It's not safe to access *neighbor yet; must verify states first. */
+	bool neighbor_is_head = contents.metadata.is_head;
+	if (!extent_neighbor_head_state_mergeable(edata_is_head_get(edata),
+	    neighbor_is_head, forward)) {
+		return false;
+	}
+	extent_state_t neighbor_state = contents.metadata.state;
+	if (pai == EXTENT_PAI_PAC) {
+		if (neighbor_state != expected_state) {
+			return false;
+		}
+		/* From this point, it's safe to access *neighbor. */
+		if (!expanding && (edata_committed_get(edata) !=
+		    edata_committed_get(neighbor))) {
+			/*
+			 * Some platforms (e.g. Windows) require an explicit
+			 * commit step (and writing to uncommitted memory is not
+			 * allowed).
+			 */
+			return false;
+		}
+	} else {
+		if (neighbor_state == extent_state_active) {
+			return false;
+		}
+		/* From this point, it's safe to access *neighbor. */
+	}
+
+	assert(edata_pai_get(edata) == pai);
+	if (edata_pai_get(neighbor) != pai) {
+		return false;
+	}
+	if (opt_retain) {
+		assert(edata_arena_ind_get(edata) ==
+		    edata_arena_ind_get(neighbor));
+	} else {
+		if (edata_arena_ind_get(edata) !=
+		    edata_arena_ind_get(neighbor)) {
+			return false;
+		}
+	}
+	assert(!edata_guarded_get(edata) && !edata_guarded_get(neighbor));
+
+	return true;
+}
+
+#endif /* JEMALLOC_INTERNAL_EXTENT_H */

+ 26 - 0
BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h

@@ -0,0 +1,26 @@
+#ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H
+#define JEMALLOC_INTERNAL_EXTENT_DSS_H
+
+typedef enum {
+	dss_prec_disabled  = 0,
+	dss_prec_primary   = 1,
+	dss_prec_secondary = 2,
+
+	dss_prec_limit     = 3
+} dss_prec_t;
+#define DSS_PREC_DEFAULT dss_prec_secondary
+#define DSS_DEFAULT "secondary"
+
+extern const char *dss_prec_names[];
+
+extern const char *opt_dss;
+
+dss_prec_t extent_dss_prec_get(void);
+bool extent_dss_prec_set(dss_prec_t dss_prec);
+void *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
+    size_t size, size_t alignment, bool *zero, bool *commit);
+bool extent_in_dss(void *addr);
+bool extent_dss_mergeable(void *addr_a, void *addr_b);
+void extent_dss_boot(void);
+
+#endif /* JEMALLOC_INTERNAL_EXTENT_DSS_H */

+ 10 - 0
BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h

@@ -0,0 +1,10 @@
+#ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
+#define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
+
+extern bool opt_retain;
+
+void *extent_alloc_mmap(void *new_addr, size_t size, size_t alignment,
+    bool *zero, bool *commit);
+bool extent_dalloc_mmap(void *addr, size_t size);
+
+#endif /* JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H */

+ 373 - 0
BeefRT/JEMalloc/include/jemalloc/internal/fb.h

@@ -0,0 +1,373 @@
+#ifndef JEMALLOC_INTERNAL_FB_H
+#define JEMALLOC_INTERNAL_FB_H
+
+/*
+ * The flat bitmap module.  This has a larger API relative to the bitmap module
+ * (supporting things like backwards searches, and searching for both set and
+ * unset bits), at the cost of slower operations for very large bitmaps.
+ *
+ * Initialized flat bitmaps start at all-zeros (all bits unset).
+ */
+
+typedef unsigned long fb_group_t;
+#define FB_GROUP_BITS (ZU(1) << (LG_SIZEOF_LONG + 3))
+#define FB_NGROUPS(nbits) ((nbits) / FB_GROUP_BITS \
+    + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1))
+
+static inline void
+fb_init(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	memset(fb, 0, ngroups * sizeof(fb_group_t));
+}
+
+static inline bool
+fb_empty(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		if (fb[i] != 0) {
+			return false;
+		}
+	}
+	return true;
+}
+
+static inline bool
+fb_full(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	size_t trailing_bits = nbits % FB_GROUP_BITS;
+	size_t limit = (trailing_bits == 0 ? ngroups : ngroups - 1);
+	for (size_t i = 0; i < limit; i++) {
+		if (fb[i] != ~(fb_group_t)0) {
+			return false;
+		}
+	}
+	if (trailing_bits == 0) {
+		return true;
+	}
+	return fb[ngroups - 1] == ((fb_group_t)1 << trailing_bits) - 1;
+}
+
+static inline bool
+fb_get(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	return (bool)(fb[group_ind] & ((fb_group_t)1 << bit_ind));
+}
+
+static inline void
+fb_set(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	fb[group_ind] |= ((fb_group_t)1 << bit_ind);
+}
+
+static inline void
+fb_unset(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	fb[group_ind] &= ~((fb_group_t)1 << bit_ind);
+}
+
+
+/*
+ * Some implementation details.  This visitation function lets us apply a group
+ * visitor to each group in the bitmap (potentially modifying it).  The mask
+ * indicates which bits are logically part of the visitation.
+ */
+typedef void (*fb_group_visitor_t)(void *ctx, fb_group_t *fb, fb_group_t mask);
+JEMALLOC_ALWAYS_INLINE void
+fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx,
+    size_t start, size_t cnt) {
+	assert(cnt > 0);
+	assert(start + cnt <= nbits);
+	size_t group_ind = start / FB_GROUP_BITS;
+	size_t start_bit_ind = start % FB_GROUP_BITS;
+	/*
+	 * The first group is special; it's the only one we don't start writing
+	 * to from bit 0.
+	 */
+	size_t first_group_cnt = (start_bit_ind + cnt > FB_GROUP_BITS
+		? FB_GROUP_BITS - start_bit_ind : cnt);
+	/*
+	 * We can basically split affected words into:
+	 *   - The first group, where we touch only the high bits
+	 *   - The last group, where we touch only the low bits
+	 *   - The middle, where we set all the bits to the same thing.
+	 * We treat each case individually.  The last two could be merged, but
+	 * this can lead to bad codegen for those middle words.
+	 */
+	/* First group */
+	fb_group_t mask = ((~(fb_group_t)0)
+	    >> (FB_GROUP_BITS - first_group_cnt))
+	    << start_bit_ind;
+	visit(ctx, &fb[group_ind], mask);
+
+	cnt -= first_group_cnt;
+	group_ind++;
+	/* Middle groups */
+	while (cnt > FB_GROUP_BITS) {
+		visit(ctx, &fb[group_ind], ~(fb_group_t)0);
+		cnt -= FB_GROUP_BITS;
+		group_ind++;
+	}
+	/* Last group */
+	if (cnt != 0) {
+		mask = (~(fb_group_t)0) >> (FB_GROUP_BITS - cnt);
+		visit(ctx, &fb[group_ind], mask);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fb_assign_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) {
+	bool val = *(bool *)ctx;
+	if (val) {
+		*fb |= mask;
+	} else {
+		*fb &= ~mask;
+	}
+}
+
+/* Sets the cnt bits starting at position start.  Must not have a 0 count. */
+static inline void
+fb_set_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	bool val = true;
+	fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt);
+}
+
+/* Unsets the cnt bits starting at position start.  Must not have a 0 count. */
+static inline void
+fb_unset_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	bool val = false;
+	fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fb_scount_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) {
+	size_t *scount = (size_t *)ctx;
+	*scount += popcount_lu(*fb & mask);
+}
+
+/* Finds the number of set bit in the of length cnt starting at start. */
+JEMALLOC_ALWAYS_INLINE size_t
+fb_scount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	size_t scount = 0;
+	fb_visit_impl(fb, nbits, &fb_scount_visitor, &scount, start, cnt);
+	return scount;
+}
+
+/* Finds the number of unset bit in the of length cnt starting at start. */
+JEMALLOC_ALWAYS_INLINE size_t
+fb_ucount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	size_t scount = fb_scount(fb, nbits, start, cnt);
+	return cnt - scount;
+}
+
+/*
+ * An implementation detail; find the first bit at position >= min_bit with the
+ * value val.
+ *
+ * Returns the number of bits in the bitmap if no such bit exists.
+ */
+JEMALLOC_ALWAYS_INLINE ssize_t
+fb_find_impl(fb_group_t *fb, size_t nbits, size_t start, bool val,
+    bool forward) {
+	assert(start < nbits);
+	size_t ngroups = FB_NGROUPS(nbits);
+	ssize_t group_ind = start / FB_GROUP_BITS;
+	size_t bit_ind = start % FB_GROUP_BITS;
+
+	fb_group_t maybe_invert = (val ? 0 : (fb_group_t)-1);
+
+	fb_group_t group = fb[group_ind];
+	group ^= maybe_invert;
+	if (forward) {
+		/* Only keep ones in bits bit_ind and above. */
+		group &= ~((1LU << bit_ind) - 1);
+	} else {
+		/*
+		 * Only keep ones in bits bit_ind and below.  You might more
+		 * naturally express this as (1 << (bit_ind + 1)) - 1, but
+		 * that shifts by an invalid amount if bit_ind is one less than
+		 * FB_GROUP_BITS.
+		 */
+		group &= ((2LU << bit_ind) - 1);
+	}
+	ssize_t group_ind_bound = forward ? (ssize_t)ngroups : -1;
+	while (group == 0) {
+		group_ind += forward ? 1 : -1;
+		if (group_ind == group_ind_bound) {
+			return forward ? (ssize_t)nbits : (ssize_t)-1;
+		}
+		group = fb[group_ind];
+		group ^= maybe_invert;
+	}
+	assert(group != 0);
+	size_t bit = forward ? ffs_lu(group) : fls_lu(group);
+	size_t pos = group_ind * FB_GROUP_BITS + bit;
+	/*
+	 * The high bits of a partially filled last group are zeros, so if we're
+	 * looking for zeros we don't want to report an invalid result.
+	 */
+	if (forward && !val && pos > nbits) {
+		return nbits;
+	}
+	return pos;
+}
+
+/*
+ * Find the first set bit in the bitmap with an index >= min_bit.  Returns the
+ * number of bits in the bitmap if no such bit exists.
+ */
+static inline size_t
+fb_ffu(fb_group_t *fb, size_t nbits, size_t min_bit) {
+	return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ false,
+	    /* forward */ true);
+}
+
+/* The same, but looks for an unset bit. */
+static inline size_t
+fb_ffs(fb_group_t *fb, size_t nbits, size_t min_bit) {
+	return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ true,
+	    /* forward */ true);
+}
+
+/*
+ * Find the last set bit in the bitmap with an index <= max_bit.  Returns -1 if
+ * no such bit exists.
+ */
+static inline ssize_t
+fb_flu(fb_group_t *fb, size_t nbits, size_t max_bit) {
+	return fb_find_impl(fb, nbits, max_bit, /* val */ false,
+	    /* forward */ false);
+}
+
+static inline ssize_t
+fb_fls(fb_group_t *fb, size_t nbits, size_t max_bit) {
+	return fb_find_impl(fb, nbits, max_bit, /* val */ true,
+	    /* forward */ false);
+}
+
+/* Returns whether or not we found a range. */
+JEMALLOC_ALWAYS_INLINE bool
+fb_iter_range_impl(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len, bool val, bool forward) {
+	assert(start < nbits);
+	ssize_t next_range_begin = fb_find_impl(fb, nbits, start, val, forward);
+	if ((forward && next_range_begin == (ssize_t)nbits)
+	    || (!forward && next_range_begin == (ssize_t)-1)) {
+		return false;
+	}
+	/* Half open range; the set bits are [begin, end). */
+	ssize_t next_range_end = fb_find_impl(fb, nbits, next_range_begin, !val,
+	    forward);
+	if (forward) {
+		*r_begin = next_range_begin;
+		*r_len = next_range_end - next_range_begin;
+	} else {
+		*r_begin = next_range_end + 1;
+		*r_len = next_range_begin - next_range_end;
+	}
+	return true;
+}
+
+/*
+ * Used to iterate through ranges of set bits.
+ *
+ * Tries to find the next contiguous sequence of set bits with a first index >=
+ * start.  If one exists, puts the earliest bit of the range in *r_begin, its
+ * length in *r_len, and returns true.  Otherwise, returns false (without
+ * touching *r_begin or *r_end).
+ */
+static inline bool
+fb_srange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ true, /* forward */ true);
+}
+
+/*
+ * The same as fb_srange_iter, but searches backwards from start rather than
+ * forwards.  (The position returned is still the earliest bit in the range).
+ */
+static inline bool
+fb_srange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ true, /* forward */ false);
+}
+
+/* Similar to fb_srange_iter, but searches for unset bits. */
+static inline bool
+fb_urange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ false, /* forward */ true);
+}
+
+/* Similar to fb_srange_riter, but searches for unset bits. */
+static inline bool
+fb_urange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ false, /* forward */ false);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+fb_range_longest_impl(fb_group_t *fb, size_t nbits, bool val) {
+	size_t begin = 0;
+	size_t longest_len = 0;
+	size_t len = 0;
+	while (begin < nbits && fb_iter_range_impl(fb, nbits, begin, &begin,
+	    &len, val, /* forward */ true)) {
+		if (len > longest_len) {
+			longest_len = len;
+		}
+		begin += len;
+	}
+	return longest_len;
+}
+
+static inline size_t
+fb_srange_longest(fb_group_t *fb, size_t nbits) {
+	return fb_range_longest_impl(fb, nbits, /* val */ true);
+}
+
+static inline size_t
+fb_urange_longest(fb_group_t *fb, size_t nbits) {
+	return fb_range_longest_impl(fb, nbits, /* val */ false);
+}
+
+/*
+ * Initializes each bit of dst with the bitwise-AND of the corresponding bits of
+ * src1 and src2.  All bitmaps must be the same size.
+ */
+static inline void
+fb_bit_and(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = src1[i] & src2[i];
+	}
+}
+
+/* Like fb_bit_and, but with bitwise-OR. */
+static inline void
+fb_bit_or(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = src1[i] | src2[i];
+	}
+}
+
+/* Initializes dst bit i to the negation of source bit i. */
+static inline void
+fb_bit_not(fb_group_t *dst, fb_group_t *src, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = ~src[i];
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_FB_H */

+ 126 - 0
BeefRT/JEMalloc/include/jemalloc/internal/fxp.h

@@ -0,0 +1,126 @@
+#ifndef JEMALLOC_INTERNAL_FXP_H
+#define JEMALLOC_INTERNAL_FXP_H
+
+/*
+ * A simple fixed-point math implementation, supporting only unsigned values
+ * (with overflow being an error).
+ *
+ * It's not in general safe to use floating point in core code, because various
+ * libc implementations we get linked against can assume that malloc won't touch
+ * floating point state and call it with an unusual calling convention.
+ */
+
+/*
+ * High 16 bits are the integer part, low 16 are the fractional part.  Or
+ * equivalently, repr == 2**16 * val, where we use "val" to refer to the
+ * (imaginary) fractional representation of the true value.
+ *
+ * We pick a uint32_t here since it's convenient in some places to
+ * double the representation size (i.e. multiplication and division use
+ * 64-bit integer types), and a uint64_t is the largest type we're
+ * certain is available.
+ */
+typedef uint32_t fxp_t;
+#define FXP_INIT_INT(x) ((x) << 16)
+#define FXP_INIT_PERCENT(pct) (((pct) << 16) / 100)
+
+/*
+ * Amount of precision used in parsing and printing numbers.  The integer bound
+ * is simply because the integer part of the number gets 16 bits, and so is
+ * bounded by 65536.
+ *
+ * We use a lot of precision for the fractional part, even though most of it
+ * gets rounded off; this lets us get exact values for the important special
+ * case where the denominator is a small power of 2 (for instance,
+ * 1/512 == 0.001953125 is exactly representable even with only 16 bits of
+ * fractional precision).  We need to left-shift by 16 before dividing by
+ * 10**precision, so we pick precision to be floor(log(2**48)) = 14.
+ */
+#define FXP_INTEGER_PART_DIGITS 5
+#define FXP_FRACTIONAL_PART_DIGITS 14
+
+/*
+ * In addition to the integer and fractional parts of the number, we need to
+ * include a null character and (possibly) a decimal point.
+ */
+#define FXP_BUF_SIZE (FXP_INTEGER_PART_DIGITS + FXP_FRACTIONAL_PART_DIGITS + 2)
+
+static inline fxp_t
+fxp_add(fxp_t a, fxp_t b) {
+	return a + b;
+}
+
+static inline fxp_t
+fxp_sub(fxp_t a, fxp_t b) {
+	assert(a >= b);
+	return a - b;
+}
+
+static inline fxp_t
+fxp_mul(fxp_t a, fxp_t b) {
+	uint64_t unshifted = (uint64_t)a * (uint64_t)b;
+	/*
+	 * Unshifted is (a.val * 2**16) * (b.val * 2**16)
+	 *   == (a.val * b.val) * 2**32, but we want
+	 * (a.val * b.val) * 2 ** 16.
+	 */
+	return (uint32_t)(unshifted >> 16);
+}
+
+static inline fxp_t
+fxp_div(fxp_t a, fxp_t b) {
+	assert(b != 0);
+	uint64_t unshifted = ((uint64_t)a << 32) / (uint64_t)b;
+	/*
+	 * Unshifted is (a.val * 2**16) * (2**32) / (b.val * 2**16)
+	 *   == (a.val / b.val) * (2 ** 32), which again corresponds to a right
+	 *   shift of 16.
+	 */
+	return (uint32_t)(unshifted >> 16);
+}
+
+static inline uint32_t
+fxp_round_down(fxp_t a) {
+	return a >> 16;
+}
+
+static inline uint32_t
+fxp_round_nearest(fxp_t a) {
+	uint32_t fractional_part = (a  & ((1U << 16) - 1));
+	uint32_t increment = (uint32_t)(fractional_part >= (1U << 15));
+	return (a >> 16) + increment;
+}
+
+/*
+ * Approximately computes x * frac, without the size limitations that would be
+ * imposed by converting u to an fxp_t.
+ */
+static inline size_t
+fxp_mul_frac(size_t x_orig, fxp_t frac) {
+	assert(frac <= (1U << 16));
+	/*
+	 * Work around an over-enthusiastic warning about type limits below (on
+	 * 32-bit platforms, a size_t is always less than 1ULL << 48).
+	 */
+	uint64_t x = (uint64_t)x_orig;
+	/*
+	 * If we can guarantee no overflow, multiply first before shifting, to
+	 * preserve some precision.  Otherwise, shift first and then multiply.
+	 * In the latter case, we only lose the low 16 bits of a 48-bit number,
+	 * so we're still accurate to within 1/2**32.
+	 */
+	if (x < (1ULL << 48)) {
+		return (size_t)((x * frac) >> 16);
+	} else {
+		return (size_t)((x >> 16) * (uint64_t)frac);
+	}
+}
+
+/*
+ * Returns true on error.  Otherwise, returns false and updates *ptr to point to
+ * the first character not parsed (because it wasn't a digit).
+ */
+bool fxp_parse(fxp_t *a, const char *ptr, char **end);
+void fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]);
+
+#endif /* JEMALLOC_INTERNAL_FXP_H */

+ 320 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hash.h

@@ -0,0 +1,320 @@
+#ifndef JEMALLOC_INTERNAL_HASH_H
+#define JEMALLOC_INTERNAL_HASH_H
+
+#include "jemalloc/internal/assert.h"
+
+/*
+ * The following hash function is based on MurmurHash3, placed into the public
+ * domain by Austin Appleby.  See https://github.com/aappleby/smhasher for
+ * details.
+ */
+
+/******************************************************************************/
+/* Internal implementation. */
+static inline uint32_t
+hash_rotl_32(uint32_t x, int8_t r) {
+	return ((x << r) | (x >> (32 - r)));
+}
+
+static inline uint64_t
+hash_rotl_64(uint64_t x, int8_t r) {
+	return ((x << r) | (x >> (64 - r)));
+}
+
+static inline uint32_t
+hash_get_block_32(const uint32_t *p, int i) {
+	/* Handle unaligned read. */
+	if (unlikely((uintptr_t)p & (sizeof(uint32_t)-1)) != 0) {
+		uint32_t ret;
+
+		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint32_t));
+		return ret;
+	}
+
+	return p[i];
+}
+
+static inline uint64_t
+hash_get_block_64(const uint64_t *p, int i) {
+	/* Handle unaligned read. */
+	if (unlikely((uintptr_t)p & (sizeof(uint64_t)-1)) != 0) {
+		uint64_t ret;
+
+		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint64_t));
+		return ret;
+	}
+
+	return p[i];
+}
+
+static inline uint32_t
+hash_fmix_32(uint32_t h) {
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+
+	return h;
+}
+
+static inline uint64_t
+hash_fmix_64(uint64_t k) {
+	k ^= k >> 33;
+	k *= KQU(0xff51afd7ed558ccd);
+	k ^= k >> 33;
+	k *= KQU(0xc4ceb9fe1a85ec53);
+	k ^= k >> 33;
+
+	return k;
+}
+
+static inline uint32_t
+hash_x86_32(const void *key, int len, uint32_t seed) {
+	const uint8_t *data = (const uint8_t *) key;
+	const int nblocks = len / 4;
+
+	uint32_t h1 = seed;
+
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+
+	/* body */
+	{
+		const uint32_t *blocks = (const uint32_t *) (data + nblocks*4);
+		int i;
+
+		for (i = -nblocks; i; i++) {
+			uint32_t k1 = hash_get_block_32(blocks, i);
+
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+
+			h1 ^= k1;
+			h1 = hash_rotl_32(h1, 13);
+			h1 = h1*5 + 0xe6546b64;
+		}
+	}
+
+	/* tail */
+	{
+		const uint8_t *tail = (const uint8_t *) (data + nblocks*4);
+
+		uint32_t k1 = 0;
+
+		switch (len & 3) {
+		case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH;
+		case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH;
+		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
+			k1 *= c2; h1 ^= k1;
+		}
+	}
+
+	/* finalization */
+	h1 ^= len;
+
+	h1 = hash_fmix_32(h1);
+
+	return h1;
+}
+
+static inline void
+hash_x86_128(const void *key, const int len, uint32_t seed,
+    uint64_t r_out[2]) {
+	const uint8_t * data = (const uint8_t *) key;
+	const int nblocks = len / 16;
+
+	uint32_t h1 = seed;
+	uint32_t h2 = seed;
+	uint32_t h3 = seed;
+	uint32_t h4 = seed;
+
+	const uint32_t c1 = 0x239b961b;
+	const uint32_t c2 = 0xab0e9789;
+	const uint32_t c3 = 0x38b34ae5;
+	const uint32_t c4 = 0xa1e38b93;
+
+	/* body */
+	{
+		const uint32_t *blocks = (const uint32_t *) (data + nblocks*16);
+		int i;
+
+		for (i = -nblocks; i; i++) {
+			uint32_t k1 = hash_get_block_32(blocks, i*4 + 0);
+			uint32_t k2 = hash_get_block_32(blocks, i*4 + 1);
+			uint32_t k3 = hash_get_block_32(blocks, i*4 + 2);
+			uint32_t k4 = hash_get_block_32(blocks, i*4 + 3);
+
+			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+
+			h1 = hash_rotl_32(h1, 19); h1 += h2;
+			h1 = h1*5 + 0x561ccd1b;
+
+			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
+
+			h2 = hash_rotl_32(h2, 17); h2 += h3;
+			h2 = h2*5 + 0x0bcaa747;
+
+			k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
+
+			h3 = hash_rotl_32(h3, 15); h3 += h4;
+			h3 = h3*5 + 0x96cd1c35;
+
+			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
+
+			h4 = hash_rotl_32(h4, 13); h4 += h1;
+			h4 = h4*5 + 0x32ac3b17;
+		}
+	}
+
+	/* tail */
+	{
+		const uint8_t *tail = (const uint8_t *) (data + nblocks*16);
+		uint32_t k1 = 0;
+		uint32_t k2 = 0;
+		uint32_t k3 = 0;
+		uint32_t k4 = 0;
+
+		switch (len & 15) {
+		case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH;
+		case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH;
+		case 13: k4 ^= tail[12] << 0;
+			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
+			JEMALLOC_FALLTHROUGH;
+		case 12: k3 ^= (uint32_t) tail[11] << 24; JEMALLOC_FALLTHROUGH;
+		case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH;
+		case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH;
+		case  9: k3 ^= tail[ 8] << 0;
+			k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
+			JEMALLOC_FALLTHROUGH;
+		case  8: k2 ^= (uint32_t) tail[ 7] << 24; JEMALLOC_FALLTHROUGH;
+		case  7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH;
+		case  6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH;
+		case  5: k2 ^= tail[ 4] << 0;
+			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case  4: k1 ^= (uint32_t) tail[ 3] << 24; JEMALLOC_FALLTHROUGH;
+		case  3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH;
+		case  2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH;
+		case  1: k1 ^= tail[ 0] << 0;
+			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+			break;
+		}
+	}
+
+	/* finalization */
+	h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+	h1 += h2; h1 += h3; h1 += h4;
+	h2 += h1; h3 += h1; h4 += h1;
+
+	h1 = hash_fmix_32(h1);
+	h2 = hash_fmix_32(h2);
+	h3 = hash_fmix_32(h3);
+	h4 = hash_fmix_32(h4);
+
+	h1 += h2; h1 += h3; h1 += h4;
+	h2 += h1; h3 += h1; h4 += h1;
+
+	r_out[0] = (((uint64_t) h2) << 32) | h1;
+	r_out[1] = (((uint64_t) h4) << 32) | h3;
+}
+
+static inline void
+hash_x64_128(const void *key, const int len, const uint32_t seed,
+    uint64_t r_out[2]) {
+	const uint8_t *data = (const uint8_t *) key;
+	const int nblocks = len / 16;
+
+	uint64_t h1 = seed;
+	uint64_t h2 = seed;
+
+	const uint64_t c1 = KQU(0x87c37b91114253d5);
+	const uint64_t c2 = KQU(0x4cf5ad432745937f);
+
+	/* body */
+	{
+		const uint64_t *blocks = (const uint64_t *) (data);
+		int i;
+
+		for (i = 0; i < nblocks; i++) {
+			uint64_t k1 = hash_get_block_64(blocks, i*2 + 0);
+			uint64_t k2 = hash_get_block_64(blocks, i*2 + 1);
+
+			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+
+			h1 = hash_rotl_64(h1, 27); h1 += h2;
+			h1 = h1*5 + 0x52dce729;
+
+			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
+
+			h2 = hash_rotl_64(h2, 31); h2 += h1;
+			h2 = h2*5 + 0x38495ab5;
+		}
+	}
+
+	/* tail */
+	{
+		const uint8_t *tail = (const uint8_t*)(data + nblocks*16);
+		uint64_t k1 = 0;
+		uint64_t k2 = 0;
+
+		switch (len & 15) {
+		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH;
+		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH;
+		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH;
+		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH;
+		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH;
+		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  JEMALLOC_FALLTHROUGH;
+		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
+			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH;
+		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH;
+		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH;
+		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH;
+		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH;
+		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH;
+		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  JEMALLOC_FALLTHROUGH;
+		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
+			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+			break;
+		}
+	}
+
+	/* finalization */
+	h1 ^= len; h2 ^= len;
+
+	h1 += h2;
+	h2 += h1;
+
+	h1 = hash_fmix_64(h1);
+	h2 = hash_fmix_64(h2);
+
+	h1 += h2;
+	h2 += h1;
+
+	r_out[0] = h1;
+	r_out[1] = h2;
+}
+
+/******************************************************************************/
+/* API. */
+static inline void
+hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2]) {
+	assert(len <= INT_MAX); /* Unfortunate implementation limitation. */
+
+#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
+	hash_x64_128(key, (int)len, seed, (uint64_t *)r_hash);
+#else
+	{
+		uint64_t hashes[2];
+		hash_x86_128(key, (int)len, seed, hashes);
+		r_hash[0] = (size_t)hashes[0];
+		r_hash[1] = (size_t)hashes[1];
+	}
+#endif
+}
+
+#endif /* JEMALLOC_INTERNAL_HASH_H */

+ 163 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hook.h

@@ -0,0 +1,163 @@
+#ifndef JEMALLOC_INTERNAL_HOOK_H
+#define JEMALLOC_INTERNAL_HOOK_H
+
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * This API is *extremely* experimental, and may get ripped out, changed in API-
+ * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc.
+ *
+ * It allows hooking the stateful parts of the API to see changes as they
+ * happen.
+ *
+ * Allocation hooks are called after the allocation is done, free hooks are
+ * called before the free is done, and expand hooks are called after the
+ * allocation is expanded.
+ *
+ * For realloc and rallocx, if the expansion happens in place, the expansion
+ * hook is called.  If it is moved, then the alloc hook is called on the new
+ * location, and then the free hook is called on the old location (i.e. both
+ * hooks are invoked in between the alloc and the dalloc).
+ *
+ * If we return NULL from OOM, then usize might not be trustworthy.  Calling
+ * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0)
+ * only calls the free hook.  (Calling realloc(NULL, 0) is treated as malloc(0),
+ * and only calls the alloc hook).
+ *
+ * Reentrancy:
+ *   Reentrancy is guarded against from within the hook implementation.  If you
+ *   call allocator functions from within a hook, the hooks will not be invoked
+ *   again.
+ * Threading:
+ *   The installation of a hook synchronizes with all its uses.  If you can
+ *   prove the installation of a hook happens-before a jemalloc entry point,
+ *   then the hook will get invoked (unless there's a racing removal).
+ *
+ *   Hook insertion appears to be atomic at a per-thread level (i.e. if a thread
+ *   allocates and has the alloc hook invoked, then a subsequent free on the
+ *   same thread will also have the free hook invoked).
+ *
+ *   The *removal* of a hook does *not* block until all threads are done with
+ *   the hook.  Hook authors have to be resilient to this, and need some
+ *   out-of-band mechanism for cleaning up any dynamically allocated memory
+ *   associated with their hook.
+ * Ordering:
+ *   Order of hook execution is unspecified, and may be different than insertion
+ *   order.
+ */
+
+#define HOOK_MAX 4
+
+enum hook_alloc_e {
+	hook_alloc_malloc,
+	hook_alloc_posix_memalign,
+	hook_alloc_aligned_alloc,
+	hook_alloc_calloc,
+	hook_alloc_memalign,
+	hook_alloc_valloc,
+	hook_alloc_mallocx,
+
+	/* The reallocating functions have both alloc and dalloc variants */
+	hook_alloc_realloc,
+	hook_alloc_rallocx,
+};
+/*
+ * We put the enum typedef after the enum, since this file may get included by
+ * jemalloc_cpp.cpp, and C++ disallows enum forward declarations.
+ */
+typedef enum hook_alloc_e hook_alloc_t;
+
+enum hook_dalloc_e {
+	hook_dalloc_free,
+	hook_dalloc_dallocx,
+	hook_dalloc_sdallocx,
+
+	/*
+	 * The dalloc halves of reallocation (not called if in-place expansion
+	 * happens).
+	 */
+	hook_dalloc_realloc,
+	hook_dalloc_rallocx,
+};
+typedef enum hook_dalloc_e hook_dalloc_t;
+
+
+enum hook_expand_e {
+	hook_expand_realloc,
+	hook_expand_rallocx,
+	hook_expand_xallocx,
+};
+typedef enum hook_expand_e hook_expand_t;
+
+typedef void (*hook_alloc)(
+    void *extra, hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+typedef void (*hook_dalloc)(
+    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+typedef void (*hook_expand)(
+    void *extra, hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+typedef struct hooks_s hooks_t;
+struct hooks_s {
+	hook_alloc alloc_hook;
+	hook_dalloc dalloc_hook;
+	hook_expand expand_hook;
+	void *extra;
+};
+
+/*
+ * Begin implementation details; everything above this point might one day live
+ * in a public API.  Everything below this point never will.
+ */
+
+/*
+ * The realloc pathways haven't gotten any refactoring love in a while, and it's
+ * fairly difficult to pass information from the entry point to the hooks.  We
+ * put the informaiton the hooks will need into a struct to encapsulate
+ * everything.
+ *
+ * Much of these pathways are force-inlined, so that the compiler can avoid
+ * materializing this struct until we hit an extern arena function.  For fairly
+ * goofy reasons, *many* of the realloc paths hit an extern arena function.
+ * These paths are cold enough that it doesn't matter; eventually, we should
+ * rewrite the realloc code to make the expand-in-place and the
+ * free-then-realloc paths more orthogonal, at which point we don't need to
+ * spread the hook logic all over the place.
+ */
+typedef struct hook_ralloc_args_s hook_ralloc_args_t;
+struct hook_ralloc_args_s {
+	/* I.e. as opposed to rallocx. */
+	bool is_realloc;
+	/*
+	 * The expand hook takes 4 arguments, even if only 3 are actually used;
+	 * we add an extra one in case the user decides to memcpy without
+	 * looking too closely at the hooked function.
+	 */
+	uintptr_t args[4];
+};
+
+/*
+ * Returns an opaque handle to be used when removing the hook.  NULL means that
+ * we couldn't install the hook.
+ */
+bool hook_boot();
+
+void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
+/* Uninstalls the hook with the handle previously returned from hook_install. */
+void hook_remove(tsdn_t *tsdn, void *opaque);
+
+/* Hooks */
+
+void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_dalloc(hook_dalloc_t type, void *address,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+#endif /* JEMALLOC_INTERNAL_HOOK_H */

+ 182 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hpa.h

@@ -0,0 +1,182 @@
+#ifndef JEMALLOC_INTERNAL_HPA_H
+#define JEMALLOC_INTERNAL_HPA_H
+
+#include "jemalloc/internal/exp_grow.h"
+#include "jemalloc/internal/hpa_hooks.h"
+#include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/pai.h"
+#include "jemalloc/internal/psset.h"
+
+typedef struct hpa_central_s hpa_central_t;
+struct hpa_central_s {
+	/*
+	 * The mutex guarding most of the operations on the central data
+	 * structure.
+	 */
+	malloc_mutex_t mtx;
+	/*
+	 * Guards expansion of eden.  We separate this from the regular mutex so
+	 * that cheaper operations can still continue while we're doing the OS
+	 * call.
+	 */
+	malloc_mutex_t grow_mtx;
+	/*
+	 * Either NULL (if empty), or some integer multiple of a
+	 * hugepage-aligned number of hugepages.  We carve them off one at a
+	 * time to satisfy new pageslab requests.
+	 *
+	 * Guarded by grow_mtx.
+	 */
+	void *eden;
+	size_t eden_len;
+	/* Source for metadata. */
+	base_t *base;
+	/* Number of grow operations done on this hpa_central_t. */
+	uint64_t age_counter;
+
+	/* The HPA hooks. */
+	hpa_hooks_t hooks;
+};
+
+typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
+struct hpa_shard_nonderived_stats_s {
+	/*
+	 * The number of times we've purged within a hugepage.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t npurge_passes;
+	/*
+	 * The number of individual purge calls we perform (which should always
+	 * be bigger than npurge_passes, since each pass purges at least one
+	 * extent within a hugepage.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t npurges;
+
+	/*
+	 * The number of times we've hugified a pageslab.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t nhugifies;
+	/*
+	 * The number of times we've dehugified a pageslab.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t ndehugifies;
+};
+
+/* Completely derived; only used by CTL. */
+typedef struct hpa_shard_stats_s hpa_shard_stats_t;
+struct hpa_shard_stats_s {
+	psset_stats_t psset_stats;
+	hpa_shard_nonderived_stats_t nonderived_stats;
+};
+
+typedef struct hpa_shard_s hpa_shard_t;
+struct hpa_shard_s {
+	/*
+	 * pai must be the first member; we cast from a pointer to it to a
+	 * pointer to the hpa_shard_t.
+	 */
+	pai_t pai;
+
+	/* The central allocator we get our hugepages from. */
+	hpa_central_t *central;
+	/* Protects most of this shard's state. */
+	malloc_mutex_t mtx;
+	/*
+	 * Guards the shard's access to the central allocator (preventing
+	 * multiple threads operating on this shard from accessing the central
+	 * allocator).
+	 */
+	malloc_mutex_t grow_mtx;
+	/* The base metadata allocator. */
+	base_t *base;
+
+	/*
+	 * This edata cache is the one we use when allocating a small extent
+	 * from a pageslab.  The pageslab itself comes from the centralized
+	 * allocator, and so will use its edata_cache.
+	 */
+	edata_cache_fast_t ecf;
+
+	psset_t psset;
+
+	/*
+	 * How many grow operations have occurred.
+	 *
+	 * Guarded by grow_mtx.
+	 */
+	uint64_t age_counter;
+
+	/* The arena ind we're associated with. */
+	unsigned ind;
+
+	/*
+	 * Our emap.  This is just a cache of the emap pointer in the associated
+	 * hpa_central.
+	 */
+	emap_t *emap;
+
+	/* The configuration choices for this hpa shard. */
+	hpa_shard_opts_t opts;
+
+	/*
+	 * How many pages have we started but not yet finished purging in this
+	 * hpa shard.
+	 */
+	size_t npending_purge;
+
+	/*
+	 * Those stats which are copied directly into the CTL-centric hpa shard
+	 * stats.
+	 */
+	hpa_shard_nonderived_stats_t stats;
+
+	/*
+	 * Last time we performed purge on this shard.
+	 */
+	nstime_t last_purge;
+};
+
+/*
+ * Whether or not the HPA can be used given the current configuration.  This is
+ * is not necessarily a guarantee that it backs its allocations by hugepages,
+ * just that it can function properly given the system it's running on.
+ */
+bool hpa_supported();
+bool hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
+bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
+    base_t *base, edata_cache_t *edata_cache, unsigned ind,
+    const hpa_shard_opts_t *opts);
+
+void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
+void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
+    hpa_shard_stats_t *dst);
+
+/*
+ * Notify the shard that we won't use it for allocations much longer.  Due to
+ * the possibility of races, we don't actually prevent allocations; just flush
+ * and disable the embedded edata_cache_small.
+ */
+void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
+
+void hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
+    bool deferral_allowed);
+void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);
+
+/*
+ * We share the fork ordering with the PA and arena prefork handling; that's why
+ * these are 3 and 4 rather than 0 and 1.
+ */
+void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);
+
+#endif /* JEMALLOC_INTERNAL_HPA_H */

+ 17 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h

@@ -0,0 +1,17 @@
+#ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H
+#define JEMALLOC_INTERNAL_HPA_HOOKS_H
+
+typedef struct hpa_hooks_s hpa_hooks_t;
+struct hpa_hooks_s {
+	void *(*map)(size_t size);
+	void (*unmap)(void *ptr, size_t size);
+	void (*purge)(void *ptr, size_t size);
+	void (*hugify)(void *ptr, size_t size);
+	void (*dehugify)(void *ptr, size_t size);
+	void (*curtime)(nstime_t *r_time, bool first_reading);
+	uint64_t (*ms_since)(nstime_t *r_time);
+};
+
+extern hpa_hooks_t hpa_hooks_default;
+
+#endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */

+ 74 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h

@@ -0,0 +1,74 @@
+#ifndef JEMALLOC_INTERNAL_HPA_OPTS_H
+#define JEMALLOC_INTERNAL_HPA_OPTS_H
+
+#include "jemalloc/internal/fxp.h"
+
+/*
+ * This file is morally part of hpa.h, but is split out for header-ordering
+ * reasons.
+ */
+
+typedef struct hpa_shard_opts_s hpa_shard_opts_t;
+struct hpa_shard_opts_s {
+	/*
+	 * The largest size we'll allocate out of the shard.  For those
+	 * allocations refused, the caller (in practice, the PA module) will
+	 * fall back to the more general (for now) PAC, which can always handle
+	 * any allocation request.
+	 */
+	size_t slab_max_alloc;
+
+	/*
+	 * When the number of active bytes in a hugepage is >=
+	 * hugification_threshold, we force hugify it.
+	 */
+	size_t hugification_threshold;
+
+	/*
+	 * The HPA purges whenever the number of pages exceeds dirty_mult *
+	 * active_pages.  This may be set to (fxp_t)-1 to disable purging.
+	 */
+	fxp_t dirty_mult;
+
+	/*
+	 * Whether or not the PAI methods are allowed to defer work to a
+	 * subsequent hpa_shard_do_deferred_work() call.  Practically, this
+	 * corresponds to background threads being enabled.  We track this
+	 * ourselves for encapsulation purposes.
+	 */
+	bool deferral_allowed;
+
+	/*
+	 * How long a hugepage has to be a hugification candidate before it will
+	 * actually get hugified.
+	 */
+	uint64_t hugify_delay_ms;
+
+	/*
+	 * Minimum amount of time between purges.
+	 */
+	uint64_t min_purge_interval_ms;
+};
+
+#define HPA_SHARD_OPTS_DEFAULT {					\
+	/* slab_max_alloc */						\
+	64 * 1024,							\
+	/* hugification_threshold */					\
+	HUGEPAGE * 95 / 100,						\
+	/* dirty_mult */						\
+	FXP_INIT_PERCENT(25),						\
+	/*								\
+	 * deferral_allowed						\
+	 * 								\
+	 * Really, this is always set by the arena during creation	\
+	 * or by an hpa_shard_set_deferral_allowed call, so the value	\
+	 * we put here doesn't matter.					\
+	 */								\
+	false,								\
+	/* hugify_delay_ms */						\
+	10 * 1000,							\
+	/* min_purge_interval_ms */					\
+	5 * 1000							\
+}
+
+#endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */

+ 413 - 0
BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h

@@ -0,0 +1,413 @@
+#ifndef JEMALLOC_INTERNAL_HPDATA_H
+#define JEMALLOC_INTERNAL_HPDATA_H
+
+#include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/typed_list.h"
+
+/*
+ * The metadata representation we use for extents in hugepages.  While the PAC
+ * uses the edata_t to represent both active and inactive extents, the HP only
+ * uses the edata_t for active ones; instead, inactive extent state is tracked
+ * within hpdata associated with the enclosing hugepage-sized, hugepage-aligned
+ * region of virtual address space.
+ *
+ * An hpdata need not be "truly" backed by a hugepage (which is not necessarily
+ * an observable property of any given region of address space).  It's just
+ * hugepage-sized and hugepage-aligned; it's *potentially* huge.
+ */
+typedef struct hpdata_s hpdata_t;
+ph_structs(hpdata_age_heap, hpdata_t);
+struct hpdata_s {
+	/*
+	 * We likewise follow the edata convention of mangling names and forcing
+	 * the use of accessors -- this lets us add some consistency checks on
+	 * access.
+	 */
+
+	/*
+	 * The address of the hugepage in question.  This can't be named h_addr,
+	 * since that conflicts with a macro defined in Windows headers.
+	 */
+	void *h_address;
+	/* Its age (measured in psset operations). */
+	uint64_t h_age;
+	/* Whether or not we think the hugepage is mapped that way by the OS. */
+	bool h_huge;
+
+	/*
+	 * For some properties, we keep parallel sets of bools; h_foo_allowed
+	 * and h_in_psset_foo_container.  This is a decoupling mechanism to
+	 * avoid bothering the hpa (which manages policies) from the psset
+	 * (which is the mechanism used to enforce those policies).  This allows
+	 * all the container management logic to live in one place, without the
+	 * HPA needing to know or care how that happens.
+	 */
+
+	/*
+	 * Whether or not the hpdata is allowed to be used to serve allocations,
+	 * and whether or not the psset is currently tracking it as such.
+	 */
+	bool h_alloc_allowed;
+	bool h_in_psset_alloc_container;
+
+	/*
+	 * The same, but with purging.  There's no corresponding
+	 * h_in_psset_purge_container, because the psset (currently) always
+	 * removes hpdatas from their containers during updates (to implement
+	 * LRU for purging).
+	 */
+	bool h_purge_allowed;
+
+	/* And with hugifying. */
+	bool h_hugify_allowed;
+	/* When we became a hugification candidate. */
+	nstime_t h_time_hugify_allowed;
+	bool h_in_psset_hugify_container;
+
+	/* Whether or not a purge or hugify is currently happening. */
+	bool h_mid_purge;
+	bool h_mid_hugify;
+
+	/*
+	 * Whether or not the hpdata is being updated in the psset (i.e. if
+	 * there has been a psset_update_begin call issued without a matching
+	 * psset_update_end call).  Eventually this will expand to other types
+	 * of updates.
+	 */
+	bool h_updating;
+
+	/* Whether or not the hpdata is in a psset. */
+	bool h_in_psset;
+
+	union {
+		/* When nonempty (and also nonfull), used by the psset bins. */
+		hpdata_age_heap_link_t age_link;
+		/*
+		 * When empty (or not corresponding to any hugepage), list
+		 * linkage.
+		 */
+		ql_elm(hpdata_t) ql_link_empty;
+	};
+
+	/*
+	 * Linkage for the psset to track candidates for purging and hugifying.
+	 */
+	ql_elm(hpdata_t) ql_link_purge;
+	ql_elm(hpdata_t) ql_link_hugify;
+
+	/* The length of the largest contiguous sequence of inactive pages. */
+	size_t h_longest_free_range;
+
+	/* Number of active pages. */
+	size_t h_nactive;
+
+	/* A bitmap with bits set in the active pages. */
+	fb_group_t active_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+
+	/*
+	 * Number of dirty or active pages, and a bitmap tracking them.  One
+	 * way to think of this is as which pages are dirty from the OS's
+	 * perspective.
+	 */
+	size_t h_ntouched;
+
+	/* The touched pages (using the same definition as above). */
+	fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+};
+
+TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
+TYPED_LIST(hpdata_purge_list, hpdata_t, ql_link_purge)
+TYPED_LIST(hpdata_hugify_list, hpdata_t, ql_link_hugify)
+
+ph_proto(, hpdata_age_heap, hpdata_t);
+
+static inline void *
+hpdata_addr_get(const hpdata_t *hpdata) {
+	return hpdata->h_address;
+}
+
+static inline void
+hpdata_addr_set(hpdata_t *hpdata, void *addr) {
+	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+	hpdata->h_address = addr;
+}
+
+static inline uint64_t
+hpdata_age_get(const hpdata_t *hpdata) {
+	return hpdata->h_age;
+}
+
+static inline void
+hpdata_age_set(hpdata_t *hpdata, uint64_t age) {
+	hpdata->h_age = age;
+}
+
+static inline bool
+hpdata_huge_get(const hpdata_t *hpdata) {
+	return hpdata->h_huge;
+}
+
+static inline bool
+hpdata_alloc_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_alloc_allowed;
+}
+
+static inline void
+hpdata_alloc_allowed_set(hpdata_t *hpdata, bool alloc_allowed) {
+	hpdata->h_alloc_allowed = alloc_allowed;
+}
+
+static inline bool
+hpdata_in_psset_alloc_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_alloc_container;
+}
+
+static inline void
+hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_alloc_container);
+	hpdata->h_in_psset_alloc_container = in_container;
+}
+
+static inline bool
+hpdata_purge_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_purge_allowed;
+}
+
+static inline void
+hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) {
+       assert(purge_allowed == false || !hpdata->h_mid_purge);
+       hpdata->h_purge_allowed = purge_allowed;
+}
+
+static inline bool
+hpdata_hugify_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_hugify_allowed;
+}
+
+static inline void
+hpdata_allow_hugify(hpdata_t *hpdata, nstime_t now) {
+	assert(!hpdata->h_mid_hugify);
+	hpdata->h_hugify_allowed = true;
+	hpdata->h_time_hugify_allowed = now;
+}
+
+static inline nstime_t
+hpdata_time_hugify_allowed(hpdata_t *hpdata) {
+	return hpdata->h_time_hugify_allowed;
+}
+
+static inline void
+hpdata_disallow_hugify(hpdata_t *hpdata) {
+	hpdata->h_hugify_allowed = false;
+}
+
+static inline bool
+hpdata_in_psset_hugify_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_hugify_container;
+}
+
+static inline void
+hpdata_in_psset_hugify_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_hugify_container);
+	hpdata->h_in_psset_hugify_container = in_container;
+}
+
+static inline bool
+hpdata_mid_purge_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_purge;
+}
+
+static inline void
+hpdata_mid_purge_set(hpdata_t *hpdata, bool mid_purge) {
+	assert(mid_purge != hpdata->h_mid_purge);
+	hpdata->h_mid_purge = mid_purge;
+}
+
+static inline bool
+hpdata_mid_hugify_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_hugify;
+}
+
+static inline void
+hpdata_mid_hugify_set(hpdata_t *hpdata, bool mid_hugify) {
+	assert(mid_hugify != hpdata->h_mid_hugify);
+	hpdata->h_mid_hugify = mid_hugify;
+}
+
+static inline bool
+hpdata_changing_state_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_purge || hpdata->h_mid_hugify;
+}
+
+
+static inline bool
+hpdata_updating_get(const hpdata_t *hpdata) {
+	return hpdata->h_updating;
+}
+
+static inline void
+hpdata_updating_set(hpdata_t *hpdata, bool updating) {
+	assert(updating != hpdata->h_updating);
+	hpdata->h_updating = updating;
+}
+
+static inline bool
+hpdata_in_psset_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset;
+}
+
+static inline void
+hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) {
+	assert(in_psset != hpdata->h_in_psset);
+	hpdata->h_in_psset = in_psset;
+}
+
+static inline size_t
+hpdata_longest_free_range_get(const hpdata_t *hpdata) {
+	return hpdata->h_longest_free_range;
+}
+
+static inline void
+hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) {
+	assert(longest_free_range <= HUGEPAGE_PAGES);
+	hpdata->h_longest_free_range = longest_free_range;
+}
+
+static inline size_t
+hpdata_nactive_get(hpdata_t *hpdata) {
+	return hpdata->h_nactive;
+}
+
+static inline size_t
+hpdata_ntouched_get(hpdata_t *hpdata) {
+	return hpdata->h_ntouched;
+}
+
+static inline size_t
+hpdata_ndirty_get(hpdata_t *hpdata) {
+	return hpdata->h_ntouched - hpdata->h_nactive;
+}
+
+static inline size_t
+hpdata_nretained_get(hpdata_t *hpdata) {
+	return HUGEPAGE_PAGES - hpdata->h_ntouched;
+}
+
+static inline void
+hpdata_assert_empty(hpdata_t *hpdata) {
+	assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES));
+	assert(hpdata->h_nactive == 0);
+}
+
+/*
+ * Only used in tests, and in hpdata_assert_consistent, below.  Verifies some
+ * consistency properties of the hpdata (e.g. that cached counts of page stats
+ * match computed ones).
+ */
+static inline bool
+hpdata_consistent(hpdata_t *hpdata) {
+	if(fb_urange_longest(hpdata->active_pages, HUGEPAGE_PAGES)
+	    != hpdata_longest_free_range_get(hpdata)) {
+		return false;
+	}
+	if (fb_scount(hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)
+	    != hpdata->h_nactive) {
+		return false;
+	}
+	if (fb_scount(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)
+	    != hpdata->h_ntouched) {
+		return false;
+	}
+	if (hpdata->h_ntouched < hpdata->h_nactive) {
+		return false;
+	}
+	if (hpdata->h_huge && hpdata->h_ntouched != HUGEPAGE_PAGES) {
+		return false;
+	}
+	if (hpdata_changing_state_get(hpdata)
+	    && ((hpdata->h_purge_allowed) || hpdata->h_hugify_allowed)) {
+		return false;
+	}
+	if (hpdata_hugify_allowed_get(hpdata)
+	    != hpdata_in_psset_hugify_container_get(hpdata)) {
+		return false;
+	}
+	return true;
+}
+
+static inline void
+hpdata_assert_consistent(hpdata_t *hpdata) {
+	assert(hpdata_consistent(hpdata));
+}
+
+static inline bool
+hpdata_empty(hpdata_t *hpdata) {
+	return hpdata->h_nactive == 0;
+}
+
+static inline bool
+hpdata_full(hpdata_t *hpdata) {
+	return hpdata->h_nactive == HUGEPAGE_PAGES;
+}
+
+void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
+
+/*
+ * Given an hpdata which can serve an allocation request, pick and reserve an
+ * offset within that allocation.
+ */
+void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz);
+void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
+
+/*
+ * The hpdata_purge_prepare_t allows grabbing the metadata required to purge
+ * subranges of a hugepage while holding a lock, drop the lock during the actual
+ * purging of them, and reacquire it to update the metadata again.
+ */
+typedef struct hpdata_purge_state_s hpdata_purge_state_t;
+struct hpdata_purge_state_s {
+	size_t npurged;
+	size_t ndirty_to_purge;
+	fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)];
+	size_t next_purge_search_begin;
+};
+
+/*
+ * Initializes purge state.  The access to hpdata must be externally
+ * synchronized with other hpdata_* calls.
+ *
+ * You can tell whether or not a thread is purging or hugifying a given hpdata
+ * via hpdata_changing_state_get(hpdata).  Racing hugification or purging
+ * operations aren't allowed.
+ *
+ * Once you begin purging, you have to follow through and call hpdata_purge_next
+ * until you're done, and then end.  Allocating out of an hpdata undergoing
+ * purging is not allowed.
+ *
+ * Returns the number of dirty pages that will be purged.
+ */
+size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);
+
+/*
+ * If there are more extents to purge, sets *r_purge_addr and *r_purge_size to
+ * true, and returns true.  Otherwise, returns false to indicate that we're
+ * done.
+ *
+ * This requires exclusive access to the purge state, but *not* to the hpdata.
+ * In particular, unreserve calls are allowed while purging (i.e. you can dalloc
+ * into one part of the hpdata while purging a different part).
+ */
+bool hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
+    void **r_purge_addr, size_t *r_purge_size);
+/*
+ * Updates the hpdata metadata after all purging is done.  Needs external
+ * synchronization.
+ */
+void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);
+
+void hpdata_hugify(hpdata_t *hpdata);
+void hpdata_dehugify(hpdata_t *hpdata);
+
+#endif /* JEMALLOC_INTERNAL_HPDATA_H */

+ 40 - 0
BeefRT/JEMalloc/include/jemalloc/internal/inspect.h

@@ -0,0 +1,40 @@
+#ifndef JEMALLOC_INTERNAL_INSPECT_H
+#define JEMALLOC_INTERNAL_INSPECT_H
+
+/*
+ * This module contains the heap introspection capabilities.  For now they are
+ * exposed purely through mallctl APIs in the experimental namespace, but this
+ * may change over time.
+ */
+
+/*
+ * The following two structs are for experimental purposes. See
+ * experimental_utilization_query_ctl and
+ * experimental_utilization_batch_query_ctl in src/ctl.c.
+ */
+typedef struct inspect_extent_util_stats_s inspect_extent_util_stats_t;
+struct inspect_extent_util_stats_s {
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+};
+
+typedef struct inspect_extent_util_stats_verbose_s
+    inspect_extent_util_stats_verbose_t;
+
+struct inspect_extent_util_stats_verbose_s {
+	void *slabcur_addr;
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+	size_t bin_nfree;
+	size_t bin_nregs;
+};
+
+void inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size);
+void inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size,
+    size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr);
+
+#endif /* JEMALLOC_INTERNAL_INSPECT_H */

+ 108 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h

@@ -0,0 +1,108 @@
+#ifndef JEMALLOC_INTERNAL_DECLS_H
+#define JEMALLOC_INTERNAL_DECLS_H
+
+#include <math.h>
+#ifdef _WIN32
+#  include <windows.h>
+#  include "msvc_compat/windows_extra.h"
+#  include "msvc_compat/strings.h"
+#  ifdef _WIN64
+#    if LG_VADDR <= 32
+#      error Generate the headers using x64 vcargs
+#    endif
+#  else
+#    if LG_VADDR > 32
+#      undef LG_VADDR
+#      define LG_VADDR 32
+#    endif
+#  endif
+#else
+#  include <sys/param.h>
+#  include <sys/mman.h>
+#  if !defined(__pnacl__) && !defined(__native_client__)
+#    include <sys/syscall.h>
+#    if !defined(SYS_write) && defined(__NR_write)
+#      define SYS_write __NR_write
+#    endif
+#    if defined(SYS_open) && defined(__aarch64__)
+       /* Android headers may define SYS_open to __NR_open even though
+        * __NR_open may not exist on AArch64 (superseded by __NR_openat). */
+#      undef SYS_open
+#    endif
+#    include <sys/uio.h>
+#  endif
+#  include <pthread.h>
+#  if defined(__FreeBSD__) || defined(__DragonFly__)
+#  include <pthread_np.h>
+#  include <sched.h>
+#  if defined(__FreeBSD__)
+#    define cpu_set_t cpuset_t
+#  endif
+#  endif
+#  include <signal.h>
+#  ifdef JEMALLOC_OS_UNFAIR_LOCK
+#    include <os/lock.h>
+#  endif
+#  ifdef JEMALLOC_GLIBC_MALLOC_HOOK
+#    include <sched.h>
+#  endif
+#  include <errno.h>
+#  include <sys/time.h>
+#  include <time.h>
+#  ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+#    include <mach/mach_time.h>
+#  endif
+#endif
+#include <sys/types.h>
+
+#include <limits.h>
+#ifndef SIZE_T_MAX
+#  define SIZE_T_MAX	SIZE_MAX
+#endif
+#ifndef SSIZE_MAX
+#  define SSIZE_MAX	((ssize_t)(SIZE_T_MAX >> 1))
+#endif
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#ifndef offsetof
+#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#endif
+#include <string.h>
+#include <strings.h>
+#include <ctype.h>
+#ifdef _MSC_VER
+#  include <io.h>
+typedef intptr_t ssize_t;
+#  define PATH_MAX 1024
+#  define STDERR_FILENO 2
+#  define __func__ __FUNCTION__
+#  ifdef JEMALLOC_HAS_RESTRICT
+#    define restrict __restrict
+#  endif
+/* Disable warnings about deprecated system functions. */
+#  pragma warning(disable: 4996)
+#if _MSC_VER < 1800
+static int
+isblank(int c) {
+	return (c == '\t' || c == ' ');
+}
+#endif
+#else
+#  include <unistd.h>
+#endif
+#include <fcntl.h>
+
+/*
+ * The Win32 midl compiler has #define small char; we don't use midl, but
+ * "small" is a nice identifier to have available when talking about size
+ * classes.
+ */
+#ifdef small
+#  undef small
+#endif
+
+#endif /* JEMALLOC_INTERNAL_H */

+ 428 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h

@@ -0,0 +1,428 @@
+/* include/jemalloc/internal/jemalloc_internal_defs.h.  Generated from jemalloc_internal_defs.h.in by configure.  */
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#define JEMALLOC_PREFIX "je_"
+#define JEMALLOC_CPREFIX "JE_"
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */
+/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */
+/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */
+/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#define JEMALLOC_PRIVATE_NAMESPACE je_
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#define CPU_SPINWAIT _mm_pause()
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#define HAVE_CPU_SPINWAIT 1
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#define LG_VADDR 48
+
+/* Defined if C11 atomics are available. */
+/* #undef JEMALLOC_C11_ATOMICS */
+
+/* Defined if GCC __atomic atomics are available. */
+/* #undef JEMALLOC_GCC_ATOMIC_ATOMICS */
+/* and the 8-bit variant support. */
+/* #undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS */
+
+/* Defined if GCC __sync atomics are available. */
+/* #undef JEMALLOC_GCC_SYNC_ATOMICS */
+/* and the 8-bit variant support. */
+/* #undef JEMALLOC_GCC_U8_SYNC_ATOMICS */
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+/* #undef JEMALLOC_HAVE_BUILTIN_CLZ */
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+/* #undef JEMALLOC_OS_UNFAIR_LOCK */
+
+/* Defined if syscall(2) is usable. */
+/* #undef JEMALLOC_USE_SYSCALL */
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+/* #undef JEMALLOC_HAVE_SECURE_GETENV */
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+/* #undef JEMALLOC_HAVE_ISSETUGID */
+
+/* Defined if pthread_atfork(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_ATFORK */
+
+/* Defined if pthread_setname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */
+
+/* Defined if pthread_getname_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */
+
+/* Defined if pthread_get_name_np(3) is available. */
+/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+/* #undef JEMALLOC_HAVE_CLOCK_REALTIME */
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+/* #undef JEMALLOC_THREADED_INIT */
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+/* #undef JEMALLOC_MUTEX_INIT_CB */
+
+/* Non-empty if the tls_model attribute is supported. */
+#define JEMALLOC_TLS_MODEL 
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+/* #undef JEMALLOC_DEBUG */
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#define JEMALLOC_STATS 
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */
+
+/* JEMALLOC_PROF enables allocation profiling. */
+/* #undef JEMALLOC_PROF */
+
+/* Use libunwind for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBUNWIND */
+
+/* Use libgcc for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_LIBGCC */
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+/* #undef JEMALLOC_PROF_GCC */
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+/* #undef JEMALLOC_DSS */
+
+/* Support memory filling (junk/zero). */
+#define JEMALLOC_FILL 
+
+/* Support utrace(2)-based tracing. */
+/* #undef JEMALLOC_UTRACE */
+
+/* Support utrace(2)-based tracing (label based signature). */
+/* #undef JEMALLOC_UTRACE_LABEL */
+
+/* Support optional abort() on OOM. */
+/* #undef JEMALLOC_XMALLOC */
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+/* #undef JEMALLOC_LAZY_LOCK */
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+/* #undef LG_QUANTUM */
+
+/* One page is 2^LG_PAGE bytes. */
+#define LG_PAGE 12
+
+/* Maximum number of regions in a slab. */
+/* #undef CONFIG_LG_SLAB_MAXREGS */
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#define LG_HUGEPAGE 21
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+/* #undef JEMALLOC_MAPS_COALESCE */
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+/* #undef JEMALLOC_RETAIN */
+
+/* TLS is used to map arenas and magazine caches to threads. */
+/* #undef JEMALLOC_TLS */
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#define JEMALLOC_INTERNAL_UNREACHABLE abort
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#define JEMALLOC_INTERNAL_FFSLL ffsll
+#define JEMALLOC_INTERNAL_FFSL ffsl
+#define JEMALLOC_INTERNAL_FFS ffs
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+/* #undef JEMALLOC_INTERNAL_POPCOUNTL */
+/* #undef JEMALLOC_INTERNAL_POPCOUNT */
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#define JEMALLOC_CACHE_OBLIVIOUS 
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+/* #undef JEMALLOC_LOG */
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+/* #undef JEMALLOC_READLINKAT */
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+/* #undef JEMALLOC_ZONE */
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */
+/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */
+
+/* Defined if madvise(2) is available. */
+/* #undef JEMALLOC_HAVE_MADVISE */
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+/* #undef JEMALLOC_HAVE_MADVISE_HUGE */
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+/* #undef JEMALLOC_PURGE_MADVISE_FREE */
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+/* #undef JEMALLOC_DEFINE_MADVISE_FREE */
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_DONTDUMP */
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+/* #undef JEMALLOC_MADVISE_NOCORE */
+
+/* Defined if mprotect(2) is available. */
+/* #undef JEMALLOC_HAVE_MPROTECT */
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+/* #undef JEMALLOC_THP */
+
+/* Defined if posix_madvise is available. */
+/* #undef JEMALLOC_HAVE_POSIX_MADVISE */
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */
+/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+/* #undef JEMALLOC_HAVE_MEMCNTL */
+
+/*
+ * Defined if malloc_size is supported
+ */
+/* #undef JEMALLOC_HAVE_MALLOC_SIZE */
+
+/* Define if operating system has alloca.h header. */
+/* #undef JEMALLOC_HAS_ALLOCA_H */
+
+/* C99 restrict keyword supported. */
+/* #undef JEMALLOC_HAS_RESTRICT */
+
+/* For use by hash code. */
+/* #undef JEMALLOC_BIG_ENDIAN */
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#define LG_SIZEOF_INT 2
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#define LG_SIZEOF_LONG 2
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#define LG_SIZEOF_LONG_LONG 3
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#define LG_SIZEOF_INTMAX_T 3
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */
+
+/* glibc memalign hook. */
+/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */
+
+/* pthread support */
+/* #undef JEMALLOC_HAVE_PTHREAD */
+
+/* dlsym() support */
+/* #undef JEMALLOC_HAVE_DLSYM */
+
+/* Adaptive mutex support in pthreads. */
+/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */
+
+/* GNU specific sched_getcpu support */
+/* #undef JEMALLOC_HAVE_SCHED_GETCPU */
+
+/* GNU specific sched_setaffinity support */
+/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+/* #undef JEMALLOC_BACKGROUND_THREAD */
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+/* #undef JEMALLOC_EXPORT */
+
+/* config.malloc_conf options string. */
+#define JEMALLOC_CONFIG_MALLOC_CONF ""
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+/* #undef JEMALLOC_IS_MALLOC */
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */
+
+/* Performs additional safety checks when defined. */
+/* #undef JEMALLOC_OPT_SAFETY_CHECKS */
+
+/* Is C++ support being built? */
+/* #undef JEMALLOC_ENABLE_CXX */
+
+/* Performs additional size checks when defined. */
+/* #undef JEMALLOC_OPT_SIZE_CHECKS */
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+/* #undef JEMALLOC_UAF_DETECTION */
+
+/* Darwin VM_MAKE_TAG support */
+/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */
+
+/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */
+#define JEMALLOC_ZERO_REALLOC_DEFAULT_FREE 
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */

+ 427 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in

@@ -0,0 +1,427 @@
+#ifndef JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
+/*
+ * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
+ * public APIs to be prefixed.  This makes it possible, with some care, to use
+ * multiple allocators simultaneously.
+ */
+#undef JEMALLOC_PREFIX
+#undef JEMALLOC_CPREFIX
+
+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#undef JEMALLOC_OVERRIDE___LIBC_CALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_FREE
+#undef JEMALLOC_OVERRIDE___LIBC_MALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+#undef JEMALLOC_OVERRIDE___LIBC_REALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_VALLOC
+#undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
+/*
+ * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
+ * For shared libraries, symbol visibility mechanisms prevent these symbols
+ * from being exported, but for static libraries, naming collisions are a real
+ * possibility.
+ */
+#undef JEMALLOC_PRIVATE_NAMESPACE
+
+/*
+ * Hyper-threaded CPUs may need a special instruction inside spin loops in
+ * order to yield to another virtual CPU.
+ */
+#undef CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#undef HAVE_CPU_SPINWAIT
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#undef LG_VADDR
+
+/* Defined if C11 atomics are available. */
+#undef JEMALLOC_C11_ATOMICS
+
+/* Defined if GCC __atomic atomics are available. */
+#undef JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS
+
+/* Defined if GCC __sync atomics are available. */
+#undef JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_SYNC_ATOMICS
+
+/*
+ * Defined if __builtin_clz() and __builtin_clzl() are available.
+ */
+#undef JEMALLOC_HAVE_BUILTIN_CLZ
+
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#undef JEMALLOC_OS_UNFAIR_LOCK
+
+/* Defined if syscall(2) is usable. */
+#undef JEMALLOC_USE_SYSCALL
+
+/*
+ * Defined if secure_getenv(3) is available.
+ */
+#undef JEMALLOC_HAVE_SECURE_GETENV
+
+/*
+ * Defined if issetugid(2) is available.
+ */
+#undef JEMALLOC_HAVE_ISSETUGID
+
+/* Defined if pthread_atfork(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if _malloc_thread_cleanup() exists.  At least in the case of
+ * FreeBSD, pthread_key_create() allocates, which if used during malloc
+ * bootstrapping will cause recursion into the pthreads library.  Therefore, if
+ * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in
+ * malloc_tsd.
+ */
+#undef JEMALLOC_MALLOC_THREAD_CLEANUP
+
+/*
+ * Defined if threaded initialization is known to be safe on this platform.
+ * Among other things, it must be possible to initialize a mutex without
+ * triggering allocation in order for threaded allocation to be safe.
+ */
+#undef JEMALLOC_THREADED_INIT
+
+/*
+ * Defined if the pthreads implementation defines
+ * _pthread_mutex_init_calloc_cb(), in which case the function is used in order
+ * to avoid recursive allocation during mutex initialization.
+ */
+#undef JEMALLOC_MUTEX_INIT_CB
+
+/* Non-empty if the tls_model attribute is supported. */
+#undef JEMALLOC_TLS_MODEL
+
+/*
+ * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
+ * inline functions.
+ */
+#undef JEMALLOC_DEBUG
+
+/* JEMALLOC_STATS enables statistics calculation. */
+#undef JEMALLOC_STATS
+
+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
+/* JEMALLOC_PROF enables allocation profiling. */
+#undef JEMALLOC_PROF
+
+/* Use libunwind for profile backtracing if defined. */
+#undef JEMALLOC_PROF_LIBUNWIND
+
+/* Use libgcc for profile backtracing if defined. */
+#undef JEMALLOC_PROF_LIBGCC
+
+/* Use gcc intrinsics for profile backtracing if defined. */
+#undef JEMALLOC_PROF_GCC
+
+/*
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
+ * segment (DSS).
+ */
+#undef JEMALLOC_DSS
+
+/* Support memory filling (junk/zero). */
+#undef JEMALLOC_FILL
+
+/* Support utrace(2)-based tracing. */
+#undef JEMALLOC_UTRACE
+
+/* Support utrace(2)-based tracing (label based signature). */
+#undef JEMALLOC_UTRACE_LABEL
+
+/* Support optional abort() on OOM. */
+#undef JEMALLOC_XMALLOC
+
+/* Support lazy locking (avoid locking unless a second thread is launched). */
+#undef JEMALLOC_LAZY_LOCK
+
+/*
+ * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
+ * classes).
+ */
+#undef LG_QUANTUM
+
+/* One page is 2^LG_PAGE bytes. */
+#undef LG_PAGE
+
+/* Maximum number of regions in a slab. */
+#undef CONFIG_LG_SLAB_MAXREGS
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#undef LG_HUGEPAGE
+
+/*
+ * If defined, adjacent virtual memory mappings with identical attributes
+ * automatically coalesce, and they fragment when changes are made to subranges.
+ * This is the normal order of things for mmap()/munmap(), but on Windows
+ * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e.
+ * mappings do *not* coalesce/fragment.
+ */
+#undef JEMALLOC_MAPS_COALESCE
+
+/*
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
+ */
+#undef JEMALLOC_RETAIN
+
+/* TLS is used to map arenas and magazine caches to threads. */
+#undef JEMALLOC_TLS
+
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#undef JEMALLOC_INTERNAL_UNREACHABLE
+
+/*
+ * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
+ * use ffs_*() from util.h.
+ */
+#undef JEMALLOC_INTERNAL_FFSLL
+#undef JEMALLOC_INTERNAL_FFSL
+#undef JEMALLOC_INTERNAL_FFS
+
+/*
+ * popcount*() functions to use for bitmapping.
+ */
+#undef JEMALLOC_INTERNAL_POPCOUNTL
+#undef JEMALLOC_INTERNAL_POPCOUNT
+
+/*
+ * If defined, explicitly attempt to more uniformly distribute large allocation
+ * pointer alignments across all cache indices.
+ */
+#undef JEMALLOC_CACHE_OBLIVIOUS
+
+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+#undef JEMALLOC_LOG
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+#undef JEMALLOC_READLINKAT
+
+/*
+ * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
+ */
+#undef JEMALLOC_ZONE
+
+/*
+ * Methods for determining whether the OS overcommits.
+ * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's
+ *                                         /proc/sys/vm.overcommit_memory file.
+ * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl.
+ */
+#undef JEMALLOC_SYSCTL_VM_OVERCOMMIT
+#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+
+/* Defined if madvise(2) is available. */
+#undef JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Methods for purging unused pages differ between operating systems.
+ *
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
+ *                                 such that new pages will be demand-zeroed if
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
+ */
+#undef JEMALLOC_PURGE_MADVISE_FREE
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+#undef JEMALLOC_DEFINE_MADVISE_FREE
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#undef JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#undef JEMALLOC_MADVISE_NOCORE
+
+/* Defined if mprotect(2) is available. */
+#undef JEMALLOC_HAVE_MPROTECT
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+#undef JEMALLOC_THP
+
+/* Defined if posix_madvise is available. */
+#undef JEMALLOC_HAVE_POSIX_MADVISE
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+#undef JEMALLOC_HAVE_MEMCNTL
+
+/*
+ * Defined if malloc_size is supported
+ */
+#undef JEMALLOC_HAVE_MALLOC_SIZE
+
+/* Define if operating system has alloca.h header. */
+#undef JEMALLOC_HAS_ALLOCA_H
+
+/* C99 restrict keyword supported. */
+#undef JEMALLOC_HAS_RESTRICT
+
+/* For use by hash code. */
+#undef JEMALLOC_BIG_ENDIAN
+
+/* sizeof(int) == 2^LG_SIZEOF_INT. */
+#undef LG_SIZEOF_INT
+
+/* sizeof(long) == 2^LG_SIZEOF_LONG. */
+#undef LG_SIZEOF_LONG
+
+/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */
+#undef LG_SIZEOF_LONG_LONG
+
+/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
+#undef LG_SIZEOF_INTMAX_T
+
+/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */
+#undef JEMALLOC_GLIBC_MALLOC_HOOK
+
+/* glibc memalign hook. */
+#undef JEMALLOC_GLIBC_MEMALIGN_HOOK
+
+/* pthread support */
+#undef JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#undef JEMALLOC_HAVE_DLSYM
+
+/* Adaptive mutex support in pthreads. */
+#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP
+
+/* GNU specific sched_getcpu support */
+#undef JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#undef JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#undef JEMALLOC_BACKGROUND_THREAD
+
+/*
+ * If defined, jemalloc symbols are not exported (doesn't work when
+ * JEMALLOC_PREFIX is not defined).
+ */
+#undef JEMALLOC_EXPORT
+
+/* config.malloc_conf options string. */
+#undef JEMALLOC_CONFIG_MALLOC_CONF
+
+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#undef JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+#undef JEMALLOC_OPT_SAFETY_CHECKS
+
+/* Is C++ support being built? */
+#undef JEMALLOC_ENABLE_CXX
+
+/* Performs additional size checks when defined. */
+#undef JEMALLOC_OPT_SIZE_CHECKS
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+#undef JEMALLOC_UAF_DETECTION
+
+/* Darwin VM_MAKE_TAG support */
+#undef JEMALLOC_HAVE_VM_MAKE_TAG
+
+/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */
+#undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE
+
+#endif /* JEMALLOC_INTERNAL_DEFS_H_ */

+ 75 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h

@@ -0,0 +1,75 @@
+#ifndef JEMALLOC_INTERNAL_EXTERNS_H
+#define JEMALLOC_INTERNAL_EXTERNS_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/sec_opts.h"
+#include "jemalloc/internal/tsd_types.h"
+#include "jemalloc/internal/nstime.h"
+
+/* TSD checks this to set thread local slow state accordingly. */
+extern bool malloc_slow;
+
+/* Run-time options. */
+extern bool opt_abort;
+extern bool opt_abort_conf;
+extern bool opt_trust_madvise;
+extern bool opt_confirm_conf;
+extern bool opt_hpa;
+extern hpa_shard_opts_t opt_hpa_opts;
+extern sec_opts_t opt_hpa_sec_opts;
+
+extern const char *opt_junk;
+extern bool opt_junk_alloc;
+extern bool opt_junk_free;
+extern void (*junk_free_callback)(void *ptr, size_t size);
+extern void (*junk_alloc_callback)(void *ptr, size_t size);
+extern bool opt_utrace;
+extern bool opt_xmalloc;
+extern bool opt_experimental_infallible_new;
+extern bool opt_zero;
+extern unsigned opt_narenas;
+extern zero_realloc_action_t opt_zero_realloc_action;
+extern malloc_init_t malloc_init_state;
+extern const char *zero_realloc_mode_names[];
+extern atomic_zu_t zero_realloc_count;
+extern bool opt_cache_oblivious;
+
+/* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */
+extern uintptr_t san_cache_bin_nonfast_mask;
+
+/* Number of CPUs. */
+extern unsigned ncpus;
+
+/* Number of arenas used for automatic multiplexing of threads and arenas. */
+extern unsigned narenas_auto;
+
+/* Base index for manual arenas. */
+extern unsigned manual_arena_base;
+
+/*
+ * Arenas that are used to service external requests.  Not all elements of the
+ * arenas array are necessarily used; arenas are created lazily as needed.
+ */
+extern atomic_p_t arenas[];
+
+void *a0malloc(size_t size);
+void a0dalloc(void *ptr);
+void *bootstrap_malloc(size_t size);
+void *bootstrap_calloc(size_t num, size_t size);
+void bootstrap_free(void *ptr);
+void arena_set(unsigned ind, arena_t *arena);
+unsigned narenas_total_get(void);
+arena_t *arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
+arena_t *arena_choose_hard(tsd_t *tsd, bool internal);
+void arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena);
+void iarena_cleanup(tsd_t *tsd);
+void arena_cleanup(tsd_t *tsd);
+size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
+void jemalloc_prefork(void);
+void jemalloc_postfork_parent(void);
+void jemalloc_postfork_child(void);
+void je_sdallocx_noflags(void *ptr, size_t size);
+void *malloc_default(size_t size);
+
+#endif /* JEMALLOC_INTERNAL_EXTERNS_H */

+ 84 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h

@@ -0,0 +1,84 @@
+#ifndef JEMALLOC_INTERNAL_INCLUDES_H
+#define JEMALLOC_INTERNAL_INCLUDES_H
+
+/*
+ * jemalloc can conceptually be broken into components (arena, tcache, etc.),
+ * but there are circular dependencies that cannot be broken without
+ * substantial performance degradation.
+ *
+ * Historically, we dealt with this by each header into four sections (types,
+ * structs, externs, and inlines), and included each header file multiple times
+ * in this file, picking out the portion we want on each pass using the
+ * following #defines:
+ *   JEMALLOC_H_TYPES   : Preprocessor-defined constants and pseudo-opaque data
+ *                        types.
+ *   JEMALLOC_H_STRUCTS : Data structures.
+ *   JEMALLOC_H_EXTERNS : Extern data declarations and function prototypes.
+ *   JEMALLOC_H_INLINES : Inline functions.
+ *
+ * We're moving toward a world in which the dependencies are explicit; each file
+ * will #include the headers it depends on (rather than relying on them being
+ * implicitly available via this file including every header file in the
+ * project).
+ *
+ * We're now in an intermediate state: we've broken up the header files to avoid
+ * having to include each one multiple times, but have not yet moved the
+ * dependency information into the header files (i.e. we still rely on the
+ * ordering in this file to ensure all a header's dependencies are available in
+ * its translation unit).  Each component is now broken up into multiple header
+ * files, corresponding to the sections above (e.g. instead of "foo.h", we now
+ * have "foo_types.h", "foo_structs.h", "foo_externs.h", "foo_inlines.h").
+ *
+ * Those files which have been converted to explicitly include their
+ * inter-component dependencies are now in the initial HERMETIC HEADERS
+ * section.  All headers may still rely on jemalloc_preamble.h (which, by fiat,
+ * must be included first in every translation unit) for system headers and
+ * global jemalloc definitions, however.
+ */
+
+/******************************************************************************/
+/* TYPES */
+/******************************************************************************/
+
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/tcache_types.h"
+#include "jemalloc/internal/prof_types.h"
+
+/******************************************************************************/
+/* STRUCTS */
+/******************************************************************************/
+
+#include "jemalloc/internal/prof_structs.h"
+#include "jemalloc/internal/arena_structs.h"
+#include "jemalloc/internal/tcache_structs.h"
+#include "jemalloc/internal/background_thread_structs.h"
+
+/******************************************************************************/
+/* EXTERNS */
+/******************************************************************************/
+
+#include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/large_externs.h"
+#include "jemalloc/internal/tcache_externs.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/background_thread_externs.h"
+
+/******************************************************************************/
+/* INLINES */
+/******************************************************************************/
+
+#include "jemalloc/internal/jemalloc_internal_inlines_a.h"
+/*
+ * Include portions of arena code interleaved with tcache code in order to
+ * resolve circular dependencies.
+ */
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_b.h"
+#include "jemalloc/internal/tcache_inlines.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_c.h"
+#include "jemalloc/internal/prof_inlines.h"
+#include "jemalloc/internal/background_thread_inlines.h"
+
+#endif /* JEMALLOC_INTERNAL_INCLUDES_H */

+ 122 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h

@@ -0,0 +1,122 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_A_H
+#define JEMALLOC_INTERNAL_INLINES_A_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/ticker.h"
+
+JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
+malloc_getcpu(void) {
+	assert(have_percpu_arena);
+#if defined(_WIN32)
+	return GetCurrentProcessorNumber();
+#elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
+	return (malloc_cpuid_t)sched_getcpu();
+#else
+	not_reached();
+	return -1;
+#endif
+}
+
+/* Return the chosen arena index based on current cpu. */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_choose(void) {
+	assert(have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena));
+
+	malloc_cpuid_t cpuid = malloc_getcpu();
+	assert(cpuid >= 0);
+
+	unsigned arena_ind;
+	if ((opt_percpu_arena == percpu_arena) || ((unsigned)cpuid < ncpus /
+	    2)) {
+		arena_ind = cpuid;
+	} else {
+		assert(opt_percpu_arena == per_phycpu_arena);
+		/* Hyper threads on the same physical CPU share arena. */
+		arena_ind = cpuid - ncpus / 2;
+	}
+
+	return arena_ind;
+}
+
+/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_ind_limit(percpu_arena_mode_t mode) {
+	assert(have_percpu_arena && PERCPU_ARENA_ENABLED(mode));
+	if (mode == per_phycpu_arena && ncpus > 1) {
+		if (ncpus % 2) {
+			/* This likely means a misconfig. */
+			return ncpus / 2 + 1;
+		}
+		return ncpus / 2;
+	} else {
+		return ncpus;
+	}
+}
+
+static inline arena_t *
+arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing) {
+	arena_t *ret;
+
+	assert(ind < MALLOCX_ARENA_LIMIT);
+
+	ret = (arena_t *)atomic_load_p(&arenas[ind], ATOMIC_ACQUIRE);
+	if (unlikely(ret == NULL)) {
+		if (init_if_missing) {
+			ret = arena_init(tsdn, ind, &arena_config_default);
+		}
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+tcache_available(tsd_t *tsd) {
+	/*
+	 * Thread specific auto tcache might be unavailable if: 1) during tcache
+	 * initialization, or 2) disabled through thread.tcache.enabled mallctl
+	 * or config options.  This check covers all cases.
+	 */
+	if (likely(tsd_tcache_enabled_get(tsd))) {
+		/* Associated arena == NULL implies tcache init in progress. */
+		if (config_debug && tsd_tcache_slowp_get(tsd)->arena != NULL) {
+			tcache_assert_initialized(tsd_tcachep_get(tsd));
+		}
+		return true;
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcache_get(tsd_t *tsd) {
+	if (!tcache_available(tsd)) {
+		return NULL;
+	}
+
+	return tsd_tcachep_get(tsd);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_slow_t *
+tcache_slow_get(tsd_t *tsd) {
+	if (!tcache_available(tsd)) {
+		return NULL;
+	}
+
+	return tsd_tcache_slowp_get(tsd);
+}
+
+static inline void
+pre_reentrancy(tsd_t *tsd, arena_t *arena) {
+	/* arena is the current context.  Reentry from a0 is not allowed. */
+	assert(arena != arena_get(tsd_tsdn(tsd), 0, false));
+	tsd_pre_reentrancy_raw(tsd);
+}
+
+static inline void
+post_reentrancy(tsd_t *tsd) {
+	tsd_post_reentrancy_raw(tsd);
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_A_H */

+ 103 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h

@@ -0,0 +1,103 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_B_H
+#define JEMALLOC_INTERNAL_INLINES_B_H
+
+#include "jemalloc/internal/extent.h"
+
+static inline void
+percpu_arena_update(tsd_t *tsd, unsigned cpu) {
+	assert(have_percpu_arena);
+	arena_t *oldarena = tsd_arena_get(tsd);
+	assert(oldarena != NULL);
+	unsigned oldind = arena_ind_get(oldarena);
+
+	if (oldind != cpu) {
+		unsigned newind = cpu;
+		arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true);
+		assert(newarena != NULL);
+
+		/* Set new arena/tcache associations. */
+		arena_migrate(tsd, oldarena, newarena);
+		tcache_t *tcache = tcache_get(tsd);
+		if (tcache != NULL) {
+			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
+			tcache_arena_reassociate(tsd_tsdn(tsd), tcache_slow,
+			    tcache, newarena);
+		}
+	}
+}
+
+
+/* Choose an arena based on a per-thread value. */
+static inline arena_t *
+arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
+	arena_t *ret;
+
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/* During reentrancy, arena 0 is the safest bet. */
+	if (unlikely(tsd_reentrancy_level_get(tsd) > 0)) {
+		return arena_get(tsd_tsdn(tsd), 0, true);
+	}
+
+	ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd);
+	if (unlikely(ret == NULL)) {
+		ret = arena_choose_hard(tsd, internal);
+		assert(ret);
+		if (tcache_available(tsd)) {
+			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
+			tcache_t *tcache = tsd_tcachep_get(tsd);
+			if (tcache_slow->arena != NULL) {
+				/* See comments in tsd_tcache_data_init().*/
+				assert(tcache_slow->arena ==
+				    arena_get(tsd_tsdn(tsd), 0, false));
+				if (tcache_slow->arena != ret) {
+					tcache_arena_reassociate(tsd_tsdn(tsd),
+					    tcache_slow, tcache, ret);
+				}
+			} else {
+				tcache_arena_associate(tsd_tsdn(tsd),
+				    tcache_slow, tcache, ret);
+			}
+		}
+	}
+
+	/*
+	 * Note that for percpu arena, if the current arena is outside of the
+	 * auto percpu arena range, (i.e. thread is assigned to a manually
+	 * managed arena), then percpu arena is skipped.
+	 */
+	if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena) &&
+	    !internal && (arena_ind_get(ret) <
+	    percpu_arena_ind_limit(opt_percpu_arena)) && (ret->last_thd !=
+	    tsd_tsdn(tsd))) {
+		unsigned ind = percpu_arena_choose();
+		if (arena_ind_get(ret) != ind) {
+			percpu_arena_update(tsd, ind);
+			ret = tsd_arena_get(tsd);
+		}
+		ret->last_thd = tsd_tsdn(tsd);
+	}
+
+	return ret;
+}
+
+static inline arena_t *
+arena_choose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, false);
+}
+
+static inline arena_t *
+arena_ichoose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, true);
+}
+
+static inline bool
+arena_is_auto(arena_t *arena) {
+	assert(narenas_auto > 0);
+
+	return (arena_ind_get(arena) < manual_arena_base);
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_B_H */

+ 340 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h

@@ -0,0 +1,340 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_C_H
+#define JEMALLOC_INTERNAL_INLINES_C_H
+
+#include "jemalloc/internal/hook.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/log.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/thread_event.h"
+#include "jemalloc/internal/witness.h"
+
+/*
+ * Translating the names of the 'i' functions:
+ *   Abbreviations used in the first part of the function name (before
+ *   alloc/dalloc) describe what that function accomplishes:
+ *     a: arena (query)
+ *     s: size (query, or sized deallocation)
+ *     e: extent (query)
+ *     p: aligned (allocates)
+ *     vs: size (query, without knowing that the pointer is into the heap)
+ *     r: rallocx implementation
+ *     x: xallocx implementation
+ *   Abbreviations used in the second part of the function name (after
+ *   alloc/dalloc) describe the arguments it takes
+ *     z: whether to return zeroed memory
+ *     t: accepts a tcache_t * parameter
+ *     m: accepts an arena_t * parameter
+ */
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+iaalloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+
+	return arena_aalloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+isalloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+
+	return arena_salloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
+    bool is_internal, arena_t *arena, bool slow_path) {
+	void *ret;
+
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena == NULL || arena_is_auto(arena));
+	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
+		witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+		    WITNESS_RANK_CORE, 0);
+	}
+
+	ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path);
+	if (config_stats && is_internal && likely(ret != NULL)) {
+		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
+	return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false,
+	    NULL, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_internal, arena_t *arena) {
+	void *ret;
+
+	assert(usize != 0);
+	assert(usize == sz_sa2u(usize, alignment));
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena == NULL || arena_is_auto(arena));
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+
+	ret = arena_palloc(tsdn, arena, usize, alignment, zero, tcache);
+	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
+	if (config_stats && is_internal && likely(ret != NULL)) {
+		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena) {
+	return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) {
+	return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero,
+	    tcache_get(tsd), false, NULL);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+ivsalloc(tsdn_t *tsdn, const void *ptr) {
+	return arena_vsalloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
+    emap_alloc_ctx_t *alloc_ctx, bool is_internal, bool slow_path) {
+	assert(ptr != NULL);
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena_is_auto(iaalloc(tsdn, ptr)));
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+	if (config_stats && is_internal) {
+		arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr));
+	}
+	if (!is_internal && !tsdn_null(tsdn) &&
+	    tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
+		assert(tcache == NULL);
+	}
+	arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+idalloc(tsd_t *tsd, void *ptr) {
+	idalloctm(tsd_tsdn(tsd), ptr, tcache_get(tsd), NULL, false, true);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
+    emap_alloc_ctx_t *alloc_ctx, bool slow_path) {
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+	arena_sdalloc(tsdn, ptr, size, tcache, alloc_ctx, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+	void *p;
+	size_t usize, copysize;
+
+	usize = sz_sa2u(size, alignment);
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
+		return NULL;
+	}
+	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
+	if (p == NULL) {
+		return NULL;
+	}
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
+	copysize = (size < oldsize) ? size : oldsize;
+	memcpy(p, ptr, copysize);
+	hook_invoke_alloc(hook_args->is_realloc
+	    ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p,
+	    hook_args->args);
+	hook_invoke_dalloc(hook_args->is_realloc
+	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
+	return p;
+}
+
+/*
+ * is_realloc threads through the knowledge of whether or not this call comes
+ * from je_realloc (as opposed to je_rallocx); this ensures that we pass the
+ * correct entry point into any hooks.
+ * Note that these functions are all force-inlined, so no actual bool gets
+ * passed-around anywhere.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
+{
+	assert(ptr != NULL);
+	assert(size != 0);
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		/*
+		 * Existing object alignment is inadequate; allocate new space
+		 * and copy.
+		 */
+		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
+		    zero, tcache, arena, hook_args);
+	}
+
+	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
+	    tcache, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    bool zero, hook_ralloc_args_t *hook_args) {
+	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
+	    tcache_get(tsd), NULL, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero, size_t *newsize) {
+	assert(ptr != NULL);
+	assert(size != 0);
+	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
+	    WITNESS_RANK_CORE, 0);
+
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		/* Existing object alignment is inadequate. */
+		*newsize = oldsize;
+		return true;
+	}
+
+	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero,
+	    newsize);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
+    cache_bin_t *bin, void *ret) {
+	thread_allocated_set(tsd, allocated_after);
+	if (config_stats) {
+		bin->tstats.nrequests++;
+	}
+
+	LOG("core.malloc.exit", "result: %p", ret);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+malloc_initialized(void) {
+	return (malloc_init_state == malloc_init_initialized);
+}
+
+/*
+ * malloc() fastpath.  Included here so that we can inline it into operator new;
+ * function call overhead there is non-negligible as a fraction of total CPU in
+ * allocation-heavy C++ programs.  We take the fallback alloc to allow malloc
+ * (which can return NULL) to differ in its behavior from operator new (which
+ * can't).  It matches the signature of malloc / operator new so that we can
+ * tail-call the fallback allocator, allowing us to avoid setting up the call
+ * frame in the common case.
+ *
+ * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
+ * tcache.  If either of these is false, we tail-call to the slowpath,
+ * malloc_default().  Tail-calling is used to avoid any caller-saved
+ * registers.
+ *
+ * fastpath supports ticker and profiling, both of which will also
+ * tail-call to the slowpath if they fire.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
+	LOG("core.malloc.entry", "size: %zu", size);
+	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
+		return fallback_alloc(size);
+	}
+
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
+		return fallback_alloc(size);
+	}
+	/*
+	 * The code below till the branch checking the next_event threshold may
+	 * execute before malloc_init(), in which case the threshold is 0 to
+	 * trigger slow path and initialization.
+	 *
+	 * Note that when uninitialized, only the fast-path variants of the sz /
+	 * tsd facilities may be called.
+	 */
+	szind_t ind;
+	/*
+	 * The thread_allocated counter in tsd serves as a general purpose
+	 * accumulator for bytes of allocation to trigger different types of
+	 * events.  usize is always needed to advance thread_allocated, though
+	 * it's not always needed in the core allocation logic.
+	 */
+	size_t usize;
+	sz_size2index_usize_fastpath(size, &ind, &usize);
+	/* Fast path relies on size being a bin. */
+	assert(ind < SC_NBINS);
+	assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
+	    (size <= SC_SMALL_MAXCLASS));
+
+	uint64_t allocated, threshold;
+	te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
+	uint64_t allocated_after = allocated + usize;
+	/*
+	 * The ind and usize might be uninitialized (or partially) before
+	 * malloc_init().  The assertions check for: 1) full correctness (usize
+	 * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
+	 * when !initialized.
+	 */
+	if (!malloc_initialized()) {
+		assert(threshold == 0);
+	} else {
+		assert(ind == sz_size2index(size));
+		assert(usize > 0 && usize == sz_index2size(ind));
+	}
+	/*
+	 * Check for events and tsd non-nominal (fast_threshold will be set to
+	 * 0) in a single branch.
+	 */
+	if (unlikely(allocated_after >= threshold)) {
+		return fallback_alloc(size);
+	}
+	assert(tsd_fast(tsd));
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+	assert(tcache == tcache_get(tsd));
+	cache_bin_t *bin = &tcache->bins[ind];
+	bool tcache_success;
+	void *ret;
+
+	/*
+	 * We split up the code this way so that redundant low-water
+	 * computation doesn't happen on the (more common) case in which we
+	 * don't touch the low water mark.  The compiler won't do this
+	 * duplication on its own.
+	 */
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
+	if (tcache_success) {
+		fastpath_success_finish(tsd, allocated_after, bin, ret);
+		return ret;
+	}
+	ret = cache_bin_alloc(bin, &tcache_success);
+	if (tcache_success) {
+		fastpath_success_finish(tsd, allocated_after, bin, ret);
+		return ret;
+	}
+
+	return fallback_alloc(size);
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_C_H */

+ 111 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h

@@ -0,0 +1,111 @@
+#ifndef JEMALLOC_INTERNAL_MACROS_H
+#define JEMALLOC_INTERNAL_MACROS_H
+
+#ifdef JEMALLOC_DEBUG
+#  define JEMALLOC_ALWAYS_INLINE static inline
+#else
+#  ifdef _MSC_VER
+#    define JEMALLOC_ALWAYS_INLINE static __forceinline
+#  else
+#    define JEMALLOC_ALWAYS_INLINE JEMALLOC_ATTR(always_inline) static inline
+#  endif
+#endif
+#ifdef _MSC_VER
+#  define inline _inline
+#endif
+
+#define UNUSED JEMALLOC_ATTR(unused)
+
+#define ZU(z)	((size_t)z)
+#define ZD(z)	((ssize_t)z)
+#define QU(q)	((uint64_t)q)
+#define QD(q)	((int64_t)q)
+
+#define KZU(z)	ZU(z##ULL)
+#define KZD(z)	ZD(z##LL)
+#define KQU(q)	QU(q##ULL)
+#define KQD(q)	QI(q##LL)
+
+#ifndef __DECONST
+#  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
+#endif
+
+#if !defined(JEMALLOC_HAS_RESTRICT) || defined(__cplusplus)
+#  define restrict
+#endif
+
+/* Various function pointers are static and immutable except during testing. */
+#ifdef JEMALLOC_JET
+#  define JET_MUTABLE
+#else
+#  define JET_MUTABLE const
+#endif
+
+#define JEMALLOC_VA_ARGS_HEAD(head, ...) head
+#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
+
+/* Diagnostic suppression macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+/* #pragma GCC diagnostic first appeared in gcc 4.6. */
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
+  (__GNUC_MINOR__ > 5)))) || defined(__clang__)
+/*
+ * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
+ * diagnostic suppression macros and should not be used anywhere else.
+ */
+#  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) \
+     JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+
+/*
+ * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
+ * all clang versions up to version 7 (currently trunk, unreleased).  This macro
+ * suppresses the warning for the affected compiler versions only.
+ */
+#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \
+     defined(__clang__)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
+          JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  endif
+
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#  if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \
+       JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  endif
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
+  JEMALLOC_DIAGNOSTIC_PUSH \
+  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#else
+#  define JEMALLOC_DIAGNOSTIC_PUSH
+#  define JEMALLOC_DIAGNOSTIC_POP
+#  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#endif
+
+/*
+ * Disables spurious diagnostics for all headers.  Since these headers are not
+ * included by users directly, it does not affect their diagnostic settings.
+ */
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
+#endif /* JEMALLOC_INTERNAL_MACROS_H */

+ 130 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h

@@ -0,0 +1,130 @@
+#ifndef JEMALLOC_INTERNAL_TYPES_H
+#define JEMALLOC_INTERNAL_TYPES_H
+
+#include "jemalloc/internal/quantum.h"
+
+/* Processor / core id type. */
+typedef int malloc_cpuid_t;
+
+/* When realloc(non-null-ptr, 0) is called, what happens? */
+enum zero_realloc_action_e {
+	/* Realloc(ptr, 0) is free(ptr); return malloc(0); */
+	zero_realloc_action_alloc = 0,
+	/* Realloc(ptr, 0) is free(ptr); */
+	zero_realloc_action_free = 1,
+	/* Realloc(ptr, 0) aborts. */
+	zero_realloc_action_abort = 2
+};
+typedef enum zero_realloc_action_e zero_realloc_action_t;
+
+/* Signature of write callback. */
+typedef void (write_cb_t)(void *, const char *);
+
+enum malloc_init_e {
+	malloc_init_uninitialized	= 3,
+	malloc_init_a0_initialized	= 2,
+	malloc_init_recursible		= 1,
+	malloc_init_initialized		= 0 /* Common case --> jnz. */
+};
+typedef enum malloc_init_e malloc_init_t;
+
+/*
+ * Flags bits:
+ *
+ * a: arena
+ * t: tcache
+ * 0: unused
+ * z: zero
+ * n: alignment
+ *
+ * aaaaaaaa aaaatttt tttttttt 0znnnnnn
+ */
+#define MALLOCX_ARENA_BITS	12
+#define MALLOCX_TCACHE_BITS	12
+#define MALLOCX_LG_ALIGN_BITS	6
+#define MALLOCX_ARENA_SHIFT	20
+#define MALLOCX_TCACHE_SHIFT	8
+#define MALLOCX_ARENA_MASK \
+    (((1 << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT)
+/* NB: Arena index bias decreases the maximum number of arenas by 1. */
+#define MALLOCX_ARENA_LIMIT	((1 << MALLOCX_ARENA_BITS) - 1)
+#define MALLOCX_TCACHE_MASK \
+    (((1 << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT)
+#define MALLOCX_TCACHE_MAX	((1 << MALLOCX_TCACHE_BITS) - 3)
+#define MALLOCX_LG_ALIGN_MASK	((1 << MALLOCX_LG_ALIGN_BITS) - 1)
+/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
+#define MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
+    (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
+#define MALLOCX_ALIGN_GET(flags)					\
+    (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1))
+#define MALLOCX_ZERO_GET(flags)						\
+    ((bool)(flags & MALLOCX_ZERO))
+
+#define MALLOCX_TCACHE_GET(flags)					\
+    (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT)) - 2)
+#define MALLOCX_ARENA_GET(flags)					\
+    (((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1)
+
+/* Smallest size class to support. */
+#define TINY_MIN		(1U << LG_TINY_MIN)
+
+#define LONG			((size_t)(1U << LG_SIZEOF_LONG))
+#define LONG_MASK		(LONG - 1)
+
+/* Return the smallest long multiple that is >= a. */
+#define LONG_CEILING(a)							\
+	(((a) + LONG_MASK) & ~LONG_MASK)
+
+#define SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
+#define PTR_MASK		(SIZEOF_PTR - 1)
+
+/* Return the smallest (void *) multiple that is >= a. */
+#define PTR_CEILING(a)							\
+	(((a) + PTR_MASK) & ~PTR_MASK)
+
+/*
+ * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
+ *
+ * CACHELINE cannot be based on LG_CACHELINE because __declspec(align()) can
+ * only handle raw constants.
+ */
+#define LG_CACHELINE		6
+#define CACHELINE		64
+#define CACHELINE_MASK		(CACHELINE - 1)
+
+/* Return the smallest cacheline multiple that is >= s. */
+#define CACHELINE_CEILING(s)						\
+	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
+
+/* Return the nearest aligned address at or below a. */
+#define ALIGNMENT_ADDR2BASE(a, alignment)				\
+	((void *)((uintptr_t)(a) & ((~(alignment)) + 1)))
+
+/* Return the offset between a and the nearest aligned address at or below a. */
+#define ALIGNMENT_ADDR2OFFSET(a, alignment)				\
+	((size_t)((uintptr_t)(a) & (alignment - 1)))
+
+/* Return the smallest alignment multiple that is >= s. */
+#define ALIGNMENT_CEILING(s, alignment)					\
+	(((s) + (alignment - 1)) & ((~(alignment)) + 1))
+
+/* Declare a variable-length array. */
+#if __STDC_VERSION__ < 199901L
+#  ifdef _MSC_VER
+#    include <malloc.h>
+#    define alloca _alloca
+#  else
+#    ifdef JEMALLOC_HAS_ALLOCA_H
+#      include <alloca.h>
+#    else
+#      include <stdlib.h>
+#    endif
+#  endif
+#  define VARIABLE_ARRAY(type, name, count) \
+	type *name = alloca(sizeof(type) * (count))
+#else
+#  define VARIABLE_ARRAY(type, name, count) type name[(count)]
+#endif
+
+#endif /* JEMALLOC_INTERNAL_TYPES_H */

+ 263 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h

@@ -0,0 +1,263 @@
+#ifndef JEMALLOC_PREAMBLE_H
+#define JEMALLOC_PREAMBLE_H
+
+#include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
+
+#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
+#include <sys/ktrace.h>
+#  if defined(JEMALLOC_UTRACE)
+#    define UTRACE_CALL(p, l) utrace(p, l)
+#  else
+#    define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l)
+#    define JEMALLOC_UTRACE
+#  endif
+#endif
+
+#define JEMALLOC_NO_DEMANGLE
+#ifdef JEMALLOC_JET
+#  undef JEMALLOC_IS_MALLOC
+#  define JEMALLOC_N(n) jet_##n
+#  include "jemalloc/internal/public_namespace.h"
+#  define JEMALLOC_NO_RENAME
+#  include "../jemalloc.h"
+#  undef JEMALLOC_NO_RENAME
+#else
+#  define JEMALLOC_N(n) je_##n
+#  include "../jemalloc.h"
+#endif
+
+#if defined(JEMALLOC_OSATOMIC)
+#include <libkern/OSAtomic.h>
+#endif
+
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#endif
+
+#include "jemalloc/internal/jemalloc_internal_macros.h"
+
+/*
+ * Note that the ordering matters here; the hook itself is name-mangled.  We
+ * want the inclusion of hooks to happen early, so that we hook as much as
+ * possible.
+ */
+#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
+#  ifndef JEMALLOC_JET
+#    include "jemalloc/internal/private_namespace.h"
+#  else
+#    include "jemalloc/internal/private_namespace_jet.h"
+#  endif
+#endif
+#include "jemalloc/internal/test_hooks.h"
+
+#ifdef JEMALLOC_DEFINE_MADVISE_FREE
+#  define JEMALLOC_MADV_FREE 8
+#endif
+
+static const bool config_debug =
+#ifdef JEMALLOC_DEBUG
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_madvise_huge =
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_fill =
+#ifdef JEMALLOC_FILL
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_lazy_lock =
+#ifdef JEMALLOC_LAZY_LOCK
+    true
+#else
+    false
+#endif
+    ;
+static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF;
+static const bool config_prof =
+#ifdef JEMALLOC_PROF
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libgcc =
+#ifdef JEMALLOC_PROF_LIBGCC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libunwind =
+#ifdef JEMALLOC_PROF_LIBUNWIND
+    true
+#else
+    false
+#endif
+    ;
+static const bool maps_coalesce =
+#ifdef JEMALLOC_MAPS_COALESCE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_stats =
+#ifdef JEMALLOC_STATS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_tls =
+#ifdef JEMALLOC_TLS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_utrace =
+#ifdef JEMALLOC_UTRACE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_xmalloc =
+#ifdef JEMALLOC_XMALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, for jemalloc development use only at the moment.  See the note
+ * in jemalloc/internal/log.h.
+ */
+static const bool config_log =
+#ifdef JEMALLOC_LOG
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+/*
+ * Extra debugging of sized deallocations too onerous to be included in the
+ * general safety checks.
+ */
+static const bool config_opt_size_checks =
+#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool config_uaf_detection =
+#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+/* Whether or not the C++ extensions are enabled. */
+static const bool config_enable_cxx =
+#ifdef JEMALLOC_ENABLE_CXX
+    true
+#else
+    false
+#endif
+;
+
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
+/* Currently percpu_arena depends on sched_getcpu. */
+#define JEMALLOC_PERCPU_ARENA
+#endif
+static const bool have_percpu_arena =
+#ifdef JEMALLOC_PERCPU_ARENA
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, and not recommended; the application should take full
+ * responsibility for tracking provenance.
+ */
+static const bool force_ivsalloc =
+#ifdef JEMALLOC_FORCE_IVSALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_background_thread =
+#ifdef JEMALLOC_BACKGROUND_THREAD
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_high_res_timer =
+#ifdef JEMALLOC_HAVE_CLOCK_REALTIME
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool have_memcntl =
+#ifdef JEMALLOC_HAVE_MEMCNTL
+    true
+#else
+    false
+#endif
+    ;
+
+#endif /* JEMALLOC_PREAMBLE_H */

+ 263 - 0
BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in

@@ -0,0 +1,263 @@
+#ifndef JEMALLOC_PREAMBLE_H
+#define JEMALLOC_PREAMBLE_H
+
+#include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
+
+#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
+#include <sys/ktrace.h>
+#  if defined(JEMALLOC_UTRACE)
+#    define UTRACE_CALL(p, l) utrace(p, l)
+#  else
+#    define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l)
+#    define JEMALLOC_UTRACE
+#  endif
+#endif
+
+#define JEMALLOC_NO_DEMANGLE
+#ifdef JEMALLOC_JET
+#  undef JEMALLOC_IS_MALLOC
+#  define JEMALLOC_N(n) jet_##n
+#  include "jemalloc/internal/public_namespace.h"
+#  define JEMALLOC_NO_RENAME
+#  include "../jemalloc@[email protected]"
+#  undef JEMALLOC_NO_RENAME
+#else
+#  define JEMALLOC_N(n) @private_namespace@##n
+#  include "../jemalloc@[email protected]"
+#endif
+
+#if defined(JEMALLOC_OSATOMIC)
+#include <libkern/OSAtomic.h>
+#endif
+
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#endif
+
+#include "jemalloc/internal/jemalloc_internal_macros.h"
+
+/*
+ * Note that the ordering matters here; the hook itself is name-mangled.  We
+ * want the inclusion of hooks to happen early, so that we hook as much as
+ * possible.
+ */
+#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
+#  ifndef JEMALLOC_JET
+#    include "jemalloc/internal/private_namespace.h"
+#  else
+#    include "jemalloc/internal/private_namespace_jet.h"
+#  endif
+#endif
+#include "jemalloc/internal/test_hooks.h"
+
+#ifdef JEMALLOC_DEFINE_MADVISE_FREE
+#  define JEMALLOC_MADV_FREE 8
+#endif
+
+static const bool config_debug =
+#ifdef JEMALLOC_DEBUG
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_madvise_huge =
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_fill =
+#ifdef JEMALLOC_FILL
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_lazy_lock =
+#ifdef JEMALLOC_LAZY_LOCK
+    true
+#else
+    false
+#endif
+    ;
+static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF;
+static const bool config_prof =
+#ifdef JEMALLOC_PROF
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libgcc =
+#ifdef JEMALLOC_PROF_LIBGCC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libunwind =
+#ifdef JEMALLOC_PROF_LIBUNWIND
+    true
+#else
+    false
+#endif
+    ;
+static const bool maps_coalesce =
+#ifdef JEMALLOC_MAPS_COALESCE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_stats =
+#ifdef JEMALLOC_STATS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_tls =
+#ifdef JEMALLOC_TLS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_utrace =
+#ifdef JEMALLOC_UTRACE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_xmalloc =
+#ifdef JEMALLOC_XMALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, for jemalloc development use only at the moment.  See the note
+ * in jemalloc/internal/log.h.
+ */
+static const bool config_log =
+#ifdef JEMALLOC_LOG
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+/*
+ * Extra debugging of sized deallocations too onerous to be included in the
+ * general safety checks.
+ */
+static const bool config_opt_size_checks =
+#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool config_uaf_detection =
+#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+/* Whether or not the C++ extensions are enabled. */
+static const bool config_enable_cxx =
+#ifdef JEMALLOC_ENABLE_CXX
+    true
+#else
+    false
+#endif
+;
+
+#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
+/* Currently percpu_arena depends on sched_getcpu. */
+#define JEMALLOC_PERCPU_ARENA
+#endif
+static const bool have_percpu_arena =
+#ifdef JEMALLOC_PERCPU_ARENA
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, and not recommended; the application should take full
+ * responsibility for tracking provenance.
+ */
+static const bool force_ivsalloc =
+#ifdef JEMALLOC_FORCE_IVSALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_background_thread =
+#ifdef JEMALLOC_BACKGROUND_THREAD
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_high_res_timer =
+#ifdef JEMALLOC_HAVE_CLOCK_REALTIME
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool have_memcntl =
+#ifdef JEMALLOC_HAVE_MEMCNTL
+    true
+#else
+    false
+#endif
+    ;
+
+#endif /* JEMALLOC_PREAMBLE_H */

+ 24 - 0
BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h

@@ -0,0 +1,24 @@
+#ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
+#define JEMALLOC_INTERNAL_LARGE_EXTERNS_H
+
+#include "jemalloc/internal/hook.h"
+
+void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
+void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
+    bool zero);
+bool large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min,
+    size_t usize_max, bool zero);
+void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
+    size_t alignment, bool zero, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
+
+void large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata);
+void large_dalloc_finish(tsdn_t *tsdn, edata_t *edata);
+void large_dalloc(tsdn_t *tsdn, edata_t *edata);
+size_t large_salloc(tsdn_t *tsdn, const edata_t *edata);
+void large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info,
+    bool reset_recent);
+void large_prof_tctx_reset(edata_t *edata);
+void large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size);
+
+#endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */

+ 204 - 0
BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h

@@ -0,0 +1,204 @@
+#ifndef JEMALLOC_INTERNAL_LOCKEDINT_H
+#define JEMALLOC_INTERNAL_LOCKEDINT_H
+
+/*
+ * In those architectures that support 64-bit atomics, we use atomic updates for
+ * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
+ * externally.
+ */
+
+typedef struct locked_u64_s locked_u64_t;
+#ifdef JEMALLOC_ATOMIC_U64
+struct locked_u64_s {
+	atomic_u64_t val;
+};
+#else
+/* Must hold the associated mutex. */
+struct locked_u64_s {
+	uint64_t val;
+};
+#endif
+
+typedef struct locked_zu_s locked_zu_t;
+struct locked_zu_s {
+	atomic_zu_t val;
+};
+
+#ifndef JEMALLOC_ATOMIC_U64
+#  define LOCKEDINT_MTX_DECLARE(name) malloc_mutex_t name;
+#  define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode)			\
+    malloc_mutex_init(&(mu), name, rank, rank_mode)
+#  define LOCKEDINT_MTX(mtx) (&(mtx))
+#  define LOCKEDINT_MTX_LOCK(tsdn, mu) malloc_mutex_lock(tsdn, &(mu))
+#  define LOCKEDINT_MTX_UNLOCK(tsdn, mu) malloc_mutex_unlock(tsdn, &(mu))
+#  define LOCKEDINT_MTX_PREFORK(tsdn, mu) malloc_mutex_prefork(tsdn, &(mu))
+#  define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)			\
+    malloc_mutex_postfork_parent(tsdn, &(mu))
+#  define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)			\
+    malloc_mutex_postfork_child(tsdn, &(mu))
+#else
+#  define LOCKEDINT_MTX_DECLARE(name)
+#  define LOCKEDINT_MTX(mtx) NULL
+#  define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) false
+#  define LOCKEDINT_MTX_LOCK(tsdn, mu)
+#  define LOCKEDINT_MTX_UNLOCK(tsdn, mu)
+#  define LOCKEDINT_MTX_PREFORK(tsdn, mu)
+#  define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)
+#  define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)
+#endif
+
+#ifdef JEMALLOC_ATOMIC_U64
+#  define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) assert((mtx) == NULL)
+#else
+#  define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx)			\
+    malloc_mutex_assert_owner(tsdn, (mtx))
+#endif
+
+static inline uint64_t
+locked_read_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_u64(&p->val, ATOMIC_RELAXED);
+#else
+	return p->val;
+#endif
+}
+
+static inline void
+locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
+    uint64_t x) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_u64(&p->val, x, ATOMIC_RELAXED);
+#else
+	p->val += x;
+#endif
+}
+
+static inline void
+locked_dec_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
+    uint64_t x) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	uint64_t r = atomic_fetch_sub_u64(&p->val, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	p->val -= x;
+	assert(p->val + x >= p->val);
+#endif
+}
+
+/* Increment and take modulus.  Returns whether the modulo made any change.  */
+static inline bool
+locked_inc_mod_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
+    const uint64_t x, const uint64_t modulus) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+	uint64_t before, after;
+	bool overflow;
+#ifdef JEMALLOC_ATOMIC_U64
+	before = atomic_load_u64(&p->val, ATOMIC_RELAXED);
+	do {
+		after = before + x;
+		assert(after >= before);
+		overflow = (after >= modulus);
+		if (overflow) {
+			after %= modulus;
+		}
+	} while (!atomic_compare_exchange_weak_u64(&p->val, &before, after,
+	    ATOMIC_RELAXED, ATOMIC_RELAXED));
+#else
+	before = p->val;
+	after = before + x;
+	overflow = (after >= modulus);
+	if (overflow) {
+		after %= modulus;
+	}
+	p->val = after;
+#endif
+	return overflow;
+}
+
+/*
+ * Non-atomically sets *dst += src.  *dst needs external synchronization.
+ * This lets us avoid the cost of a fetch_add when its unnecessary (note that
+ * the types here are atomic).
+ */
+static inline void
+locked_inc_u64_unsynchronized(locked_u64_t *dst, uint64_t src) {
+#ifdef JEMALLOC_ATOMIC_U64
+	uint64_t cur_dst = atomic_load_u64(&dst->val, ATOMIC_RELAXED);
+	atomic_store_u64(&dst->val, src + cur_dst, ATOMIC_RELAXED);
+#else
+	dst->val += src;
+#endif
+}
+
+static inline uint64_t
+locked_read_u64_unsynchronized(locked_u64_t *p) {
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_u64(&p->val, ATOMIC_RELAXED);
+#else
+	return p->val;
+#endif
+}
+
+static inline void
+locked_init_u64_unsynchronized(locked_u64_t *p, uint64_t x) {
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_store_u64(&p->val, x, ATOMIC_RELAXED);
+#else
+	p->val = x;
+#endif
+}
+
+static inline size_t
+locked_read_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	return atomic_load_zu(&p->val, ATOMIC_RELAXED);
+#else
+	return atomic_load_zu(&p->val, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p,
+    size_t x) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	atomic_fetch_add_zu(&p->val, x, ATOMIC_RELAXED);
+#else
+	size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED);
+	atomic_store_zu(&p->val, cur + x, ATOMIC_RELAXED);
+#endif
+}
+
+static inline void
+locked_dec_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p,
+    size_t x) {
+	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
+#ifdef JEMALLOC_ATOMIC_U64
+	size_t r = atomic_fetch_sub_zu(&p->val, x, ATOMIC_RELAXED);
+	assert(r - x <= r);
+#else
+	size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED);
+	atomic_store_zu(&p->val, cur - x, ATOMIC_RELAXED);
+#endif
+}
+
+/* Like the _u64 variant, needs an externally synchronized *dst. */
+static inline void
+locked_inc_zu_unsynchronized(locked_zu_t *dst, size_t src) {
+	size_t cur_dst = atomic_load_zu(&dst->val, ATOMIC_RELAXED);
+	atomic_store_zu(&dst->val, src + cur_dst, ATOMIC_RELAXED);
+}
+
+/*
+ * Unlike the _u64 variant, this is safe to call unconditionally.
+ */
+static inline size_t
+locked_read_atomic_zu(locked_zu_t *p) {
+	return atomic_load_zu(&p->val, ATOMIC_RELAXED);
+}
+
+#endif /* JEMALLOC_INTERNAL_LOCKEDINT_H */

+ 115 - 0
BeefRT/JEMalloc/include/jemalloc/internal/log.h

@@ -0,0 +1,115 @@
+#ifndef JEMALLOC_INTERNAL_LOG_H
+#define JEMALLOC_INTERNAL_LOG_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/mutex.h"
+
+#ifdef JEMALLOC_LOG
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1000
+#else
+#  define JEMALLOC_LOG_VAR_BUFSIZE 1
+#endif
+
+#define JEMALLOC_LOG_BUFSIZE 4096
+
+/*
+ * The log malloc_conf option is a '|'-delimited list of log_var name segments
+ * which should be logged.  The names are themselves hierarchical, with '.' as
+ * the delimiter (a "segment" is just a prefix in the log namespace).  So, if
+ * you have:
+ *
+ * log("arena", "log msg for arena"); // 1
+ * log("arena.a", "log msg for arena.a"); // 2
+ * log("arena.b", "log msg for arena.b"); // 3
+ * log("arena.a.a", "log msg for arena.a.a"); // 4
+ * log("extent.a", "log msg for extent.a"); // 5
+ * log("extent.b", "log msg for extent.b"); // 6
+ *
+ * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and
+ * 6 will print at runtime.  You can enable logging from all log vars by
+ * writing "log=.".
+ *
+ * None of this should be regarded as a stable API for right now.  It's intended
+ * as a debugging interface, to let us keep around some of our printf-debugging
+ * statements.
+ */
+
+extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+extern atomic_b_t log_init_done;
+
+typedef struct log_var_s log_var_t;
+struct log_var_s {
+	/*
+	 * Lowest bit is "inited", second lowest is "enabled".  Putting them in
+	 * a single word lets us avoid any fences on weak architectures.
+	 */
+	atomic_u_t state;
+	const char *name;
+};
+
+#define LOG_NOT_INITIALIZED 0U
+#define LOG_INITIALIZED_NOT_ENABLED 1U
+#define LOG_ENABLED 2U
+
+#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str}
+
+/*
+ * Returns the value we should assume for state (which is not necessarily
+ * accurate; if logging is done before logging has finished initializing, then
+ * we default to doing the safe thing by logging everything).
+ */
+unsigned log_var_update_state(log_var_t *log_var);
+
+/* We factor out the metadata management to allow us to test more easily. */
+#define log_do_begin(log_var)						\
+if (config_log) {							\
+	unsigned log_state = atomic_load_u(&(log_var).state,		\
+	    ATOMIC_RELAXED);						\
+	if (unlikely(log_state == LOG_NOT_INITIALIZED)) {		\
+		log_state = log_var_update_state(&(log_var));		\
+		assert(log_state != LOG_NOT_INITIALIZED);		\
+	}								\
+	if (log_state == LOG_ENABLED) {					\
+		{
+			/* User code executes here. */
+#define log_do_end(log_var)						\
+		}							\
+	}								\
+}
+
+/*
+ * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during
+ * preprocessing.  To work around this, we take all potential extra arguments in
+ * a var-args functions.  Since a varargs macro needs at least one argument in
+ * the "...", we accept the format string there, and require that the first
+ * argument in this "..." is a const char *.
+ */
+static inline void
+log_impl_varargs(const char *name, ...) {
+	char buf[JEMALLOC_LOG_BUFSIZE];
+	va_list ap;
+
+	va_start(ap, name);
+	const char *format = va_arg(ap, const char *);
+	size_t dst_offset = 0;
+	dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name);
+	dst_offset += malloc_vsnprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
+	dst_offset += malloc_snprintf(buf + dst_offset,
+	    JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
+	va_end(ap);
+
+	malloc_write(buf);
+}
+
+/* Call as log("log.var.str", "format_string %d", arg_for_format_string); */
+#define LOG(log_var_str, ...)						\
+do {									\
+	static log_var_t log_var = LOG_VAR_INIT(log_var_str);		\
+	log_do_begin(log_var)						\
+		log_impl_varargs((log_var).name, __VA_ARGS__);		\
+	log_do_end(log_var)						\
+} while (0)
+
+#endif /* JEMALLOC_INTERNAL_LOG_H */

+ 105 - 0
BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h

@@ -0,0 +1,105 @@
+#ifndef JEMALLOC_INTERNAL_MALLOC_IO_H
+#define JEMALLOC_INTERNAL_MALLOC_IO_H
+
+#include "jemalloc/internal/jemalloc_internal_types.h"
+
+#ifdef _WIN32
+#  ifdef _WIN64
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX "ll"
+#  else
+#    define FMT64_PREFIX "ll"
+#    define FMTPTR_PREFIX ""
+#  endif
+#  define FMTd32 "d"
+#  define FMTu32 "u"
+#  define FMTx32 "x"
+#  define FMTd64 FMT64_PREFIX "d"
+#  define FMTu64 FMT64_PREFIX "u"
+#  define FMTx64 FMT64_PREFIX "x"
+#  define FMTdPTR FMTPTR_PREFIX "d"
+#  define FMTuPTR FMTPTR_PREFIX "u"
+#  define FMTxPTR FMTPTR_PREFIX "x"
+#else
+#  include <inttypes.h>
+#  define FMTd32 PRId32
+#  define FMTu32 PRIu32
+#  define FMTx32 PRIx32
+#  define FMTd64 PRId64
+#  define FMTu64 PRIu64
+#  define FMTx64 PRIx64
+#  define FMTdPTR PRIdPTR
+#  define FMTuPTR PRIuPTR
+#  define FMTxPTR PRIxPTR
+#endif
+
+/* Size of stack-allocated buffer passed to buferror(). */
+#define BUFERROR_BUF		64
+
+/*
+ * Size of stack-allocated buffer used by malloc_{,v,vc}printf().  This must be
+ * large enough for all possible uses within jemalloc.
+ */
+#define MALLOC_PRINTF_BUFSIZE	4096
+
+write_cb_t wrtmessage;
+int buferror(int err, char *buf, size_t buflen);
+uintmax_t malloc_strtoumax(const char *restrict nptr, char **restrict endptr,
+    int base);
+void malloc_write(const char *s);
+
+/*
+ * malloc_vsnprintf() supports a subset of snprintf(3) that avoids floating
+ * point math.
+ */
+size_t malloc_vsnprintf(char *str, size_t size, const char *format,
+    va_list ap);
+size_t malloc_snprintf(char *str, size_t size, const char *format, ...)
+    JEMALLOC_FORMAT_PRINTF(3, 4);
+/*
+ * The caller can set write_cb to null to choose to print with the
+ * je_malloc_message hook.
+ */
+void malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format,
+    va_list ap);
+void malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format,
+    ...) JEMALLOC_FORMAT_PRINTF(3, 4);
+void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);
+
+static inline ssize_t
+malloc_write_fd(int fd, const void *buf, size_t count) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
+	/*
+	 * Use syscall(2) rather than write(2) when possible in order to avoid
+	 * the possibility of memory allocation within libc.  This is necessary
+	 * on FreeBSD; most operating systems do not have this problem though.
+	 *
+	 * syscall() returns long or int, depending on platform, so capture the
+	 * result in the widest plausible type to avoid compiler warnings.
+	 */
+	long result = syscall(SYS_write, fd, buf, count);
+#else
+	ssize_t result = (ssize_t)write(fd, buf,
+#ifdef _WIN32
+	    (unsigned int)
+#endif
+	    count);
+#endif
+	return (ssize_t)result;
+}
+
+static inline ssize_t
+malloc_read_fd(int fd, void *buf, size_t count) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
+	long result = syscall(SYS_read, fd, buf, count);
+#else
+	ssize_t result = read(fd, buf,
+#ifdef _WIN32
+	    (unsigned int)
+#endif
+	    count);
+#endif
+	return (ssize_t)result;
+}
+
+#endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */

+ 134 - 0
BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h

@@ -0,0 +1,134 @@
+#ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H
+#define JEMALLOC_INTERNAL_MPSC_QUEUE_H
+
+#include "jemalloc/internal/atomic.h"
+
+/*
+ * A concurrent implementation of a multi-producer, single-consumer queue.  It
+ * supports three concurrent operations:
+ * - Push
+ * - Push batch
+ * - Pop batch
+ *
+ * These operations are all lock-free.
+ *
+ * The implementation is the simple two-stack queue built on a Treiber stack.
+ * It's not terribly efficient, but this isn't expected to go into anywhere with
+ * hot code.  In fact, we don't really even need queue semantics in any
+ * anticipated use cases; we could get away with just the stack.  But this way
+ * lets us frame the API in terms of the existing list types, which is a nice
+ * convenience.  We can save on cache misses by introducing our own (parallel)
+ * single-linked list type here, and dropping FIFO semantics, if we need this to
+ * get faster.  Since we're currently providing queue semantics though, we use
+ * the prev field in the link rather than the next field for Treiber-stack
+ * linkage, so that we can preserve order for bash-pushed lists (recall that the
+ * two-stack tricks reverses orders in the lock-free first stack).
+ */
+
+#define mpsc_queue(a_type)						\
+struct {								\
+	atomic_p_t tail;						\
+}
+
+#define mpsc_queue_proto(a_attr, a_prefix, a_queue_type, a_type,	\
+    a_list_type)							\
+/* Initialize a queue. */						\
+a_attr void								\
+a_prefix##new(a_queue_type *queue);					\
+/* Insert all items in src into the queue, clearing src. */		\
+a_attr void								\
+a_prefix##push_batch(a_queue_type *queue, a_list_type *src);		\
+/* Insert node into the queue. */					\
+a_attr void								\
+a_prefix##push(a_queue_type *queue, a_type *node);			\
+/*									\
+ * Pop all items in the queue into the list at dst.  dst should already	\
+ * be initialized (and may contain existing items, which then remain	\
+ * in dst).								\
+ */									\
+a_attr void								\
+a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst);
+
+#define mpsc_queue_gen(a_attr, a_prefix, a_queue_type, a_type,		\
+    a_list_type, a_link)						\
+a_attr void								\
+a_prefix##new(a_queue_type *queue) {					\
+	atomic_store_p(&queue->tail, NULL, ATOMIC_RELAXED);		\
+}									\
+a_attr void								\
+a_prefix##push_batch(a_queue_type *queue, a_list_type *src) {		\
+	/*								\
+	 * Reuse the ql list next field as the Treiber stack next	\
+	 * field.							\
+	 */								\
+	a_type *first = ql_first(src);					\
+	a_type *last = ql_last(src, a_link);				\
+	void* cur_tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED);	\
+	do {								\
+		/*							\
+		 * Note that this breaks the queue ring structure;	\
+		 * it's not a ring any more!				\
+		 */							\
+		first->a_link.qre_prev = cur_tail;			\
+		/*							\
+		 * Note: the upcoming CAS doesn't need an atomic; every	\
+		 * push only needs to synchronize with the next pop,	\
+		 * which we get from the release sequence rules.	\
+		 */							\
+	} while (!atomic_compare_exchange_weak_p(&queue->tail,		\
+	    &cur_tail, last, ATOMIC_RELEASE, ATOMIC_RELAXED));		\
+	ql_new(src);							\
+}									\
+a_attr void								\
+a_prefix##push(a_queue_type *queue, a_type *node) {			\
+	ql_elm_new(node, a_link);					\
+	a_list_type list;						\
+	ql_new(&list);							\
+	ql_head_insert(&list, node, a_link);				\
+	a_prefix##push_batch(queue, &list);				\
+}									\
+a_attr void								\
+a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst) {		\
+	a_type *tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED);	\
+	if (tail == NULL) {						\
+		/*							\
+		 * In the common special case where there are no	\
+		 * pending elements, bail early without a costly RMW.	\
+		 */							\
+		return;							\
+	}								\
+	tail = atomic_exchange_p(&queue->tail, NULL, ATOMIC_ACQUIRE);	\
+	/*								\
+	 * It's a single-consumer queue, so if cur started non-NULL,	\
+	 * it'd better stay non-NULL.					\
+	 */								\
+	assert(tail != NULL);						\
+	/*								\
+	 * We iterate through the stack and both fix up the link	\
+	 * structure (stack insertion broke the list requirement that	\
+	 * the list be circularly linked).  It's just as efficient at	\
+	 * this point to make the queue a "real" queue, so do that as	\
+	 * well.							\
+	 * If this ever gets to be a hot spot, we can omit this fixup	\
+	 * and make the queue a bag (i.e. not necessarily ordered), but	\
+	 * that would mean jettisoning the existing list API as the 	\
+	 * batch pushing/popping interface.				\
+	 */								\
+	a_list_type reversed;						\
+	ql_new(&reversed);						\
+	while (tail != NULL) {						\
+		/*							\
+		 * Pop an item off the stack, prepend it onto the list	\
+		 * (reversing the order).  Recall that we use the	\
+		 * list prev field as the Treiber stack next field to	\
+		 * preserve order of batch-pushed items when reversed.	\
+		 */							\
+		a_type *next = tail->a_link.qre_prev;			\
+		ql_elm_new(tail, a_link);				\
+		ql_head_insert(&reversed, tail, a_link);		\
+		tail = next;						\
+	}								\
+	ql_concat(dst, &reversed, a_link);				\
+}
+
+#endif /* JEMALLOC_INTERNAL_MPSC_QUEUE_H */

+ 319 - 0
BeefRT/JEMalloc/include/jemalloc/internal/mutex.h

@@ -0,0 +1,319 @@
+#ifndef JEMALLOC_INTERNAL_MUTEX_H
+#define JEMALLOC_INTERNAL_MUTEX_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/tsd.h"
+#include "jemalloc/internal/witness.h"
+
+extern int64_t opt_mutex_max_spin;
+
+typedef enum {
+	/* Can only acquire one mutex of a given witness rank at a time. */
+	malloc_mutex_rank_exclusive,
+	/*
+	 * Can acquire multiple mutexes of the same witness rank, but in
+	 * address-ascending order only.
+	 */
+	malloc_mutex_address_ordered
+} malloc_mutex_lock_order_t;
+
+typedef struct malloc_mutex_s malloc_mutex_t;
+struct malloc_mutex_s {
+	union {
+		struct {
+			/*
+			 * prof_data is defined first to reduce cacheline
+			 * bouncing: the data is not touched by the mutex holder
+			 * during unlocking, while might be modified by
+			 * contenders.  Having it before the mutex itself could
+			 * avoid prefetching a modified cacheline (for the
+			 * unlocking thread).
+			 */
+			mutex_prof_data_t	prof_data;
+#ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+			SRWLOCK         	lock;
+#  else
+			CRITICAL_SECTION	lock;
+#  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+			os_unfair_lock		lock;
+#elif (defined(JEMALLOC_MUTEX_INIT_CB))
+			pthread_mutex_t		lock;
+			malloc_mutex_t		*postponed_next;
+#else
+			pthread_mutex_t		lock;
+#endif
+			/*
+			 * Hint flag to avoid exclusive cache line contention
+			 * during spin waiting
+			 */
+			atomic_b_t		locked;
+		};
+		/*
+		 * We only touch witness when configured w/ debug.  However we
+		 * keep the field in a union when !debug so that we don't have
+		 * to pollute the code base with #ifdefs, while avoid paying the
+		 * memory cost.
+		 */
+#if !defined(JEMALLOC_DEBUG)
+		witness_t			witness;
+		malloc_mutex_lock_order_t	lock_order;
+#endif
+	};
+
+#if defined(JEMALLOC_DEBUG)
+	witness_t			witness;
+	malloc_mutex_lock_order_t	lock_order;
+#endif
+};
+
+#ifdef _WIN32
+#  if _WIN32_WINNT >= 0x0600
+#    define MALLOC_MUTEX_LOCK(m)    AcquireSRWLockExclusive(&(m)->lock)
+#    define MALLOC_MUTEX_UNLOCK(m)  ReleaseSRWLockExclusive(&(m)->lock)
+#    define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock))
+#  else
+#    define MALLOC_MUTEX_LOCK(m)    EnterCriticalSection(&(m)->lock)
+#    define MALLOC_MUTEX_UNLOCK(m)  LeaveCriticalSection(&(m)->lock)
+#    define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock))
+#  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+#    define MALLOC_MUTEX_LOCK(m)    os_unfair_lock_lock(&(m)->lock)
+#    define MALLOC_MUTEX_UNLOCK(m)  os_unfair_lock_unlock(&(m)->lock)
+#    define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
+#else
+#    define MALLOC_MUTEX_LOCK(m)    pthread_mutex_lock(&(m)->lock)
+#    define MALLOC_MUTEX_UNLOCK(m)  pthread_mutex_unlock(&(m)->lock)
+#    define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0)
+#endif
+
+#define LOCK_PROF_DATA_INITIALIZER					\
+    {NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0,		\
+	    ATOMIC_INIT(0), 0, NULL, 0}
+
+#ifdef _WIN32
+#  define MALLOC_MUTEX_INITIALIZER
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+#  if defined(JEMALLOC_DEBUG)
+#    define MALLOC_MUTEX_INITIALIZER					\
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
+         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                      \
+  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
+      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+#elif (defined(JEMALLOC_MUTEX_INIT_CB))
+#  if (defined(JEMALLOC_DEBUG))
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#     define MALLOC_MUTEX_INITIALIZER					\
+      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+
+#else
+#    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
+#  if defined(JEMALLOC_DEBUG)
+#    define MALLOC_MUTEX_INITIALIZER					\
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
+           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
+#  else
+#    define MALLOC_MUTEX_INITIALIZER                          \
+     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
+      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
+#  endif
+#endif
+
+#ifdef JEMALLOC_LAZY_LOCK
+extern bool isthreaded;
+#else
+#  undef isthreaded /* Undo private_namespace.h definition. */
+#  define isthreaded true
+#endif
+
+bool malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
+    witness_rank_t rank, malloc_mutex_lock_order_t lock_order);
+void malloc_mutex_prefork(tsdn_t *tsdn, malloc_mutex_t *mutex);
+void malloc_mutex_postfork_parent(tsdn_t *tsdn, malloc_mutex_t *mutex);
+void malloc_mutex_postfork_child(tsdn_t *tsdn, malloc_mutex_t *mutex);
+bool malloc_mutex_boot(void);
+void malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex);
+
+void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
+
+static inline void
+malloc_mutex_lock_final(malloc_mutex_t *mutex) {
+	MALLOC_MUTEX_LOCK(mutex);
+	atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
+}
+
+static inline bool
+malloc_mutex_trylock_final(malloc_mutex_t *mutex) {
+	return MALLOC_MUTEX_TRYLOCK(mutex);
+}
+
+static inline void
+mutex_owner_stats_update(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	if (config_stats) {
+		mutex_prof_data_t *data = &mutex->prof_data;
+		data->n_lock_ops++;
+		if (data->prev_owner != tsdn) {
+			data->prev_owner = tsdn;
+			data->n_owner_switches++;
+		}
+	}
+}
+
+/* Trylock: return false if the lock is successfully acquired. */
+static inline bool
+malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+	if (isthreaded) {
+		if (malloc_mutex_trylock_final(mutex)) {
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
+			return true;
+		}
+		mutex_owner_stats_update(tsdn, mutex);
+	}
+	witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+
+	return false;
+}
+
+/* Aggregate lock prof data. */
+static inline void
+malloc_mutex_prof_merge(mutex_prof_data_t *sum, mutex_prof_data_t *data) {
+	nstime_add(&sum->tot_wait_time, &data->tot_wait_time);
+	if (nstime_compare(&sum->max_wait_time, &data->max_wait_time) < 0) {
+		nstime_copy(&sum->max_wait_time, &data->max_wait_time);
+	}
+
+	sum->n_wait_times += data->n_wait_times;
+	sum->n_spin_acquired += data->n_spin_acquired;
+
+	if (sum->max_n_thds < data->max_n_thds) {
+		sum->max_n_thds = data->max_n_thds;
+	}
+	uint32_t cur_n_waiting_thds = atomic_load_u32(&sum->n_waiting_thds,
+	    ATOMIC_RELAXED);
+	uint32_t new_n_waiting_thds = cur_n_waiting_thds + atomic_load_u32(
+	    &data->n_waiting_thds, ATOMIC_RELAXED);
+	atomic_store_u32(&sum->n_waiting_thds, new_n_waiting_thds,
+	    ATOMIC_RELAXED);
+	sum->n_owner_switches += data->n_owner_switches;
+	sum->n_lock_ops += data->n_lock_ops;
+}
+
+static inline void
+malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+	if (isthreaded) {
+		if (malloc_mutex_trylock_final(mutex)) {
+			malloc_mutex_lock_slow(mutex);
+			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
+		}
+		mutex_owner_stats_update(tsdn, mutex);
+	}
+	witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+}
+
+static inline void
+malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
+	witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+	if (isthreaded) {
+		MALLOC_MUTEX_UNLOCK(mutex);
+	}
+}
+
+static inline void
+malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	witness_assert_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+}
+
+static inline void
+malloc_mutex_assert_not_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) {
+	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+}
+
+static inline void
+malloc_mutex_prof_copy(mutex_prof_data_t *dst, mutex_prof_data_t *source) {
+	/*
+	 * Not *really* allowed (we shouldn't be doing non-atomic loads of
+	 * atomic data), but the mutex protection makes this safe, and writing
+	 * a member-for-member copy is tedious for this situation.
+	 */
+	*dst = *source;
+	/* n_wait_thds is not reported (modified w/o locking). */
+	atomic_store_u32(&dst->n_waiting_thds, 0, ATOMIC_RELAXED);
+}
+
+/* Copy the prof data from mutex for processing. */
+static inline void
+malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+	malloc_mutex_prof_copy(data, &mutex->prof_data);
+}
+
+static inline void
+malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	mutex_prof_data_t *source = &mutex->prof_data;
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+
+	nstime_add(&data->tot_wait_time, &source->tot_wait_time);
+	if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) {
+		nstime_copy(&data->max_wait_time, &source->max_wait_time);
+	}
+	data->n_wait_times += source->n_wait_times;
+	data->n_spin_acquired += source->n_spin_acquired;
+	if (data->max_n_thds < source->max_n_thds) {
+		data->max_n_thds = source->max_n_thds;
+	}
+	/* n_wait_thds is not reported. */
+	atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED);
+	data->n_owner_switches += source->n_owner_switches;
+	data->n_lock_ops += source->n_lock_ops;
+}
+
+/* Compare the prof data and update to the maximum. */
+static inline void
+malloc_mutex_prof_max_update(tsdn_t *tsdn, mutex_prof_data_t *data,
+    malloc_mutex_t *mutex) {
+	mutex_prof_data_t *source = &mutex->prof_data;
+	/* Can only read holding the mutex. */
+	malloc_mutex_assert_owner(tsdn, mutex);
+
+	if (nstime_compare(&source->tot_wait_time, &data->tot_wait_time) > 0) {
+		nstime_copy(&data->tot_wait_time, &source->tot_wait_time);
+	}
+	if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) {
+		nstime_copy(&data->max_wait_time, &source->max_wait_time);
+	}
+	if (source->n_wait_times > data->n_wait_times) {
+		data->n_wait_times = source->n_wait_times;
+	}
+	if (source->n_spin_acquired > data->n_spin_acquired) {
+		data->n_spin_acquired = source->n_spin_acquired;
+	}
+	if (source->max_n_thds > data->max_n_thds) {
+		data->max_n_thds = source->max_n_thds;
+	}
+	if (source->n_owner_switches > data->n_owner_switches) {
+		data->n_owner_switches = source->n_owner_switches;
+	}
+	if (source->n_lock_ops > data->n_lock_ops) {
+		data->n_lock_ops = source->n_lock_ops;
+	}
+	/* n_wait_thds is not reported. */
+}
+
+#endif /* JEMALLOC_INTERNAL_MUTEX_H */

+ 117 - 0
BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h

@@ -0,0 +1,117 @@
+#ifndef JEMALLOC_INTERNAL_MUTEX_PROF_H
+#define JEMALLOC_INTERNAL_MUTEX_PROF_H
+
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/tsd_types.h"
+
+#define MUTEX_PROF_GLOBAL_MUTEXES					\
+    OP(background_thread)						\
+    OP(max_per_bg_thd)							\
+    OP(ctl)								\
+    OP(prof)								\
+    OP(prof_thds_data)							\
+    OP(prof_dump)							\
+    OP(prof_recent_alloc)						\
+    OP(prof_recent_dump)						\
+    OP(prof_stats)
+
+typedef enum {
+#define OP(mtx) global_prof_mutex_##mtx,
+	MUTEX_PROF_GLOBAL_MUTEXES
+#undef OP
+	mutex_prof_num_global_mutexes
+} mutex_prof_global_ind_t;
+
+#define MUTEX_PROF_ARENA_MUTEXES					\
+    OP(large)								\
+    OP(extent_avail)							\
+    OP(extents_dirty)							\
+    OP(extents_muzzy)							\
+    OP(extents_retained)						\
+    OP(decay_dirty)							\
+    OP(decay_muzzy)							\
+    OP(base)								\
+    OP(tcache_list)							\
+    OP(hpa_shard)							\
+    OP(hpa_shard_grow)							\
+    OP(hpa_sec)
+
+typedef enum {
+#define OP(mtx) arena_prof_mutex_##mtx,
+	MUTEX_PROF_ARENA_MUTEXES
+#undef OP
+	mutex_prof_num_arena_mutexes
+} mutex_prof_arena_ind_t;
+
+/*
+ * The forth parameter is a boolean value that is true for derived rate counters
+ * and false for real ones.
+ */
+#define MUTEX_PROF_UINT64_COUNTERS					\
+    OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)					\
+    OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)				\
+    OP(num_wait, uint64_t, "n_waiting", false, num_wait)				\
+    OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)				\
+    OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)			\
+    OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)			\
+    OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch)		\
+    OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)	\
+    OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time)		\
+    OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)		\
+    OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)
+
+#define MUTEX_PROF_UINT32_COUNTERS					\
+    OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)
+
+#define MUTEX_PROF_COUNTERS						\
+		MUTEX_PROF_UINT64_COUNTERS				\
+		MUTEX_PROF_UINT32_COUNTERS
+
+#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter,
+
+#define COUNTER_ENUM(counter_list, t)					\
+		typedef enum {						\
+			counter_list					\
+			mutex_prof_num_##t##_counters			\
+		} mutex_prof_##t##_counter_ind_t;
+
+COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t)
+COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t)
+
+#undef COUNTER_ENUM
+#undef OP
+
+typedef struct {
+	/*
+	 * Counters touched on the slow path, i.e. when there is lock
+	 * contention.  We update them once we have the lock.
+	 */
+	/* Total time (in nano seconds) spent waiting on this mutex. */
+	nstime_t		tot_wait_time;
+	/* Max time (in nano seconds) spent on a single lock operation. */
+	nstime_t		max_wait_time;
+	/* # of times have to wait for this mutex (after spinning). */
+	uint64_t		n_wait_times;
+	/* # of times acquired the mutex through local spinning. */
+	uint64_t		n_spin_acquired;
+	/* Max # of threads waiting for the mutex at the same time. */
+	uint32_t		max_n_thds;
+	/* Current # of threads waiting on the lock.  Atomic synced. */
+	atomic_u32_t		n_waiting_thds;
+
+	/*
+	 * Data touched on the fast path.  These are modified right after we
+	 * grab the lock, so it's placed closest to the end (i.e. right before
+	 * the lock) so that we have a higher chance of them being on the same
+	 * cacheline.
+	 */
+	/* # of times the mutex holder is different than the previous one. */
+	uint64_t		n_owner_switches;
+	/* Previous mutex holder, to facilitate n_owner_switches. */
+	tsdn_t			*prev_owner;
+	/* # of lock() operations in total. */
+	uint64_t		n_lock_ops;
+} mutex_prof_data_t;
+
+#endif /* JEMALLOC_INTERNAL_MUTEX_PROF_H */

+ 73 - 0
BeefRT/JEMalloc/include/jemalloc/internal/nstime.h

@@ -0,0 +1,73 @@
+#ifndef JEMALLOC_INTERNAL_NSTIME_H
+#define JEMALLOC_INTERNAL_NSTIME_H
+
+/* Maximum supported number of seconds (~584 years). */
+#define NSTIME_SEC_MAX KQU(18446744072)
+
+#define NSTIME_MAGIC ((uint32_t)0xb8a9ce37)
+#ifdef JEMALLOC_DEBUG
+#  define NSTIME_ZERO_INITIALIZER {0, NSTIME_MAGIC}
+#else
+#  define NSTIME_ZERO_INITIALIZER {0}
+#endif
+
+typedef struct {
+	uint64_t ns;
+#ifdef JEMALLOC_DEBUG
+	uint32_t magic; /* Tracks if initialized. */
+#endif
+} nstime_t;
+
+static const nstime_t nstime_zero = NSTIME_ZERO_INITIALIZER;
+
+void nstime_init(nstime_t *time, uint64_t ns);
+void nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec);
+uint64_t nstime_ns(const nstime_t *time);
+uint64_t nstime_sec(const nstime_t *time);
+uint64_t nstime_msec(const nstime_t *time);
+uint64_t nstime_nsec(const nstime_t *time);
+void nstime_copy(nstime_t *time, const nstime_t *source);
+int nstime_compare(const nstime_t *a, const nstime_t *b);
+void nstime_add(nstime_t *time, const nstime_t *addend);
+void nstime_iadd(nstime_t *time, uint64_t addend);
+void nstime_subtract(nstime_t *time, const nstime_t *subtrahend);
+void nstime_isubtract(nstime_t *time, uint64_t subtrahend);
+void nstime_imultiply(nstime_t *time, uint64_t multiplier);
+void nstime_idivide(nstime_t *time, uint64_t divisor);
+uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor);
+uint64_t nstime_ns_since(const nstime_t *past);
+
+typedef bool (nstime_monotonic_t)(void);
+extern nstime_monotonic_t *JET_MUTABLE nstime_monotonic;
+
+typedef void (nstime_update_t)(nstime_t *);
+extern nstime_update_t *JET_MUTABLE nstime_update;
+
+typedef void (nstime_prof_update_t)(nstime_t *);
+extern nstime_prof_update_t *JET_MUTABLE nstime_prof_update;
+
+void nstime_init_update(nstime_t *time);
+void nstime_prof_init_update(nstime_t *time);
+
+enum prof_time_res_e {
+	prof_time_res_default = 0,
+	prof_time_res_high = 1
+};
+typedef enum prof_time_res_e prof_time_res_t;
+
+extern prof_time_res_t opt_prof_time_res;
+extern const char *prof_time_res_mode_names[];
+
+JEMALLOC_ALWAYS_INLINE void
+nstime_init_zero(nstime_t *time) {
+	nstime_copy(time, &nstime_zero);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+nstime_equals_zero(nstime_t *time) {
+	int diff = nstime_compare(time, &nstime_zero);
+	assert(diff >= 0);
+	return diff == 0;
+}
+
+#endif /* JEMALLOC_INTERNAL_NSTIME_H */

+ 243 - 0
BeefRT/JEMalloc/include/jemalloc/internal/pa.h

@@ -0,0 +1,243 @@
+#ifndef JEMALLOC_INTERNAL_PA_H
+#define JEMALLOC_INTERNAL_PA_H
+
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/decay.h"
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/hpa.h"
+#include "jemalloc/internal/lockedint.h"
+#include "jemalloc/internal/pac.h"
+#include "jemalloc/internal/pai.h"
+#include "jemalloc/internal/sec.h"
+
+/*
+ * The page allocator; responsible for acquiring pages of memory for
+ * allocations.  It picks the implementation of the page allocator interface
+ * (i.e. a pai_t) to handle a given page-level allocation request.  For now, the
+ * only such implementation is the PAC code ("page allocator classic"), but
+ * others will be coming soon.
+ */
+
+typedef struct pa_central_s pa_central_t;
+struct pa_central_s {
+	hpa_central_t hpa;
+};
+
+/*
+ * The stats for a particular pa_shard.  Because of the way the ctl module
+ * handles stats epoch data collection (it has its own arena_stats, and merges
+ * the stats from each arena into it), this needs to live in the arena_stats_t;
+ * hence we define it here and let the pa_shard have a pointer (rather than the
+ * more natural approach of just embedding it in the pa_shard itself).
+ *
+ * We follow the arena_stats_t approach of marking the derived fields.  These
+ * are the ones that are not maintained on their own; instead, their values are
+ * derived during those stats merges.
+ */
+typedef struct pa_shard_stats_s pa_shard_stats_t;
+struct pa_shard_stats_s {
+	/* Number of edata_t structs allocated by base, but not being used. */
+	size_t edata_avail; /* Derived. */
+	/*
+	 * Stats specific to the PAC.  For now, these are the only stats that
+	 * exist, but there will eventually be other page allocators.  Things
+	 * like edata_avail make sense in a cross-PA sense, but things like
+	 * npurges don't.
+	 */
+	pac_stats_t pac_stats;
+};
+
+/*
+ * The local allocator handle.  Keeps the state necessary to satisfy page-sized
+ * allocations.
+ *
+ * The contents are mostly internal to the PA module.  The key exception is that
+ * arena decay code is allowed to grab pointers to the dirty and muzzy ecaches
+ * decay_ts, for a couple of queries, passing them back to a PA function, or
+ * acquiring decay.mtx and looking at decay.purging.  The reasoning is that,
+ * while PA decides what and how to purge, the arena code decides when and where
+ * (e.g. on what thread).  It's allowed to use the presence of another purger to
+ * decide.
+ * (The background thread code also touches some other decay internals, but
+ * that's not fundamental; its' just an artifact of a partial refactoring, and
+ * its accesses could be straightforwardly moved inside the decay module).
+ */
+typedef struct pa_shard_s pa_shard_t;
+struct pa_shard_s {
+	/* The central PA this shard is associated with. */
+	pa_central_t *central;
+
+	/*
+	 * Number of pages in active extents.
+	 *
+	 * Synchronization: atomic.
+	 */
+	atomic_zu_t nactive;
+
+	/*
+	 * Whether or not we should prefer the hugepage allocator.  Atomic since
+	 * it may be concurrently modified by a thread setting extent hooks.
+	 * Note that we still may do HPA operations in this arena; if use_hpa is
+	 * changed from true to false, we'll free back to the hugepage allocator
+	 * for those allocations.
+	 */
+	atomic_b_t use_hpa;
+
+	/*
+	 * If we never used the HPA to begin with, it wasn't initialized, and so
+	 * we shouldn't try to e.g. acquire its mutexes during fork.  This
+	 * tracks that knowledge.
+	 */
+	bool ever_used_hpa;
+
+	/* Allocates from a PAC. */
+	pac_t pac;
+
+	/*
+	 * We place a small extent cache in front of the HPA, since we intend
+	 * these configurations to use many fewer arenas, and therefore have a
+	 * higher risk of hot locks.
+	 */
+	sec_t hpa_sec;
+	hpa_shard_t hpa_shard;
+
+	/* The source of edata_t objects. */
+	edata_cache_t edata_cache;
+
+	unsigned ind;
+
+	malloc_mutex_t *stats_mtx;
+	pa_shard_stats_t *stats;
+
+	/* The emap this shard is tied to. */
+	emap_t *emap;
+
+	/* The base from which we get the ehooks and allocate metadat. */
+	base_t *base;
+};
+
+static inline bool
+pa_shard_dont_decay_muzzy(pa_shard_t *shard) {
+	return ecache_npages_get(&shard->pac.ecache_muzzy) == 0 &&
+	    pac_decay_ms_get(&shard->pac, extent_state_muzzy) <= 0;
+}
+
+static inline ehooks_t *
+pa_shard_ehooks_get(pa_shard_t *shard) {
+	return base_ehooks_get(shard->base);
+}
+
+/* Returns true on error. */
+bool pa_central_init(pa_central_t *central, base_t *base, bool hpa,
+    hpa_hooks_t *hpa_hooks);
+
+/* Returns true on error. */
+bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central,
+    emap_t *emap, base_t *base, unsigned ind, pa_shard_stats_t *stats,
+    malloc_mutex_t *stats_mtx, nstime_t *cur_time, size_t oversize_threshold,
+    ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms);
+
+/*
+ * This isn't exposed to users; we allow late enablement of the HPA shard so
+ * that we can boot without worrying about the HPA, then turn it on in a0.
+ */
+bool pa_shard_enable_hpa(tsdn_t *tsdn, pa_shard_t *shard,
+    const hpa_shard_opts_t *hpa_opts, const sec_opts_t *hpa_sec_opts);
+
+/*
+ * We stop using the HPA when custom extent hooks are installed, but still
+ * redirect deallocations to it.
+ */
+void pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard);
+
+/*
+ * This does the PA-specific parts of arena reset (i.e. freeing all active
+ * allocations).
+ */
+void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard);
+
+/*
+ * Destroy all the remaining retained extents.  Should only be called after
+ * decaying all active, dirty, and muzzy extents to the retained state, as the
+ * last step in destroying the shard.
+ */
+void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard);
+
+/* Gets an edata for the given allocation. */
+edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size,
+    size_t alignment, bool slab, szind_t szind, bool zero, bool guarded,
+    bool *deferred_work_generated);
+/* Returns true on error, in which case nothing changed. */
+bool pa_expand(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size,
+    size_t new_size, szind_t szind, bool zero, bool *deferred_work_generated);
+/*
+ * The same.  Sets *generated_dirty to true if we produced new dirty pages, and
+ * false otherwise.
+ */
+bool pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size,
+    size_t new_size, szind_t szind, bool *deferred_work_generated);
+/*
+ * Frees the given edata back to the pa.  Sets *generated_dirty if we produced
+ * new dirty pages (well, we always set it for now; but this need not be the
+ * case).
+ * (We could make generated_dirty the return value of course, but this is more
+ * consistent with the shrink pathway and our error codes here).
+ */
+void pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata,
+    bool *deferred_work_generated);
+bool pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state,
+    ssize_t decay_ms, pac_purge_eagerness_t eagerness);
+ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state);
+
+/*
+ * Do deferred work on this PA shard.
+ *
+ * Morally, this should do both PAC decay and the HPA deferred work.  For now,
+ * though, the arena, background thread, and PAC modules are tightly interwoven
+ * in a way that's tricky to extricate, so we only do the HPA-specific parts.
+ */
+void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard,
+    bool deferral_allowed);
+void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_try_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);
+uint64_t pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard);
+
+/******************************************************************************/
+/*
+ * Various bits of "boring" functionality that are still part of this module,
+ * but that we relegate to pa_extra.c, to keep the core logic in pa.c as
+ * readable as possible.
+ */
+
+/*
+ * These fork phases are synchronized with the arena fork phase numbering to
+ * make it easy to keep straight. That's why there's no prefork1.
+ */
+void pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard);
+void pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard);
+
+void pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive,
+    size_t *ndirty, size_t *nmuzzy);
+
+void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
+    pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out,
+    hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out,
+    size_t *resident);
+
+/*
+ * Reads the PA-owned mutex stats into the output stats array, at the
+ * appropriate positions.  Morally, these stats should really live in
+ * pa_shard_stats_t, but the indices are sort of baked into the various mutex
+ * prof macros.  This would be a good thing to do at some point.
+ */
+void pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard,
+    mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]);
+
+#endif /* JEMALLOC_INTERNAL_PA_H */

+ 179 - 0
BeefRT/JEMalloc/include/jemalloc/internal/pac.h

@@ -0,0 +1,179 @@
+#ifndef JEMALLOC_INTERNAL_PAC_H
+#define JEMALLOC_INTERNAL_PAC_H
+
+#include "jemalloc/internal/exp_grow.h"
+#include "jemalloc/internal/pai.h"
+#include "san_bump.h"
+
+
+/*
+ * Page allocator classic; an implementation of the PAI interface that:
+ * - Can be used for arenas with custom extent hooks.
+ * - Can always satisfy any allocation request (including highly-fragmentary
+ *   ones).
+ * - Can use efficient OS-level zeroing primitives for demand-filled pages.
+ */
+
+/* How "eager" decay/purging should be. */
+enum pac_purge_eagerness_e {
+	PAC_PURGE_ALWAYS,
+	PAC_PURGE_NEVER,
+	PAC_PURGE_ON_EPOCH_ADVANCE
+};
+typedef enum pac_purge_eagerness_e pac_purge_eagerness_t;
+
+typedef struct pac_decay_stats_s pac_decay_stats_t;
+struct pac_decay_stats_s {
+	/* Total number of purge sweeps. */
+	locked_u64_t npurge;
+	/* Total number of madvise calls made. */
+	locked_u64_t nmadvise;
+	/* Total number of pages purged. */
+	locked_u64_t purged;
+};
+
+typedef struct pac_estats_s pac_estats_t;
+struct pac_estats_s {
+	/*
+	 * Stats for a given index in the range [0, SC_NPSIZES] in the various
+	 * ecache_ts.
+	 * We track both bytes and # of extents: two extents in the same bucket
+	 * may have different sizes if adjacent size classes differ by more than
+	 * a page, so bytes cannot always be derived from # of extents.
+	 */
+	size_t ndirty;
+	size_t dirty_bytes;
+	size_t nmuzzy;
+	size_t muzzy_bytes;
+	size_t nretained;
+	size_t retained_bytes;
+};
+
+typedef struct pac_stats_s pac_stats_t;
+struct pac_stats_s {
+	pac_decay_stats_t decay_dirty;
+	pac_decay_stats_t decay_muzzy;
+
+	/*
+	 * Number of unused virtual memory bytes currently retained.  Retained
+	 * bytes are technically mapped (though always decommitted or purged),
+	 * but they are excluded from the mapped statistic (above).
+	 */
+	size_t retained; /* Derived. */
+
+	/*
+	 * Number of bytes currently mapped, excluding retained memory (and any
+	 * base-allocated memory, which is tracked by the arena stats).
+	 *
+	 * We name this "pac_mapped" to avoid confusion with the arena_stats
+	 * "mapped".
+	 */
+	atomic_zu_t pac_mapped;
+
+	/* VM space had to be leaked (undocumented).  Normally 0. */
+	atomic_zu_t abandoned_vm;
+};
+
+typedef struct pac_s pac_t;
+struct pac_s {
+	/*
+	 * Must be the first member (we convert it to a PAC given only a
+	 * pointer).  The handle to the allocation interface.
+	 */
+	pai_t pai;
+	/*
+	 * Collections of extents that were previously allocated.  These are
+	 * used when allocating extents, in an attempt to re-use address space.
+	 *
+	 * Synchronization: internal.
+	 */
+	ecache_t ecache_dirty;
+	ecache_t ecache_muzzy;
+	ecache_t ecache_retained;
+
+	base_t *base;
+	emap_t *emap;
+	edata_cache_t *edata_cache;
+
+	/* The grow info for the retained ecache. */
+	exp_grow_t exp_grow;
+	malloc_mutex_t grow_mtx;
+
+	/* Special allocator for guarded frequently reused extents. */
+	san_bump_alloc_t sba;
+
+	/* How large extents should be before getting auto-purged. */
+	atomic_zu_t oversize_threshold;
+
+	/*
+	 * Decay-based purging state, responsible for scheduling extent state
+	 * transitions.
+	 *
+	 * Synchronization: via the internal mutex.
+	 */
+	decay_t decay_dirty; /* dirty --> muzzy */
+	decay_t decay_muzzy; /* muzzy --> retained */
+
+	malloc_mutex_t *stats_mtx;
+	pac_stats_t *stats;
+
+	/* Extent serial number generator state. */
+	atomic_zu_t extent_sn_next;
+};
+
+bool pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap,
+    edata_cache_t *edata_cache, nstime_t *cur_time, size_t oversize_threshold,
+    ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms, pac_stats_t *pac_stats,
+    malloc_mutex_t *stats_mtx);
+
+static inline size_t
+pac_mapped(pac_t *pac) {
+	return atomic_load_zu(&pac->stats->pac_mapped, ATOMIC_RELAXED);
+}
+
+static inline ehooks_t *
+pac_ehooks_get(pac_t *pac) {
+	return base_ehooks_get(pac->base);
+}
+
+/*
+ * All purging functions require holding decay->mtx.  This is one of the few
+ * places external modules are allowed to peek inside pa_shard_t internals.
+ */
+
+/*
+ * Decays the number of pages currently in the ecache.  This might not leave the
+ * ecache empty if other threads are inserting dirty objects into it
+ * concurrently with the call.
+ */
+void pac_decay_all(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
+    pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay);
+/*
+ * Updates decay settings for the current time, and conditionally purges in
+ * response (depending on decay_purge_setting).  Returns whether or not the
+ * epoch advanced.
+ */
+bool pac_maybe_decay_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
+    pac_decay_stats_t *decay_stats, ecache_t *ecache,
+    pac_purge_eagerness_t eagerness);
+
+/*
+ * Gets / sets the maximum amount that we'll grow an arena down the
+ * grow-retained pathways (unless forced to by an allocaction request).
+ *
+ * Set new_limit to NULL if it's just a query, or old_limit to NULL if you don't
+ * care about the previous value.
+ *
+ * Returns true on error (if the new limit is not valid).
+ */
+bool pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit,
+    size_t *new_limit);
+
+bool pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state,
+    ssize_t decay_ms, pac_purge_eagerness_t eagerness);
+ssize_t pac_decay_ms_get(pac_t *pac, extent_state_t state);
+
+void pac_reset(tsdn_t *tsdn, pac_t *pac);
+void pac_destroy(tsdn_t *tsdn, pac_t *pac);
+
+#endif /* JEMALLOC_INTERNAL_PAC_H */

+ 119 - 0
BeefRT/JEMalloc/include/jemalloc/internal/pages.h

@@ -0,0 +1,119 @@
+#ifndef JEMALLOC_INTERNAL_PAGES_EXTERNS_H
+#define JEMALLOC_INTERNAL_PAGES_EXTERNS_H
+
+/* Page size.  LG_PAGE is determined by the configure script. */
+#ifdef PAGE_MASK
+#  undef PAGE_MASK
+#endif
+#define PAGE		((size_t)(1U << LG_PAGE))
+#define PAGE_MASK	((size_t)(PAGE - 1))
+/* Return the page base address for the page containing address a. */
+#define PAGE_ADDR2BASE(a)						\
+	((void *)((uintptr_t)(a) & ~PAGE_MASK))
+/* Return the smallest pagesize multiple that is >= s. */
+#define PAGE_CEILING(s)							\
+	(((s) + PAGE_MASK) & ~PAGE_MASK)
+/* Return the largest pagesize multiple that is <=s. */
+#define PAGE_FLOOR(s) 							\
+	((s) & ~PAGE_MASK)
+
+/* Huge page size.  LG_HUGEPAGE is determined by the configure script. */
+#define HUGEPAGE	((size_t)(1U << LG_HUGEPAGE))
+#define HUGEPAGE_MASK	((size_t)(HUGEPAGE - 1))
+
+#if LG_HUGEPAGE != 0
+#  define HUGEPAGE_PAGES (HUGEPAGE / PAGE)
+#else
+/*
+ * It's convenient to define arrays (or bitmaps) of HUGEPAGE_PAGES lengths.  If
+ * we can't autodetect the hugepage size, it gets treated as 0, in which case
+ * we'll trigger a compiler error in those arrays.  Avoid this case by ensuring
+ * that this value is at least 1.  (We won't ever run in this degraded state;
+ * hpa_supported() returns false in this case.
+ */
+#  define HUGEPAGE_PAGES 1
+#endif
+
+/* Return the huge page base address for the huge page containing address a. */
+#define HUGEPAGE_ADDR2BASE(a)						\
+	((void *)((uintptr_t)(a) & ~HUGEPAGE_MASK))
+/* Return the smallest pagesize multiple that is >= s. */
+#define HUGEPAGE_CEILING(s)						\
+	(((s) + HUGEPAGE_MASK) & ~HUGEPAGE_MASK)
+
+/* PAGES_CAN_PURGE_LAZY is defined if lazy purging is supported. */
+#if defined(_WIN32) || defined(JEMALLOC_PURGE_MADVISE_FREE)
+#  define PAGES_CAN_PURGE_LAZY
+#endif
+/*
+ * PAGES_CAN_PURGE_FORCED is defined if forced purging is supported.
+ *
+ * The only supported way to hard-purge on Windows is to decommit and then
+ * re-commit, but doing so is racy, and if re-commit fails it's a pain to
+ * propagate the "poisoned" memory state.  Since we typically decommit as the
+ * next step after purging on Windows anyway, there's no point in adding such
+ * complexity.
+ */
+#if !defined(_WIN32) && ((defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \
+    defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)) || \
+    defined(JEMALLOC_MAPS_COALESCE))
+#  define PAGES_CAN_PURGE_FORCED
+#endif
+
+static const bool pages_can_purge_lazy =
+#ifdef PAGES_CAN_PURGE_LAZY
+    true
+#else
+    false
+#endif
+    ;
+static const bool pages_can_purge_forced =
+#ifdef PAGES_CAN_PURGE_FORCED
+    true
+#else
+    false
+#endif
+    ;
+
+#if defined(JEMALLOC_HAVE_MADVISE_HUGE) || defined(JEMALLOC_HAVE_MEMCNTL)
+#  define PAGES_CAN_HUGIFY
+#endif
+
+static const bool pages_can_hugify =
+#ifdef PAGES_CAN_HUGIFY
+    true
+#else
+    false
+#endif
+    ;
+
+typedef enum {
+	thp_mode_default       = 0, /* Do not change hugepage settings. */
+	thp_mode_always        = 1, /* Always set MADV_HUGEPAGE. */
+	thp_mode_never         = 2, /* Always set MADV_NOHUGEPAGE. */
+
+	thp_mode_names_limit   = 3, /* Used for option processing. */
+	thp_mode_not_supported = 3  /* No THP support detected. */
+} thp_mode_t;
+
+#define THP_MODE_DEFAULT thp_mode_default
+extern thp_mode_t opt_thp;
+extern thp_mode_t init_system_thp_mode; /* Initial system wide state. */
+extern const char *thp_mode_names[];
+
+void *pages_map(void *addr, size_t size, size_t alignment, bool *commit);
+void pages_unmap(void *addr, size_t size);
+bool pages_commit(void *addr, size_t size);
+bool pages_decommit(void *addr, size_t size);
+bool pages_purge_lazy(void *addr, size_t size);
+bool pages_purge_forced(void *addr, size_t size);
+bool pages_huge(void *addr, size_t size);
+bool pages_nohuge(void *addr, size_t size);
+bool pages_dontdump(void *addr, size_t size);
+bool pages_dodump(void *addr, size_t size);
+bool pages_boot(void);
+void pages_set_thp_state (void *ptr, size_t size);
+void pages_mark_guards(void *head, void *tail);
+void pages_unmark_guards(void *head, void *tail);
+
+#endif /* JEMALLOC_INTERNAL_PAGES_EXTERNS_H */

+ 95 - 0
BeefRT/JEMalloc/include/jemalloc/internal/pai.h

@@ -0,0 +1,95 @@
+#ifndef JEMALLOC_INTERNAL_PAI_H
+#define JEMALLOC_INTERNAL_PAI_H
+
+/* An interface for page allocation. */
+
+typedef struct pai_s pai_t;
+struct pai_s {
+	/* Returns NULL on failure. */
+	edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size,
+	    size_t alignment, bool zero, bool guarded, bool frequent_reuse,
+	    bool *deferred_work_generated);
+	/*
+	 * Returns the number of extents added to the list (which may be fewer
+	 * than requested, in case of OOM).  The list should already be
+	 * initialized.  The only alignment guarantee is page-alignment, and
+	 * the results are not necessarily zeroed.
+	 */
+	size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size,
+	    size_t nallocs, edata_list_active_t *results,
+	    bool *deferred_work_generated);
+	bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
+	    size_t old_size, size_t new_size, bool zero,
+	    bool *deferred_work_generated);
+	bool (*shrink)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
+	    size_t old_size, size_t new_size, bool *deferred_work_generated);
+	void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata,
+	    bool *deferred_work_generated);
+	/* This function empties out list as a side-effect of being called. */
+	void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self,
+	    edata_list_active_t *list, bool *deferred_work_generated);
+	uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self);
+};
+
+/*
+ * These are just simple convenience functions to avoid having to reference the
+ * same pai_t twice on every invocation.
+ */
+
+static inline edata_t *
+pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment,
+    bool zero, bool guarded, bool frequent_reuse,
+    bool *deferred_work_generated) {
+	return self->alloc(tsdn, self, size, alignment, zero, guarded,
+	    frequent_reuse, deferred_work_generated);
+}
+
+static inline size_t
+pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs,
+    edata_list_active_t *results, bool *deferred_work_generated) {
+	return self->alloc_batch(tsdn, self, size, nallocs, results,
+	    deferred_work_generated);
+}
+
+static inline bool
+pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
+    size_t new_size, bool zero, bool *deferred_work_generated) {
+	return self->expand(tsdn, self, edata, old_size, new_size, zero,
+	    deferred_work_generated);
+}
+
+static inline bool
+pai_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
+    size_t new_size, bool *deferred_work_generated) {
+	return self->shrink(tsdn, self, edata, old_size, new_size,
+	    deferred_work_generated);
+}
+
+static inline void
+pai_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata,
+    bool *deferred_work_generated) {
+	self->dalloc(tsdn, self, edata, deferred_work_generated);
+}
+
+static inline void
+pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list,
+    bool *deferred_work_generated) {
+	self->dalloc_batch(tsdn, self, list, deferred_work_generated);
+}
+
+static inline uint64_t
+pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) {
+	return self->time_until_deferred_work(tsdn, self);
+}
+
+/*
+ * An implementation of batch allocation that simply calls alloc once for
+ * each item in the list.
+ */
+size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size,
+    size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated);
+/* Ditto, for dalloc. */
+void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self,
+    edata_list_active_t *list, bool *deferred_work_generated);
+
+#endif /* JEMALLOC_INTERNAL_PAI_H */

+ 37 - 0
BeefRT/JEMalloc/include/jemalloc/internal/peak.h

@@ -0,0 +1,37 @@
+#ifndef JEMALLOC_INTERNAL_PEAK_H
+#define JEMALLOC_INTERNAL_PEAK_H
+
+typedef struct peak_s peak_t;
+struct peak_s {
+	/* The highest recorded peak value, after adjustment (see below). */
+	uint64_t cur_max;
+	/*
+	 * The difference between alloc and dalloc at the last set_zero call;
+	 * this lets us cancel out the appropriate amount of excess.
+	 */
+	uint64_t adjustment;
+};
+
+#define PEAK_INITIALIZER {0, 0}
+
+static inline uint64_t
+peak_max(peak_t *peak) {
+	return peak->cur_max;
+}
+
+static inline void
+peak_update(peak_t *peak, uint64_t alloc, uint64_t dalloc) {
+	int64_t candidate_max = (int64_t)(alloc - dalloc - peak->adjustment);
+	if (candidate_max > (int64_t)peak->cur_max) {
+		peak->cur_max = candidate_max;
+	}
+}
+
+/* Resets the counter to zero; all peaks are now relative to this point. */
+static inline void
+peak_set_zero(peak_t *peak, uint64_t alloc, uint64_t dalloc) {
+	peak->cur_max = 0;
+	peak->adjustment = alloc - dalloc;
+}
+
+#endif /* JEMALLOC_INTERNAL_PEAK_H */

+ 24 - 0
BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h

@@ -0,0 +1,24 @@
+#ifndef JEMALLOC_INTERNAL_PEAK_EVENT_H
+#define JEMALLOC_INTERNAL_PEAK_EVENT_H
+
+/*
+ * While peak.h contains the simple helper struct that tracks state, this
+ * contains the allocator tie-ins (and knows about tsd, the event module, etc.).
+ */
+
+/* Update the peak with current tsd state. */
+void peak_event_update(tsd_t *tsd);
+/* Set current state to zero. */
+void peak_event_zero(tsd_t *tsd);
+uint64_t peak_event_max(tsd_t *tsd);
+
+/* Manual hooks. */
+/* The activity-triggered hooks. */
+uint64_t peak_alloc_new_event_wait(tsd_t *tsd);
+uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd);
+void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed);
+uint64_t peak_dalloc_new_event_wait(tsd_t *tsd);
+uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd);
+void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed);
+
+#endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */

+ 520 - 0
BeefRT/JEMalloc/include/jemalloc/internal/ph.h

@@ -0,0 +1,520 @@
+#ifndef JEMALLOC_INTERNAL_PH_H
+#define JEMALLOC_INTERNAL_PH_H
+
+/*
+ * A Pairing Heap implementation.
+ *
+ * "The Pairing Heap: A New Form of Self-Adjusting Heap"
+ * https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf
+ *
+ * With auxiliary twopass list, described in a follow on paper.
+ *
+ * "Pairing Heaps: Experiments and Analysis"
+ * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.106.2988&rep=rep1&type=pdf
+ *
+ *******************************************************************************
+ *
+ * We include a non-obvious optimization:
+ * - First, we introduce a new pop-and-link operation; pop the two most
+ *   recently-inserted items off the aux-list, link them, and push the resulting
+ *   heap.
+ * - We maintain a count of the number of insertions since the last time we
+ *   merged the aux-list (i.e. via first() or remove_first()).  After N inserts,
+ *   we do ffs(N) pop-and-link operations.
+ *
+ * One way to think of this is that we're progressively building up a tree in
+ * the aux-list, rather than a linked-list (think of the series of merges that
+ * will be performed as the aux-count grows).
+ *
+ * There's a couple reasons we benefit from this:
+ * - Ordinarily, after N insertions, the aux-list is of size N.  With our
+ *   strategy, it's of size O(log(N)).  So we decrease the worst-case time of
+ *   first() calls, and reduce the average cost of remove_min calls.  Since
+ *   these almost always occur while holding a lock, we practically reduce the
+ *   frequency of unusually long hold times.
+ * - This moves the bulk of the work of merging the aux-list onto the threads
+ *   that are inserting into the heap.  In some common scenarios, insertions
+ *   happen in bulk, from a single thread (think tcache flushing; we potentially
+ *   move many slabs from slabs_full to slabs_nonfull).  All the nodes in this
+ *   case are in the inserting threads cache, and linking them is very cheap
+ *   (cache misses dominate linking cost).  Without this optimization, linking
+ *   happens on the next call to remove_first.  Since that remove_first call
+ *   likely happens on a different thread (or at least, after the cache has
+ *   gotten cold if done on the same thread), deferring linking trades cheap
+ *   link operations now for expensive ones later.
+ *
+ * The ffs trick keeps amortized insert cost at constant time.  Similar
+ * strategies based on periodically sorting the list after a batch of operations
+ * perform worse than this in practice, even with various fancy tricks; they
+ * all took amortized complexity of an insert from O(1) to O(log(n)).
+ */
+
+typedef int (*ph_cmp_t)(void *, void *);
+
+/* Node structure. */
+typedef struct phn_link_s phn_link_t;
+struct phn_link_s {
+	void *prev;
+	void *next;
+	void *lchild;
+};
+
+typedef struct ph_s ph_t;
+struct ph_s {
+	void *root;
+	/*
+	 * Inserts done since the last aux-list merge.  This is not necessarily
+	 * the size of the aux-list, since it's possible that removals have
+	 * happened since, and we don't track whether or not those removals are
+	 * from the aux list.
+	 */
+	size_t auxcount;
+};
+
+JEMALLOC_ALWAYS_INLINE phn_link_t *
+phn_link_get(void *phn, size_t offset) {
+	return (phn_link_t *)(((uintptr_t)phn) + offset);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+phn_link_init(void *phn, size_t offset) {
+	phn_link_get(phn, offset)->prev = NULL;
+	phn_link_get(phn, offset)->next = NULL;
+	phn_link_get(phn, offset)->lchild = NULL;
+}
+
+/* Internal utility helpers. */
+JEMALLOC_ALWAYS_INLINE void *
+phn_lchild_get(void *phn, size_t offset) {
+	return phn_link_get(phn, offset)->lchild;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+phn_lchild_set(void *phn, void *lchild, size_t offset) {
+	phn_link_get(phn, offset)->lchild = lchild;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+phn_next_get(void *phn, size_t offset) {
+	return phn_link_get(phn, offset)->next;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+phn_next_set(void *phn, void *next, size_t offset) {
+	phn_link_get(phn, offset)->next = next;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+phn_prev_get(void *phn, size_t offset) {
+	return phn_link_get(phn, offset)->prev;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+phn_prev_set(void *phn, void *prev, size_t offset) {
+	phn_link_get(phn, offset)->prev = prev;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+phn_merge_ordered(void *phn0, void *phn1, size_t offset,
+    ph_cmp_t cmp) {
+	void *phn0child;
+
+	assert(phn0 != NULL);
+	assert(phn1 != NULL);
+	assert(cmp(phn0, phn1) <= 0);
+
+	phn_prev_set(phn1, phn0, offset);
+	phn0child = phn_lchild_get(phn0, offset);
+	phn_next_set(phn1, phn0child, offset);
+	if (phn0child != NULL) {
+		phn_prev_set(phn0child, phn1, offset);
+	}
+	phn_lchild_set(phn0, phn1, offset);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+phn_merge(void *phn0, void *phn1, size_t offset, ph_cmp_t cmp) {
+	void *result;
+	if (phn0 == NULL) {
+		result = phn1;
+	} else if (phn1 == NULL) {
+		result = phn0;
+	} else if (cmp(phn0, phn1) < 0) {
+		phn_merge_ordered(phn0, phn1, offset, cmp);
+		result = phn0;
+	} else {
+		phn_merge_ordered(phn1, phn0, offset, cmp);
+		result = phn1;
+	}
+	return result;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+phn_merge_siblings(void *phn, size_t offset, ph_cmp_t cmp) {
+	void *head = NULL;
+	void *tail = NULL;
+	void *phn0 = phn;
+	void *phn1 = phn_next_get(phn0, offset);
+
+	/*
+	 * Multipass merge, wherein the first two elements of a FIFO
+	 * are repeatedly merged, and each result is appended to the
+	 * singly linked FIFO, until the FIFO contains only a single
+	 * element.  We start with a sibling list but no reference to
+	 * its tail, so we do a single pass over the sibling list to
+	 * populate the FIFO.
+	 */
+	if (phn1 != NULL) {
+		void *phnrest = phn_next_get(phn1, offset);
+		if (phnrest != NULL) {
+			phn_prev_set(phnrest, NULL, offset);
+		}
+		phn_prev_set(phn0, NULL, offset);
+		phn_next_set(phn0, NULL, offset);
+		phn_prev_set(phn1, NULL, offset);
+		phn_next_set(phn1, NULL, offset);
+		phn0 = phn_merge(phn0, phn1, offset, cmp);
+		head = tail = phn0;
+		phn0 = phnrest;
+		while (phn0 != NULL) {
+			phn1 = phn_next_get(phn0, offset);
+			if (phn1 != NULL) {
+				phnrest = phn_next_get(phn1, offset);
+				if (phnrest != NULL) {
+					phn_prev_set(phnrest, NULL, offset);
+				}
+				phn_prev_set(phn0, NULL, offset);
+				phn_next_set(phn0, NULL, offset);
+				phn_prev_set(phn1, NULL, offset);
+				phn_next_set(phn1, NULL, offset);
+				phn0 = phn_merge(phn0, phn1, offset, cmp);
+				phn_next_set(tail, phn0, offset);
+				tail = phn0;
+				phn0 = phnrest;
+			} else {
+				phn_next_set(tail, phn0, offset);
+				tail = phn0;
+				phn0 = NULL;
+			}
+		}
+		phn0 = head;
+		phn1 = phn_next_get(phn0, offset);
+		if (phn1 != NULL) {
+			while (true) {
+				head = phn_next_get(phn1, offset);
+				assert(phn_prev_get(phn0, offset) == NULL);
+				phn_next_set(phn0, NULL, offset);
+				assert(phn_prev_get(phn1, offset) == NULL);
+				phn_next_set(phn1, NULL, offset);
+				phn0 = phn_merge(phn0, phn1, offset, cmp);
+				if (head == NULL) {
+					break;
+				}
+				phn_next_set(tail, phn0, offset);
+				tail = phn0;
+				phn0 = head;
+				phn1 = phn_next_get(phn0, offset);
+			}
+		}
+	}
+	return phn0;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_merge_aux(ph_t *ph, size_t offset, ph_cmp_t cmp) {
+	ph->auxcount = 0;
+	void *phn = phn_next_get(ph->root, offset);
+	if (phn != NULL) {
+		phn_prev_set(ph->root, NULL, offset);
+		phn_next_set(ph->root, NULL, offset);
+		phn_prev_set(phn, NULL, offset);
+		phn = phn_merge_siblings(phn, offset, cmp);
+		assert(phn_next_get(phn, offset) == NULL);
+		ph->root = phn_merge(ph->root, phn, offset, cmp);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_merge_children(void *phn, size_t offset, ph_cmp_t cmp) {
+	void *result;
+	void *lchild = phn_lchild_get(phn, offset);
+	if (lchild == NULL) {
+		result = NULL;
+	} else {
+		result = phn_merge_siblings(lchild, offset, cmp);
+	}
+	return result;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_new(ph_t *ph) {
+	ph->root = NULL;
+	ph->auxcount = 0;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+ph_empty(ph_t *ph) {
+	return ph->root == NULL;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_first(ph_t *ph, size_t offset, ph_cmp_t cmp) {
+	if (ph->root == NULL) {
+		return NULL;
+	}
+	ph_merge_aux(ph, offset, cmp);
+	return ph->root;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_any(ph_t *ph, size_t offset) {
+	if (ph->root == NULL) {
+		return NULL;
+	}
+	void *aux = phn_next_get(ph->root, offset);
+	if (aux != NULL) {
+		return aux;
+	}
+	return ph->root;
+}
+
+/* Returns true if we should stop trying to merge. */
+JEMALLOC_ALWAYS_INLINE bool
+ph_try_aux_merge_pair(ph_t *ph, size_t offset, ph_cmp_t cmp) {
+	assert(ph->root != NULL);
+	void *phn0 = phn_next_get(ph->root, offset);
+	if (phn0 == NULL) {
+		return true;
+	}
+	void *phn1 = phn_next_get(phn0, offset);
+	if (phn1 == NULL) {
+		return true;
+	}
+	void *next_phn1 = phn_next_get(phn1, offset);
+	phn_next_set(phn0, NULL, offset);
+	phn_prev_set(phn0, NULL, offset);
+	phn_next_set(phn1, NULL, offset);
+	phn_prev_set(phn1, NULL, offset);
+	phn0 = phn_merge(phn0, phn1, offset, cmp);
+	phn_next_set(phn0, next_phn1, offset);
+	if (next_phn1 != NULL) {
+		phn_prev_set(next_phn1, phn0, offset);
+	}
+	phn_next_set(ph->root, phn0, offset);
+	phn_prev_set(phn0, ph->root, offset);
+	return next_phn1 == NULL;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_insert(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
+	phn_link_init(phn, offset);
+
+	/*
+	 * Treat the root as an aux list during insertion, and lazily merge
+	 * during a_prefix##remove_first().  For elements that are inserted,
+	 * then removed via a_prefix##remove() before the aux list is ever
+	 * processed, this makes insert/remove constant-time, whereas eager
+	 * merging would make insert O(log n).
+	 */
+	if (ph->root == NULL) {
+		ph->root = phn;
+	} else {
+		/*
+		 * As a special case, check to see if we can replace the root.
+		 * This is practically common in some important cases, and lets
+		 * us defer some insertions (hopefully, until the point where
+		 * some of the items in the aux list have been removed, savings
+		 * us from linking them at all).
+		 */
+		if (cmp(phn, ph->root) < 0) {
+			phn_lchild_set(phn, ph->root, offset);
+			phn_prev_set(ph->root, phn, offset);
+			ph->root = phn;
+			ph->auxcount = 0;
+			return;
+		}
+		ph->auxcount++;
+		phn_next_set(phn, phn_next_get(ph->root, offset), offset);
+		if (phn_next_get(ph->root, offset) != NULL) {
+			phn_prev_set(phn_next_get(ph->root, offset), phn,
+			    offset);
+		}
+		phn_prev_set(phn, ph->root, offset);
+		phn_next_set(ph->root, phn, offset);
+	}
+	if (ph->auxcount > 1) {
+		unsigned nmerges = ffs_zu(ph->auxcount - 1);
+		bool done = false;
+		for (unsigned i = 0; i < nmerges && !done; i++) {
+			done = ph_try_aux_merge_pair(ph, offset, cmp);
+		}
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_remove_first(ph_t *ph, size_t offset, ph_cmp_t cmp) {
+	void *ret;
+
+	if (ph->root == NULL) {
+		return NULL;
+	}
+	ph_merge_aux(ph, offset, cmp);
+	ret = ph->root;
+	ph->root = ph_merge_children(ph->root, offset, cmp);
+
+	return ret;
+
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
+	void *replace;
+	void *parent;
+
+	if (ph->root == phn) {
+		/*
+		 * We can delete from aux list without merging it, but we need
+		 * to merge if we are dealing with the root node and it has
+		 * children.
+		 */
+		if (phn_lchild_get(phn, offset) == NULL) {
+			ph->root = phn_next_get(phn, offset);
+			if (ph->root != NULL) {
+				phn_prev_set(ph->root, NULL, offset);
+			}
+			return;
+		}
+		ph_merge_aux(ph, offset, cmp);
+		if (ph->root == phn) {
+			ph->root = ph_merge_children(ph->root, offset, cmp);
+			return;
+		}
+	}
+
+	/* Get parent (if phn is leftmost child) before mutating. */
+	if ((parent = phn_prev_get(phn, offset)) != NULL) {
+		if (phn_lchild_get(parent, offset) != phn) {
+			parent = NULL;
+		}
+	}
+	/* Find a possible replacement node, and link to parent. */
+	replace = ph_merge_children(phn, offset, cmp);
+	/* Set next/prev for sibling linked list. */
+	if (replace != NULL) {
+		if (parent != NULL) {
+			phn_prev_set(replace, parent, offset);
+			phn_lchild_set(parent, replace, offset);
+		} else {
+			phn_prev_set(replace, phn_prev_get(phn, offset),
+			    offset);
+			if (phn_prev_get(phn, offset) != NULL) {
+				phn_next_set(phn_prev_get(phn, offset), replace,
+				    offset);
+			}
+		}
+		phn_next_set(replace, phn_next_get(phn, offset), offset);
+		if (phn_next_get(phn, offset) != NULL) {
+			phn_prev_set(phn_next_get(phn, offset), replace,
+			    offset);
+		}
+	} else {
+		if (parent != NULL) {
+			void *next = phn_next_get(phn, offset);
+			phn_lchild_set(parent, next, offset);
+			if (next != NULL) {
+				phn_prev_set(next, parent, offset);
+			}
+		} else {
+			assert(phn_prev_get(phn, offset) != NULL);
+			phn_next_set(
+			    phn_prev_get(phn, offset),
+			    phn_next_get(phn, offset), offset);
+		}
+		if (phn_next_get(phn, offset) != NULL) {
+			phn_prev_set(
+			    phn_next_get(phn, offset),
+			    phn_prev_get(phn, offset), offset);
+		}
+	}
+}
+
+#define ph_structs(a_prefix, a_type)					\
+typedef struct {							\
+	phn_link_t link;						\
+} a_prefix##_link_t;							\
+									\
+typedef struct {							\
+	ph_t ph;							\
+} a_prefix##_t;
+
+/*
+ * The ph_proto() macro generates function prototypes that correspond to the
+ * functions generated by an equivalently parameterized call to ph_gen().
+ */
+#define ph_proto(a_attr, a_prefix, a_type)				\
+									\
+a_attr void a_prefix##_new(a_prefix##_t *ph);				\
+a_attr bool a_prefix##_empty(a_prefix##_t *ph);				\
+a_attr a_type *a_prefix##_first(a_prefix##_t *ph);			\
+a_attr a_type *a_prefix##_any(a_prefix##_t *ph);			\
+a_attr void a_prefix##_insert(a_prefix##_t *ph, a_type *phn);		\
+a_attr a_type *a_prefix##_remove_first(a_prefix##_t *ph);		\
+a_attr void a_prefix##_remove(a_prefix##_t *ph, a_type *phn);		\
+a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph);
+
+/* The ph_gen() macro generates a type-specific pairing heap implementation. */
+#define ph_gen(a_attr, a_prefix, a_type, a_field, a_cmp)		\
+JEMALLOC_ALWAYS_INLINE int						\
+a_prefix##_ph_cmp(void *a, void *b) {					\
+	return a_cmp((a_type *)a, (a_type *)b);				\
+}									\
+									\
+a_attr void								\
+a_prefix##_new(a_prefix##_t *ph) {					\
+	ph_new(&ph->ph);						\
+}									\
+									\
+a_attr bool								\
+a_prefix##_empty(a_prefix##_t *ph) {					\
+	return ph_empty(&ph->ph);					\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_first(a_prefix##_t *ph) {					\
+	return ph_first(&ph->ph, offsetof(a_type, a_field),		\
+	    &a_prefix##_ph_cmp);					\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_any(a_prefix##_t *ph) {					\
+	return ph_any(&ph->ph, offsetof(a_type, a_field));		\
+}									\
+									\
+a_attr void								\
+a_prefix##_insert(a_prefix##_t *ph, a_type *phn) {			\
+	ph_insert(&ph->ph, phn, offsetof(a_type, a_field),		\
+	    a_prefix##_ph_cmp);						\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_remove_first(a_prefix##_t *ph) {				\
+	return ph_remove_first(&ph->ph, offsetof(a_type, a_field),	\
+	    a_prefix##_ph_cmp);						\
+}									\
+									\
+a_attr void								\
+a_prefix##_remove(a_prefix##_t *ph, a_type *phn) {			\
+	ph_remove(&ph->ph, phn, offsetof(a_type, a_field),		\
+	    a_prefix##_ph_cmp);						\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_remove_any(a_prefix##_t *ph) {				\
+	a_type *ret = a_prefix##_any(ph);				\
+	if (ret != NULL) {						\
+		a_prefix##_remove(ph, ret);				\
+	}								\
+	return ret;							\
+}
+
+#endif /* JEMALLOC_INTERNAL_PH_H */

+ 5 - 0
BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh

@@ -0,0 +1,5 @@
+#!/bin/sh
+
+for symbol in `cat "$@"` ; do
+  echo "#define ${symbol} JEMALLOC_N(${symbol})"
+done

+ 52 - 0
BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk

@@ -0,0 +1,52 @@
+#!/usr/bin/env awk -f
+
+BEGIN {
+  sym_prefix = ""
+  split("\
+        je_aligned_alloc \
+        je_calloc \
+        je_dallocx \
+        je_free \
+        je_mallctl \
+        je_mallctlbymib \
+        je_mallctlnametomib \
+        je_malloc \
+        je_malloc_conf \
+        je_malloc_conf_2_conf_harder \
+        je_malloc_message \
+        je_malloc_stats_print \
+        je_malloc_usable_size \
+        je_mallocx \
+        je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c \
+        je_nallocx \
+        je_posix_memalign \
+        je_rallocx \
+        je_realloc \
+        je_sallocx \
+        je_sdallocx \
+        je_xallocx \
+        tls_callback \
+        ", exported_symbol_names)
+  # Store exported symbol names as keys in exported_symbols.
+  for (i in exported_symbol_names) {
+    exported_symbols[exported_symbol_names[i]] = 1
+  }
+}
+
+# Process 'nm -a <c_source.o>' output.
+#
+# Handle lines like:
+#   0000000000000008 D opt_junk
+#   0000000000007574 T malloc_initialized
+(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) {
+  print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix))
+}
+
+# Process 'dumpbin /SYMBOLS <c_source.obj>' output.
+#
+# Handle lines like:
+#   353 00008098 SECT4  notype       External     | opt_junk
+#   3F1 00000000 SECT7  notype ()    External     | malloc_initialized
+($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) {
+  print $NF
+}

+ 51 - 0
BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh

@@ -0,0 +1,51 @@
+#!/bin/sh
+#
+# Generate private_symbols[_jet].awk.
+#
+# Usage: private_symbols.sh <sym_prefix> <sym>*
+#
+# <sym_prefix> is typically "" or "_".
+
+sym_prefix=$1
+shift
+
+cat <<EOF
+#!/usr/bin/env awk -f
+
+BEGIN {
+  sym_prefix = "${sym_prefix}"
+  split("\\
+EOF
+
+for public_sym in "$@" ; do
+  cat <<EOF
+        ${sym_prefix}${public_sym} \\
+EOF
+done
+
+cat <<"EOF"
+        ", exported_symbol_names)
+  # Store exported symbol names as keys in exported_symbols.
+  for (i in exported_symbol_names) {
+    exported_symbols[exported_symbol_names[i]] = 1
+  }
+}
+
+# Process 'nm -a <c_source.o>' output.
+#
+# Handle lines like:
+#   0000000000000008 D opt_junk
+#   0000000000007574 T malloc_initialized
+(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) {
+  print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix))
+}
+
+# Process 'dumpbin /SYMBOLS <c_source.obj>' output.
+#
+# Handle lines like:
+#   353 00008098 SECT4  notype       External     | opt_junk
+#   3F1 00000000 SECT7  notype ()    External     | malloc_initialized
+($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) {
+  print $NF
+}
+EOF

+ 52 - 0
BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk

@@ -0,0 +1,52 @@
+#!/usr/bin/env awk -f
+
+BEGIN {
+  sym_prefix = ""
+  split("\
+        jet_aligned_alloc \
+        jet_calloc \
+        jet_dallocx \
+        jet_free \
+        jet_mallctl \
+        jet_mallctlbymib \
+        jet_mallctlnametomib \
+        jet_malloc \
+        jet_malloc_conf \
+        jet_malloc_conf_2_conf_harder \
+        jet_malloc_message \
+        jet_malloc_stats_print \
+        jet_malloc_usable_size \
+        jet_mallocx \
+        jet_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c \
+        jet_nallocx \
+        jet_posix_memalign \
+        jet_rallocx \
+        jet_realloc \
+        jet_sallocx \
+        jet_sdallocx \
+        jet_xallocx \
+        tls_callback \
+        ", exported_symbol_names)
+  # Store exported symbol names as keys in exported_symbols.
+  for (i in exported_symbol_names) {
+    exported_symbols[exported_symbol_names[i]] = 1
+  }
+}
+
+# Process 'nm -a <c_source.o>' output.
+#
+# Handle lines like:
+#   0000000000000008 D opt_junk
+#   0000000000007574 T malloc_initialized
+(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) {
+  print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix))
+}
+
+# Process 'dumpbin /SYMBOLS <c_source.obj>' output.
+#
+# Handle lines like:
+#   353 00008098 SECT4  notype       External     | opt_junk
+#   3F1 00000000 SECT7  notype ()    External     | malloc_initialized
+($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) {
+  print $NF
+}

+ 168 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prng.h

@@ -0,0 +1,168 @@
+#ifndef JEMALLOC_INTERNAL_PRNG_H
+#define JEMALLOC_INTERNAL_PRNG_H
+
+#include "jemalloc/internal/bit_util.h"
+
+/*
+ * Simple linear congruential pseudo-random number generator:
+ *
+ *   prng(y) = (a*x + c) % m
+ *
+ * where the following constants ensure maximal period:
+ *
+ *   a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4.
+ *   c == Odd number (relatively prime to 2^n).
+ *   m == 2^32
+ *
+ * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints.
+ *
+ * This choice of m has the disadvantage that the quality of the bits is
+ * proportional to bit position.  For example, the lowest bit has a cycle of 2,
+ * the next has a cycle of 4, etc.  For this reason, we prefer to use the upper
+ * bits.
+ */
+
+/******************************************************************************/
+/* INTERNAL DEFINITIONS -- IGNORE */
+/******************************************************************************/
+#define PRNG_A_32	UINT32_C(1103515241)
+#define PRNG_C_32	UINT32_C(12347)
+
+#define PRNG_A_64	UINT64_C(6364136223846793005)
+#define PRNG_C_64	UINT64_C(1442695040888963407)
+
+JEMALLOC_ALWAYS_INLINE uint32_t
+prng_state_next_u32(uint32_t state) {
+	return (state * PRNG_A_32) + PRNG_C_32;
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+prng_state_next_u64(uint64_t state) {
+	return (state * PRNG_A_64) + PRNG_C_64;
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+prng_state_next_zu(size_t state) {
+#if LG_SIZEOF_PTR == 2
+	return (state * PRNG_A_32) + PRNG_C_32;
+#elif LG_SIZEOF_PTR == 3
+	return (state * PRNG_A_64) + PRNG_C_64;
+#else
+#error Unsupported pointer size
+#endif
+}
+
+/******************************************************************************/
+/* BEGIN PUBLIC API */
+/******************************************************************************/
+
+/*
+ * The prng_lg_range functions give a uniform int in the half-open range [0,
+ * 2**lg_range).
+ */
+
+JEMALLOC_ALWAYS_INLINE uint32_t
+prng_lg_range_u32(uint32_t *state, unsigned lg_range) {
+	assert(lg_range > 0);
+	assert(lg_range <= 32);
+
+	*state = prng_state_next_u32(*state);
+	uint32_t ret = *state >> (32 - lg_range);
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+prng_lg_range_u64(uint64_t *state, unsigned lg_range) {
+	assert(lg_range > 0);
+	assert(lg_range <= 64);
+
+	*state = prng_state_next_u64(*state);
+	uint64_t ret = *state >> (64 - lg_range);
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+prng_lg_range_zu(size_t *state, unsigned lg_range) {
+	assert(lg_range > 0);
+	assert(lg_range <= ZU(1) << (3 + LG_SIZEOF_PTR));
+
+	*state = prng_state_next_zu(*state);
+	size_t ret = *state >> ((ZU(1) << (3 + LG_SIZEOF_PTR)) - lg_range);
+
+	return ret;
+}
+
+/*
+ * The prng_range functions behave like the prng_lg_range, but return a result
+ * in [0, range) instead of [0, 2**lg_range).
+ */
+
+JEMALLOC_ALWAYS_INLINE uint32_t
+prng_range_u32(uint32_t *state, uint32_t range) {
+	assert(range != 0);
+	/*
+	 * If range were 1, lg_range would be 0, so the shift in
+	 * prng_lg_range_u32 would be a shift of a 32-bit variable by 32 bits,
+	 * which is UB.  Just handle this case as a one-off.
+	 */
+	if (range == 1) {
+		return 0;
+	}
+
+	/* Compute the ceiling of lg(range). */
+	unsigned lg_range = ffs_u32(pow2_ceil_u32(range));
+
+	/* Generate a result in [0..range) via repeated trial. */
+	uint32_t ret;
+	do {
+		ret = prng_lg_range_u32(state, lg_range);
+	} while (ret >= range);
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+prng_range_u64(uint64_t *state, uint64_t range) {
+	assert(range != 0);
+
+	/* See the note in prng_range_u32. */
+	if (range == 1) {
+		return 0;
+	}
+
+	/* Compute the ceiling of lg(range). */
+	unsigned lg_range = ffs_u64(pow2_ceil_u64(range));
+
+	/* Generate a result in [0..range) via repeated trial. */
+	uint64_t ret;
+	do {
+		ret = prng_lg_range_u64(state, lg_range);
+	} while (ret >= range);
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+prng_range_zu(size_t *state, size_t range) {
+	assert(range != 0);
+
+	/* See the note in prng_range_u32. */
+	if (range == 1) {
+		return 0;
+	}
+
+	/* Compute the ceiling of lg(range). */
+	unsigned lg_range = ffs_u64(pow2_ceil_u64(range));
+
+	/* Generate a result in [0..range) via repeated trial. */
+	size_t ret;
+	do {
+		ret = prng_lg_range_zu(state, lg_range);
+	} while (ret >= range);
+
+	return ret;
+}
+
+#endif /* JEMALLOC_INTERNAL_PRNG_H */

+ 37 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h

@@ -0,0 +1,37 @@
+#ifndef JEMALLOC_INTERNAL_PROF_DATA_H
+#define JEMALLOC_INTERNAL_PROF_DATA_H
+
+#include "jemalloc/internal/mutex.h"
+
+extern malloc_mutex_t bt2gctx_mtx;
+extern malloc_mutex_t tdatas_mtx;
+extern malloc_mutex_t prof_dump_mtx;
+
+extern malloc_mutex_t *gctx_locks;
+extern malloc_mutex_t *tdata_locks;
+
+extern size_t prof_unbiased_sz[PROF_SC_NSIZES];
+extern size_t prof_shifted_unbiased_cnt[PROF_SC_NSIZES];
+
+void prof_bt_hash(const void *key, size_t r_hash[2]);
+bool prof_bt_keycomp(const void *k1, const void *k2);
+
+bool prof_data_init(tsd_t *tsd);
+prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt);
+char *prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
+int prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name);
+void prof_unbias_map_init();
+void prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque,
+    prof_tdata_t *tdata, bool leakcheck);
+prof_tdata_t * prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid,
+    uint64_t thr_discrim, char *thread_name, bool active);
+void prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata);
+void prof_reset(tsd_t *tsd, size_t lg_sample);
+void prof_tctx_try_destroy(tsd_t *tsd, prof_tctx_t *tctx);
+
+/* Used in unit tests. */
+size_t prof_tdata_count(void);
+size_t prof_bt_count(void);
+void prof_cnt_all(prof_cnt_t *cnt_all);
+
+#endif /* JEMALLOC_INTERNAL_PROF_DATA_H */

+ 95 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h

@@ -0,0 +1,95 @@
+#ifndef JEMALLOC_INTERNAL_PROF_EXTERNS_H
+#define JEMALLOC_INTERNAL_PROF_EXTERNS_H
+
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/prof_hook.h"
+
+extern bool opt_prof;
+extern bool opt_prof_active;
+extern bool opt_prof_thread_active_init;
+extern size_t opt_lg_prof_sample;    /* Mean bytes between samples. */
+extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
+extern bool opt_prof_gdump;          /* High-water memory dumping. */
+extern bool opt_prof_final;          /* Final profile dumping. */
+extern bool opt_prof_leak;           /* Dump leak summary at exit. */
+extern bool opt_prof_leak_error;     /* Exit with error code if memory leaked */
+extern bool opt_prof_accum;          /* Report cumulative bytes. */
+extern bool opt_prof_log;            /* Turn logging on at boot. */
+extern char opt_prof_prefix[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
+extern bool opt_prof_unbias;
+
+/* For recording recent allocations */
+extern ssize_t opt_prof_recent_alloc_max;
+
+/* Whether to use thread name provided by the system or by mallctl. */
+extern bool opt_prof_sys_thread_name;
+
+/* Whether to record per size class counts and request size totals. */
+extern bool opt_prof_stats;
+
+/* Accessed via prof_active_[gs]et{_unlocked,}(). */
+extern bool prof_active_state;
+
+/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */
+extern bool prof_gdump_val;
+
+/* Profile dump interval, measured in bytes allocated. */
+extern uint64_t prof_interval;
+
+/*
+ * Initialized as opt_lg_prof_sample, and potentially modified during profiling
+ * resets.
+ */
+extern size_t lg_prof_sample;
+
+extern bool prof_booted;
+
+void prof_backtrace_hook_set(prof_backtrace_hook_t hook);
+prof_backtrace_hook_t prof_backtrace_hook_get();
+
+void prof_dump_hook_set(prof_dump_hook_t hook);
+prof_dump_hook_t prof_dump_hook_get();
+
+/* Functions only accessed in prof_inlines.h */
+prof_tdata_t *prof_tdata_init(tsd_t *tsd);
+prof_tdata_t *prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
+
+void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx);
+void prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size,
+    size_t usize, prof_tctx_t *tctx);
+void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info);
+prof_tctx_t *prof_tctx_create(tsd_t *tsd);
+void prof_idump(tsdn_t *tsdn);
+bool prof_mdump(tsd_t *tsd, const char *filename);
+void prof_gdump(tsdn_t *tsdn);
+
+void prof_tdata_cleanup(tsd_t *tsd);
+bool prof_active_get(tsdn_t *tsdn);
+bool prof_active_set(tsdn_t *tsdn, bool active);
+const char *prof_thread_name_get(tsd_t *tsd);
+int prof_thread_name_set(tsd_t *tsd, const char *thread_name);
+bool prof_thread_active_get(tsd_t *tsd);
+bool prof_thread_active_set(tsd_t *tsd, bool active);
+bool prof_thread_active_init_get(tsdn_t *tsdn);
+bool prof_thread_active_init_set(tsdn_t *tsdn, bool active_init);
+bool prof_gdump_get(tsdn_t *tsdn);
+bool prof_gdump_set(tsdn_t *tsdn, bool active);
+void prof_boot0(void);
+void prof_boot1(void);
+bool prof_boot2(tsd_t *tsd, base_t *base);
+void prof_prefork0(tsdn_t *tsdn);
+void prof_prefork1(tsdn_t *tsdn);
+void prof_postfork_parent(tsdn_t *tsdn);
+void prof_postfork_child(tsdn_t *tsdn);
+
+/* Only accessed by thread event. */
+uint64_t prof_sample_new_event_wait(tsd_t *tsd);
+uint64_t prof_sample_postponed_event_wait(tsd_t *tsd);
+void prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed);
+
+#endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */

+ 21 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h

@@ -0,0 +1,21 @@
+#ifndef JEMALLOC_INTERNAL_PROF_HOOK_H
+#define JEMALLOC_INTERNAL_PROF_HOOK_H
+
+/*
+ * The hooks types of which are declared in this file are experimental and
+ * undocumented, thus the typedefs are located in an 'internal' header.
+ */
+
+/*
+ * A hook to mock out backtrace functionality.  This can be handy, since it's
+ * otherwise difficult to guarantee that two allocations are reported as coming
+ * from the exact same stack trace in the presence of an optimizing compiler.
+ */
+typedef void (*prof_backtrace_hook_t)(void **, unsigned *, unsigned);
+
+/*
+ * A callback hook that notifies about recently dumped heap profile.
+ */
+typedef void (*prof_dump_hook_t)(const char *filename);
+
+#endif /* JEMALLOC_INTERNAL_PROF_HOOK_H */

+ 261 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h

@@ -0,0 +1,261 @@
+#ifndef JEMALLOC_INTERNAL_PROF_INLINES_H
+#define JEMALLOC_INTERNAL_PROF_INLINES_H
+
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/thread_event.h"
+
+JEMALLOC_ALWAYS_INLINE void
+prof_active_assert() {
+	cassert(config_prof);
+	/*
+	 * If opt_prof is off, then prof_active must always be off, regardless
+	 * of whether prof_active_mtx is in effect or not.
+	 */
+	assert(opt_prof || !prof_active_state);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_active_get_unlocked(void) {
+	prof_active_assert();
+	/*
+	 * Even if opt_prof is true, sampling can be temporarily disabled by
+	 * setting prof_active to false.  No locking is used when reading
+	 * prof_active in the fast path, so there are no guarantees regarding
+	 * how long it will take for all threads to notice state changes.
+	 */
+	return prof_active_state;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_gdump_get_unlocked(void) {
+	/*
+	 * No locking is used when reading prof_gdump_val in the fast path, so
+	 * there are no guarantees regarding how long it will take for all
+	 * threads to notice state changes.
+	 */
+	return prof_gdump_val;
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tdata_t *
+prof_tdata_get(tsd_t *tsd, bool create) {
+	prof_tdata_t *tdata;
+
+	cassert(config_prof);
+
+	tdata = tsd_prof_tdata_get(tsd);
+	if (create) {
+		assert(tsd_reentrancy_level_get(tsd) == 0);
+		if (unlikely(tdata == NULL)) {
+			if (tsd_nominal(tsd)) {
+				tdata = prof_tdata_init(tsd);
+				tsd_prof_tdata_set(tsd, tdata);
+			}
+		} else if (unlikely(tdata->expired)) {
+			tdata = prof_tdata_reinit(tsd, tdata);
+			tsd_prof_tdata_set(tsd, tdata);
+		}
+		assert(tdata == NULL || tdata->attached);
+	}
+
+	return tdata;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
+    prof_info_t *prof_info) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(prof_info != NULL);
+
+	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, false);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_info_get_and_reset_recent(tsd_t *tsd, const void *ptr,
+    emap_alloc_ctx_t *alloc_ctx, prof_info_t *prof_info) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(prof_info != NULL);
+
+	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, true);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_reset(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_tctx_reset(tsd, ptr, alloc_ctx);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	arena_prof_tctx_reset_sampled(tsd, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) {
+	cassert(config_prof);
+	assert(edata != NULL);
+	assert((uintptr_t)tctx > (uintptr_t)1U);
+
+	arena_prof_info_set(tsd, edata, tctx, size);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_should_skip(tsd_t *tsd, bool sample_event) {
+	cassert(config_prof);
+
+	/* Fastpath: no need to load tdata */
+	if (likely(!sample_event)) {
+		return true;
+	}
+
+	/*
+	 * sample_event is always obtained from the thread event module, and
+	 * whenever it's true, it means that the thread event module has
+	 * already checked the reentrancy level.
+	 */
+	assert(tsd_reentrancy_level_get(tsd) == 0);
+
+	prof_tdata_t *tdata = prof_tdata_get(tsd, true);
+	if (unlikely(tdata == NULL)) {
+		return true;
+	}
+
+	return !tdata->active;
+}
+
+JEMALLOC_ALWAYS_INLINE prof_tctx_t *
+prof_alloc_prep(tsd_t *tsd, bool prof_active, bool sample_event) {
+	prof_tctx_t *ret;
+
+	if (!prof_active ||
+	    likely(prof_sample_should_skip(tsd, sample_event))) {
+		ret = (prof_tctx_t *)(uintptr_t)1U;
+	} else {
+		ret = prof_tctx_create(tsd);
+	}
+
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_malloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
+    emap_alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
+
+	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
+		prof_malloc_sample_object(tsd, ptr, size, usize, tctx);
+	} else {
+		prof_tctx_reset(tsd, ptr, alloc_ctx);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
+    prof_tctx_t *tctx, bool prof_active, const void *old_ptr, size_t old_usize,
+    prof_info_t *old_prof_info, bool sample_event) {
+	bool sampled, old_sampled, moved;
+
+	cassert(config_prof);
+	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
+
+	if (prof_active && ptr != NULL) {
+		assert(usize == isalloc(tsd_tsdn(tsd), ptr));
+		if (prof_sample_should_skip(tsd, sample_event)) {
+			/*
+			 * Don't sample.  The usize passed to prof_alloc_prep()
+			 * was larger than what actually got allocated, so a
+			 * backtrace was captured for this allocation, even
+			 * though its actual usize was insufficient to cross the
+			 * sample threshold.
+			 */
+			prof_alloc_rollback(tsd, tctx);
+			tctx = (prof_tctx_t *)(uintptr_t)1U;
+		}
+	}
+
+	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
+	old_sampled = ((uintptr_t)old_prof_info->alloc_tctx > (uintptr_t)1U);
+	moved = (ptr != old_ptr);
+
+	if (unlikely(sampled)) {
+		prof_malloc_sample_object(tsd, ptr, size, usize, tctx);
+	} else if (moved) {
+		prof_tctx_reset(tsd, ptr, NULL);
+	} else if (unlikely(old_sampled)) {
+		/*
+		 * prof_tctx_reset() would work for the !moved case as well,
+		 * but prof_tctx_reset_sampled() is slightly cheaper, and the
+		 * proper thing to do here in the presence of explicit
+		 * knowledge re: moved state.
+		 */
+		prof_tctx_reset_sampled(tsd, ptr);
+	} else {
+		prof_info_t prof_info;
+		prof_info_get(tsd, ptr, NULL, &prof_info);
+		assert((uintptr_t)prof_info.alloc_tctx == (uintptr_t)1U);
+	}
+
+	/*
+	 * The prof_free_sampled_object() call must come after the
+	 * prof_malloc_sample_object() call, because tctx and old_tctx may be
+	 * the same, in which case reversing the call order could cause the tctx
+	 * to be prematurely destroyed as a side effect of momentarily zeroed
+	 * counters.
+	 */
+	if (unlikely(old_sampled)) {
+		prof_free_sampled_object(tsd, old_usize, old_prof_info);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+prof_sample_align(size_t orig_align) {
+	/*
+	 * Enforce page alignment, so that sampled allocations can be identified
+	 * w/o metadata lookup.
+	 */
+	assert(opt_prof);
+	return (opt_cache_oblivious && orig_align < PAGE) ? PAGE :
+	    orig_align;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_aligned(const void *ptr) {
+	return ((uintptr_t)ptr & PAGE_MASK) == 0;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sampled(tsd_t *tsd, const void *ptr) {
+	prof_info_t prof_info;
+	prof_info_get(tsd, ptr, NULL, &prof_info);
+	bool sampled = (uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U;
+	if (sampled) {
+		assert(prof_sample_aligned(ptr));
+	}
+	return sampled;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_free(tsd_t *tsd, const void *ptr, size_t usize,
+    emap_alloc_ctx_t *alloc_ctx) {
+	prof_info_t prof_info;
+	prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info);
+
+	cassert(config_prof);
+	assert(usize == isalloc(tsd_tsdn(tsd), ptr));
+
+	if (unlikely((uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U)) {
+		assert(prof_sample_aligned(ptr));
+		prof_free_sampled_object(tsd, usize, &prof_info);
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_PROF_INLINES_H */

+ 22 - 0
BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h

@@ -0,0 +1,22 @@
+#ifndef JEMALLOC_INTERNAL_PROF_LOG_H
+#define JEMALLOC_INTERNAL_PROF_LOG_H
+
+#include "jemalloc/internal/mutex.h"
+
+extern malloc_mutex_t log_mtx;
+
+void prof_try_log(tsd_t *tsd, size_t usize, prof_info_t *prof_info);
+bool prof_log_init(tsd_t *tsdn);
+
+/* Used in unit tests. */
+size_t prof_log_bt_count(void);
+size_t prof_log_alloc_count(void);
+size_t prof_log_thr_count(void);
+bool prof_log_is_logging(void);
+bool prof_log_rep_check(void);
+void prof_log_dummy_set(bool new_value);
+
+bool prof_log_start(tsdn_t *tsdn, const char *filename);
+bool prof_log_stop(tsdn_t *tsdn);
+
+#endif /* JEMALLOC_INTERNAL_PROF_LOG_H */

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels