Browse Source

Low-level optimistic implementations of SysRelocateThreadvar that directly read TEB.

Rika Ichinose 1 year ago
parent
commit
c68788e03e
1 changed files with 85 additions and 2 deletions
  1. 85 2
      rtl/win/systhrd.inc

+ 85 - 2
rtl/win/systhrd.inc

@@ -150,7 +150,90 @@ function WinTryEnterCriticalSection(var cs : TRTLCriticalSection):longint;
         TLSKey^:=$ffffffff;
       end;
 
-    function SysRelocateThreadvar(offset : dword) : pointer;
+
+{ Directly access thread environment block (TEB). If there is a value, use it. If there is not, jump to TrulyRelocateThreadvar that can allocate it.
+  TrulyRelocateThreadvar is several (5+) times slower by itself; shortcutting SetLastError on errorsave = 0 helps a bit (reduces to 3.5× maybe :D).
+
+  General info (in particular, stories about de facto stability guarantees):
+  https://en.wikipedia.org/wiki/Win32_Thread_Information_Block
+
+  TEB layout:
+  https://github.com/wine-mirror/wine/blob/badaed641928edb8f2426d9f12d16c88b479e1e8/include/winternl.h#L431
+
+  “Why load fs:[0x18] into a register and then dereference that, instead of just going for fs:[n] directly?”
+  https://devblogs.microsoft.com/oldnewthing/20220919-00/?p=107195
+  TL;DR: even in Windows sources, TlsGetValue is written in relatively high-level manner and not overly optimized. }
+
+{$ifndef wince} { Don’t know a thing, maybe WinCE TEB is compatible... :D https://stackoverflow.com/questions/1099311/windows-ce-internals-teb-thread-environment-block }
+{$if defined(cpui386)}
+    function TrulyRelocateThreadvar(offset : dword) : pointer; forward;
+
+    function SysRelocateThreadvar(offset : dword) : pointer; assembler; nostackframe;
+      { eax = offset }
+      const
+        TlsSlots = $E10; { void* TlsSlots[64] @ fs:[E10h]. }
+        TlsExpansionSlots = $F94; { void** TlsExpansionSlots @ fs:[F94h] }
+      asm
+        mov  TLSKey, %edx
+        mov  (%edx), %edx { edx = TLSKey^. }
+
+        cmp  $0x40, %edx { There are 64 static slots + 1024 dynamic slots. }
+        jae  .LExp
+        mov  %fs:TlsSlots(,%edx,4), %edx { Read TLSKey^-th slot. }
+        test %edx, %edx
+        jz   .LOops
+        add  %edx, %eax { result := TlsGetValue(TLSKey^) + offset. }
+        ret
+
+.LOops: jmp  TrulyRelocateThreadvar { Save on relative jumps :) }
+
+.LExp:  cmp  $0x440, %edx
+        jae  .LOops { Will fail as 0x440 = 1088 = 64 static + 1024 dynamic is the limit on TLS indices. }
+        mov  %fs:TlsExpansionSlots, %ecx { ecx = TlsExpansionSlots. }
+        test %ecx, %ecx
+        jz   .LOops { No TlsExpansionSlots allocated. }
+        mov  -0x100(%ecx,%edx,4), %edx { Read (TLSKey^ − 64)-th dynamic slot. }
+        test %edx, %edx
+        jz   .LOops
+        add  %edx, %eax { result := TlsGetValue(TLSKey^) + offset. }
+      end;
+{$elseif defined(cpux86_64)}
+    function TrulyRelocateThreadvar(offset : dword) : pointer; forward;
+
+    function SysRelocateThreadvar(offset : dword) : pointer; assembler; nostackframe;
+      { ecx = offset }
+      const { Same as above but 64-bit: TEB pointer is in GS register, different offsets. }
+        TlsSlots = $1480;
+        TlsExpansionSlots = $1780;
+      asm
+        mov  TLSKey(%rip), %rdx
+        mov  (%rdx), %edx { edx = TLSKey^. }
+
+        cmp  $0x40, %edx
+        jae  .LExp
+        mov  %gs:TlsSlots(,%rdx,8), %rax
+        test %rax, %rax
+        jz   .LOops
+        add  %rcx, %rax { Hopefully offset is zero-extended on entry. }
+        ret
+
+.LOops: jmp  TrulyRelocateThreadvar
+
+.LExp:  cmp  $0x440, %edx
+        jae  .LOops
+        mov  %gs:TlsExpansionSlots, %rax
+        test %rax, %rax
+        jz   .LOops
+        mov  -0x200(%rax,%rdx,8), %rax
+        test %rax, %rax
+        jz   .LOops
+        add  %rcx, %rax
+      end;
+{$endif implement SysRelocateThreadvar with assembly}
+{$endif not wince}
+
+
+    function {$if declared(SysRelocateThreadvar)} TrulyRelocateThreadvar {$else} SysRelocateThreadvar {$endif} (offset : dword) : pointer;
       var
         dataindex : pointer;
         errorsave : dword;
@@ -164,7 +247,7 @@ function WinTryEnterCriticalSection(var cs : TRTLCriticalSection):longint;
             InitThread($1000000);
           end;
         SetLastError(errorsave);
-        SysRelocateThreadvar:=DataIndex+Offset;
+        Result:=DataIndex+Offset;
       end;