Low-level optimistic implementations of SysRelocateThreadvar that directly read TEB.

This commit is contained in:
Rika Ichinose 2024-05-24 15:39:15 +03:00 committed by FPK
parent 0ca608243c
commit c68788e03e

View File

@ -150,7 +150,90 @@ function WinTryEnterCriticalSection(var cs : TRTLCriticalSection):longint;
TLSKey^:=$ffffffff;
end;
function SysRelocateThreadvar(offset : dword) : pointer;
{ Directly access thread environment block (TEB). If there is a value, use it. If there is not, jump to TrulyRelocateThreadvar that can allocate it.
TrulyRelocateThreadvar is several (5+) times slower by itself; shortcutting SetLastError on errorsave = 0 helps a bit (reduces to 3.5× maybe :D).
General info (in particular, stories about de facto stability guarantees):
https://en.wikipedia.org/wiki/Win32_Thread_Information_Block
TEB layout:
https://github.com/wine-mirror/wine/blob/badaed641928edb8f2426d9f12d16c88b479e1e8/include/winternl.h#L431
“Why load fs:[0x18] into a register and then dereference that, instead of just going for fs:[n] directly?
https://devblogs.microsoft.com/oldnewthing/20220919-00/?p=107195
TL;DR: even in Windows sources, TlsGetValue is written in relatively high-level manner and not overly optimized. }
{$ifndef wince} { Dont know a thing, maybe WinCE TEB is compatible... :D https://stackoverflow.com/questions/1099311/windows-ce-internals-teb-thread-environment-block }
{$if defined(cpui386)}
function TrulyRelocateThreadvar(offset : dword) : pointer; forward;
function SysRelocateThreadvar(offset : dword) : pointer; assembler; nostackframe;
{ eax = offset }
const
TlsSlots = $E10; { void* TlsSlots[64] @ fs:[E10h]. }
TlsExpansionSlots = $F94; { void** TlsExpansionSlots @ fs:[F94h] }
asm
mov TLSKey, %edx
mov (%edx), %edx { edx = TLSKey^. }
cmp $0x40, %edx { There are 64 static slots + 1024 dynamic slots. }
jae .LExp
mov %fs:TlsSlots(,%edx,4), %edx { Read TLSKey^-th slot. }
test %edx, %edx
jz .LOops
add %edx, %eax { result := TlsGetValue(TLSKey^) + offset. }
ret
.LOops: jmp TrulyRelocateThreadvar { Save on relative jumps :) }
.LExp: cmp $0x440, %edx
jae .LOops { Will fail as 0x440 = 1088 = 64 static + 1024 dynamic is the limit on TLS indices. }
mov %fs:TlsExpansionSlots, %ecx { ecx = TlsExpansionSlots. }
test %ecx, %ecx
jz .LOops { No TlsExpansionSlots allocated. }
mov -0x100(%ecx,%edx,4), %edx { Read (TLSKey^ 64)-th dynamic slot. }
test %edx, %edx
jz .LOops
add %edx, %eax { result := TlsGetValue(TLSKey^) + offset. }
end;
{$elseif defined(cpux86_64)}
function TrulyRelocateThreadvar(offset : dword) : pointer; forward;
function SysRelocateThreadvar(offset : dword) : pointer; assembler; nostackframe;
{ ecx = offset }
const { Same as above but 64-bit: TEB pointer is in GS register, different offsets. }
TlsSlots = $1480;
TlsExpansionSlots = $1780;
asm
mov TLSKey(%rip), %rdx
mov (%rdx), %edx { edx = TLSKey^. }
cmp $0x40, %edx
jae .LExp
mov %gs:TlsSlots(,%rdx,8), %rax
test %rax, %rax
jz .LOops
add %rcx, %rax { Hopefully offset is zero-extended on entry. }
ret
.LOops: jmp TrulyRelocateThreadvar
.LExp: cmp $0x440, %edx
jae .LOops
mov %gs:TlsExpansionSlots, %rax
test %rax, %rax
jz .LOops
mov -0x200(%rax,%rdx,8), %rax
test %rax, %rax
jz .LOops
add %rcx, %rax
end;
{$endif implement SysRelocateThreadvar with assembly}
{$endif not wince}
function {$if declared(SysRelocateThreadvar)} TrulyRelocateThreadvar {$else} SysRelocateThreadvar {$endif} (offset : dword) : pointer;
var
dataindex : pointer;
errorsave : dword;
@ -164,7 +247,7 @@ function WinTryEnterCriticalSection(var cs : TRTLCriticalSection):longint;
InitThread($1000000);
end;
SetLastError(errorsave);
SysRelocateThreadvar:=DataIndex+Offset;
Result:=DataIndex+Offset;
end;