Compare commits

...

4 Commits

Author SHA1 Message Date
J. Gareth "Curious Kit" Moreton
f90e42a791 * Added tests for #41210 2025-04-03 18:00:13 +00:00
J. Gareth "Curious Kit" Moreton
233f7e5a05 * x86: Fixed bug and refactored optimisations where SHR instructions were
merged incorrectly when a condition appears between them
2025-04-03 18:00:13 +00:00
Michaël Van Canneyt
fc43e66f05 * Wake main thread when a thread is auto freed 2025-04-03 17:12:20 +02:00
Michaël Van Canneyt
1a21ea41b8 * Correctly set current thread 2025-04-03 16:26:31 +02:00
5 changed files with 245 additions and 202 deletions

View File

@ -164,6 +164,7 @@ unit aoptx86;
function DoArithCombineOpt(var p : tai) : Boolean;
function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
function HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
function PrePeepholeOptSxx(var p : tai) : boolean;
function PrePeepholeOptIMUL(var p : tai) : boolean;
@ -7732,19 +7733,18 @@ unit aoptx86;
end;
end;
function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
function TX86AsmOptimizer.HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
var
hp1, hp2: tai;
Shift: TCGInt;
IdentityMask, Shift: TCGInt;
LimitSize: Topsize;
DoNotMerge: Boolean;
begin
if not MatchInstruction(p, A_SHR, []) then
InternalError(2025040301);
Result := False;
{ All these optimisations work on "shr const,%reg" }
if not MatchOpType(taicpu(p), top_const, top_reg) then
Exit;
DoNotMerge := False;
Shift := taicpu(p).oper[0]^.val;
LimitSize := taicpu(p).opsize;
@ -7755,7 +7755,77 @@ unit aoptx86;
Exit;
case taicpu(hp1).opcode of
A_TEST, A_CMP, A_Jcc:
A_AND:
{ Detect:
shr x, %reg
and y, %reg
If and y, %reg doesn't actually change the value of %reg (e.g. with
"shrl $24,%reg; andl $255,%reg", remove the AND instruction.
(Post-peephole only)
}
if PostPeephole and
(taicpu(hp1).opsize = taicpu(p).opsize) and
MatchOpType(taicpu(hp1), top_const, top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
{ Make sure the FLAGS register isn't in use }
TransferUsedRegs(TmpUsedRegs);
hp2 := p;
repeat
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
begin
{ Generate the identity mask }
case taicpu(p).opsize of
S_B:
IdentityMask := $FF shr Shift;
S_W:
IdentityMask := $FFFF shr Shift;
S_L:
IdentityMask := $FFFFFFFF shr Shift;
{$ifdef x86_64}
S_Q:
{ We need to force the operands to be unsigned 64-bit
integers otherwise the wrong value is generated }
IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
{$endif x86_64}
else
InternalError(2022081501);
end;
if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
begin
DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
{ All the possible 1 bits are covered, so we can remove the AND }
hp2 := tai(hp1.Previous);
RemoveInstruction(hp1);
{ p wasn't actually changed, so don't set Result to True,
but a change was nonetheless made elsewhere }
Include(OptsToCheck, aoc_ForceNewIteration);
{ Do another pass in case other AND or MOVZX instructions
follow }
hp1 := hp2;
Continue;
end;
end;
end;
A_TEST, A_CMP:
{ Skip over relevant comparisons, but shift instructions must
now not be merged since the original value is being read }
begin
DoNotMerge := True;
Continue;
end;
A_Jcc:
{ Skip over conditional jumps and relevant comparisons }
Continue;
@ -7769,7 +7839,21 @@ unit aoptx86;
if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
begin
if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
{ If the MOVZX instruction reads and writes the same register,
defer this to the post-peephole optimisation stage }
if PostPeephole then
begin
DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
{ All the possible 1 bits are covered, so we can remove the MOVZX }
hp2 := tai(hp1.Previous);
RemoveInstruction(hp1);
hp1 := hp2;
end;
end
else { Different register target }
begin
DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
taicpu(hp1).opcode := A_MOV;
@ -7791,10 +7875,49 @@ unit aoptx86;
Continue;
end;
{ NOTE: If the MOVZX instruction reads and writes the same
register, defer this to the post-peephole optimisation stage }
Exit;
end
else if PostPeephole and
(Shift > 0) and
(taicpu(p).opsize = S_W) and
(taicpu(hp1).opsize = S_WL) and
(taicpu(hp1).oper[0]^.reg = NR_AX) and
(taicpu(hp1).oper[1]^.reg = NR_EAX) then
begin
{ Detect:
shr x, %ax (x > 0)
...
movzwl %ax,%eax
-
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
But first, check to see if movzwl %ax,%eax can be removed...
}
hp2 := tai(hp1.Previous);
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegsBetween(UsedRegs, p, hp1);
if PostPeepholeOptMovZX(hp1) then
hp1 := hp2
else
begin
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
taicpu(hp1).opcode := A_CWDE;
taicpu(hp1).clearop(0);
taicpu(hp1).clearop(1);
taicpu(hp1).ops := 0;
end;
RestoreUsedRegs(TmpUsedRegs);
{ Don't need to set aoc_ForceNewIteration if
PostPeepholeOptMovZX returned True because it's the
post-peephole stage }
end;
{ Move onto the next instruction }
Continue;
end;
A_SHL, A_SAL, A_SHR:
if (taicpu(hp1).opsize <= LimitSize) and
@ -7842,11 +7965,36 @@ unit aoptx86;
;
end;
{ If the register isn't actually modified, move onto the next instruction,
but set DoNotMerge to True since the register is being read }
if (
{ Under -O2 and below, GetNextInstructionUsingReg only returns
the next instruction, whether or not it contains the register }
(cs_opt_level3 in current_settings.optimizerswitches) or
RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
begin
DoNotMerge := True;
Continue;
end;
Break;
until False;
end;
function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
begin
Result := False;
{ All these optimisations work on "shr const,%reg" }
if not MatchOpType(taicpu(p), top_const, top_reg) then
Exit;
Result := HandleSHRMerge(p, False);
end;
function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
var
CurrentRef: TReference;
@ -16576,10 +16724,7 @@ unit aoptx86;
function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
var
hp1, hp2: tai;
IdentityMask, Shift: TCGInt;
LimitSize: Topsize;
DoNotMerge: Boolean;
hp1: tai;
begin
Result := False;
@ -16587,193 +16732,12 @@ unit aoptx86;
if not MatchOpType(taicpu(p), top_const, top_reg) then
Exit;
DoNotMerge := False;
Shift := taicpu(p).oper[0]^.val;
LimitSize := taicpu(p).opsize;
hp1 := p;
repeat
if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
Break;
{ Detect:
shr x, %reg
and y, %reg
If and y, %reg doesn't actually change the value of %reg (e.g. with
"shrl $24,%reg; andl $255,%reg", remove the AND instruction.
}
case taicpu(hp1).opcode of
A_AND:
if (taicpu(hp1).opsize = taicpu(p).opsize) and
MatchOpType(taicpu(hp1), top_const, top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
{ Make sure the FLAGS register isn't in use }
TransferUsedRegs(TmpUsedRegs);
hp2 := p;
repeat
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
begin
{ Generate the identity mask }
case taicpu(p).opsize of
S_B:
IdentityMask := $FF shr Shift;
S_W:
IdentityMask := $FFFF shr Shift;
S_L:
IdentityMask := $FFFFFFFF shr Shift;
{$ifdef x86_64}
S_Q:
{ We need to force the operands to be unsigned 64-bit
integers otherwise the wrong value is generated }
IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
{$endif x86_64}
else
InternalError(2022081501);
end;
if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
begin
DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
{ All the possible 1 bits are covered, so we can remove the AND }
hp2 := tai(hp1.Previous);
RemoveInstruction(hp1);
{ p wasn't actually changed, so don't set Result to True,
but a change was nonetheless made elsewhere }
Include(OptsToCheck, aoc_ForceNewIteration);
{ Do another pass in case other AND or MOVZX instructions
follow }
hp1 := hp2;
Continue;
end;
end;
end;
A_TEST, A_CMP, A_Jcc:
{ Skip over conditional jumps and relevant comparisons }
Continue;
A_MOVZX:
if MatchOpType(taicpu(hp1), top_reg, top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
begin
{ Since the original register is being read as is, subsequent
SHRs must not be merged at this point }
DoNotMerge := True;
if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
begin
if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
{ All the possible 1 bits are covered, so we can remove the AND }
hp2 := tai(hp1.Previous);
RemoveInstruction(hp1);
hp1 := hp2;
end
else { Different register target }
begin
DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
taicpu(hp1).opcode := A_MOV;
setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
case taicpu(hp1).opsize of
S_BW:
taicpu(hp1).opsize := S_W;
S_BL, S_WL:
taicpu(hp1).opsize := S_L;
else
InternalError(2022081503);
end;
end;
end
else if (Shift > 0) and
(taicpu(p).opsize = S_W) and
(taicpu(hp1).opsize = S_WL) and
(taicpu(hp1).oper[0]^.reg = NR_AX) and
(taicpu(hp1).oper[1]^.reg = NR_EAX) then
begin
{ Detect:
shr x, %ax (x > 0)
...
movzwl %ax,%eax
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
}
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
taicpu(hp1).opcode := A_CWDE;
taicpu(hp1).clearop(0);
taicpu(hp1).clearop(1);
taicpu(hp1).ops := 0;
end;
{ Move onto the next instruction }
Continue;
end;
A_SHL, A_SAL, A_SHR:
if (taicpu(hp1).opsize <= LimitSize) and
MatchOpType(taicpu(hp1), top_const, top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
begin
{ Make sure the sizes don't exceed the register size limit
(measured by the shift value falling below the limit) }
if taicpu(hp1).opsize < LimitSize then
LimitSize := taicpu(hp1).opsize;
if taicpu(hp1).opcode = A_SHR then
Inc(Shift, taicpu(hp1).oper[0]^.val)
else
begin
Dec(Shift, taicpu(hp1).oper[0]^.val);
DoNotMerge := True;
end;
if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
Break;
{ Since we've established that the combined shift is within
limits, we can actually combine the adjacent SHR
instructions even if they're different sizes }
if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
begin
hp2 := tai(hp1.Previous);
DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
RemoveInstruction(hp1);
hp1 := hp2;
end;
{ Move onto the next instruction }
Continue;
end;
else
{ If the register isn't actually modified, move onto the next instruction,
but set DoNotMerge to True since the register is being read }
if (
{ Under -O2 and below, GetNextInstructionUsingReg only returns
the next instruction, whether or not it contains the register }
(cs_opt_level3 in current_settings.optimizerswitches) or
RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
begin
DoNotMerge := True;
Continue;
end;
if HandleSHRMerge(p, True) then
begin
Result := True;
Exit;
end;
Break;
until False;
{ Detect the following (looking backwards):
shr %cl,%reg
shr x, %reg

View File

@ -399,12 +399,14 @@ procedure WasiAllocateThreadVars; forward;
{$push}{$S-} // no stack checking for this procedure
procedure FPCWasmThreadStartPascal(tid: longint; start_arg: PWasmThread);
begin
{$IFDEF DEBUGWASMTHREADS}DebugWriteln('FPCWasmThreadStartPascal(...)');{$ENDIF}
{$IFDEF DEBUGWASMTHREADS}DebugWriteln('FPCWasmThreadStartPascal('+IntToStr(tid)+','+IntToStr(ptrint(start_arg))+')');{$ENDIF}
start_arg^.ID:=tid;
GlobalCurrentThread:=@start_arg;
GlobalCurrentThread:=start_arg;
GlobalIsMainThread:=0;
GlobalIsWorkerThread:=1;
{$IFDEF DEBUGWASMTHREADS}DebugWriteln('Check : TID='+IntToStr(tid)+', start_arg_id='+IntToStr(start_arg^.ID)+', currentthread= '+IntTostr(ptrint(GetCurrentThreadID))+')');{$ENDIF}
{$IFDEF FPC_WASM_WORKER_THREADS_CAN_WAIT}
GlobalIsThreadBlockable:=1;
{$ELSE FPC_WASM_WORKER_THREADS_CAN_WAIT}
@ -610,6 +612,7 @@ begin
Result:=0;
end;
function WasiGetCurrentThreadId : TThreadID;
begin
Result:=GetSelfThread;

View File

@ -184,6 +184,9 @@ begin
Resume;
end;
{$IFDEF DEBUGWASMTHREADS}DebugWriteln('TThread.SysDestroy: waiting on thread');{$ENDIF}
// Before calling WaitFor, signal main thread with WakeMainThread, so pending checksynchronize calls are handled.
if assigned(WakeMainThread) then
WakeMainThread(Self);
WaitFor;
end;
end;

19
tests/webtbs/tw41210.pp Normal file
View File

@ -0,0 +1,19 @@
{ %OPT=-O3 }
{$mode objfpc} {$coperators on}
program tw41210;
var
x, oops: uint32;
begin
x := random(0) + $123456;
oops := 0;
x := x shr 8;
if byte(x) <> $34 then oops += 1;
x := x shr 8;
if byte(x) <> $12 then oops += 2;
if oops <> 0 then
begin
writeln('FAILED: oops = ', oops);
Halt(1);
end;
Writeln('ok');
end.

54
tests/webtbs/tw41210a.pp Normal file
View File

@ -0,0 +1,54 @@
{ %OPT=-O2 }
program tw41210b;
{$MODE OBJFPC}
function strspn(s, accept: pointer): integer;
var
p: PCardinal;
c: AnsiChar;
d: cardinal;
begin
// returns size of initial segment of s which are in accept
result := 0;
repeat
c := PAnsiChar(s)[result];
if c = #0 then
break;
p := accept;
repeat // stop as soon as we find any character not from accept
d := p^;
inc(p);
if AnsiChar(d) = c then
break
else if AnsiChar(d) = #0 then
exit;
d := d shr 8;
if AnsiChar(d) = c then
break
else if AnsiChar(d) = #0 then
exit;
d := d shr 8;
if AnsiChar(d) = c then
break
else if AnsiChar(d) = #0 then
exit;
d := d shr 8;
if AnsiChar(d) = c then
break
else if AnsiChar(d) = #0 then
exit;
until false;
inc(result);
until false;
end;
var
Output: integer;
begin
Output := strspn(PAnsiChar('abcdef'), PAnsiChar('debca'));
if Output <> 5 then
begin
WriteLn('FAILED: Returned ', Output, ' instead of 5');
Halt(1);
end;
WriteLn('ok');
end.