From 5726428dccfb87159f31ce505ab1bef04a5f59e6 Mon Sep 17 00:00:00 2001 From: florian Date: Fri, 21 May 2021 20:36:15 +0000 Subject: [PATCH] * patch by J. Gareth Moreton: Additional SETcc optimisations, resolves #38767 git-svn-id: trunk@49386 - --- compiler/cgutils.pas | 4 +- compiler/i386/aoptcpu.pas | 4 +- compiler/x86/aoptx86.pas | 541 ++++++++++++++++++++++++++++++------ compiler/x86_64/aoptcpu.pas | 4 +- 4 files changed, 465 insertions(+), 88 deletions(-) diff --git a/compiler/cgutils.pas b/compiler/cgutils.pas index 1a78c2082d..926541111b 100644 --- a/compiler/cgutils.pas +++ b/compiler/cgutils.pas @@ -197,7 +197,7 @@ unit cgutils; { This routine verifies if two references are the same, and if so, returns TRUE, otherwise returns false. } - function references_equal(const sref,dref : treference) : boolean; + function references_equal(const sref,dref : treference) : boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE} { tlocation handling } @@ -262,7 +262,7 @@ uses end; - function references_equal(const sref,dref : treference):boolean; + function references_equal(const sref,dref : treference):boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE} begin references_equal:=CompareByte(sref,dref,sizeof(treference))=0; end; diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas index b1899bebac..d1ee73f105 100644 --- a/compiler/i386/aoptcpu.pas +++ b/compiler/i386/aoptcpu.pas @@ -221,8 +221,6 @@ unit aoptcpu; A_MOVSD, A_MOVSS: Result:=OptPass1MOVXX(p); - A_SETcc: - Result:=OptPass1SETcc(p); else ; end; @@ -260,6 +258,8 @@ unit aoptcpu; Result:=OptPass2Movx(p); A_SUB: Result:=OptPass2SUB(p); + A_SETcc: + Result:=OptPass2SETcc(p); else ; end; diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index d363cfca50..a4aab212c9 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -134,7 +134,6 @@ unit aoptx86; function OptPass1LEA(var p : tai) : boolean; function OptPass1Sub(var p : tai) : boolean; function OptPass1SHLSAL(var p : tai) : boolean; - function OptPass1SETcc(var p : tai) : boolean; function OptPass1FSTP(var p : tai) : boolean; function OptPass1FLD(var p : tai) : boolean; function OptPass1Cmp(var p : tai) : boolean; @@ -150,6 +149,9 @@ unit aoptx86; function OptPass2Lea(var p: tai): Boolean; function OptPass2SUB(var p: tai): Boolean; function OptPass2ADD(var p : tai): Boolean; + function OptPass2SETcc(var p : tai) : boolean; + + function CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean; function PostPeepholeOptMov(var p : tai) : Boolean; function PostPeepholeOptMovzx(var p : tai) : Boolean; @@ -215,6 +217,7 @@ unit aoptx86; const SPeepholeOptimization = ''; {$endif DEBUG_AOPTCPU} + LIST_STEP_SIZE = 4; function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean; begin @@ -4153,97 +4156,120 @@ unit aoptx86; end; - function TX86AsmOptimizer.OptPass1SETcc(var p: tai): boolean; + function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean; var - hp1,hp2,next: tai; SetC, JumpC: TAsmCond; Unconditional: Boolean; + CurrentRef: TReference; + FullReg: TRegister; + hp1, hp2: tai; begin - Result:=false; + Result := False; + if (first_mov.opsize <> S_B) or (second_mov.opsize <> S_B) then + Exit; - if MatchOpType(taicpu(p),top_reg) and GetNextInstruction(p, hp1) then + { We assume you've checked if the operand is actually a reference by + this point. If it isn't, you'll most likely get an access violation } + CurrentRef := first_mov.oper[1]^.ref^; + + { Memory must be aligned } + if (CurrentRef.offset mod 4) <> 0 then + Exit; + + Inc(CurrentRef.offset); + CurrentRef.alignment := 1; { Otherwise references_equal will return False } + + if MatchOperand(second_mov.oper[0]^, 0) and + references_equal(second_mov.oper[1]^.ref^, CurrentRef) and + GetNextInstruction(second_mov, hp1) and + (hp1.typ = ait_instruction) and + (taicpu(hp1).opcode = A_MOV) and + MatchOpType(taicpu(hp1), top_const, top_ref) and + (taicpu(hp1).oper[0]^.val = 0) then begin - if ((MatchInstruction(hp1, A_TEST, [S_B]) and - MatchOpType(taicpu(hp1),top_reg,top_reg) and - (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg)) or - (MatchInstruction(hp1, A_CMP, [S_B]) and - MatchOpType(taicpu(hp1),top_const,top_reg) and - (taicpu(hp1).oper[0]^.val=0)) - ) and - (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and - GetNextInstruction(hp1, hp2) and - MatchInstruction(hp2, A_Jcc, []) then - { Change from: To: + Inc(CurrentRef.offset); + CurrentRef.alignment := taicpu(hp1).oper[1]^.ref^.alignment; { Otherwise references_equal might return False } - set(C) %reg j(~C) label - test %reg,%reg/cmp $0,%reg - je label + FullReg := newreg(R_INTREGISTER,getsupreg(first_mov.oper[0]^.reg), R_SUBD); - - set(C) %reg j(C) label - test %reg,%reg/cmp $0,%reg - jne label - } + if references_equal(taicpu(hp1).oper[1]^.ref^, CurrentRef) then begin - next := tai(p.Next); + case taicpu(hp1).opsize of + S_B: + if GetNextInstruction(hp1, hp2) and + MatchInstruction(taicpu(hp2), A_MOV, [S_B]) and + MatchOpType(taicpu(hp2), top_const, top_ref) and + (taicpu(hp2).oper[0]^.val = 0) then + begin + Inc(CurrentRef.offset); + CurrentRef.alignment := 1; { Otherwise references_equal will return False } - TransferUsedRegs(TmpUsedRegs); - UpdateUsedRegs(TmpUsedRegs, next); - UpdateUsedRegs(TmpUsedRegs, tai(hp1.next)); + if references_equal(taicpu(hp2).oper[1]^.ref^, CurrentRef) and + (taicpu(hp2).opsize = S_B) then + begin + RemoveInstruction(hp1); + RemoveInstruction(hp2); - JumpC := taicpu(hp2).condition; - Unconditional := False; + first_mov.opsize := S_L; - if conditions_equal(JumpC, C_E) then - SetC := inverse_cond(taicpu(p).condition) - else if conditions_equal(JumpC, C_NE) then - SetC := taicpu(p).condition - else - { We've got something weird here (and inefficent) } - begin - DebugMsg('DEBUG: Inefficient jump - check code generation', p); - SetC := C_NONE; + if first_mov.oper[0]^.typ = top_reg then + begin + DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVZX/MOVl', first_mov); - { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) } - if condition_in(C_AE, JumpC) then - Unconditional := True - else - { Not sure what to do with this jump - drop out } + { Reuse second_mov as a MOVZX instruction } + second_mov.opcode := A_MOVZX; + second_mov.opsize := S_BL; + second_mov.loadreg(0, first_mov.oper[0]^.reg); + second_mov.loadreg(1, FullReg); + + first_mov.oper[0]^.reg := FullReg; + + asml.Remove(second_mov); + asml.InsertBefore(second_mov, first_mov); + end + else + { It's a value } + begin + DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVl', first_mov); + RemoveInstruction(second_mov); + end; + + Result := True; + Exit; + end; + end; + S_W: + begin + RemoveInstruction(hp1); + + first_mov.opsize := S_L; + + if first_mov.oper[0]^.typ = top_reg then + begin + DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVZX/MOVl', first_mov); + + { Reuse second_mov as a MOVZX instruction } + second_mov.opcode := A_MOVZX; + second_mov.opsize := S_BL; + second_mov.loadreg(0, first_mov.oper[0]^.reg); + second_mov.loadreg(1, FullReg); + + first_mov.oper[0]^.reg := FullReg; + + asml.Remove(second_mov); + asml.InsertBefore(second_mov, first_mov); + end + else + { It's a value } + begin + DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVl', first_mov); + RemoveInstruction(second_mov); + end; + + Result := True; Exit; - end; - - RemoveInstruction(hp1); - - if Unconditional then - MakeUnconditional(taicpu(hp2)) - else - begin - if SetC = C_NONE then - InternalError(2018061402); - - taicpu(hp2).SetCondition(SetC); - end; - - if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then - begin - RemoveCurrentp(p, hp2); - Result := True; - end; - - DebugMsg(SPeepholeOptimization + 'SETcc/TESTCmp/Jcc -> Jcc',p); - end - else if MatchInstruction(hp1, A_MOV, [S_B]) and - MatchOpType(taicpu(hp1),top_reg,top_reg) and - MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) then - begin - TransferUsedRegs(TmpUsedRegs); - UpdateUsedRegs(TmpUsedRegs, tai(p.Next)); - if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then - begin - AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs); - taicpu(p).oper[0]^.reg:=taicpu(hp1).oper[1]^.reg; - RemoveInstruction(hp1); - DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p); - Result := true; - end; + end; + else + ; + end; end; end; end; @@ -5265,8 +5291,6 @@ unit aoptx86; function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean; - const - LIST_STEP_SIZE = 4; var ThisReg: TRegister; MinSize, MaxSize, TrySmaller, TargetSize: TOpSize; @@ -5973,6 +5997,359 @@ unit aoptx86; end; + function TX86AsmOptimizer.OptPass2SETcc(var p: tai): boolean; + var + hp1,hp2,next: tai; SetC, JumpC: TAsmCond; + Unconditional, PotentialModified: Boolean; + OperPtr: POper; + + NewRef: TReference; + + InstrList: array of taicpu; + InstrMax, Index: Integer; + const +{$ifdef DEBUG_AOPTCPU} + SNoFlags: shortstring = ' so the flags aren''t modified'; +{$else DEBUG_AOPTCPU} + SNoFlags = ''; +{$endif DEBUG_AOPTCPU} + begin + Result:=false; + + if MatchOpType(taicpu(p),top_reg) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then + begin + if MatchInstruction(hp1, A_TEST, [S_B]) and + MatchOpType(taicpu(hp1),top_reg,top_reg) and + (taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and + (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and + GetNextInstruction(hp1, hp2) and + MatchInstruction(hp2, A_Jcc, []) then + { Change from: To: + + set(C) %reg j(~C) label + test %reg,%reg/cmp $0,%reg + je label + + + set(C) %reg j(C) label + test %reg,%reg/cmp $0,%reg + jne label + } + begin + { Before we do anything else, we need to check the instructions + in between SETcc and TEST to make sure they don't modify the + FLAGS register - if -O2 or under, there won't be any + instructions between SET and TEST } + + TransferUsedRegs(TmpUsedRegs); + UpdateUsedRegs(TmpUsedRegs, tai(p.next)); + + if (cs_opt_level3 in current_settings.optimizerswitches) then + begin + next := p; + SetLength(InstrList, 0); + InstrMax := -1; + PotentialModified := False; + + { Make a note of every instruction that modifies the FLAGS + register } + while GetNextInstruction(next, next) and (next <> hp1) do + begin + if next.typ <> ait_instruction then + { GetNextInstructionUsingReg should have returned False } + InternalError(2021051701); + + if RegModifiedByInstruction(NR_DEFAULTFLAGS, next) then + begin + case taicpu(next).opcode of + A_SETcc, + A_CMOVcc, + A_Jcc: + begin + if PotentialModified then + { Not safe because the flags were modified earlier } + Exit + else + { Condition is the same as the initial SETcc, so this is safe + (don't add to instruction list though) } + Continue; + end; + A_ADD: + begin + if (taicpu(next).opsize = S_B) or + { LEA doesn't support 8-bit operands } + (taicpu(next).oper[1]^.typ <> top_reg) or + { Must write to a register } + (taicpu(next).oper[0]^.typ = top_ref) then + { Require a constant or a register } + Exit; + + PotentialModified := True; + end; + A_SUB: + begin + if (taicpu(next).opsize = S_B) or + { LEA doesn't support 8-bit operands } + (taicpu(next).oper[1]^.typ <> top_reg) or + { Must write to a register } + (taicpu(next).oper[0]^.typ <> top_const) or + (taicpu(next).oper[0]^.val = $80000000) then + { Can't subtract a register with LEA - also + check that the value isn't -2^31, as this + can't be negated } + Exit; + + PotentialModified := True; + end; + A_SAL, + A_SHL: + begin + if (taicpu(next).opsize = S_B) or + { LEA doesn't support 8-bit operands } + (taicpu(next).oper[1]^.typ <> top_reg) or + { Must write to a register } + (taicpu(next).oper[0]^.typ <> top_const) or + (taicpu(next).oper[0]^.val < 0) or + (taicpu(next).oper[0]^.val > 3) then + Exit; + + PotentialModified := True; + end; + A_IMUL: + begin + if (taicpu(next).ops <> 3) or + (taicpu(next).oper[1]^.typ <> top_reg) or + { Must write to a register } + (taicpu(next).oper[2]^.val in [2,3,4,5,8,9]) then + { We can convert "imul x,%reg1,%reg2" (where x = 2, 4 or 8) + to "lea (%reg1,x),%reg2". If x = 3, 5 or 9, we can + change this to "lea (%reg1,%reg1,(x-1)),%reg2" } + Exit + else + PotentialModified := True; + end; + else + { Don't know how to change this, so abort } + Exit; + end; + + { Contains highest index (so instruction count - 1) } + Inc(InstrMax); + if InstrMax > High(InstrList) then + SetLength(InstrList, InstrMax + LIST_STEP_SIZE); + + InstrList[InstrMax] := taicpu(next); + end; + UpdateUsedRegs(TmpUsedRegs, tai(next.next)); + end; + + if not Assigned(next) or (next <> hp1) then + { It should be equal to hp1 } + InternalError(2021051702); + + { Cycle through each instruction and check to see if we can + change them to versions that don't modify the flags } + if (InstrMax >= 0) then + begin + for Index := 0 to InstrMax do + case InstrList[Index].opcode of + A_ADD: + begin + DebugMsg(SPeepholeOptimization + 'ADD -> LEA' + SNoFlags, InstrList[Index]); + InstrList[Index].opcode := A_LEA; + reference_reset(NewRef, 1, []); + NewRef.base := InstrList[Index].oper[1]^.reg; + + if InstrList[Index].oper[0]^.typ = top_reg then + begin + NewRef.index := InstrList[Index].oper[0]^.reg; + NewRef.scalefactor := 1; + end + else + NewRef.offset := InstrList[Index].oper[0]^.val; + + InstrList[Index].loadref(0, NewRef); + end; + A_SUB: + begin + DebugMsg(SPeepholeOptimization + 'SUB -> LEA' + SNoFlags, InstrList[Index]); + InstrList[Index].opcode := A_LEA; + reference_reset(NewRef, 1, []); + NewRef.base := InstrList[Index].oper[1]^.reg; + NewRef.offset := -InstrList[Index].oper[0]^.val; + + InstrList[Index].loadref(0, NewRef); + end; + A_SHL, + A_SAL: + begin + DebugMsg(SPeepholeOptimization + 'SHL -> LEA' + SNoFlags, InstrList[Index]); + InstrList[Index].opcode := A_LEA; + reference_reset(NewRef, 1, []); + NewRef.index := InstrList[Index].oper[1]^.reg; + NewRef.scalefactor := 1 shl (InstrList[Index].oper[0]^.val); + + InstrList[Index].loadref(0, NewRef); + end; + A_IMUL: + begin + DebugMsg(SPeepholeOptimization + 'IMUL -> LEA' + SNoFlags, InstrList[Index]); + InstrList[Index].opcode := A_LEA; + reference_reset(NewRef, 1, []); + NewRef.index := InstrList[Index].oper[1]^.reg; + case InstrList[Index].oper[0]^.val of + 2, 4, 8: + NewRef.scalefactor := InstrList[Index].oper[0]^.val; + else {3, 5 and 9} + begin + NewRef.scalefactor := InstrList[Index].oper[0]^.val - 1; + NewRef.base := InstrList[Index].oper[1]^.reg; + end; + end; + + InstrList[Index].loadref(0, NewRef); + end; + else + InternalError(2021051710); + end; + + end; + + { Mark the FLAGS register as used across this whole block } + AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs); + end; + + UpdateUsedRegs(TmpUsedRegs, tai(hp1.next)); + + JumpC := taicpu(hp2).condition; + Unconditional := False; + + if conditions_equal(JumpC, C_E) then + SetC := inverse_cond(taicpu(p).condition) + else if conditions_equal(JumpC, C_NE) then + SetC := taicpu(p).condition + else + { We've got something weird here (and inefficent) } + begin + DebugMsg('DEBUG: Inefficient jump - check code generation', p); + SetC := C_NONE; + + { JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) } + if condition_in(C_AE, JumpC) then + Unconditional := True + else + { Not sure what to do with this jump - drop out } + Exit; + end; + + RemoveInstruction(hp1); + + if Unconditional then + MakeUnconditional(taicpu(hp2)) + else + begin + if SetC = C_NONE then + InternalError(2018061402); + + taicpu(hp2).SetCondition(SetC); + end; + + if not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs) then + begin + RemoveCurrentp(p, hp2); + DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> Jcc',p); + end + else + DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> SETcc/Jcc',p); + + Result := True; + end + else if + { Make sure the instructions are adjacent } + ( + not (cs_opt_level3 in current_settings.optimizerswitches) or + GetNextInstruction(p, hp1) + ) and + MatchInstruction(hp1, A_MOV, [S_B]) and + { Writing to memory is allowed } + MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) then + begin + { + Watch out for sequences such as: + + set(c)b %regb + movb %regb,(ref) + movb $0,1(ref) + movb $0,2(ref) + movb $0,3(ref) + + Much more efficient to turn it into: + movl $0,%regl + set(c)b %regb + movl %regl,(ref) + + Or: + set(c)b %regb + movzbl %regb,%regl + movl %regl,(ref) + } + if (taicpu(hp1).oper[1]^.typ = top_ref) and + GetNextInstruction(hp1, hp2) and + MatchInstruction(hp2, A_MOV, [S_B]) and + (taicpu(hp2).oper[1]^.typ = top_ref) and + CheckMemoryWrite(taicpu(hp1), taicpu(hp2)) then + begin + { Don't do anything else except set Result to True } + end + else + begin + if taicpu(p).oper[0]^.typ = top_reg then + begin + TransferUsedRegs(TmpUsedRegs); + UpdateUsedRegs(TmpUsedRegs, tai(p.Next)); + end; + + { If it's not a register, it's a memory address } + if (taicpu(p).oper[0]^.typ <> top_reg) or RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then + begin + { Even if the register is still in use, we can minimise the + pipeline stall by changing the MOV into another SETcc. } + taicpu(hp1).opcode := A_SETcc; + taicpu(hp1).condition := taicpu(p).condition; + if taicpu(hp1).oper[1]^.typ = top_ref then + begin + { Swapping the operand pointers like this is probably a + bit naughty, but it is far faster than using loadoper + to transfer the reference from oper[1] to oper[0] if + you take into account the extra procedure calls and + the memory allocation and deallocation required } + OperPtr := taicpu(hp1).oper[1]; + taicpu(hp1).oper[1] := taicpu(hp1).oper[0]; + taicpu(hp1).oper[0] := OperPtr; + end + else + taicpu(hp1).oper[0]^.reg := taicpu(hp1).oper[1]^.reg; + + taicpu(hp1).clearop(1); + taicpu(hp1).ops := 1; + DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc/SETcc',p); + end + else + begin + if taicpu(hp1).oper[1]^.typ = top_reg then + AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs); + + taicpu(p).loadoper(0, taicpu(hp1).oper[1]^); + RemoveInstruction(hp1); + DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p); + end + end; + Result := True; + end; + end; + end; + + function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean; var hp1, hp2, hp3: tai; diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas index 1e2fdbde26..6957d69b41 100644 --- a/compiler/x86_64/aoptcpu.pas +++ b/compiler/x86_64/aoptcpu.pas @@ -127,8 +127,6 @@ uses result:=OptPass1Sub(p); A_SHL,A_SAL: result:=OptPass1SHLSAL(p); - A_SETcc: - result:=OptPass1SETcc(p); A_FSTP,A_FISTP: result:=OptPass1FSTP(p); A_FLD: @@ -181,6 +179,8 @@ uses Result:=OptPass2SUB(p); A_ADD: Result:=OptPass2ADD(p); + A_SETcc: + result:=OptPass2SETcc(p); else ; end;