From 45883c2cc0db3a2d13948fbc62a549032c1389ff Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Fri, 7 Jan 2022 03:20:29 +0000 Subject: [PATCH 1/3] * x86: New Add/ref -> ref optimisation --- compiler/x86/aoptx86.pas | 167 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 4318c8e961..ecf5c74e13 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -156,6 +156,7 @@ unit aoptx86; procedure RemoveLastDeallocForFuncRes(p : tai); function DoArithCombineOpt(var p : tai) : Boolean; + function DoAddRefOpt(var p, hp1: tai; Reg: TRegister; Value: TCGInt): Boolean; function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean; function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean; @@ -5788,6 +5789,7 @@ unit aoptx86; else DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> ADD',p); RemoveInstruction(hp1); + hp1 := nil; { This permits DoAddRefOpt to work in a single call } end; end else @@ -5814,6 +5816,28 @@ unit aoptx86; end; end; + { Change: + add $x,%reg1 + ... + ??? #(%reg2,%reg1,y) (Instructions with a reference) + (dealloc %reg1) + + To: + ??? x*y+#(%reg2,%reg1,y) + + (Do similar when %reg1 appears in (or also in) the index) + } + + if ( + { Save calling GetNextInstructionUsingReg again } + Assigned(hp1) or + GetNextInstructionUsingReg(p,hp1, ActiveReg) + ) and DoAddRefOpt(p, hp1, ActiveReg, taicpu(p).oper[0]^.val) then + begin + Result := True; + Exit; + end; + if DoArithCombineOpt(p) then Result:=true; end; @@ -6310,6 +6334,121 @@ unit aoptx86; end; + function TX86AsmOptimizer.DoAddRefOpt(var p, hp1: tai; Reg: TRegister; Value: TCGInt): Boolean; + var + ThisConst: TCGInt; + X: Integer; + NewAssign, RegUsed: Boolean; + hp2, p_next: tai; + begin + Result := False; + + if taicpu(hp1).typ <> ait_instruction then + Exit; + + { If thef flags are in use, do not make any changes } + if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then + Exit; + + NewAssign := + { MOV is a common case and faster to check than calling MatchInstruction } + ( + (taicpu(hp1).opcode = A_MOV) and + (taicpu(hp1).oper[1]^.typ = top_reg) and + SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, Reg) + ) or + ( + MatchInstruction(hp1, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}, []) and + SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, Reg) + ) or + RegLoadedWithNewValue(Reg, hp1); + + if not NewAssign and + RegModifiedByInstruction(Reg, hp1) then + Exit; + + TransferUsedRegs(TmpUsedRegs); + if not GetNextInstruction(p, p_next) then + { We should have hit hp1 at least } + InternalError(2022010801); + + UpdateUsedRegs(TmpUsedRegs, tai(p_next.Next)); + + if p_next <> hp1 then + begin + hp2 := p_next; + repeat + UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next)); + until not GetNextInstruction(hp2, hp2) or (hp2 = hp1); + end; + + RegUsed := RegUsedAfterInstruction(Reg, hp1, TmpUsedRegs); + + { If the register is used afterwards, we have to take some precautions } + if RegUsed and RegInInstruction(NR_DEFAULTFLAGS, hp1) then + { Swapping the instructions will scramble the flags; e.g. + subq $1,%rax + cmpb $10,(%rax) + + Swapping them will cause the flags to depend on SUB instead + } + Exit; + + for X := 0 to taicpu(hp1).ops - 1 do + if taicpu(hp1).oper[X]^.typ = top_ref then + begin + ThisConst := taicpu(hp1).oper[X]^.ref^.offset; + + if (taicpu(hp1).oper[X]^.ref^.base = Reg) then + begin + Inc(ThisConst, Value); + Result := True; + end; + + if (taicpu(hp1).oper[X]^.ref^.index = Reg) then + begin + Inc(ThisConst, Value * max(taicpu(hp1).oper[X]^.ref^.scalefactor, 1)); + Result := True; + end; + + if Result then + if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then + { Overflow - abort } + Result := False + else if (cs_opt_size in current_settings.optimizerswitches) and + not NewAssign and RegUsed and + (ThisConst <> 0) then + { Will increase code size } + Result := False + else + taicpu(hp1).oper[X]^.ref^.offset := ThisConst; + + Break; + end; + + if Result then + begin + if not NewAssign and RegUsed then + begin + DebugMsg(SPeepholeOptimization + 'Merged arithmetic instruction into following reference (OpRef2RefOp)', p); + UpdateUsedRegs(tai(p.Next)); + Asml.Remove(p); + Asml.InsertAfter(p, hp1); + p := p_next; + end + else + begin + DebugMsg(SPeepholeOptimization + 'Merged arithmetic instruction into following reference (OpRef2Ref)', p); + if (cs_opt_level3 in current_settings.optimizerswitches) then + RemoveCurrentp(p) + else + RemoveCurrentp(p, p_next); + end; + Exit; + end; + end; + + function TX86AsmOptimizer.DoArithCombineOpt(var p: tai): Boolean; var hp1 : tai; @@ -6863,6 +7002,7 @@ unit aoptx86; else DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> SUB',p); RemoveInstruction(hp1); + hp1 := nil; { This permits DoAddRefOpt to work in a single call } end; end else @@ -6922,7 +7062,32 @@ unit aoptx86; end; {$endif i386} if DoArithCombineOpt(p) then - Result:=true; + begin + Result:=true; + Exit; + end; + + { Change: + sub $x,%reg1 + ... + ??? #(%reg2,%reg1,y) (Instructions with a reference) + (dealloc %reg1) + + To: + ??? #-x*y(%reg2,%reg1,y) + + (Do similar when %reg1 appears in (or also in) the index) + } + + if ( + { Save calling GetNextInstructionUsingReg again } + Assigned(hp1) or + GetNextInstructionUsingReg(p,hp1, ActiveReg) + ) and DoAddRefOpt(p, hp1, ActiveReg, -taicpu(p).oper[0]^.val) then + begin + Result := True; + Exit; + end; end; end; From 1d86db3100ea9c2d92baf2e75db333d69f208568 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Thu, 16 Jun 2022 19:44:11 +0100 Subject: [PATCH 2/3] * x86: Reversal of DoAddRefOpt optimisation if no pipeline saving is made --- compiler/x86/aoptx86.pas | 318 ++++++++++++++++++++++++--------------- 1 file changed, 200 insertions(+), 118 deletions(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index ecf5c74e13..c627ed9730 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -157,6 +157,7 @@ unit aoptx86; function DoArithCombineOpt(var p : tai) : Boolean; function DoAddRefOpt(var p, hp1: tai; Reg: TRegister; Value: TCGInt): Boolean; + function DoReverseAddRefOpt(var p: tai; Value: TCGInt): Boolean; function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean; function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean; @@ -6449,6 +6450,73 @@ unit aoptx86; end; + function TX86AsmOptimizer.DoReverseAddRefOpt(var p: tai; Value: TCGInt): Boolean; + var + CurrentRef: PReference; + OtherRegister: TRegister; + X: Integer; + hp1, hp2: tai; + begin + { Assume that p has been checked and confirmed to be of the form + "ADD/SUB const,%reg" } + Result := False; + + if GetLastInstruction(p, hp1) and (hp1.typ = ait_instruction) and + { Make sure this instruction doesn't also modify the register used in + the ADD/SUB instruction } + not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then + begin + { Find the reference } + for X := 0 to taicpu(Hp1).ops - 1 do + if (taicpu(hp1).oper[X]^.typ = top_ref) then + begin + { Locally store the pointer to the reference } + CurrentRef := taicpu(hp1).oper[X]^.ref; + + if + { Only references of the form x(%reg1,%reg2,scale) can be + optimised here } + (CurrentRef^.refaddr <> addr_no) or + not RegInRef(taicpu(p).oper[1]^.reg, CurrentRef^) then + Exit; + + { Account for the scale factor on the value } + if SuperRegistersEqual(taicpu(p).oper[1]^.reg, CurrentRef^.index) then + begin + OtherRegister := CurrentRef^.base; + if SuperRegistersEqual(taicpu(p).oper[1]^.reg, CurrentRef^.base) then + Inc(Value, Value * max(CurrentRef^.scalefactor, 1)) + else + Value := Value * max(CurrentRef^.scalefactor, 1); + end + else + OtherRegister := CurrentRef^.index; + + if (OtherRegister <> NR_NO) and + { If we can't set the offset to zero, this is wasted effort } + (CurrentRef^.offset = Value) + and GetLastInstruction(hp1, hp2) and + { Make sure there is a pipeline stall between hp2 and hp1, + otherwise a saving won't be made } + RegModifiedByInstruction(OtherRegister, hp2) then + begin + taicpu(hp1).oper[X]^.ref^.offset := 0; + AsmL.Remove(hp1); + AsmL.InsertAfter(hp1, p); + + { In case OtherRegister got deallocated right after the reference } + AllocRegBetween(OtherRegister, hp2, hp1, UsedRegs); + + DebugMsg(SPeepholeOptimization + 'Rearranged MOV; (ref); ADD/SUB to MOV; ADD/SUB; (ref) to remove offset and minimise cache pollution', hp2); + + { Keep p as the current instruction } + Result := True; + end; + end; + end; + end; + + function TX86AsmOptimizer.DoArithCombineOpt(var p: tai): Boolean; var hp1 : tai; @@ -15495,131 +15563,145 @@ unit aoptx86; begin Result := False; - { Change: - add/sub 128,(dest) - - To: - sub/add -128,(dest) - - This generaally takes fewer bytes to encode because -128 can be stored - in a signed byte, whereas +128 cannot. - } - if (taicpu(p).opsize <> S_B) and MatchOperand(taicpu(p).oper[0]^, 128) then + if taicpu(p).oper[0]^.typ = top_const then begin - if taicpu(p).opcode = A_ADD then - Opposite := A_SUB - else - Opposite := A_ADD; - - { Be careful if the flags are in use, because the CF flag inverts - when changing from ADD to SUB and vice versa } - if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and - GetNextInstruction(p, hp1) then + { Sometimes, DoAddRefOpt makes an optimisation that doesn't + improve code speed and only increases cache pollution. If these + aren't cleared by other optimisations, rectify it here } + if taicpu(p).oper[1]^.typ = top_reg then begin - TransferUsedRegs(TmpUsedRegs); - TmpUsedRegs[R_SPECIALREGISTER].Update(tai(p.Next), True); - - hp2 := hp1; - - { Scan ahead to check if everything's safe } - while Assigned(hp1) and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) do - begin - if (hp1.typ <> ait_instruction) then - { Probably unsafe since the flags are still in use } - Exit; - - if MatchInstruction(hp1, A_CALL, A_JMP, A_RET, []) then - { Stop searching at an unconditional jump } - Break; - - if not - ( - MatchInstruction(hp1, A_ADC, A_SBB, []) and - (taicpu(hp1).oper[0]^.typ = top_const) { We need to be able to invert a constant } - ) and - (taicpu(hp1).condition = C_None) and RegInInstruction(NR_DEFAULTFLAGS, hp1) then - { Instruction depends on FLAGS (and is not ADC or SBB); break out } - Exit; - - UpdateUsedRegs(TmpUsedRegs, tai(p.Next)); - TmpUsedRegs[R_SPECIALREGISTER].Update(tai(hp1.Next), True); - - { Move to the next instruction } - GetNextInstruction(hp1, hp1); - end; - - while Assigned(hp2) and (hp2 <> hp1) do - begin - NewCond := C_None; - - case taicpu(hp2).condition of - C_A, C_NBE: - NewCond := C_BE; - C_B, C_C, C_NAE: - NewCond := C_AE; - C_AE, C_NB, C_NC: - NewCond := C_B; - C_BE, C_NA: - NewCond := C_A; - else - { No change needed }; - end; - - if NewCond <> C_None then - begin - DebugMsg(SPeepholeOptimization + 'Condition changed from ' + cond2str[taicpu(hp2).condition] + ' to ' + cond2str[NewCond] + - ' to accommodate ' + debug_op2str(taicpu(p).opcode) + ' -> ' + debug_op2str(opposite) + ' above', hp2); - - taicpu(hp2).condition := NewCond; - end - else - if MatchInstruction(hp2, A_ADC, A_SBB, []) then - begin - { Because of the flipping of the carry bit, to ensure - the operation remains equivalent, ADC becomes SBB - and vice versa, and the constant is not-inverted. - - If multiple ADCs or SBBs appear in a row, each one - changed causes the carry bit to invert, so they all - need to be flipped } - if taicpu(hp2).opcode = A_ADC then - SecondOpposite := A_SBB - else - SecondOpposite := A_ADC; - - if taicpu(hp2).oper[0]^.typ <> top_const then - { Should have broken out of this optimisation already } - InternalError(2021112901); - - DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(hp2).opcode) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' -> ' + - debug_op2str(SecondOpposite) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(not taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' to accommodate inverted carry bit', hp2); - - { Bit-invert the constant (effectively equivalent to "-1 - val") } - taicpu(hp2).opcode := SecondOpposite; - taicpu(hp2).oper[0]^.val := not taicpu(hp2).oper[0]^.val; - end; - - { Move to the next instruction } - GetNextInstruction(hp2, hp2); - end; - - if (hp2 <> hp1) then - InternalError(2021111501); + if taicpu(p).opcode = A_ADD then + Result := DoReverseAddRefOpt(p, taicpu(p).oper[0]^.val) + else + Result := DoReverseAddRefOpt(p, -taicpu(p).oper[0]^.val); end; - DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(p).opcode) + debug_opsize2str(taicpu(p).opsize) + ' $128,' + debug_operstr(taicpu(p).oper[1]^) + ' changed to ' + - debug_op2str(opposite) + debug_opsize2str(taicpu(p).opsize) + ' $-128,' + debug_operstr(taicpu(p).oper[1]^) + ' to reduce instruction size', p); + { Change: + add/sub 128,(dest) - taicpu(p).opcode := Opposite; - taicpu(p).oper[0]^.val := -128; + To: + sub/add -128,(dest) - { No further optimisations can be made on this instruction, so move - onto the next one to save time } - p := tai(p.Next); - UpdateUsedRegs(p); + This generaally takes fewer bytes to encode because -128 can be stored + in a signed byte, whereas +128 cannot. + } + if (taicpu(p).opsize <> S_B) and (taicpu(p).oper[0]^.val = 128) then + begin + if taicpu(p).opcode = A_ADD then + Opposite := A_SUB + else + Opposite := A_ADD; - Result := True; - Exit; + { Be careful if the flags are in use, because the CF flag inverts + when changing from ADD to SUB and vice versa } + if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and + GetNextInstruction(p, hp1) then + begin + TransferUsedRegs(TmpUsedRegs); + TmpUsedRegs[R_SPECIALREGISTER].Update(tai(p.Next), True); + + hp2 := hp1; + + { Scan ahead to check if everything's safe } + while Assigned(hp1) and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) do + begin + if (hp1.typ <> ait_instruction) then + { Probably unsafe since the flags are still in use } + Exit; + + if MatchInstruction(hp1, A_CALL, A_JMP, A_RET, []) then + { Stop searching at an unconditional jump } + Break; + + if not + ( + MatchInstruction(hp1, A_ADC, A_SBB, []) and + (taicpu(hp1).oper[0]^.typ = top_const) { We need to be able to invert a constant } + ) and + (taicpu(hp1).condition = C_None) and RegInInstruction(NR_DEFAULTFLAGS, hp1) then + { Instruction depends on FLAGS (and is not ADC or SBB); break out } + Exit; + + UpdateUsedRegs(TmpUsedRegs, tai(p.Next)); + TmpUsedRegs[R_SPECIALREGISTER].Update(tai(hp1.Next), True); + + { Move to the next instruction } + GetNextInstruction(hp1, hp1); + end; + + while Assigned(hp2) and (hp2 <> hp1) do + begin + NewCond := C_None; + + case taicpu(hp2).condition of + C_A, C_NBE: + NewCond := C_BE; + C_B, C_C, C_NAE: + NewCond := C_AE; + C_AE, C_NB, C_NC: + NewCond := C_B; + C_BE, C_NA: + NewCond := C_A; + else + { No change needed }; + end; + + if NewCond <> C_None then + begin + DebugMsg(SPeepholeOptimization + 'Condition changed from ' + cond2str[taicpu(hp2).condition] + ' to ' + cond2str[NewCond] + + ' to accommodate ' + debug_op2str(taicpu(p).opcode) + ' -> ' + debug_op2str(opposite) + ' above', hp2); + + taicpu(hp2).condition := NewCond; + end + else + if MatchInstruction(hp2, A_ADC, A_SBB, []) then + begin + { Because of the flipping of the carry bit, to ensure + the operation remains equivalent, ADC becomes SBB + and vice versa, and the constant is not-inverted. + + If multiple ADCs or SBBs appear in a row, each one + changed causes the carry bit to invert, so they all + need to be flipped } + if taicpu(hp2).opcode = A_ADC then + SecondOpposite := A_SBB + else + SecondOpposite := A_ADC; + + if taicpu(hp2).oper[0]^.typ <> top_const then + { Should have broken out of this optimisation already } + InternalError(2021112901); + + DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(hp2).opcode) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' -> ' + + debug_op2str(SecondOpposite) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(not taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' to accommodate inverted carry bit', hp2); + + { Bit-invert the constant (effectively equivalent to "-1 - val") } + taicpu(hp2).opcode := SecondOpposite; + taicpu(hp2).oper[0]^.val := not taicpu(hp2).oper[0]^.val; + end; + + { Move to the next instruction } + GetNextInstruction(hp2, hp2); + end; + + if (hp2 <> hp1) then + InternalError(2021111501); + end; + + DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(p).opcode) + debug_opsize2str(taicpu(p).opsize) + ' $128,' + debug_operstr(taicpu(p).oper[1]^) + ' changed to ' + + debug_op2str(opposite) + debug_opsize2str(taicpu(p).opsize) + ' $-128,' + debug_operstr(taicpu(p).oper[1]^) + ' to reduce instruction size', p); + + taicpu(p).opcode := Opposite; + taicpu(p).oper[0]^.val := -128; + + { No further optimisations can be made on this instruction, so move + onto the next one to save time } + p := tai(p.Next); + UpdateUsedRegs(p); + + Result := True; + Exit; + end; end; { Detect: From 2d392b863d79110f31d3e3cb944cafa55ebf0e98 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Fri, 7 Jan 2022 21:59:52 +0000 Subject: [PATCH 3/3] * x86: Updates to DoArithCombineOpt to cover more eventualities --- compiler/x86/aoptx86.pas | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index c627ed9730..9ce0b3838d 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -5817,6 +5817,11 @@ unit aoptx86; end; end; + if DoArithCombineOpt(p) then + begin + Result:=true; + Exit; + end; { Change: add $x,%reg1 ... @@ -6344,7 +6349,7 @@ unit aoptx86; begin Result := False; - if taicpu(hp1).typ <> ait_instruction then + if hp1.typ <> ait_instruction then Exit; { If thef flags are in use, do not make any changes } @@ -6493,8 +6498,16 @@ unit aoptx86; OtherRegister := CurrentRef^.index; if (OtherRegister <> NR_NO) and - { If we can't set the offset to zero, this is wasted effort } - (CurrentRef^.offset = Value) + ( + { If we can't set the offset to zero, this is wasted effort } + (CurrentRef^.offset = Value) or + { But maybe not if we can reduce the byte count } + ( + ((CurrentRef^.offset > 127) or (CurrentRef^.offset < -128)) and + ((CurrentRef^.offset - Value) >= -128) and + ((CurrentRef^.offset - Value) <= 127) + ) + ) and GetLastInstruction(hp1, hp2) and { Make sure there is a pipeline stall between hp2 and hp1, otherwise a saving won't be made }