+ factored out TX86AsmOptimizer.PrePeepholeOptIMUL, used now by x86-64 and i386

* generalized and simplified the code

git-svn-id: trunk@40162 -
This commit is contained in:
florian 2018-11-01 20:49:20 +00:00
parent 47c9bca7f2
commit 59d5d6ec95
4 changed files with 97 additions and 199 deletions

View File

@ -31,6 +31,7 @@ unit aoptutils;
function MatchOpType(const p : taicpu;type0: toptype) : Boolean;
function MatchOpType(const p : taicpu;type0,type1 : toptype) : Boolean;
function MatchOpType(const p : taicpu; type0,type1,type2 : toptype) : Boolean;
{ skips all labels and returns the next "real" instruction }
function SkipLabels(hp: tai; var hp2: tai): boolean;
@ -49,6 +50,12 @@ unit aoptutils;
end;
function MatchOpType(const p : taicpu; type0,type1,type2 : toptype) : Boolean;
begin
Result:=(p.ops=3) and (p.oper[0]^.typ=type0) and (p.oper[1]^.typ=type1) and (p.oper[2]^.typ=type1);
end;
{ skips all labels and returns the next "real" instruction }
function SkipLabels(hp: tai; var hp2: tai): boolean;
begin

View File

@ -169,205 +169,8 @@ begin
end;
case taicpu(p).opcode Of
A_IMUL:
{changes certain "imul const, %reg"'s to lea sequences}
begin
if (taicpu(p).oper[0]^.typ = Top_Const) and
(taicpu(p).oper[1]^.typ = Top_Reg) and
(taicpu(p).opsize = S_L) then
if (taicpu(p).oper[0]^.val = 1) then
if (taicpu(p).ops = 2) then
{remove "imul $1, reg"}
begin
hp1 := tai(p.Next);
asml.remove(p);
p.free;
p := hp1;
continue;
end
else
{change "imul $1, reg1, reg2" to "mov reg1, reg2"}
begin
hp1 := taicpu.Op_Reg_Reg(A_MOV, S_L, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end
else if
((taicpu(p).ops <= 2) or
(taicpu(p).oper[2]^.typ = Top_Reg)) and
(taicpu(p).oper[0]^.val <= 12) and
not(cs_opt_size in current_settings.optimizerswitches) and
(not(GetNextInstruction(p, hp1)) or
{GetNextInstruction(p, hp1) and}
not((tai(hp1).typ = ait_instruction) and
((taicpu(hp1).opcode=A_Jcc) and
(taicpu(hp1).condition in [C_O,C_NO])))) then
begin
reference_reset(tmpref,1,[]);
case taicpu(p).oper[0]^.val Of
3: begin
{imul 3, reg1, reg2 to
lea (reg1,reg1,2), reg2
imul 3, reg1 to
lea (reg1,reg1,2), reg1}
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 2;
if (taicpu(p).ops = 2) then
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end;
5: begin
{imul 5, reg1, reg2 to
lea (reg1,reg1,4), reg2
imul 5, reg1 to
lea (reg1,reg1,4), reg1}
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 4;
if (taicpu(p).ops = 2) then
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end;
6: begin
{imul 6, reg1, reg2 to
lea (,reg1,2), reg2
lea (reg2,reg1,4), reg2
imul 6, reg1 to
lea (reg1,reg1,2), reg1
add reg1, reg1}
if (current_settings.optimizecputype <= cpu_386) then
begin
TmpRef.index := taicpu(p).oper[1]^.reg;
if (taicpu(p).ops = 3) then
begin
TmpRef.base := taicpu(p).oper[2]^.reg;
TmpRef.ScaleFactor := 4;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg);
end
else
begin
hp1 := taicpu.op_reg_reg(A_ADD, S_L,
taicpu(p).oper[1]^.reg,taicpu(p).oper[1]^.reg);
end;
InsertLLItem(p, p.next, hp1);
reference_reset(tmpref,2,[]);
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 2;
if (taicpu(p).ops = 3) then
begin
TmpRef.base := NR_NO;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef,
taicpu(p).oper[2]^.reg);
end
else
begin
TmpRef.base := taicpu(p).oper[1]^.reg;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg);
end;
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := tai(hp1.next);
end
end;
9: begin
{imul 9, reg1, reg2 to
lea (reg1,reg1,8), reg2
imul 9, reg1 to
lea (reg1,reg1,8), reg1}
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 8;
if (taicpu(p).ops = 2) then
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end;
10: begin
{imul 10, reg1, reg2 to
lea (reg1,reg1,4), reg2
add reg2, reg2
imul 10, reg1 to
lea (reg1,reg1,4), reg1
add reg1, reg1}
if (current_settings.optimizecputype <= cpu_386) then
begin
if (taicpu(p).ops = 3) then
hp1 := taicpu.op_reg_reg(A_ADD, S_L,
taicpu(p).oper[2]^.reg,taicpu(p).oper[2]^.reg)
else
hp1 := taicpu.op_reg_reg(A_ADD, S_L,
taicpu(p).oper[1]^.reg,taicpu(p).oper[1]^.reg);
InsertLLItem(p, p.next, hp1);
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 4;
if (taicpu(p).ops = 3) then
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := tai(hp1.next);
end
end;
12: begin
{imul 12, reg1, reg2 to
lea (,reg1,4), reg2
lea (reg2,reg1,8), reg2
imul 12, reg1 to
lea (reg1,reg1,2), reg1
lea (,reg1,4), reg1}
if (current_settings.optimizecputype <= cpu_386)
then
begin
TmpRef.index := taicpu(p).oper[1]^.reg;
if (taicpu(p).ops = 3) then
begin
TmpRef.base := taicpu(p).oper[2]^.reg;
TmpRef.ScaleFactor := 8;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg);
end
else
begin
TmpRef.base := NR_NO;
TmpRef.ScaleFactor := 4;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg);
end;
InsertLLItem(p, p.next, hp1);
reference_reset(tmpref,2,[]);
TmpRef.index := taicpu(p).oper[1]^.reg;
if (taicpu(p).ops = 3) then
begin
TmpRef.base := NR_NO;
TmpRef.ScaleFactor := 4;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[2]^.reg);
end
else
begin
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.ScaleFactor := 2;
hp1 := taicpu.op_ref_reg(A_LEA, S_L, TmpRef, taicpu(p).oper[1]^.reg);
end;
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := tai(hp1.next);
end
end
end;
end;
end;
if PrePeepholeOptIMUL(p) then
Continue;
A_SAR,A_SHR:
if PrePeepholeOptSxx(p) then
continue;

View File

@ -57,6 +57,7 @@ unit aoptx86;
function DoSubAddOpt(var p : tai) : Boolean;
function PrePeepholeOptSxx(var p : tai) : boolean;
function PrePeepholeOptIMUL(var p : tai) : boolean;
function OptPass1AND(var p : tai) : boolean;
function OptPass1VMOVAP(var p : tai) : boolean;
@ -718,6 +719,91 @@ unit aoptx86;
end;
function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
var
opsize : topsize;
hp1 : tai;
tmpref : treference;
hp2 : taicpu;
ShiftValue : Cardinal;
BaseValue : TCGInt;
begin
result:=false;
opsize:=taicpu(p).opsize;
{ changes certain "imul const, %reg"'s to lea sequences }
if (MatchOpType(taicpu(p),top_const,top_reg) or
MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
(opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
if (taicpu(p).oper[0]^.val = 1) then
if (taicpu(p).ops = 2) then
{ remove "imul $1, reg" }
begin
hp1 := tai(p.Next);
asml.remove(p);
DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
p.free;
p := hp1;
result:=true;
end
else
{ change "imul $1, reg1, reg2" to "mov reg1, reg2" }
begin
hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
p.free;
p := hp1;
end
else if
((taicpu(p).ops <= 2) or
(taicpu(p).oper[2]^.typ = Top_Reg)) and
not(cs_opt_size in current_settings.optimizerswitches) and
(not(GetNextInstruction(p, hp1)) or
not((tai(hp1).typ = ait_instruction) and
((taicpu(hp1).opcode=A_Jcc) and
(taicpu(hp1).condition in [C_O,C_NO])))) then
begin
{
imul X, reg1, reg2 to
lea (reg1,reg1,Y), reg2
shl ZZ,reg2
imul XX, reg1 to
lea (reg1,reg1,YY), reg1
shl ZZ,reg2
This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
it does not exist as a separate optimization target in FPC though.
This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
at most two zeros
}
reference_reset(tmpref,1,[]);
if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
begin
ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
if not(BaseValue in [3,5,9]) then
Internalerror(2018110101);
TmpRef.ScaleFactor := BaseValue-1;
if (taicpu(p).ops = 2) then
hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
AsmL.InsertAfter(hp1,p);
DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
AsmL.Remove(p);
taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
p.free;
p := hp1;
if ShiftValue>0 then
AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
end;
end;
end;
function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
var
p: taicpu;

View File

@ -51,6 +51,8 @@ uses
ait_instruction:
begin
case taicpu(p).opcode of
A_IMUL:
result:=PrePeepholeOptIMUL(p);
A_SAR,A_SHR:
result:=PrePeepholeOptSxx(p);
end;