mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-15 07:19:35 +02:00
+ AAarch64: optimize divisions by constant
git-svn-id: trunk@44204 -
This commit is contained in:
parent
f05b51d7d8
commit
e50c4f6373
@ -76,9 +76,58 @@ implementation
|
||||
resultreg : tregister;
|
||||
hl : tasmlabel;
|
||||
overflowloc: tlocation;
|
||||
power: longint;
|
||||
|
||||
procedure genOrdConstNodeDiv;
|
||||
var
|
||||
helper1, helper2: TRegister;
|
||||
so: tshifterop;
|
||||
begin
|
||||
if tordconstnode(right).value=0 then
|
||||
internalerror(2020021601)
|
||||
else if tordconstnode(right).value=1 then
|
||||
cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, numerator, resultreg)
|
||||
else if (tordconstnode(right).value = int64(-1)) then
|
||||
begin
|
||||
// note: only in the signed case possible..., may overflow
|
||||
if cs_check_overflow in current_settings.localswitches then
|
||||
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
|
||||
|
||||
current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
|
||||
resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
|
||||
end
|
||||
else if ispowerof2(tordconstnode(right).value,power) then
|
||||
begin
|
||||
if (is_signed(right.resultdef)) then
|
||||
begin
|
||||
helper2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||
if power = 1 then
|
||||
helper1:=numerator
|
||||
else
|
||||
begin
|
||||
helper1:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,63,numerator,helper1);
|
||||
end;
|
||||
shifterop_reset(so);
|
||||
so.shiftmode:=SM_LSR;
|
||||
so.shiftimm:=64-power;
|
||||
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
|
||||
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,power,helper2,resultreg);
|
||||
end
|
||||
else
|
||||
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,OS_INT,power,numerator,resultreg)
|
||||
end
|
||||
else
|
||||
{ Everything else is handled in the generic code }
|
||||
cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),
|
||||
tordconstnode(right).value.svalue,numerator,resultreg);
|
||||
end;
|
||||
|
||||
begin
|
||||
secondpass(left);
|
||||
secondpass(right);
|
||||
{ avoid warning }
|
||||
divider:=NR_NO;
|
||||
|
||||
{ set result location }
|
||||
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
|
||||
@ -89,16 +138,32 @@ implementation
|
||||
hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
|
||||
numerator:=left.location.register;
|
||||
|
||||
{ load divider in a register }
|
||||
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
|
||||
divider:=right.location.register;
|
||||
|
||||
{ start division }
|
||||
if is_signed(left.resultdef) then
|
||||
op:=A_SDIV
|
||||
if (right.nodetype=ordconstn) and
|
||||
((tordconstnode(right).value=1) or
|
||||
(tordconstnode(right).value=int64(-1)) or
|
||||
(tordconstnode(right).value=0) or
|
||||
ispowerof2(tordconstnode(right).value,power)) then
|
||||
begin
|
||||
genOrdConstNodeDiv;
|
||||
if nodetype=modn then
|
||||
begin
|
||||
divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
|
||||
cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider);
|
||||
end;
|
||||
end
|
||||
else
|
||||
op:=A_UDIV;
|
||||
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
|
||||
begin
|
||||
{ load divider in a register }
|
||||
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
|
||||
divider:=right.location.register;
|
||||
|
||||
{ start division }
|
||||
if is_signed(left.resultdef) then
|
||||
op:=A_SDIV
|
||||
else
|
||||
op:=A_UDIV;
|
||||
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
|
||||
end;
|
||||
|
||||
{ no divide-by-zero detection available in hardware, emulate (if it's a
|
||||
constant, this will have been detected earlier already) }
|
||||
|
@ -1914,29 +1914,148 @@ unit aoptx86;
|
||||
{ Depending on the DeepMOVOpt above, it may turn out that hp1 completely
|
||||
overwrites the original destination register. e.g.
|
||||
|
||||
movl %reg1d,%reg2d
|
||||
movslq %reg1d,%reg2q
|
||||
movl ###,%reg2d
|
||||
movslq ###,%reg2q (### doesn't have to be the same as the first one)
|
||||
|
||||
In this case, we can remove the MOV
|
||||
In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
|
||||
}
|
||||
if (taicpu(p).oper[1]^.typ = top_reg) and
|
||||
MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
|
||||
{ The RegInOp check makes sure that movb r/m,%reg1b; movzbl %reg1b,%reg1l"
|
||||
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
|
||||
optimised }
|
||||
(taicpu(hp1).oper[1]^.typ = top_reg) and
|
||||
not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
|
||||
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
|
||||
{ take care of the register (de)allocs following p }
|
||||
UpdateUsedRegs(tai(p.next));
|
||||
asml.remove(p);
|
||||
p.free;
|
||||
p:=hp1;
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
begin
|
||||
if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
|
||||
begin
|
||||
if (taicpu(hp1).oper[0]^.typ = top_reg) then
|
||||
case taicpu(p).oper[0]^.typ of
|
||||
top_const:
|
||||
{ We have something like:
|
||||
|
||||
movb $x, %regb
|
||||
movzbl %regb,%regd
|
||||
|
||||
Change to:
|
||||
|
||||
movl $x, %regd
|
||||
}
|
||||
begin
|
||||
case taicpu(hp1).opsize of
|
||||
S_BW:
|
||||
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7F)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
|
||||
taicpu(p).opsize := S_W;
|
||||
end;
|
||||
S_BL:
|
||||
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7F)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
||||
taicpu(p).opsize := S_L;
|
||||
end;
|
||||
S_WL:
|
||||
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7FFF)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
||||
taicpu(p).opsize := S_L;
|
||||
end;
|
||||
{$ifdef x86_64}
|
||||
S_BQ:
|
||||
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7F)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||
taicpu(p).opsize := S_Q;
|
||||
end;
|
||||
S_WQ:
|
||||
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7FFF)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||
taicpu(p).opsize := S_Q;
|
||||
end;
|
||||
S_LQ:
|
||||
if (taicpu(hp1).opcode <> A_MOVSXD) or
|
||||
(
|
||||
(taicpu(p).oper[0]^.val >= 0) and
|
||||
(taicpu(p).oper[0]^.val <= $7FFFFFFF)
|
||||
) then
|
||||
begin
|
||||
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||
taicpu(p).opsize := S_Q;
|
||||
end;
|
||||
{$endif x86_64}
|
||||
else
|
||||
{ If hp1 was a MOV instruction, it should have been
|
||||
optimised already }
|
||||
InternalError(2020021001);
|
||||
end;
|
||||
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
|
||||
asml.Remove(hp1);
|
||||
hp1.Free;
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
top_ref:
|
||||
{ We have something like:
|
||||
|
||||
movb mem, %regb
|
||||
movzbl %regb,%regd
|
||||
|
||||
Change to:
|
||||
|
||||
movzbl mem, %regd
|
||||
}
|
||||
if IsMOVZXAcceptable or (taicpu(hp1).opcode <> A_MOVZX) then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
|
||||
taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
|
||||
{ take care of the register (de)allocs following p }
|
||||
UpdateUsedRegs(tai(p.next));
|
||||
asml.remove(p);
|
||||
p.free;
|
||||
p:=hp1;
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
else
|
||||
if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
|
||||
{ Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
|
||||
Exit;
|
||||
end;
|
||||
end
|
||||
{ The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
|
||||
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
|
||||
optimised }
|
||||
else
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
|
||||
{ take care of the register (de)allocs following p }
|
||||
UpdateUsedRegs(tai(p.next));
|
||||
asml.remove(p);
|
||||
p.free;
|
||||
p:=hp1;
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
|
||||
if (taicpu(hp1).opcode = A_AND) and
|
||||
(taicpu(p).oper[1]^.typ = top_reg) and
|
||||
@ -2339,27 +2458,8 @@ unit aoptx86;
|
||||
Result:=true;
|
||||
exit;
|
||||
end;
|
||||
{
|
||||
mov* x,reg1
|
||||
mov* y,reg1
|
||||
|
||||
to
|
||||
|
||||
mov* y,reg1
|
||||
}
|
||||
if (taicpu(p).oper[1]^.typ=top_reg) and
|
||||
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
|
||||
not(RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^)) then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 4 done',p);
|
||||
{ take care of the register (de)allocs following p }
|
||||
UpdateUsedRegs(tai(p.next));
|
||||
asml.remove(p);
|
||||
p.free;
|
||||
p:=hp1;
|
||||
Result:=true;
|
||||
exit;
|
||||
end;
|
||||
{ mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
|
||||
end;
|
||||
|
||||
{ search further than the next instruction for a mov }
|
||||
|
Loading…
Reference in New Issue
Block a user