+ AAarch64: optimize divisions by constant

git-svn-id: trunk@44204 -
This commit is contained in:
florian 2020-02-17 20:11:32 +00:00
parent f05b51d7d8
commit e50c4f6373
2 changed files with 211 additions and 46 deletions

View File

@ -76,9 +76,58 @@ implementation
resultreg : tregister;
hl : tasmlabel;
overflowloc: tlocation;
power: longint;
procedure genOrdConstNodeDiv;
var
helper1, helper2: TRegister;
so: tshifterop;
begin
if tordconstnode(right).value=0 then
internalerror(2020021601)
else if tordconstnode(right).value=1 then
cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, numerator, resultreg)
else if (tordconstnode(right).value = int64(-1)) then
begin
// note: only in the signed case possible..., may overflow
if cs_check_overflow in current_settings.localswitches then
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
end
else if ispowerof2(tordconstnode(right).value,power) then
begin
if (is_signed(right.resultdef)) then
begin
helper2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
if power = 1 then
helper1:=numerator
else
begin
helper1:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,63,numerator,helper1);
end;
shifterop_reset(so);
so.shiftmode:=SM_LSR;
so.shiftimm:=64-power;
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,power,helper2,resultreg);
end
else
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,OS_INT,power,numerator,resultreg)
end
else
{ Everything else is handled in the generic code }
cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),
tordconstnode(right).value.svalue,numerator,resultreg);
end;
begin
secondpass(left);
secondpass(right);
{ avoid warning }
divider:=NR_NO;
{ set result location }
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
@ -89,16 +138,32 @@ implementation
hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
numerator:=left.location.register;
{ load divider in a register }
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
divider:=right.location.register;
{ start division }
if is_signed(left.resultdef) then
op:=A_SDIV
if (right.nodetype=ordconstn) and
((tordconstnode(right).value=1) or
(tordconstnode(right).value=int64(-1)) or
(tordconstnode(right).value=0) or
ispowerof2(tordconstnode(right).value,power)) then
begin
genOrdConstNodeDiv;
if nodetype=modn then
begin
divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider);
end;
end
else
op:=A_UDIV;
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
begin
{ load divider in a register }
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
divider:=right.location.register;
{ start division }
if is_signed(left.resultdef) then
op:=A_SDIV
else
op:=A_UDIV;
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
end;
{ no divide-by-zero detection available in hardware, emulate (if it's a
constant, this will have been detected earlier already) }

View File

@ -1914,29 +1914,148 @@ unit aoptx86;
{ Depending on the DeepMOVOpt above, it may turn out that hp1 completely
overwrites the original destination register. e.g.
movl %reg1d,%reg2d
movslq %reg1d,%reg2q
movl ###,%reg2d
movslq ###,%reg2q (### doesn't have to be the same as the first one)
In this case, we can remove the MOV
In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
}
if (taicpu(p).oper[1]^.typ = top_reg) and
MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
{ The RegInOp check makes sure that movb r/m,%reg1b; movzbl %reg1b,%reg1l"
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
optimised }
(taicpu(hp1).oper[1]^.typ = top_reg) and
not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
begin
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
{ take care of the register (de)allocs following p }
UpdateUsedRegs(tai(p.next));
asml.remove(p);
p.free;
p:=hp1;
Result := True;
Exit;
end;
begin
if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
begin
if (taicpu(hp1).oper[0]^.typ = top_reg) then
case taicpu(p).oper[0]^.typ of
top_const:
{ We have something like:
movb $x, %regb
movzbl %regb,%regd
Change to:
movl $x, %regd
}
begin
case taicpu(hp1).opsize of
S_BW:
if (taicpu(hp1).opcode <> A_MOVSX) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7F)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
taicpu(p).opsize := S_W;
end;
S_BL:
if (taicpu(hp1).opcode <> A_MOVSX) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7F)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).opsize := S_L;
end;
S_WL:
if (taicpu(hp1).opcode <> A_MOVSX) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7FFF)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).opsize := S_L;
end;
{$ifdef x86_64}
S_BQ:
if (taicpu(hp1).opcode <> A_MOVSX) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7F)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
S_WQ:
if (taicpu(hp1).opcode <> A_MOVSX) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7FFF)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
S_LQ:
if (taicpu(hp1).opcode <> A_MOVSXD) or
(
(taicpu(p).oper[0]^.val >= 0) and
(taicpu(p).oper[0]^.val <= $7FFFFFFF)
) then
begin
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
{$endif x86_64}
else
{ If hp1 was a MOV instruction, it should have been
optimised already }
InternalError(2020021001);
end;
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
asml.Remove(hp1);
hp1.Free;
Result := True;
Exit;
end;
top_ref:
{ We have something like:
movb mem, %regb
movzbl %regb,%regd
Change to:
movzbl mem, %regd
}
if IsMOVZXAcceptable or (taicpu(hp1).opcode <> A_MOVZX) then
begin
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
{ take care of the register (de)allocs following p }
UpdateUsedRegs(tai(p.next));
asml.remove(p);
p.free;
p:=hp1;
Result := True;
Exit;
end;
else
if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
{ Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
Exit;
end;
end
{ The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
optimised }
else
begin
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
{ take care of the register (de)allocs following p }
UpdateUsedRegs(tai(p.next));
asml.remove(p);
p.free;
p:=hp1;
Result := True;
Exit;
end;
end;
if (taicpu(hp1).opcode = A_AND) and
(taicpu(p).oper[1]^.typ = top_reg) and
@ -2339,27 +2458,8 @@ unit aoptx86;
Result:=true;
exit;
end;
{
mov* x,reg1
mov* y,reg1
to
mov* y,reg1
}
if (taicpu(p).oper[1]^.typ=top_reg) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
not(RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^)) then
begin
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 4 done',p);
{ take care of the register (de)allocs following p }
UpdateUsedRegs(tai(p.next));
asml.remove(p);
p.free;
p:=hp1;
Result:=true;
exit;
end;
{ mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
end;
{ search further than the next instruction for a mov }