mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-17 03:59:13 +02:00
+ AAarch64: optimize divisions by constant
git-svn-id: trunk@44204 -
This commit is contained in:
parent
f05b51d7d8
commit
e50c4f6373
@ -76,9 +76,58 @@ implementation
|
|||||||
resultreg : tregister;
|
resultreg : tregister;
|
||||||
hl : tasmlabel;
|
hl : tasmlabel;
|
||||||
overflowloc: tlocation;
|
overflowloc: tlocation;
|
||||||
|
power: longint;
|
||||||
|
|
||||||
|
procedure genOrdConstNodeDiv;
|
||||||
|
var
|
||||||
|
helper1, helper2: TRegister;
|
||||||
|
so: tshifterop;
|
||||||
|
begin
|
||||||
|
if tordconstnode(right).value=0 then
|
||||||
|
internalerror(2020021601)
|
||||||
|
else if tordconstnode(right).value=1 then
|
||||||
|
cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, numerator, resultreg)
|
||||||
|
else if (tordconstnode(right).value = int64(-1)) then
|
||||||
|
begin
|
||||||
|
// note: only in the signed case possible..., may overflow
|
||||||
|
if cs_check_overflow in current_settings.localswitches then
|
||||||
|
cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
|
||||||
|
|
||||||
|
current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
|
||||||
|
resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
|
||||||
|
end
|
||||||
|
else if ispowerof2(tordconstnode(right).value,power) then
|
||||||
|
begin
|
||||||
|
if (is_signed(right.resultdef)) then
|
||||||
|
begin
|
||||||
|
helper2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||||
|
if power = 1 then
|
||||||
|
helper1:=numerator
|
||||||
|
else
|
||||||
|
begin
|
||||||
|
helper1:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
|
||||||
|
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,63,numerator,helper1);
|
||||||
|
end;
|
||||||
|
shifterop_reset(so);
|
||||||
|
so.shiftmode:=SM_LSR;
|
||||||
|
so.shiftimm:=64-power;
|
||||||
|
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
|
||||||
|
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,power,helper2,resultreg);
|
||||||
|
end
|
||||||
|
else
|
||||||
|
cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,OS_INT,power,numerator,resultreg)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
{ Everything else is handled in the generic code }
|
||||||
|
cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),
|
||||||
|
tordconstnode(right).value.svalue,numerator,resultreg);
|
||||||
|
end;
|
||||||
|
|
||||||
begin
|
begin
|
||||||
secondpass(left);
|
secondpass(left);
|
||||||
secondpass(right);
|
secondpass(right);
|
||||||
|
{ avoid warning }
|
||||||
|
divider:=NR_NO;
|
||||||
|
|
||||||
{ set result location }
|
{ set result location }
|
||||||
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
|
location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
|
||||||
@ -89,16 +138,32 @@ implementation
|
|||||||
hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
|
hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
|
||||||
numerator:=left.location.register;
|
numerator:=left.location.register;
|
||||||
|
|
||||||
{ load divider in a register }
|
if (right.nodetype=ordconstn) and
|
||||||
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
|
((tordconstnode(right).value=1) or
|
||||||
divider:=right.location.register;
|
(tordconstnode(right).value=int64(-1)) or
|
||||||
|
(tordconstnode(right).value=0) or
|
||||||
{ start division }
|
ispowerof2(tordconstnode(right).value,power)) then
|
||||||
if is_signed(left.resultdef) then
|
begin
|
||||||
op:=A_SDIV
|
genOrdConstNodeDiv;
|
||||||
|
if nodetype=modn then
|
||||||
|
begin
|
||||||
|
divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
|
||||||
|
cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider);
|
||||||
|
end;
|
||||||
|
end
|
||||||
else
|
else
|
||||||
op:=A_UDIV;
|
begin
|
||||||
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
|
{ load divider in a register }
|
||||||
|
hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
|
||||||
|
divider:=right.location.register;
|
||||||
|
|
||||||
|
{ start division }
|
||||||
|
if is_signed(left.resultdef) then
|
||||||
|
op:=A_SDIV
|
||||||
|
else
|
||||||
|
op:=A_UDIV;
|
||||||
|
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
|
||||||
|
end;
|
||||||
|
|
||||||
{ no divide-by-zero detection available in hardware, emulate (if it's a
|
{ no divide-by-zero detection available in hardware, emulate (if it's a
|
||||||
constant, this will have been detected earlier already) }
|
constant, this will have been detected earlier already) }
|
||||||
|
@ -1914,29 +1914,148 @@ unit aoptx86;
|
|||||||
{ Depending on the DeepMOVOpt above, it may turn out that hp1 completely
|
{ Depending on the DeepMOVOpt above, it may turn out that hp1 completely
|
||||||
overwrites the original destination register. e.g.
|
overwrites the original destination register. e.g.
|
||||||
|
|
||||||
movl %reg1d,%reg2d
|
movl ###,%reg2d
|
||||||
movslq %reg1d,%reg2q
|
movslq ###,%reg2q (### doesn't have to be the same as the first one)
|
||||||
|
|
||||||
In this case, we can remove the MOV
|
In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
|
||||||
}
|
}
|
||||||
if (taicpu(p).oper[1]^.typ = top_reg) and
|
if (taicpu(p).oper[1]^.typ = top_reg) and
|
||||||
MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
|
MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
|
||||||
{ The RegInOp check makes sure that movb r/m,%reg1b; movzbl %reg1b,%reg1l"
|
|
||||||
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
|
|
||||||
optimised }
|
|
||||||
(taicpu(hp1).oper[1]^.typ = top_reg) and
|
(taicpu(hp1).oper[1]^.typ = top_reg) and
|
||||||
not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
|
|
||||||
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
|
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
|
||||||
begin
|
begin
|
||||||
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
|
if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
|
||||||
{ take care of the register (de)allocs following p }
|
begin
|
||||||
UpdateUsedRegs(tai(p.next));
|
if (taicpu(hp1).oper[0]^.typ = top_reg) then
|
||||||
asml.remove(p);
|
case taicpu(p).oper[0]^.typ of
|
||||||
p.free;
|
top_const:
|
||||||
p:=hp1;
|
{ We have something like:
|
||||||
Result := True;
|
|
||||||
Exit;
|
movb $x, %regb
|
||||||
end;
|
movzbl %regb,%regd
|
||||||
|
|
||||||
|
Change to:
|
||||||
|
|
||||||
|
movl $x, %regd
|
||||||
|
}
|
||||||
|
begin
|
||||||
|
case taicpu(hp1).opsize of
|
||||||
|
S_BW:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7F)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
|
||||||
|
taicpu(p).opsize := S_W;
|
||||||
|
end;
|
||||||
|
S_BL:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7F)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
||||||
|
taicpu(p).opsize := S_L;
|
||||||
|
end;
|
||||||
|
S_WL:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7FFF)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
||||||
|
taicpu(p).opsize := S_L;
|
||||||
|
end;
|
||||||
|
{$ifdef x86_64}
|
||||||
|
S_BQ:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7F)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||||
|
taicpu(p).opsize := S_Q;
|
||||||
|
end;
|
||||||
|
S_WQ:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSX) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7FFF)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||||
|
taicpu(p).opsize := S_Q;
|
||||||
|
end;
|
||||||
|
S_LQ:
|
||||||
|
if (taicpu(hp1).opcode <> A_MOVSXD) or
|
||||||
|
(
|
||||||
|
(taicpu(p).oper[0]^.val >= 0) and
|
||||||
|
(taicpu(p).oper[0]^.val <= $7FFFFFFF)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
|
||||||
|
taicpu(p).opsize := S_Q;
|
||||||
|
end;
|
||||||
|
{$endif x86_64}
|
||||||
|
else
|
||||||
|
{ If hp1 was a MOV instruction, it should have been
|
||||||
|
optimised already }
|
||||||
|
InternalError(2020021001);
|
||||||
|
end;
|
||||||
|
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
|
||||||
|
asml.Remove(hp1);
|
||||||
|
hp1.Free;
|
||||||
|
Result := True;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
top_ref:
|
||||||
|
{ We have something like:
|
||||||
|
|
||||||
|
movb mem, %regb
|
||||||
|
movzbl %regb,%regd
|
||||||
|
|
||||||
|
Change to:
|
||||||
|
|
||||||
|
movzbl mem, %regd
|
||||||
|
}
|
||||||
|
if IsMOVZXAcceptable or (taicpu(hp1).opcode <> A_MOVZX) then
|
||||||
|
begin
|
||||||
|
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
|
||||||
|
taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
|
||||||
|
{ take care of the register (de)allocs following p }
|
||||||
|
UpdateUsedRegs(tai(p.next));
|
||||||
|
asml.remove(p);
|
||||||
|
p.free;
|
||||||
|
p:=hp1;
|
||||||
|
Result := True;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
else
|
||||||
|
if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
|
||||||
|
{ Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
end
|
||||||
|
{ The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
|
||||||
|
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
|
||||||
|
optimised }
|
||||||
|
else
|
||||||
|
begin
|
||||||
|
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
|
||||||
|
{ take care of the register (de)allocs following p }
|
||||||
|
UpdateUsedRegs(tai(p.next));
|
||||||
|
asml.remove(p);
|
||||||
|
p.free;
|
||||||
|
p:=hp1;
|
||||||
|
Result := True;
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
if (taicpu(hp1).opcode = A_AND) and
|
if (taicpu(hp1).opcode = A_AND) and
|
||||||
(taicpu(p).oper[1]^.typ = top_reg) and
|
(taicpu(p).oper[1]^.typ = top_reg) and
|
||||||
@ -2339,27 +2458,8 @@ unit aoptx86;
|
|||||||
Result:=true;
|
Result:=true;
|
||||||
exit;
|
exit;
|
||||||
end;
|
end;
|
||||||
{
|
|
||||||
mov* x,reg1
|
|
||||||
mov* y,reg1
|
|
||||||
|
|
||||||
to
|
{ mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
|
||||||
|
|
||||||
mov* y,reg1
|
|
||||||
}
|
|
||||||
if (taicpu(p).oper[1]^.typ=top_reg) and
|
|
||||||
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
|
|
||||||
not(RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^)) then
|
|
||||||
begin
|
|
||||||
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 4 done',p);
|
|
||||||
{ take care of the register (de)allocs following p }
|
|
||||||
UpdateUsedRegs(tai(p.next));
|
|
||||||
asml.remove(p);
|
|
||||||
p.free;
|
|
||||||
p:=hp1;
|
|
||||||
Result:=true;
|
|
||||||
exit;
|
|
||||||
end;
|
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{ search further than the next instruction for a mov }
|
{ search further than the next instruction for a mov }
|
||||||
|
Loading…
Reference in New Issue
Block a user