* use add/sub instead of inc/dec on modern CPUs when optimizing for speed

git-svn-id: trunk@25057 -
This commit is contained in:
florian 2013-07-07 19:50:13 +00:00
parent af1d33767d
commit 507edb16de
3 changed files with 49 additions and 16 deletions

View File

@ -41,7 +41,8 @@ uses
{$ifdef finaldestdebug}
cobjects,
{$endif finaldestdebug}
cpuinfo,cpubase,cgutils,daopt386;
cpuinfo,cpubase,cgutils,daopt386,
cgx86;
function isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
@ -960,13 +961,13 @@ begin
if (base = taicpu(p).oper[1]^.reg) then
begin
l := offset;
if (l=1) then
if (l=1) and UseIncDec then
begin
taicpu(p).opcode := A_INC;
taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
taicpu(p).ops := 1
end
else if (l=-1) then
else if (l=-1) and UseIncDec then
begin
taicpu(p).opcode := A_DEC;
taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
@ -2121,6 +2122,8 @@ begin
end;
case taicpu(p).opcode Of
A_CALL:
{ don't do this on modern CPUs, this really hurts them due to
broken call/ret pairing }
if (current_settings.optimizecputype < cpu_Pentium2) and
not(cs_create_pic in current_settings.moduleswitches) and
GetNextInstruction(p, hp1) and

View File

@ -167,6 +167,8 @@ unit cgx86;
function UseAVX: boolean;
function UseIncDec: boolean;
implementation
uses
@ -180,6 +182,21 @@ unit cgx86;
Result:=current_settings.fputype in fpu_avx_instructionsets;
end;
{ modern CPUs prefer add/sub over inc/dec because add/sub break instructions dependencies on flags
because they modify all flags }
function UseIncDec: boolean;
begin
{$if defined(x86_64)}
Result:=cs_opt_size in current_settings.optimizerswitches;
{$elseif defined(i386)}
Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_386]);
{$elseif defined(i8086)}
Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_8086..cpu_386]);
{$endif}
end;
const
TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
@ -1596,11 +1613,14 @@ unit cgx86;
OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
if not(cs_check_overflow in current_settings.localswitches) and
(a = 1) and
(op in [OP_ADD,OP_SUB]) then
if op = OP_ADD then
list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
else
list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
(op in [OP_ADD,OP_SUB]) and
UseIncDec then
begin
if op = OP_ADD then
list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
else
list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
end
else if (a = 0) then
if (op <> OP_AND) then
exit
@ -1727,11 +1747,14 @@ unit cgx86;
OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
if not(cs_check_overflow in current_settings.localswitches) and
(a = 1) and
(op in [OP_ADD,OP_SUB]) then
if op = OP_ADD then
list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
else
list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
(op in [OP_ADD,OP_SUB]) and
UseIncDec then
begin
if op = OP_ADD then
list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
else
list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
end
else if (a = 0) then
if (op <> OP_AND) then
exit
@ -2371,7 +2394,10 @@ unit cgx86;
a_label(list,again);
decrease_sp(winstackpagesize-4);
list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EAX));
list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI));
if UseIncDec then
list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI))
else
list.concat(Taicpu.op_const_reg(A_SUB,S_L,1,NR_EDI));
a_jmp_cond(list,OC_NE,again);
decrease_sp(localsize mod winstackpagesize-4);
reference_reset_base(href,NR_ESP,localsize-4,4);
@ -2409,7 +2435,10 @@ unit cgx86;
decrease_sp(winstackpagesize);
reference_reset_base(href,NR_RSP,0,4);
list.concat(Taicpu.op_reg_ref(A_MOV,S_L,NR_EAX,href));
list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10));
if UseIncDec then
list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10))
else
list.concat(Taicpu.op_const_reg(A_SUB,S_Q,1,NR_R10));
a_jmp_cond(list,OC_NE,again);
decrease_sp(localsize mod winstackpagesize);
ungetcpuregister(list,NR_R10);

View File

@ -143,7 +143,8 @@ unit nx86add;
if (op=A_SUB) and
(right.location.loc=LOC_CONSTANT) and
(right.location.value=1) and
not(cs_check_overflow in current_settings.localswitches) then
not(cs_check_overflow in current_settings.localswitches) and
UseIncDec then
begin
emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
end