* patch by Nico Erfurth:

Reorder unaligned Load sequence on ARM

The old version produced code like that:

ldrb rDEST, [rBASE]
ldrb rTemp, [rBASE, #1]
orr  rDEST, rDEST, rTEMP lsl #8 (2 stall cycles)
ldrb rTemp, [rBASE, #2]
orr  rDEST, rDEST, rTEMP lsl #16 (2 stall cycles)
ldrb rTemp, [rBASE, #3]
orr  rDEST, rDEST, rTEMP lsl #24 (2 stall cycles)

This creates a lot of stall-cycles on ARM Implementations with load
delay slots like Marvel Kirkwood or Intel XScale. With the usual up to 2
stall-cycles this code requires a total of 13 cycles (7 instructions + 6 stall
cycles) in best case.

The new code uses a second temp register to avoid the stall cycles.

ldrb rDEST, [rBASE]
ldrb rTemp1, [rBASE, #1]
ldrb rTemp2, [rBASE, #2]
orr  rDEST, rDEST, rTEMP1 lsl #8
ldrb rTemp1, [rBASE, #3]
orr  rDEST, rDEST, rTEMP2 lsl #16
orr  rDEST, rDEST, rTEMP1 lsl #24 (1 stall cycle)

The rescheduling and second register bring the total cycles down to 8.
If a later rescheduling should happen for the last orr it even can go
down to 7.

git-svn-id: trunk@21363 -
This commit is contained in:
florian 2012-05-22 19:09:20 +00:00
parent dc03282cb7
commit c75486db89

View File

@ -177,7 +177,7 @@ unit cgcpu;
uses
globals,verbose,systems,cutils,
globals,verbose,systems,cutils,sysutils,
aopt,aoptcpu,
fmodule,
symconst,symsym,
@ -388,19 +388,26 @@ unit cgcpu;
end
else
begin
tmpreg2:=getintregister(list,OS_INT);
if target_info.endian=endian_big then
inc(usedtmpref.offset,3);
a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,reg);
inc(usedtmpref.offset,dir);
a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
inc(usedtmpref.offset,dir);
a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg2);
so.shiftimm:=8;
list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
inc(usedtmpref.offset,dir);
a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
so.shiftimm:=16;
list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
inc(usedtmpref.offset,dir);
a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg2,so));
so.shiftimm:=24;
list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
end;
@ -706,7 +713,7 @@ unit cgcpu;
OP_SAR:
begin
if a>32 then
internalerror(200308295);
internalerror(200308298);
if a<>0 then
begin
shifterop_reset(so);
@ -1081,7 +1088,7 @@ unit cgcpu;
OS_F32:
oppostfix:=PF_None;
else
InternalError(200308295);
InternalError(200308299);
end;
if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[tosize]) then
begin