o merging r22801 of Jeppe Johansen

git-svn-id: trunk@22812 -
This commit is contained in:
florian 2012-10-21 19:05:59 +00:00
commit 970405c0f3
7 changed files with 291 additions and 128 deletions

View File

@ -735,7 +735,7 @@ implementation
{ check for pre/post indexed }
result := operand_read;
//Thumb2
A_LSL, A_LSR, A_ROR, A_ASR, A_SDIV, A_UDIV,A_MOVT:
A_LSL, A_LSR, A_ROR, A_ASR, A_SDIV, A_UDIV, A_MOVW, A_MOVT, A_MLS:
if opnr in [0] then
result:=operand_write
else

View File

@ -342,7 +342,8 @@ Implementation
{There is a special requirement for MUL and MLA, oper[0] and oper[1] are not allowed to be the same}
not (
(taicpu(p).opcode in [A_MLA, A_MUL]) and
(taicpu(p).oper[1]^.reg = taicpu(movp).oper[0]^.reg)
(taicpu(p).oper[1]^.reg = taicpu(movp).oper[0]^.reg) and
(current_settings.cputype < cpu_armv6)
) and
{ Take care to only do this for instructions which REALLY load to the first register.
Otherwise
@ -1170,7 +1171,10 @@ Implementation
add reg2, ...
}
if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
begin
if (taicpu(p).ops=3) then
RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
end;
end;
A_MVN:
begin
@ -1260,6 +1264,52 @@ Implementation
asml.remove(p);
p.free;
p:=hp1;
end
{
change
uxtb reg2,reg1
uxtb reg3,reg2
dealloc reg2
to
uxtb reg3,reg1
}
else if MatchInstruction(p, A_UXTB, [C_None], [PF_None]) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
(assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
(taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg)) and
{ reg1 might not be modified inbetween }
not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
begin
DebugMsg('Peephole UxtbUxtb2Uxtb done', p);
taicpu(hp1).opcode:=A_UXTB;
taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
asml.remove(p);
p.free;
p:=hp1;
end
{
change
uxth reg2,reg1
uxth reg3,reg2
dealloc reg2
to
uxth reg3,reg1
}
else if MatchInstruction(p, A_UXTH, [C_None], [PF_None]) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
(assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
(taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg)) and
{ reg1 might not be modified inbetween }
not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
begin
DebugMsg('Peephole UxthUxth2Uxth done', p);
taicpu(hp1).opcode:=A_UXTH;
taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
asml.remove(p);
p.free;
p:=hp1;
end;
end;
A_UXTH:
@ -1858,7 +1908,17 @@ Implementation
result:=true;
end
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_None,PF_S]) and
MatchInstruction(p, [A_ADD], [C_None], [PF_None]) and
(taicpu(p).ops = 3) and
MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
(taicpu(p).oper[2]^.typ=top_reg) then
begin
taicpu(p).ops := 2;
taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
result:=true;
end
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_BIC,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_None]) and
(taicpu(p).ops = 3) and
MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
(taicpu(p).oper[2]^.typ=top_reg) and
@ -1873,7 +1933,7 @@ Implementation
result:=true;
end
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_AND,A_ORR,A_EOR], [], [PF_None,PF_S]) and
MatchInstruction(p, [A_AND,A_ORR,A_EOR], [C_None], [PF_None,PF_S]) and
(taicpu(p).ops = 3) and
MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[2]^) and
(not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
@ -1885,6 +1945,33 @@ Implementation
taicpu(p).ops := 2;
result:=true;
end
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_MOV], [C_None], [PF_None]) and
(taicpu(p).ops=3) and
(taicpu(p).oper[2]^.typ=top_shifterop) and
(taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSL,SM_LSR,SM_ASR,SM_ROR]) and
MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
(not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
begin
asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
taicpu(p).oppostfix:=PF_S;
taicpu(p).ops := 2;
if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
taicpu(p).loadreg(1, taicpu(p).oper[2]^.shifterop^.rs)
else
taicpu(p).loadconst(1, taicpu(p).oper[2]^.shifterop^.shiftimm);
case taicpu(p).oper[2]^.shifterop^.shiftmode of
SM_LSL: taicpu(p).opcode:=A_LSL;
SM_LSR: taicpu(p).opcode:=A_LSR;
SM_ASR: taicpu(p).opcode:=A_ASR;
SM_ROR: taicpu(p).opcode:=A_ROR;
end;
result:=true;
end
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_AND], [], [PF_None]) and
(taicpu(p).ops = 2) and
@ -1917,6 +2004,76 @@ Implementation
result := true;
end
{
Turn
mul reg0, z,w
sub/add x, y, reg0
dealloc reg0
into
mls/mla x,y,z,w
}
else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_MUL], [C_None], [PF_None]) and
(taicpu(p).ops=3) and
(taicpu(p).oper[0]^.typ = top_reg) and
(taicpu(p).oper[1]^.typ = top_reg) and
(taicpu(p).oper[2]^.typ = top_reg) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
(((taicpu(hp1).ops=3) and
(taicpu(hp1).oper[2]^.typ=top_reg) and
(MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
(MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
(taicpu(hp1).opcode=A_ADD)))) or
((taicpu(hp1).ops=2) and
(taicpu(hp1).oper[1]^.typ=top_reg) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
not(RegModifiedBetween(taicpu(p).oper[2]^.reg,p,hp1)) then
begin
if taicpu(hp1).opcode=A_ADD then
begin
taicpu(hp1).opcode:=A_MLA;
if taicpu(hp1).ops=3 then
if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
taicpu(hp1).loadreg(1,taicpu(hp1).oper[2]^.reg);
taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
taicpu(hp1).loadreg(3,taicpu(p).oper[2]^.reg);
DebugMsg('MulAdd2MLA done', p);
taicpu(hp1).ops:=4;
asml.remove(p);
p.free;
p:=hp1;
end
else
begin
taicpu(hp1).opcode:=A_MLS;
if taicpu(hp1).ops=2 then
taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
taicpu(hp1).loadreg(3,taicpu(p).oper[2]^.reg);
DebugMsg('MulSub2MLS done', p);
taicpu(hp1).ops:=4;
asml.remove(p);
p.free;
p:=hp1;
end;
result:=true;
end
{else if (p.typ=ait_instruction) and
MatchInstruction(p, [A_CMP], [C_None], [PF_None]) and
(taicpu(p).oper[1]^.typ=top_const) and

View File

@ -160,6 +160,8 @@ unit cgcpu;
procedure g_proc_entry(list : TAsmList;localsize : longint;nostackframe:boolean);override;
procedure g_proc_exit(list : TAsmList;parasize : longint;nostackframe:boolean); override;
procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; size: TCGSize; src, dst: TRegister); override;
function handle_load_store(list:TAsmList;op: tasmop;oppostfix : toppostfix;reg:tregister;ref: treference):treference; override;
procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle); override;
@ -3170,24 +3172,12 @@ unit cgcpu;
begin
if not(size in [OS_8,OS_S8,OS_16,OS_S16,OS_32,OS_S32]) then
internalerror(2002090902);
if is_shifter_const(a,imm_shift) then
if is_thumb_imm(a) then
list.concat(taicpu.op_reg_const(A_MOV,reg,a))
{ loading of constants with mov and orr }
else if (is_shifter_const(a-byte(a),imm_shift)) then
begin
list.concat(taicpu.op_reg_const(A_MOV,reg,a-byte(a)));
list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,byte(a)));
end
else if (is_shifter_const(a-word(a),imm_shift)) and (is_shifter_const(word(a),imm_shift)) then
begin
list.concat(taicpu.op_reg_const(A_MOV,reg,a-word(a)));
list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,word(a)));
end
else if (is_shifter_const(a-(dword(a) shl 8) shr 8,imm_shift)) and (is_shifter_const((dword(a) shl 8) shr 8,imm_shift)) then
begin
list.concat(taicpu.op_reg_const(A_MOV,reg,a-(dword(a) shl 8) shr 8));
list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,(dword(a) shl 8) shr 8));
end
else if is_thumb_imm(not(a)) then
list.concat(taicpu.op_reg_const(A_MVN,reg,not(a)))
else if (a and $FFFF)=a then
list.concat(taicpu.op_reg_const(A_MOVW,reg,a))
else
begin
reference_reset(hr,4);
@ -3198,6 +3188,7 @@ unit cgcpu;
current_procinfo.aktlocaldata.concat(tai_const.Create_32bit(longint(a)));
hr.symbol:=l;
hr.base:=NR_PC;
list.concat(taicpu.op_reg_ref(A_LDR,reg,hr));
end;
end;
@ -3478,6 +3469,35 @@ unit cgcpu;
so.shiftimm:=l1;
list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,dst,src,src,so));
end
{ for example : b=a*7 -> b=a*8-a with rsb instruction and shl }
else if (op in [OP_MUL,OP_IMUL]) and ispowerof2(a+1,l1) and not(cgsetflags or setflags) then
begin
if l1>32 then{does this ever happen?}
internalerror(201205181);
shifterop_reset(so);
so.shiftmode:=SM_LSL;
so.shiftimm:=l1;
list.concat(taicpu.op_reg_reg_reg_shifterop(A_RSB,dst,src,src,so));
end
else if (op in [OP_MUL,OP_IMUL]) and not(cgsetflags or setflags) and try_optimized_mul32_const_reg_reg(list,a,src,dst) then
begin
{ nothing to do on success }
end
{ x := y and 0; just clears a register, this sometimes gets generated on 64bit ops.
Just using mov x, #0 might allow some easier optimizations down the line. }
else if (op = OP_AND) and (dword(a)=0) then
list.concat(taicpu.op_reg_const(A_MOV,dst,0))
{ x := y AND $FFFFFFFF just copies the register, so use mov for better optimizations }
else if (op = OP_AND) and (not(dword(a))=0) then
list.concat(taicpu.op_reg_reg(A_MOV,dst,src))
{ BIC clears the specified bits, while AND keeps them, using BIC allows to use a
broader range of shifterconstants.}
{else if (op = OP_AND) and is_shifter_const(not(dword(a)),shift) then
list.concat(taicpu.op_reg_reg_const(A_BIC,dst,src,not(dword(a))))}
else if (op = OP_AND) and is_thumb_imm(a) then
list.concat(taicpu.op_reg_reg_const(A_MOV,dst,src,dword(a)))
else if (op = OP_AND) and is_thumb_imm(not(dword(a))) then
list.concat(taicpu.op_reg_reg_const(A_BIC,dst,src,not(dword(a))))
else
begin
tmpreg:=getintregister(list,size);
@ -3810,6 +3830,22 @@ unit cgcpu;
list.concat(taicpu.op_reg_reg(A_MOV,NR_PC,NR_R14));
end;
procedure Tthumb2cgarm.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; size: TCGSize; src, dst: TRegister);
begin
if reverse then
begin
list.Concat(taicpu.op_reg_reg(A_CLZ,dst,src));
list.Concat(taicpu.op_reg_reg_const(A_RSB,dst,dst,31));
list.Concat(taicpu.op_reg_reg(A_UXTB,dst,dst));
end
else
begin
list.Concat(taicpu.op_reg_reg(A_RBIT,dst,src));
list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
list.Concat(taicpu.op_reg_reg_const(A_RSB,dst,dst,31));
list.Concat(taicpu.op_reg_reg(A_UXTB,dst,dst));
end
end;
function Tthumb2cgarm.handle_load_store(list:TAsmList;op: tasmop;oppostfix : toppostfix;reg:tregister;ref: treference):treference;
var

View File

@ -365,6 +365,7 @@ unit cpubase;
function is_pc(const r : tregister) : boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE}
function is_shifter_const(d : aint;var imm_shift : byte) : boolean;
function is_thumb_imm(d : aint) : boolean; { Doesn't handle ROR_C detection }
function split_into_shifter_const(value : aint;var imm1: dword; var imm2: dword):boolean;
function dwarf_reg(r:tregister):shortint;
@ -550,6 +551,43 @@ unit cpubase;
result:=false;
end;
function is_thumb_imm(d: aint): boolean;
var
t : aint;
i : longint;
imm : byte;
begin
result:=false;
if (d and $FF) = d then
begin
result:=true;
exit;
end;
if ((d and $FF00FF00) = 0) and
((d shr 16)=(d and $FFFF)) then
begin
result:=true;
exit;
end;
if ((d and $00FF00FF) = 0) and
((d shr 16)=(d and $FFFF)) then
begin
result:=true;
exit;
end;
if ((d shr 16)=(d and $FFFF)) and
((d shr 8)=(d and $FF)) then
begin
result:=true;
exit;
end;
if is_shifter_const(d,imm) then
begin
result:=true;
exit;
end;
end;
function split_into_shifter_const(value : aint;var imm1: dword; var imm2: dword) : boolean;
var
d, i, i2: Dword;

View File

@ -1066,6 +1066,7 @@ Const
CPUARM_HAS_CLZ, { CPU supports the CLZ instruction }
CPUARM_HAS_EDSP, { CPU supports the PLD,STRD,LDRD,MCRR and MRRC instructions }
CPUARM_HAS_REV, { CPU supports the REV instruction }
CPUARM_HAS_RBIT, { CPU supports the RBIT instruction }
CPUARM_HAS_LDREX,
CPUARM_HAS_IDIV
);
@ -1088,8 +1089,8 @@ Const
{ cpu_armv7 } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
{ cpu_armv7a } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
{ cpu_armv7r } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
{ cpu_armv7m } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_IDIV],
{ cpu_armv7em } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_IDIV]
{ cpu_armv7m } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_IDIV],
{ cpu_armv7em } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_IDIV]
);
Implementation

View File

@ -3268,7 +3268,11 @@ if (target_info.abi = abi_eabihf) then
this is not perfect but the current implementation bsf/bsr does not allow another
solution }
if CPUARM_HAS_CLZ in cpu_capabilities[init_settings.cputype] then
def_system_macro('FPC_HAS_INTERNAL_BSR');
begin
def_system_macro('FPC_HAS_INTERNAL_BSR');
if CPUARM_HAS_RBIT in cpu_capabilities[init_settings.cputype] then
def_system_macro('FPC_HAS_INTERNAL_BSF');
end;
{$endif}

View File

@ -505,140 +505,67 @@ asm
end;
{$endif}
var
fpc_system_lock: longint; export name 'fpc_system_lock';
function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
asm
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
.Lloop:
ldrex r2, [r3]
cmp r2, #0
itt eq
strexeq r2, r1, [r3]
cmpeq r2, #0
ldrex ip, [r0]
sub ip, #1
strex r3, ip, [r0]
cmp r3, #0
bne .Lloop
// do the job
ldr r1, [r0]
sub r1, r1, #1
str r1, [r0]
mov r0, r1
// unlock and return
str r2, [r3]
mov pc, lr
.Lfpc_system_lock:
.long fpc_system_lock
mov r0, ip
end;
function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
asm
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
.Lloop:
ldrex r2, [r3]
cmp r2, #0
itt eq
strexeq r2, r1, [r3]
cmpeq r2, #0
ldrex ip, [r0]
add ip, #1
strex r3, ip, [r0]
cmp r3, #0
bne .Lloop
// do the job
ldr r1, [r0]
add r1, r1, #1
str r1, [r0]
mov r0, r1
// unlock and return
str r2, [r3]
mov pc, lr
.Lfpc_system_lock:
.long fpc_system_lock
mov r0, ip
end;
function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
// lock
ldr r3, .Lfpc_system_lock
mov r2, #1
.Lloop:
ldrex r2, [r3]
cmp r2, #0
itt eq
strexeq r2, r12, [r3]
cmpeq r2, #0
ldrex ip, [r0]
strex r3, r1, [r0]
cmp r3, #0
bne .Lloop
// do the job
ldr r2, [r0]
str r1, [r0]
mov r0, r2
// unlock and return
mov r2, #0
str r2, [r3]
mov pc, lr
.Lfpc_system_lock:
.long fpc_system_lock
mov r0, ip
end;
function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
// lock
ldr r3, .Lfpc_system_lock
mov r2, #1
.Lloop:
ldrex r2, [r3]
cmp r2, #0
itt eq
strexeq r2, r12, [r3]
cmpeq r2, #0
ldrex ip, [r0]
add r2, ip, r1
strex r3, r2, [r0]
cmp r3, #0
bne .Lloop
// do the job
ldr r2, [r0]
add r1, r1, r2
str r1, [r0]
mov r0, r2
// unlock and return
mov r2, #0
str r2, [r3]
mov pc, lr
.Lfpc_system_lock:
.long fpc_system_lock
mov r0, ip
end;
function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
asm
// lock
ldr r12, .Lfpc_system_lock
mov r3, #1
.Lloop:
ldrex r2, [r12]
cmp r2, #0
itt eq
strexeq r2, r1, [r12]
cmpeq r2, #0
ldrex ip, [r0]
cmp ip, r2
ite eq
strexeq r3, r1, [r0]
movne r3, #0
cmp r3, #0
bne .Lloop
// do the job
ldr r3, [r0]
cmp r3, r2
it eq
streq r1, [r0]
mov r0, r3
// unlock and return
mov r3, #0
str r3, [r12]
mov pc, lr
.Lfpc_system_lock:
.long fpc_system_lock
mov r0, ip
end;
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}