* ARMv7A / A64: Constant writes to memory merged to larger forms where possible

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2021-12-19 00:51:33 +00:00 committed by FPK
parent 7343a12908
commit 2a50d5abf8
4 changed files with 1051 additions and 11 deletions

View File

@ -383,6 +383,9 @@ Implementation
if inherited OptPass1STR(p) or
LookForPostindexedPattern(p) then
Exit(True);
if getsupreg(taicpu(p).oper[0]^.reg) = RS_WZR then
Result := TryConstMerge(p, nil);
end;
@ -645,10 +648,12 @@ Implementation
function TCpuAsmOptimizer.OptPass1MOVZ(var p: tai): boolean;
var
hp1: tai;
ZeroReg: TRegister;
TargetReg: TRegister;
begin
Result := False;
hp1 := nil;
TargetReg := taicpu(p).oper[0]^.reg;
if (taicpu(p).oppostfix = PF_None) and (taicpu(p).condition = C_None) then
begin
if
@ -658,7 +663,7 @@ Implementation
not GetNextInstruction(p, hp1) or
{ MOVZ and MOVK/MOVN instructions undergo macro-fusion. }
not MatchInstruction(hp1, [A_MOVK, A_MOVN], [C_None], [PF_None]) or
(taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[0]^.reg) then
(taicpu(hp1).oper[0]^.reg <> TargetReg) then
begin
if (taicpu(p).oper[1]^.val = 0) then
begin
@ -672,12 +677,11 @@ Implementation
}
DebugMsg(SPeepholeOptimization + 'Movz0ToMovZeroReg', p);
{ Make sure the zero register is the correct size }
ZeroReg := taicpu(p).oper[0]^.reg;
setsupreg(ZeroReg, RS_XZR);
{ Convert TargetReg to the correctly-sized zero register }
setsupreg(TargetReg, RS_XZR);
taicpu(p).opcode := A_MOV;
taicpu(p).loadreg(1, ZeroReg);
taicpu(p).loadreg(1, TargetReg);
Result := True;
Exit;
end;
@ -698,6 +702,48 @@ Implementation
exit;
end;
end;
if (getsupreg(TargetReg) <= RS_X30) and { Mostly to play safe }
GetNextInstructionUsingReg(p, hp1, TargetReg) and
(hp1.typ = ait_instruction) then
begin
case taicpu(hp1).opcode of
{$ifdef AARCH64}
A_MOVK:
{ Try to avoid too much unnecessary processing by checking to see
if the register is 32-bit }
if (getsubreg(TargetReg) = R_SUBD) and
(taicpu(hp1).oper[0]^.reg = TargetReg) and
TryConstMerge(p, hp1) then
begin
Result := True;
Exit;
end;
{$endif AARCH64}
A_STR:
{
With sequences such as:
movz w0,x
strb w0,[sp, #ofs]
movz w0,y
strb w0,[sp, #ofs+1]
Merge the constants to:
movz w0,x + (y shl 8)
strw w0,[sp, #ofs]
Only use the stack pointer or frame pointer and an even offset though
to guarantee alignment
}
if TryConstMerge(p, hp1) then
begin
Result := True;
Exit;
end;
else
;
end;
end;
end;

View File

@ -79,6 +79,7 @@ Type
function OptPass1CMP(var p: tai): Boolean;
function OptPass1STM(var p: tai): Boolean;
function OptPass1MOV(var p: tai): Boolean;
function OptPass1MOVW(var p: tai): Boolean;
function OptPass1MUL(var p: tai): Boolean;
function OptPass1MVN(var p: tai): Boolean;
function OptPass1VMov(var p: tai): Boolean;
@ -1484,6 +1485,13 @@ Implementation
if Result then
Exit;
{ If no changes were made, now try constant merging }
if TryConstMerge(p, hpfar1) then
begin
Result := True;
Exit;
end;
end;
end;
{
@ -1824,6 +1832,58 @@ Implementation
end;
function TCpuAsmOptimizer.OptPass1MOVW(var p: tai): Boolean;
var
ThisReg: TRegister;
a: aint;
imm_shift: byte;
hp1, hp2: tai;
begin
Result := False;
ThisReg := taicpu(p).oper[0]^.reg;
if GetNextInstruction(p, hp1) then
begin
{ Can the MOVW/MOVT pair be represented by a single MOV instruction? }
if MatchInstruction(hp1, A_MOVT, [taicpu(p).condition], []) and
(taicpu(hp1).oper[0]^.reg = ThisReg) then
begin
a := (aint(taicpu(p).oper[1]^.val) and $FFFF) or aint(taicpu(hp1).oper[1]^.val shl 16);
if is_shifter_const(a,imm_shift) then
begin
DebugMsg(SPeepholeOptimization + 'MOVW/MOVT pair can encode value as a single MOV instruction (MovwMovT2Mov)', p);
taicpu(p).opcode := A_MOV;
taicpu(p).oper[1]^.val := a;
RemoveInstruction(hp1);
Result := True;
Exit;
end
else if is_shifter_const(not(a),imm_shift) then
begin
DebugMsg(SPeepholeOptimization + 'MOVW/MOVT pair can encode value as a single MVN instruction (MovwMovT2Mvn)', p);
taicpu(p).opcode := A_MVN;
taicpu(p).oper[1]^.val := not(a);
RemoveInstruction(hp1);
Result := True;
Exit;
end;
end;
if (
(
MatchInstruction(hp1, A_STR, [taicpu(p).condition], [PF_H]) and
(taicpu(hp1).oper[0]^.reg = ThisReg)
)
) and
TryConstMerge(p, hp1) then
begin
Result := True;
Exit;
end;
end;
end;
function TCpuAsmOptimizer.OptPass1MVN(var p: tai): Boolean;
var
hp1: tai;
@ -2351,6 +2411,8 @@ Implementation
Result := OptPass1LDR(p);
A_MOV:
Result := OptPass1MOV(p);
A_MOVW:
Result := OptPass1MOVW(p);
A_AND:
Result := OptPass1And(p);
A_ADD,

View File

@ -1101,7 +1101,8 @@ Const
CPUARM_HAS_IDIV,
CPUARM_HAS_THUMB_IDIV,
CPUARM_HAS_THUMB2,
CPUARM_HAS_UMULL
CPUARM_HAS_UMULL,
CPUARM_HAS_EXTENDED_CONSTANTS { has MOVW and MOVT instructions }
);
tfpuflags =
@ -1132,9 +1133,9 @@ Const
{ cpu_armv6t2 } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
{ cpu_armv6z } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_UMULL],
{ cpu_armv6m } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_REV],
{ the identifier armv7 is should not be used, it is considered being equal to armv7a }
{ cpu_armv7 } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
{ cpu_armv7a } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
{ the identifier armv7 should not be used; it is considered equal to armv7a }
{ cpu_armv7 } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL,CPUARM_HAS_EXTENDED_CONSTANTS],
{ cpu_armv7a } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL,CPUARM_HAS_EXTENDED_CONSTANTS],
{ cpu_armv7r } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
{ cpu_armv7m } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
{ cpu_armv7em } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL]

View File

@ -61,6 +61,9 @@ Type
function OptPass2Bitwise(var p: tai): Boolean;
function OptPass2TST(var p: tai): Boolean;
{ Common code that tries to merge constant writes to sequential memory }
function TryConstMerge(var p: tai; hp1: tai): Boolean;
protected
function DoXTArithOp(var p: tai; hp1: tai): Boolean;
End;
@ -81,7 +84,7 @@ Type
Implementation
uses
cutils,verbose,globals,
cutils,verbose,globals,aoptutils,
systems,
cpuinfo,
cgobj,procinfo,
@ -2003,5 +2006,933 @@ Implementation
end;
end;
function TARMAsmOptimizer.TryConstMerge(var p: tai; hp1: tai): Boolean;
const
{$ifdef ARM}
LO_16_WRITE: TAsmOp = A_MOVW;
HI_16_WRITE: TAsmOp = A_MOVT;
{$endif ARM}
{$ifdef AARCH64}
LO_16_WRITE: TAsmOp = A_MOVZ;
HI_16_WRITE: TAsmOp = A_MOVK;
{$endif AARCH64}
var
hp2, hp2_second, hp3, hp3_second, p_second, hp1_second: tai;
ThisReg: TRegister;
ThisRef: TReference;
so: TShifterOp;
procedure SearchAhead;
begin
{ If p.opcode = A_STR, then ThisReg will be NR_NO }
if (
{$ifdef ARM}
(p_second.typ = ait_instruction) and
(taicpu(p_second).condition = taicpu(p).condition) and
(
(taicpu(p_second).opcode = A_MOV) or
(taicpu(p_second).opcode = A_MOVW)
)
{$endif ARM}
{$ifdef AARCH64}
MatchInstruction(p, A_MOVZ, []) or
(
MatchInstruction(p, A_STR, []) and
SetAndTest(p, hp1)
)
{$endif AARCH64}
) and
(
(
(ThisReg <> NR_NO) and
(
{$ifdef AARCH64}
(
(getsubreg(ThisReg) = R_SUBD) and
MatchInstruction(hp1, A_MOVK, []) and
(taicpu(hp1).oper[0]^.reg = ThisReg) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_STR, []) and
(taicpu(hp2).oper[0]^.reg = ThisReg) and
GetNextInstruction(hp2, p_second)
) or
{$endif AARCH64}
(
MatchInstruction(hp1, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, []) and
(taicpu(hp1).oper[0]^.reg = ThisReg) and
GetNextInstruction(hp1, p_second)
)
)
) or (
{ Just search one ahead if ThisReg is NR_NO }
(ThisReg = NR_NO) and
GetNextInstruction(hp1, p_second)
)
) and
(
(
{$ifdef ARM}
(p_second.typ = ait_instruction) and
(taicpu(p_second).condition = taicpu(p).condition) and
(
(taicpu(p_second).opcode = A_MOV) or
(taicpu(p_second).opcode = A_MOVW)
) and
{$endif ARM}
{$ifdef AARCH64}
MatchInstruction(p_second, A_MOVZ, []) and
{$endif AARCH64}
{ Don't use ThisReg because it may be NR_NO }
GetNextInstruction(p_second, hp1_second) and
(
{$ifdef AARCH64}
(
MatchInstruction(hp1_second, A_MOVK, []) and
GetNextInstruction(hp1_second, hp2_second) and
MatchInstruction(hp2_second, A_STR, [PF_None])
) or
{$endif AARCH64}
MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
)
)
{$ifdef AARCH64}
or (
MatchInstruction(p_second, A_STR, []) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) and
{ Negate the result because we're setting hp1_second to nil }
not SetAndTest(nil, hp1_second)
)
{$endif AARCH64}
) then
TryConstMerge(p_second, hp1_second);
end;
begin
Result := False;
{$ifdef ARM}
{ We need a Cortex-A ARM processor that supports MOVW and MOVT }
if not (CPUARM_HAS_EXTENDED_CONSTANTS in cpu_capabilities[current_settings.cputype]) then
Exit;
{$endif ARM}
ThisReg := NR_NO; { Safe initialisation }
case taicpu(p).opcode of
{$ifdef ARM}
A_MOV,
A_MOVW:
if (taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const) then
{$endif ARM}
{$ifdef AARCH64}
A_MOVZ:
{$endif AARCH64}
begin
ThisReg := taicpu(p).oper[0]^.reg;
if Assigned(hp1){$ifdef ARM} and (taicpu(hp1).condition = taicpu(p).condition){$endif ARM} then
case taicpu(hp1).opcode of
A_STR:
if {$ifdef ARM}(taicpu(hp1).ops = 2) and {$endif ARM}SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
begin
ThisRef := taicpu(hp1).oper[1]^.ref^;
if (ThisRef.addressmode = AM_OFFSET) and
(ThisRef.index = NR_NO) and
{ Only permit writes to the stack, since we can guarantee alignment with that }
(
(ThisRef.base = NR_STACK_POINTER_REG) or
(ThisRef.base = current_procinfo.framepointer)
) then
begin
case taicpu(hp1).oppostfix of
PF_B:
{
With sequences such as:
movz w0,x
strb w0,[sp, #ofs]
movz w0,y
strb w0,[sp, #ofs+1]
Merge the constants to:
movz w0,x + (y shl 8)
strh w0,[sp, #ofs]
Only use the stack pointer or frame pointer and an even offset though
to guarantee alignment
}
if ((ThisRef.offset mod 2) = 0) and
GetNextInstruction(hp1, p_second) and
(p_second.typ = ait_instruction)
{$ifdef ARM}
and (taicpu(p_second).condition = taicpu(p).condition)
{$endif ARM}
then
begin
case taicpu(p_second).opcode of
{$ifdef ARM}
A_MOV,
A_MOVW:
if (taicpu(p_second).oppostfix = PF_None) and
((taicpu(p_second).opcode <> A_MOV) or (taicpu(p_second).oper[1]^.typ = top_const)) then
{$endif ARM}
{$ifdef AARCH64}
A_MOVZ:
{$endif AARCH64}
begin
if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
GetNextInstruction(p_second, hp1_second) and
MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset);
if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(ThisReg, taicpu(hp1_second)) then
begin
{ See if we can merge 4 bytes at once (this benefits ARM mostly, but provides a speed boost for AArch64 too) }
if GetNextInstruction(hp1_second, hp2) and
(
{$ifdef ARM}
MatchInstruction(hp2, A_MOVW, [taicpu(p).condition], []) or
{$endif ARM}
(
MatchInstruction(hp2, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
{$ifdef ARM}
and (taicpu(hp2).oper[1]^.typ = top_const)
{$endif ARM}
)
) and
SuperRegistersEqual(taicpu(hp2).oper[0]^.reg, ThisReg) and
GetNextInstruction(hp2, hp2_second) and
MatchInstruction(hp2_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
SuperRegistersEqual(taicpu(hp2_second).oper[0]^.reg, ThisReg) and
GetNextInstruction(hp2_second, hp3) and
(
{$ifdef ARM}
MatchInstruction(hp3, A_MOVW, [taicpu(p).condition], []) or
{$endif ARM}
(
MatchInstruction(hp3, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
{$ifdef ARM}
and (taicpu(hp3).oper[1]^.typ = top_const)
{$endif ARM}
)
) and
SuperRegistersEqual(taicpu(hp3).oper[0]^.reg, ThisReg) and
GetNextInstruction(hp3, hp3_second) and
MatchInstruction(hp3_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
SuperRegistersEqual(taicpu(hp3_second).oper[0]^.reg, ThisReg) then
begin
Inc(ThisRef.offset);
if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) then
begin
Inc(ThisRef.offset);
if RefsEqual(taicpu(hp3_second).oper[1]^.ref^, ThisRef) then
begin
{ Merge the constants }
DebugMsg(SPeepholeOptimization + 'Merged four byte-writes to memory into a single word-write (MovzStrbMovzStrbMovzStrbMovzStrb2MovzMovkStr)', p);
{$ifdef ARM}
taicpu(p).opcode := A_MOVW;
{$endif ARM}
taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
taicpu(hp2).opcode := HI_16_WRITE;
taicpu(hp2).oper[1]^.val := (taicpu(hp2).oper[1]^.val and $FF) or ((taicpu(hp3).oper[1]^.val and $FF) shl 8);
so.shiftimm := 16;
so.shiftmode := SM_LSL;
taicpu(hp2).loadshifterop(2, so);
taicpu(hp2).ops := 3;
taicpu(hp1).oppostfix := PF_None;
AsmL.Remove(hp2);
AsmL.InsertAfter(hp2, p);
RemoveInstruction(p_second);
RemoveInstruction(hp1_second);
RemoveInstruction(hp2_second);
RemoveInstruction(hp3);
RemoveInstruction(hp3_second);
Result := True;
{$ifdef AARCH64}
{ Searching ahead only benefits AArch64 here }
SearchAhead;
{$endif AARCH64}
Exit;
end;
{ Reset the offset so the range check below is correct }
Dec(ThisRef.offset);
end;
Dec(ThisRef.offset);
end;
{$ifdef ARM}
{ Be careful. strb and str support offsets between -4095 and +4095, but
strh only supports offsets between -255 and +255. However, we might be
able to bypass this if there are four bytes in a row (for AArch64, just
use SearchAhead below }
if { Remember we added 1 to the offset }
(ThisRef.offset >= -254) and (ThisRef.offset <= 256) then
{$endif ARM}
begin
{ Merge the constants and remove the second pair of instructions }
DebugMsg(SPeepholeOptimization + 'Merged two byte-writes to memory into a single half-write (MovzStrbMovzStrb2MovzStrh)', p);
{$ifdef ARM}
taicpu(p).opcode := A_MOVW;
{$endif ARM}
taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
taicpu(hp1).oppostfix := PF_H;
RemoveInstruction(p_second);
RemoveInstruction(hp1_second);
Result := True;
end;
end;
end;
end;
{$ifdef AARCH64}
A_STR:
{ Sometimes, the second mov might not be present as we're writing the
zero register to the next address - that is:
movz w0,x
strb w0,[sp, #ofs]
strb wzr,[sp, #ofs+1]
Which becomes:
movz w0,x
strh w0,[sp, #ofs]
}
if RegEndOfLife(ThisReg, taicpu(hp1)) and
(taicpu(p_second).oppostfix = PF_B) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
{ Merge the constants and remove the second pair of instructions }
DebugMsg(SPeepholeOptimization + 'Merged a byte-write and a zero-register byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 1)', p);
taicpu(p).oper[1]^.val := taicpu(p).oper[1]^.val and $FF; { In case there's some extraneous bits }
taicpu(hp1).oppostfix := PF_H;
RemoveInstruction(p_second);
Result := True;
end;
end;
{$endif AARCH64}
else
;
end;
{ Search ahead to see if more bytes are written individually,
because then we may be able to merge 4 bytes into a full
word write in a single pass }
if Result then
begin
SearchAhead;
Exit;
end;
end;
PF_H:
{
With sequences such as:
movz w0,x
strh w0,[sp, #ofs]
movz w0,y
strh w0,[sp, #ofs+2]
Merge the constants to:
movz w0,x
movk w0,y,lsl #16
str w0,[sp, #ofs]
Only use the stack pointer or frame pointer and an offset
that's a multiple of 4 though to guarantee alignment
}
if ((ThisRef.offset mod 4) = 0) and
GetNextInstruction(hp1, p_second) and
(p_second.typ = ait_instruction)
{$ifdef ARM}
and (taicpu(p_second).condition = taicpu(p).condition)
{$endif ARM}
then
begin
case taicpu(p_second).opcode of
{$ifdef ARM}
A_MOV,
A_MOVW:
if (taicpu(p).oppostfix = PF_None) and
((taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const)) then
{$endif ARM}
{$ifdef AARCH64}
A_MOVZ:
{$endif AARCH64}
begin
if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
GetNextInstruction(p_second, hp1_second) and
MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_H]) and
SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset, 2);
if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(ThisReg, taicpu(hp1_second)) then
begin
{ Merge the constants }
DebugMsg(SPeepholeOptimization + 'Merged two half-writes to memory into a single word-write (MovzStrhMovzStrh2MovzMovkStr)', p);
{ Repurpose the second MOVZ instruction into a MOVK instruction }
if taicpu(p_second).oper[1]^.val = 0 then
begin
{ Or just remove it if it's not needed }
RemoveInstruction(p_second);
{$ifdef ARM}
{ If within the range 0..255, MOV suffices (256 can also be encoded this way) }
if (taicpu(p).oper[1]^.val < 0) or (taicpu(p).oper[1]^.val > 256) then
taicpu(p).opcode := A_MOVW;
{$endif ARM}
end
else
begin
asml.Remove(p_second);
asml.InsertAfter(p_second, p);
{$ifdef ARM}
taicpu(p).opcode := A_MOVW;
{$endif ARM}
taicpu(p_second).opcode := HI_16_WRITE;
{$ifdef AARCH64}
so.shiftmode := SM_LSL;
so.shiftimm := 16;
taicpu(p_second).ops := 3;
taicpu(p_second).loadshifterop(2, so);
{ Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
setsubreg(ThisReg, R_SUBD);
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(p_second).oper[0]^.reg := ThisReg;
taicpu(hp1).oper[0]^.reg := ThisReg;
{$endif AARCH64}
{ TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
end;
taicpu(hp1).oppostfix := PF_None;
RemoveInstruction(hp1_second);
Result := True;
end;
end;
end;
{$ifdef AARCH64}
A_STR:
{ Sometimes, the second mov might not be present as we're writing the
zero register to the next address - that is:
movz w0,x
strh w0,[sp, #ofs]
strh wzr,[sp, #ofs+1]
Which becomes:
movz w0,x
str w0,[sp, #ofs]
}
if RegEndOfLife(ThisReg, taicpu(hp1)) and
(taicpu(p_second).oppostfix = PF_H) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset, 2);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
{ Merge the constants and remove the second pair of instructions }
DebugMsg(SPeepholeOptimization + 'Merged a half-write and a zero-register half-write to memory into a single word-write (MovzStrhStrh2MovzStr)', p);
{ Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
setsubreg(ThisReg, R_SUBD);
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(hp1).oper[0]^.reg := ThisReg;
taicpu(hp1).oppostfix := PF_None;
RemoveInstruction(p_second);
Result := True;
end;
end;
{$endif AARCH64}
else
;
end;
{$ifdef AARCH64}
{ Search ahead to see if more half-words are written
individually, because then we may be able to merge
4 words into a full extended write in a single pass }
if Result then
begin
SearchAhead;
Exit;
end;
{$endif AARCH64}
end;
else
;
end;
end;
end;
{$ifdef AARCH64}
A_MOVK:
if (getsubreg(ThisReg) = R_SUBD) and
(taicpu(hp1).oper[0]^.reg = ThisReg) and
(taicpu(hp1).ops = 3) and
(taicpu(hp1).oper[2]^.shifterop^.shiftmode = SM_LSL) and
(taicpu(hp1).oper[2]^.shifterop^.shiftimm = 16) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_STR, [PF_None]) and
(taicpu(hp2).oper[0]^.reg = ThisReg) then
begin
{
With sequences such as:
movz w0,x
movk w0,y,lsl #16
str w0,[sp, #ofs]
movz w0,z
movk w0,q,lsl #16
str w0,[sp, #ofs+4]
Merge the constants to:
movz x0,x
movk x0,y,lsl #16
movk x0,z,lsl #32
movk x0,q,lsl #48
str x0,[sp, #ofs]
Only use the stack pointer or frame pointer and an offset
that's a multiple of 8 though to guarantee alignment
}
ThisRef := taicpu(hp2).oper[1]^.ref^;
if ((ThisRef.offset mod 8) = 0) and
GetNextInstruction(hp2, p_second) and
(p_second.typ = ait_instruction) then
case taicpu(p_second).opcode of
A_MOVZ:
if (
(taicpu(p_second).oper[0]^.reg = ThisReg) or
(
RegEndOfLife(ThisReg, taicpu(hp2)) and
(getsubreg(taicpu(p_second).oper[0]^.reg) = R_SUBD)
)
) and GetNextInstruction(p_second, hp1_second) then
begin
case taicpu(hp1_second).opcode of
A_MOVK:
if (taicpu(p_second).oper[1]^.val <= $FFFF) and
(taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) and
(taicpu(hp1_second).ops = 3) and
(taicpu(hp1_second).oper[2]^.shifterop^.shiftmode = SM_LSL) and
(taicpu(hp1_second).oper[2]^.shifterop^.shiftimm = 16) and
GetNextInstruction(hp1_second, hp2_second) and
MatchInstruction(hp2_second, A_STR, [PF_None]) and
(taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) then
begin
Inc(ThisRef.offset, 4);
if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(taicpu(p_second).oper[0]^.reg, taicpu(hp2_second)) then
begin
DebugMsg(SPeepholeOptimization + 'Merged two word-writes to memory into a single extended-write (MovzMovkStrMovzMovkStr2MovzMovkMovkMovkStr)', p);
{ Extend register to 64-bit and repurpose second MOVZ to a MOVK with lsl 32 }
setsubreg(ThisReg, R_SUBQ);
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(hp1).oper[0]^.reg := ThisReg;
{ If the 3rd word is zero, we can remove the instruction entirely }
if taicpu(p_second).oper[1]^.val = 0 then
RemoveInstruction(p_second)
else
begin
taicpu(p_second).oper[0]^.reg := ThisReg;
so.shiftimm := 32;
so.shiftmode := SM_LSL;
taicpu(p_second).opcode := A_MOVK;
taicpu(p_second).ops := 3;
taicpu(p_second).loadshifterop(2, so);
AsmL.Remove(p_second);
AsmL.InsertBefore(p_second, hp2);
end;
taicpu(hp1_second).oper[0]^.reg := ThisReg;
taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
taicpu(hp2).oper[0]^.reg := ThisReg;
AsmL.Remove(hp1_second);
AsmL.InsertBefore(hp1_second, hp2);
RemoveInstruction(hp2_second);
Result := True;
end;
end;
else
;
end;
end;
A_STR:
{ Sometimes, the second mov might not be present as we're writing the
zero register to the next address - that is:
movz w0,x
movk w0,y,lsl #16
str w0,[sp, #ofs]
str wzr,[sp, #ofs+4]
Which becomes:
movz x0,x
movk x0,y,lsl #16
str x0,[sp, #ofs]
}
begin
{ Sometimes, the second mov might not be present as we're writing the
zero register to the next address - that is:
movz w0,x
strh w0,[sp, #ofs]
strh wzr,[sp, #ofs+1]
Which becomes:
movz w0,x
str w0,[sp, #ofs]
}
{ Don't need to check end-of-life because the upper 32 bits are zero
and the overall value isn't being modified }
if (taicpu(p_second).oppostfix = PF_None) and
(taicpu(p_second).oper[0]^.reg = NR_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset, 4);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
{ Merge the constants and remove the second pair of instructions }
DebugMsg(SPeepholeOptimization + 'Merged a word-write and a zero-register word-write to memory into a single extended-write (MovzStrStr2MovzStr)', p);
setsubreg(taicpu(p).oper[0]^.reg, R_SUBQ);
setsubreg(taicpu(hp1).oper[0]^.reg, R_SUBQ);
setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBQ);
RemoveInstruction(p_second);
Result := True;
end;
end;
end
else
;
end;
end;
{$endif AARCH64}
else
;
end;
end;
{$ifdef AARCH64}
A_STR:
{ hp1 is probably nil }
if getsupreg(taicpu(p).oper[0]^.reg) = RS_WZR then
begin
ThisRef := taicpu(p).oper[1]^.ref^;
if (ThisRef.addressmode = AM_OFFSET) and
(ThisRef.index = NR_NO) and
{ Only permit writes to the stack, since we can guarantee alignment with that }
(
(ThisRef.base = NR_STACK_POINTER_REG) or
(ThisRef.base = current_procinfo.framepointer)
) then
begin
case taicpu(p).oppostfix of
PF_B:
{
With sequences such as:
strb wzr,[sp, #ofs]
movz w0,x
strb w0,[sp, #ofs+1]
Merge the constants to:
movz w0,x shl 8
strh w0,[sp, #ofs]
Only use the stack pointer or frame pointer and an even offset though
to guarantee alignment
}
if ((ThisRef.offset mod 2) = 0) and
GetNextInstruction(p, p_second) and
(p_second.typ = ait_instruction) then
begin
case taicpu(p_second).opcode of
A_MOVZ:
begin
ThisReg := taicpu(p_second).oper[0]^.reg;
if GetNextInstruction(p_second, hp1_second) and
MatchInstruction(hp1_second, A_STR, [PF_B]) and
SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset);
if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(ThisReg, taicpu(hp1_second)) then
begin
{ Merge the constants by repurposing the 2nd move, changing the register in the first STR and removing the second STR }
DebugMsg(SPeepholeOptimization + 'Merged a zero-register byte-write and a byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 2)', p);
taicpu(p_second).oper[1]^.val := (taicpu(p_second).oper[1]^.val and $FF) shl 8;
taicpu(hp1_second).oppostfix := PF_H;
Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 1);
RemoveCurrentP(p, p_second);
Result := True;
hp1 := hp1_second; { So SearchAhead works properly below }
end;
end;
end;
A_STR:
{ Change:
strb wzr,[sp, #ofs]
strb wzr,[sp, #ofs+1]
To:
strh wzr,[sp, #ofs]
}
if (taicpu(p_second).oppostfix = PF_B) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
DebugMsg(SPeepholeOptimization + 'Merged two zero-register byte-writes to memory into a single zero-register half-write (StrbStrb2Strh)', p);
taicpu(p).oppostfix := PF_H;
RemoveInstruction(p_second);
Result := True;
end;
end;
else
;
end;
{ Search ahead to see if more bytes are written individually,
because then we may be able to merge 4 bytes into a full
word write in a single pass }
if Result then
begin
SearchAhead;
Exit;
end;
end;
PF_H:
{
With sequences such as:
strh wzr,[sp, #ofs]
movz w0,x
strh w0,[sp, #ofs+2]
Merge the constants to:
movz w0,#0
movk w0,x,lsl #16
str w0,[sp, #ofs]
Only use the stack pointer or frame pointer and an offset
that's a multiple of 4 though to guarantee alignment
}
if ((ThisRef.offset mod 4) = 0) and
GetNextInstruction(p, p_second) and
(p_second.typ = ait_instruction) then
begin
case taicpu(p_second).opcode of
A_MOVZ:
begin
ThisReg := taicpu(p_second).oper[0]^.reg;
if GetNextInstruction(p_second, hp1_second) and
MatchInstruction(hp1_second, A_STR, [PF_H]) and
SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
begin
{ Is the second storage location exactly two bytes ahead? }
Inc(ThisRef.offset, 2);
if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(ThisReg, taicpu(hp1_second)) then
begin
{ Merge the constants }
DebugMsg(SPeepholeOptimization + 'Merged a zero-register half-write and a half-write to memory into a single word-write (StrhMovzStrh2MovzMovkStr)', p);
{ Repurpose the first STR to a MOVZ instruction }
taicpu(p).opcode := A_MOVZ;
taicpu(p).oppostfix := PF_None;
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(p).loadconst(1, 0);
so.shiftmode := SM_LSL;
so.shiftimm := 16;
taicpu(p_second).opcode := A_MOVK;
taicpu(p_second).ops := 3;
taicpu(p_second).loadshifterop(2, so);
{ Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
setsubreg(ThisReg, R_SUBD);
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(p_second).oper[0]^.reg := ThisReg;
taicpu(hp1_second).oper[0]^.reg := ThisReg;
{ TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
taicpu(hp1_second).oppostfix := PF_None;
Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 2);
Result := True;
end;
end;
end;
A_STR:
{ Change:
strh wzr,[sp, #ofs]
strh wzr,[sp, #ofs+2]
To:
str wzr,[sp, #ofs]
}
if (taicpu(p_second).oppostfix = PF_H) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset, 2);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
DebugMsg(SPeepholeOptimization + 'Merged two zero-register half-writes to memory into a single zero-register word-write (StrhStrh2Str)', p);
{ Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
taicpu(p).oper[0]^.reg := NR_WZR;
taicpu(p).oppostfix := PF_None;
RemoveInstruction(p_second);
Result := True;
end;
end;
else
;
end;
end;
PF_None:
{
With sequences such as:
str wzr,[sp, #ofs]
movz w0,x
movk w0,y,lsl #16
str w0,[sp, #ofs+4]
Merge the constants to:
movz x0,#0
movk x0,x,lsl #32
movk x0,y,lsl #48
str x0,[sp, #ofs]
Only use the stack pointer or frame pointer and an offset
that's a multiple of 8 though to guarantee alignment
}
if ((ThisRef.offset mod 8) = 0) and
GetNextInstruction(p, p_second) and
(p_second.typ = ait_instruction) then
begin
case taicpu(p_second).opcode of
A_MOVZ:
begin
ThisReg := taicpu(p_second).oper[0]^.reg;
if GetNextInstruction(p_second, hp1_second) and
MatchInstruction(hp1_second, A_MOVK, []) and
GetNextInstruction(hp1_second, hp2_second) and
MatchInstruction(hp2_second, A_STR, [PF_None]) and
(taicpu(hp2_second).oper[0]^.reg = ThisReg) then
begin
{ Is the second storage location exactly four bytes ahead? }
Inc(ThisRef.offset, 4);
if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
{ The final safety check... make sure the register used
to store the constant isn't used afterwards }
RegEndOfLife(ThisReg, taicpu(hp1_second)) then
begin
{ Merge the constants }
DebugMsg(SPeepholeOptimization + 'Merged a zero-register word-write and a word-write to memory into a single extended-write (StrMovzMovkStr2MovzMovkMovkStr)', p);
setsubreg(ThisReg, R_SUBQ);
{ Repurpose the first STR to a MOVZ instruction }
taicpu(p).opcode := A_MOVZ;
taicpu(p).oppostfix := PF_None;
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(p).loadconst(1, 0);
{ If the 3rd word is zero, we can remove the instruction entirely }
if taicpu(p_second).oper[1]^.val = 0 then
RemoveInstruction(p_second)
else
begin
so.shiftmode := SM_LSL;
so.shiftimm := 32;
taicpu(p_second).opcode := A_MOVK;
taicpu(p_second).ops := 3;
taicpu(p_second).loadshifterop(2, so);
taicpu(p_second).oper[0]^.reg := ThisReg;
end;
taicpu(p).oper[0]^.reg := ThisReg;
taicpu(hp1_second).oper[0]^.reg := ThisReg;
taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
{ TODO: Confirm that the A_MOVZ / A_MOVK / A_MOVK combination is the most efficient }
taicpu(hp2_second).oppostfix := PF_None;
Dec(taicpu(hp2_second).oper[1]^.ref^.offset, 4);
taicpu(hp2_second).oper[0]^.reg := ThisReg; { Remember to change the register to its 64-bit counterpart }
Result := True;
end;
end;
end;
A_STR:
{ Change:
str wzr,[sp, #ofs]
str wzr,[sp, #ofs+4]
To:
str xzr,[sp, #ofs]
}
if (taicpu(p_second).oppostfix = PF_None) and
(getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
begin
{ Is the second storage location exactly one byte ahead? }
Inc(ThisRef.offset, 4);
if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
begin
DebugMsg(SPeepholeOptimization + 'Merged two zero-register word-writes to memory into a single zero-register extended-write (StrStr2Str)', p);
taicpu(p).oper[0]^.reg := NR_XZR;
RemoveInstruction(p_second);
Result := True;
end;
end;
else
;
end;
end;
else
;
end;
end;
end;
{$endif AARCH64}
else
;
end;
end;
end.