fpc/compiler/x86/aoptx86.pas
J. Gareth "Curious Kit" Moreton 15a5a62eb4 New backward MOV optimisation
2022-01-06 20:57:48 +00:00

12748 lines
544 KiB
ObjectPascal

{
Copyright (c) 1998-2002 by Florian Klaempfl and Jonas Maebe
This unit contains the peephole optimizer.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
****************************************************************************
}
unit aoptx86;
{$i fpcdefs.inc}
{ $define DEBUG_AOPTCPU}
{$ifdef EXTDEBUG}
{$define DEBUG_AOPTCPU}
{$endif EXTDEBUG}
interface
uses
globtype,
cpubase,
aasmtai,aasmcpu,
cgbase,cgutils,
aopt,aoptobj;
type
TOptsToCheck = (
aoc_MovAnd2Mov_3
);
TX86AsmOptimizer = class(TAsmOptimizer)
{ some optimizations are very expensive to check, so the
pre opt pass can be used to set some flags, depending on the found
instructions if it is worth to check a certain optimization }
OptsToCheck : set of TOptsToCheck;
function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
function RegReadByInstruction(reg : TRegister; hp : tai) : boolean;
function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
{ This version of GetNextInstructionUsingReg will look across conditional jumps,
potentially allowing further optimisation (although it might need to know if
it crossed a conditional jump. }
function GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean;
{
In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks
the use of a register by allocs/dealloc, so it can ignore calls.
In the following example, GetNextInstructionUsingReg will return the second movq,
GetNextInstructionUsingRegTrackingUse won't.
movq %rdi,%rax
# Register rdi released
# Register rdi allocated
movq %rax,%rdi
While in this example:
movq %rdi,%rax
call proc
movq %rdi,%rax
GetNextInstructionUsingRegTrackingUse will return the second instruction while GetNextInstructionUsingReg
won't.
}
function GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
private
function SkipSimpleInstructions(var hp1: tai): Boolean;
protected
class function IsMOVZXAcceptable: Boolean; static; inline;
{ Attempts to allocate a volatile integer register for use between p and hp,
using AUsedRegs for the current register usage information. Returns NR_NO
if no free register could be found }
function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
{ Attempts to allocate a volatile MM register for use between p and hp,
using AUsedRegs for the current register usage information. Returns NR_NO
if no free register could be found }
function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
{ checks whether loading a new value in reg1 overwrites the entirety of reg2 }
function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
{ checks whether reading the value in reg1 depends on the value of reg2. This
is very similar to SuperRegisterEquals, except it takes into account that
R_SUBH and R_SUBL are independendent (e.g. reading from AL does not
depend on the value in AH). }
function Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
{ Replaces all references to AOldReg in a memory reference to ANewReg }
class function ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean; static;
{ Replaces all references to AOldReg in an operand to ANewReg }
class function ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean; static;
{ Replaces all references to AOldReg in an instruction to ANewReg,
except where the register is being written }
class function ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean; static;
{ Returns true if the reference only refers to ESP or EBP (or their 64-bit equivalents),
or writes to a global symbol }
class function IsRefSafe(const ref: PReference): Boolean; static;
{ Returns true if the given MOV instruction can be safely converted to CMOV }
class function CanBeCMOV(p : tai) : boolean; static;
{ Converts the LEA instruction to ADD/INC/SUB/DEC. Returns True if the
conversion was successful }
function ConvertLEA(const p : taicpu): Boolean;
function DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
procedure DebugMsg(const s : string; p : tai);inline;
class function IsExitCode(p : tai) : boolean; static;
class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
procedure RemoveLastDeallocForFuncRes(p : tai);
function DoSubAddOpt(var p : tai) : Boolean;
function DoMovCmpMemOpt(var p : tai; const hp1: tai; UpdateTmpUsedRegs: Boolean) : Boolean;
function PrePeepholeOptSxx(var p : tai) : boolean;
function PrePeepholeOptIMUL(var p : tai) : boolean;
function PrePeepholeOptAND(var p : tai) : boolean;
function OptPass1Test(var p: tai): boolean;
function OptPass1Add(var p: tai): boolean;
function OptPass1AND(var p : tai) : boolean;
function OptPass1_V_MOVAP(var p : tai) : boolean;
function OptPass1VOP(var p : tai) : boolean;
function OptPass1MOV(var p : tai) : boolean;
function OptPass1Movx(var p : tai) : boolean;
function OptPass1MOVXX(var p : tai) : boolean;
function OptPass1OP(var p : tai) : boolean;
function OptPass1LEA(var p : tai) : boolean;
function OptPass1Sub(var p : tai) : boolean;
function OptPass1SHLSAL(var p : tai) : boolean;
function OptPass1FSTP(var p : tai) : boolean;
function OptPass1FLD(var p : tai) : boolean;
function OptPass1Cmp(var p : tai) : boolean;
function OptPass1PXor(var p : tai) : boolean;
function OptPass1VPXor(var p: tai): boolean;
function OptPass1Imul(var p : tai) : boolean;
function OptPass1Jcc(var p : tai) : boolean;
function OptPass1SHXX(var p: tai): boolean;
function OptPass1VMOVDQ(var p: tai): Boolean;
function OptPass1_V_Cvtss2sd(var p: tai): boolean;
function OptPass2Movx(var p : tai): Boolean;
function OptPass2MOV(var p : tai) : boolean;
function OptPass2Imul(var p : tai) : boolean;
function OptPass2Jmp(var p : tai) : boolean;
function OptPass2Jcc(var p : tai) : boolean;
function OptPass2Lea(var p: tai): Boolean;
function OptPass2SUB(var p: tai): Boolean;
function OptPass2ADD(var p : tai): Boolean;
function OptPass2SETcc(var p : tai) : boolean;
function CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
function PostPeepholeOptMov(var p : tai) : Boolean;
function PostPeepholeOptMovzx(var p : tai) : Boolean;
{$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
function PostPeepholeOptXor(var p : tai) : Boolean;
{$endif x86_64}
function PostPeepholeOptAnd(var p : tai) : boolean;
function PostPeepholeOptMOVSX(var p : tai) : boolean;
function PostPeepholeOptCmp(var p : tai) : Boolean;
function PostPeepholeOptTestOr(var p : tai) : Boolean;
function PostPeepholeOptCall(var p : tai) : Boolean;
function PostPeepholeOptLea(var p : tai) : Boolean;
function PostPeepholeOptPush(var p: tai): Boolean;
function PostPeepholeOptShr(var p : tai) : boolean;
function PostPeepholeOptADDSUB(var p : tai) : Boolean;
function PostPeepholeOptVPXOR(var p: tai): Boolean;
procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
function TrySwapMovCmp(var p, hp1: tai): Boolean;
{ Processor-dependent reference optimisation }
class procedure OptimizeRefs(var p: taicpu); static;
end;
function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
function MatchInstruction(const instr: tai; const ops: array of TAsmOp; const opsize: topsizes): boolean;
function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
{$if max_operands>2}
function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
{$endif max_operands>2}
function RefsEqual(const r1, r2: treference): boolean;
function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
{ returns true, if ref is a reference using only the registers passed as base and index
and having an offset }
function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
implementation
uses
cutils,verbose,
systems,
globals,
cpuinfo,
procinfo,
paramgr,
aasmbase,
aoptbase,aoptutils,
symconst,symsym,
cgx86,
itcpugas;
{$ifdef DEBUG_AOPTCPU}
const
SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
{$else DEBUG_AOPTCPU}
{ Empty strings help the optimizer to remove string concatenations that won't
ever appear to the user on release builds. [Kit] }
const
SPeepholeOptimization = '';
{$endif DEBUG_AOPTCPU}
LIST_STEP_SIZE = 4;
function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
begin
result :=
(instr.typ = ait_instruction) and
(taicpu(instr).opcode = op) and
((opsize = []) or (taicpu(instr).opsize in opsize));
end;
function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
begin
result :=
(instr.typ = ait_instruction) and
((taicpu(instr).opcode = op1) or
(taicpu(instr).opcode = op2)
) and
((opsize = []) or (taicpu(instr).opsize in opsize));
end;
function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
begin
result :=
(instr.typ = ait_instruction) and
((taicpu(instr).opcode = op1) or
(taicpu(instr).opcode = op2) or
(taicpu(instr).opcode = op3)
) and
((opsize = []) or (taicpu(instr).opsize in opsize));
end;
function MatchInstruction(const instr : tai;const ops : array of TAsmOp;
const opsize : topsizes) : boolean;
var
op : TAsmOp;
begin
result:=false;
if (instr.typ <> ait_instruction) or
((opsize <> []) and not(taicpu(instr).opsize in opsize)) then
exit;
for op in ops do
begin
if taicpu(instr).opcode = op then
begin
result:=true;
exit;
end;
end;
end;
function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
begin
result := (oper.typ = top_reg) and (oper.reg = reg);
end;
function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
begin
result := (oper.typ = top_const) and (oper.val = a);
end;
function MatchOperand(const oper1: TOper; const oper2: TOper): boolean;
begin
result := oper1.typ = oper2.typ;
if result then
case oper1.typ of
top_const:
Result:=oper1.val = oper2.val;
top_reg:
Result:=oper1.reg = oper2.reg;
top_ref:
Result:=RefsEqual(oper1.ref^, oper2.ref^);
else
internalerror(2013102801);
end
end;
function MatchOperand(const oper1: TOper; const oper2: TOper; const oper3: TOper): boolean;
begin
result := (oper1.typ = oper2.typ) and (oper1.typ = oper3.typ);
if result then
case oper1.typ of
top_const:
Result:=(oper1.val = oper2.val) and (oper1.val = oper3.val);
top_reg:
Result:=(oper1.reg = oper2.reg) and (oper1.reg = oper3.reg);
top_ref:
Result:=RefsEqual(oper1.ref^, oper2.ref^) and RefsEqual(oper1.ref^, oper3.ref^);
else
internalerror(2020052401);
end
end;
function RefsEqual(const r1, r2: treference): boolean;
begin
RefsEqual :=
(r1.offset = r2.offset) and
(r1.segment = r2.segment) and (r1.base = r2.base) and
(r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
(r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
(r1.relsymbol = r2.relsymbol) and
(r1.volatility=[]) and
(r2.volatility=[]);
end;
function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
begin
Result:=(ref.offset=0) and
(ref.scalefactor in [0,1]) and
(ref.segment=NR_NO) and
(ref.symbol=nil) and
(ref.relsymbol=nil) and
((base=NR_INVALID) or
(ref.base=base)) and
((index=NR_INVALID) or
(ref.index=index)) and
(ref.volatility=[]);
end;
function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean;
begin
Result:=(ref.scalefactor in [0,1]) and
(ref.segment=NR_NO) and
(ref.symbol=nil) and
(ref.relsymbol=nil) and
((base=NR_INVALID) or
(ref.base=base)) and
((index=NR_INVALID) or
(ref.index=index)) and
(ref.volatility=[]);
end;
function InstrReadsFlags(p: tai): boolean;
begin
InstrReadsFlags := true;
case p.typ of
ait_instruction:
if InsProp[taicpu(p).opcode].Ch*
[Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc,Ch_All]<>[] then
exit;
ait_label:
exit;
else
;
end;
InstrReadsFlags := false;
end;
function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
begin
Next:=Current;
repeat
Result:=GetNextInstruction(Next,Next);
until not (Result) or
not(cs_opt_level3 in current_settings.optimizerswitches) or
(Next.typ<>ait_instruction) or
RegInInstruction(reg,Next) or
is_calljmp(taicpu(Next).opcode);
end;
function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean;
begin
{ Note, CrossJump keeps its input value if a conditional jump is not found - it doesn't get set to False }
Next := Current;
repeat
Result := GetNextInstruction(Next,Next);
if Result and (Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) then
if is_calljmpuncondret(taicpu(Next).opcode) then
begin
Result := False;
Exit;
end
else
CrossJump := True;
until not Result or
not (cs_opt_level3 in current_settings.optimizerswitches) or
(Next.typ <> ait_instruction) or
RegInInstruction(reg,Next);
end;
function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean;
begin
if not(cs_opt_level3 in current_settings.optimizerswitches) then
begin
Result:=GetNextInstruction(Current,Next);
exit;
end;
Next:=tai(Current.Next);
Result:=false;
while assigned(Next) do
begin
if ((Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) and not(taicpu(Next).opcode=A_CALL)) or
((Next.typ=ait_regalloc) and (getsupreg(tai_regalloc(Next).reg)=getsupreg(reg))) or
((Next.typ=ait_label) and not(labelCanBeSkipped(Tai_Label(Next)))) then
exit
else if (Next.typ=ait_instruction) and RegInInstruction(reg,Next) and not(taicpu(Next).opcode=A_CALL) then
begin
Result:=true;
exit;
end;
Next:=tai(Next.Next);
end;
end;
function TX86AsmOptimizer.InstructionLoadsFromReg(const reg: TRegister;const hp: tai): boolean;
begin
Result:=RegReadByInstruction(reg,hp);
end;
function TX86AsmOptimizer.RegReadByInstruction(reg: TRegister; hp: tai): boolean;
var
p: taicpu;
opcount: longint;
begin
RegReadByInstruction := false;
if hp.typ <> ait_instruction then
exit;
p := taicpu(hp);
case p.opcode of
A_CALL:
regreadbyinstruction := true;
A_IMUL:
case p.ops of
1:
regReadByInstruction := RegInOp(reg,p.oper[0]^) or
(
((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
);
2,3:
regReadByInstruction :=
reginop(reg,p.oper[0]^) or
reginop(reg,p.oper[1]^);
else
InternalError(2019112801);
end;
A_MUL:
begin
regReadByInstruction := RegInOp(reg,p.oper[0]^) or
(
((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and
((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B))
);
end;
A_IDIV,A_DIV:
begin
regReadByInstruction := RegInOp(reg,p.oper[0]^) or
(
(getregtype(reg)=R_INTREGISTER) and
(
(getsupreg(reg)=RS_EAX) or ((getsupreg(reg)=RS_EDX) and (p.opsize<>S_B))
)
);
end;
else
begin
if (p.opcode=A_LEA) and is_segment_reg(reg) then
begin
RegReadByInstruction := false;
exit;
end;
for opcount := 0 to p.ops-1 do
if (p.oper[opCount]^.typ = top_ref) and
RegInRef(reg,p.oper[opcount]^.ref^) then
begin
RegReadByInstruction := true;
exit
end;
{ special handling for SSE MOVSD }
if (p.opcode=A_MOVSD) and (p.ops>0) then
begin
if p.ops<>2 then
internalerror(2017042702);
regReadByInstruction := reginop(reg,p.oper[0]^) or
(
(p.oper[1]^.typ=top_reg) and (p.oper[0]^.typ=top_reg) and reginop(reg, p.oper[1]^)
);
exit;
end;
with insprop[p.opcode] do
begin
case getregtype(reg) of
R_INTREGISTER:
begin
case getsupreg(reg) of
RS_EAX:
if [Ch_REAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_ECX:
if [Ch_RECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_EDX:
if [Ch_REDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_EBX:
if [Ch_REBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_ESP:
if [Ch_RESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_EBP:
if [Ch_REBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_ESI:
if [Ch_RESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
RS_EDI:
if [Ch_REDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
end;
end;
R_MMREGISTER:
begin
case getsupreg(reg) of
RS_XMM0:
if [Ch_RXMM0,Ch_RWXMM0,Ch_MXMM0]*Ch<>[] then
begin
RegReadByInstruction := true;
exit
end;
end;
end;
else
;
end;
if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
begin
if (Ch_RFLAGScc in Ch) and not(getsubreg(reg) in [R_SUBW,R_SUBD,R_SUBQ]) then
begin
case p.condition of
C_A,C_NBE, { CF=0 and ZF=0 }
C_BE,C_NA: { CF=1 or ZF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY,R_SUBFLAGZERO];
C_AE,C_NB,C_NC, { CF=0 }
C_B,C_NAE,C_C: { CF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGCARRY];
C_NE,C_NZ, { ZF=0 }
C_E,C_Z: { ZF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO];
C_G,C_NLE, { ZF=0 and SF=OF }
C_LE,C_NG: { ZF=1 or SF<>OF }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGZERO,R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
C_GE,C_NL, { SF=OF }
C_L,C_NGE: { SF<>OF }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN,R_SUBFLAGOVERFLOW];
C_NO, { OF=0 }
C_O: { OF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGOVERFLOW];
C_NP,C_PO, { PF=0 }
C_P,C_PE: { PF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGPARITY];
C_NS, { SF=0 }
C_S: { SF=1 }
RegReadByInstruction:=getsubreg(reg) in [R_SUBFLAGSIGN];
else
internalerror(2017042701);
end;
if RegReadByInstruction then
exit;
end;
case getsubreg(reg) of
R_SUBW,R_SUBD,R_SUBQ:
RegReadByInstruction :=
[Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
Ch_RDirFlag,Ch_RFlags,Ch_RWFlags,Ch_RFLAGScc]*Ch<>[];
R_SUBFLAGCARRY:
RegReadByInstruction:=[Ch_RCarryFlag,Ch_RWCarryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGPARITY:
RegReadByInstruction:=[Ch_RParityFlag,Ch_RWParityFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGAUXILIARY:
RegReadByInstruction:=[Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGZERO:
RegReadByInstruction:=[Ch_RZeroFlag,Ch_RWZeroFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGSIGN:
RegReadByInstruction:=[Ch_RSignFlag,Ch_RWSignFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGOVERFLOW:
RegReadByInstruction:=[Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGINTERRUPT:
RegReadByInstruction:=[Ch_RFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGDIRECTION:
RegReadByInstruction:=[Ch_RDirFlag,Ch_RFlags,Ch_RWFlags]*Ch<>[];
else
internalerror(2017042601);
end;
exit;
end;
if (Ch_NoReadIfEqualRegs in Ch) and (p.ops=2) and
(p.oper[0]^.typ=top_reg) and (p.oper[1]^.typ=top_reg) and
(p.oper[0]^.reg=p.oper[1]^.reg) then
exit;
if ([CH_RWOP1,CH_ROP1,CH_MOP1]*Ch<>[]) and reginop(reg,p.oper[0]^) then
begin
RegReadByInstruction := true;
exit
end;
if ([Ch_RWOP2,Ch_ROP2,Ch_MOP2]*Ch<>[]) and reginop(reg,p.oper[1]^) then
begin
RegReadByInstruction := true;
exit
end;
if ([Ch_RWOP3,Ch_ROP3,Ch_MOP3]*Ch<>[]) and reginop(reg,p.oper[2]^) then
begin
RegReadByInstruction := true;
exit
end;
if ([Ch_RWOP4,Ch_ROP4,Ch_MOP4]*Ch<>[]) and reginop(reg,p.oper[3]^) then
begin
RegReadByInstruction := true;
exit
end;
end;
end;
end;
end;
function TX86AsmOptimizer.RegInInstruction(Reg: TRegister; p1: tai): Boolean;
begin
result:=false;
if p1.typ<>ait_instruction then
exit;
if (Ch_All in insprop[taicpu(p1).opcode].Ch) then
exit(true);
if (getregtype(reg)=R_INTREGISTER) and
{ change information for xmm movsd are not correct }
((taicpu(p1).opcode<>A_MOVSD) or (taicpu(p1).ops=0)) then
begin
case getsupreg(reg) of
{ RS_EAX = RS_RAX on x86-64 }
RS_EAX:
result:=([Ch_REAX,Ch_RRAX,Ch_WEAX,Ch_WRAX,Ch_RWEAX,Ch_RWRAX,Ch_MEAX,Ch_MRAX]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_ECX:
result:=([Ch_RECX,Ch_RRCX,Ch_WECX,Ch_WRCX,Ch_RWECX,Ch_RWRCX,Ch_MECX,Ch_MRCX]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_EDX:
result:=([Ch_REDX,Ch_RRDX,Ch_WEDX,Ch_WRDX,Ch_RWEDX,Ch_RWRDX,Ch_MEDX,Ch_MRDX]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_EBX:
result:=([Ch_REBX,Ch_RRBX,Ch_WEBX,Ch_WRBX,Ch_RWEBX,Ch_RWRBX,Ch_MEBX,Ch_MRBX]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_ESP:
result:=([Ch_RESP,Ch_RRSP,Ch_WESP,Ch_WRSP,Ch_RWESP,Ch_RWRSP,Ch_MESP,Ch_MRSP]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_EBP:
result:=([Ch_REBP,Ch_RRBP,Ch_WEBP,Ch_WRBP,Ch_RWEBP,Ch_RWRBP,Ch_MEBP,Ch_MRBP]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_ESI:
result:=([Ch_RESI,Ch_RRSI,Ch_WESI,Ch_WRSI,Ch_RWESI,Ch_RWRSI,Ch_MESI,Ch_MRSI,Ch_RMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
RS_EDI:
result:=([Ch_REDI,Ch_RRDI,Ch_WEDI,Ch_WRDI,Ch_RWEDI,Ch_RWRDI,Ch_MEDI,Ch_MRDI,Ch_WMemEDI]*insprop[taicpu(p1).opcode].Ch)<>[];
else
;
end;
if result then
exit;
end
else if getregtype(reg)=R_MMREGISTER then
begin
case getsupreg(reg) of
RS_XMM0:
result:=([Ch_RXMM0,Ch_WXMM0,Ch_RWXMM0,Ch_MXMM0]*insprop[taicpu(p1).opcode].Ch)<>[];
else
;
end;
if result then
exit;
end
else if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
begin
if ([Ch_RFlags,Ch_WFlags,Ch_RWFlags,Ch_RFLAGScc]*insprop[taicpu(p1).opcode].Ch)<>[] then
exit(true);
case getsubreg(reg) of
R_SUBFLAGCARRY:
Result:=([Ch_RCarryFlag,Ch_RWCarryFlag,Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGPARITY:
Result:=([Ch_RParityFlag,Ch_RWParityFlag,Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGAUXILIARY:
Result:=([Ch_RAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGZERO:
Result:=([Ch_RZeroFlag,Ch_RWZeroFlag,Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGSIGN:
Result:=([Ch_RSignFlag,Ch_RWSignFlag,Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGOVERFLOW:
Result:=([Ch_ROverflowFlag,Ch_RWOverflowFlag,Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGINTERRUPT:
Result:=([Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBFLAGDIRECTION:
Result:=([Ch_RDirFlag,Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*insprop[taicpu(p1).opcode].Ch)<>[];
R_SUBW,R_SUBD,R_SUBQ:
{ Everything except the direction bits }
Result:=
([Ch_RCarryFlag,Ch_RParityFlag,Ch_RAuxiliaryFlag,Ch_RZeroFlag,Ch_RSignFlag,Ch_ROverflowFlag,
Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
Ch_W0CarryFlag,Ch_W0ParityFlag,Ch_W0AuxiliaryFlag,Ch_W0ZeroFlag,Ch_W0SignFlag,Ch_W0OverflowFlag,
Ch_W1CarryFlag,Ch_W1ParityFlag,Ch_W1AuxiliaryFlag,Ch_W1ZeroFlag,Ch_W1SignFlag,Ch_W1OverflowFlag,
Ch_WUCarryFlag,Ch_WUParityFlag,Ch_WUAuxiliaryFlag,Ch_WUZeroFlag,Ch_WUSignFlag,Ch_WUOverflowFlag,
Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag
]*insprop[taicpu(p1).opcode].Ch)<>[];
else
;
end;
if result then
exit;
end
else if (getregtype(reg)=R_FPUREGISTER) and (Ch_FPU in insprop[taicpu(p1).opcode].Ch) then
exit(true);
Result:=inherited RegInInstruction(Reg, p1);
end;
function TX86AsmOptimizer.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
const
WriteOps: array[0..3] of set of TInsChange =
([CH_RWOP1,CH_WOP1,CH_MOP1],
[Ch_RWOP2,Ch_WOP2,Ch_MOP2],
[Ch_RWOP3,Ch_WOP3,Ch_MOP3],
[Ch_RWOP4,Ch_WOP4,Ch_MOP4]);
var
OperIdx: Integer;
begin
Result := False;
if p1.typ <> ait_instruction then
exit;
with insprop[taicpu(p1).opcode] do
if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
begin
case getsubreg(reg) of
R_SUBW,R_SUBD,R_SUBQ:
Result :=
[Ch_WCarryFlag,Ch_WParityFlag,Ch_WAuxiliaryFlag,Ch_WZeroFlag,Ch_WSignFlag,Ch_WOverflowFlag,
Ch_RWCarryFlag,Ch_RWParityFlag,Ch_RWAuxiliaryFlag,Ch_RWZeroFlag,Ch_RWSignFlag,Ch_RWOverflowFlag,
Ch_W0DirFlag,Ch_W1DirFlag,Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGCARRY:
Result:=[Ch_WCarryFlag,Ch_RWCarryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGPARITY:
Result:=[Ch_WParityFlag,Ch_RWParityFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGAUXILIARY:
Result:=[Ch_WAuxiliaryFlag,Ch_RWAuxiliaryFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGZERO:
Result:=[Ch_WZeroFlag,Ch_RWZeroFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGSIGN:
Result:=[Ch_WSignFlag,Ch_RWSignFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGOVERFLOW:
Result:=[Ch_WOverflowFlag,Ch_RWOverflowFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGINTERRUPT:
Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
R_SUBFLAGDIRECTION:
Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags,Ch_RWFlags]*Ch<>[];
else
internalerror(2017042602);
end;
exit;
end;
case taicpu(p1).opcode of
A_CALL:
{ We could potentially set Result to False if the register in
question is non-volatile for the subroutine's calling convention,
but this would require detecting the calling convention in use and
also assuming that the routine doesn't contain malformed assembly
language, for example... so it could only be done under -O4 as it
would be considered a side-effect. [Kit] }
Result := True;
A_MOVSD:
{ special handling for SSE MOVSD }
if (taicpu(p1).ops>0) then
begin
if taicpu(p1).ops<>2 then
internalerror(2017042703);
Result := (taicpu(p1).oper[1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[1]^);
end;
{ VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
so fix it here (FK)
}
A_VMOVSS,
A_VMOVSD:
begin
Result := (taicpu(p1).ops=3) and (taicpu(p1).oper[2]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[2]^);
exit;
end;
A_IMUL:
Result := (taicpu(p1).oper[taicpu(p1).ops-1]^.typ=top_reg) and RegInOp(reg,taicpu(p1).oper[taicpu(p1).ops-1]^);
else
;
end;
if Result then
exit;
with insprop[taicpu(p1).opcode] do
begin
if getregtype(reg)=R_INTREGISTER then
begin
case getsupreg(reg) of
RS_EAX:
if [Ch_WEAX,Ch_RWEAX,Ch_MEAX]*Ch<>[] then
begin
Result := True;
exit
end;
RS_ECX:
if [Ch_WECX,Ch_RWECX,Ch_MECX]*Ch<>[] then
begin
Result := True;
exit
end;
RS_EDX:
if [Ch_WEDX,Ch_RWEDX,Ch_MEDX]*Ch<>[] then
begin
Result := True;
exit
end;
RS_EBX:
if [Ch_WEBX,Ch_RWEBX,Ch_MEBX]*Ch<>[] then
begin
Result := True;
exit
end;
RS_ESP:
if [Ch_WESP,Ch_RWESP,Ch_MESP]*Ch<>[] then
begin
Result := True;
exit
end;
RS_EBP:
if [Ch_WEBP,Ch_RWEBP,Ch_MEBP]*Ch<>[] then
begin
Result := True;
exit
end;
RS_ESI:
if [Ch_WESI,Ch_RWESI,Ch_MESI]*Ch<>[] then
begin
Result := True;
exit
end;
RS_EDI:
if [Ch_WEDI,Ch_RWEDI,Ch_MEDI]*Ch<>[] then
begin
Result := True;
exit
end;
end;
end;
for OperIdx := 0 to taicpu(p1).ops - 1 do
if (WriteOps[OperIdx]*Ch<>[]) and
{ The register doesn't get modified inside a reference }
(taicpu(p1).oper[OperIdx]^.typ = top_reg) and
SuperRegistersEqual(reg,taicpu(p1).oper[OperIdx]^.reg) then
begin
Result := true;
exit
end;
end;
end;
{$ifdef DEBUG_AOPTCPU}
procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);
begin
asml.insertbefore(tai_comment.Create(strpnew(s)), p);
end;
function debug_tostr(i: tcgint): string; inline;
begin
Result := tostr(i);
end;
function debug_regname(r: TRegister): string; inline;
begin
Result := '%' + std_regname(r);
end;
{ Debug output function - creates a string representation of an operator }
function debug_operstr(oper: TOper): string;
begin
case oper.typ of
top_const:
Result := '$' + debug_tostr(oper.val);
top_reg:
Result := debug_regname(oper.reg);
top_ref:
begin
if oper.ref^.offset <> 0 then
Result := debug_tostr(oper.ref^.offset) + '('
else
Result := '(';
if (oper.ref^.base <> NR_INVALID) and (oper.ref^.base <> NR_NO) then
begin
Result := Result + debug_regname(oper.ref^.base);
if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
Result := Result + ',' + debug_regname(oper.ref^.index);
end
else
if (oper.ref^.index <> NR_INVALID) and (oper.ref^.index <> NR_NO) then
Result := Result + debug_regname(oper.ref^.index);
if (oper.ref^.scalefactor > 1) then
Result := Result + ',' + debug_tostr(oper.ref^.scalefactor) + ')'
else
Result := Result + ')';
end;
else
Result := '[UNKNOWN]';
end;
end;
function debug_op2str(opcode: tasmop): string; inline;
begin
Result := std_op2str[opcode];
end;
function debug_opsize2str(opsize: topsize): string; inline;
begin
Result := gas_opsize2str[opsize];
end;
{$else DEBUG_AOPTCPU}
procedure TX86AsmOptimizer.DebugMsg(const s: string;p : tai);inline;
begin
end;
function debug_tostr(i: tcgint): string; inline;
begin
Result := '';
end;
function debug_regname(r: TRegister): string; inline;
begin
Result := '';
end;
function debug_operstr(oper: TOper): string; inline;
begin
Result := '';
end;
function debug_op2str(opcode: tasmop): string; inline;
begin
Result := '';
end;
function debug_opsize2str(opsize: topsize): string; inline;
begin
Result := '';
end;
{$endif DEBUG_AOPTCPU}
class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline;
begin
{$ifdef x86_64}
{ Always fine on x86-64 }
Result := True;
{$else x86_64}
Result :=
{$ifdef i8086}
(current_settings.cputype >= cpu_386) and
{$endif i8086}
(
{ Always accept if optimising for size }
(cs_opt_size in current_settings.optimizerswitches) or
{ From the Pentium II onwards, MOVZX only takes 1 cycle. [Kit] }
(current_settings.optimizecputype >= cpu_Pentium2)
);
{$endif x86_64}
end;
{ Attempts to allocate a volatile integer register for use between p and hp,
using AUsedRegs for the current register usage information. Returns NR_NO
if no free register could be found }
function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
var
RegSet: TCPURegisterSet;
CurrentSuperReg: Integer;
CurrentReg: TRegister;
Currentp: tai;
Breakout: Boolean;
begin
{ TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
Result := NR_NO;
RegSet := paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption);
for CurrentSuperReg in RegSet do
begin
CurrentReg := newreg(R_INTREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
if not AUsedRegs[R_INTREGISTER].IsUsed(CurrentReg)
{$if defined(i386) or defined(i8086)}
{ If the target size is 8-bit, make sure we can actually encode it }
and (
(RegSize >= R_SUBW) or { Not R_SUBL or R_SUBH }
(GetSupReg(CurrentReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX])
)
{$endif i386 or i8086}
then
begin
Currentp := p;
Breakout := False;
while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
begin
case Currentp.typ of
ait_instruction:
begin
if RegInInstruction(CurrentReg, Currentp) then
begin
Breakout := True;
Break;
end;
{ Cannot allocate across an unconditional jump }
if is_calljmpuncondret(taicpu(Currentp).opcode) then
Exit;
end;
ait_marker:
{ Don't try anything more if a marker is hit }
Exit;
ait_regalloc:
if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
begin
Breakout := True;
Break;
end;
else
;
end;
end;
if Breakout then
{ Try the next register }
Continue;
{ We have a free register available }
Result := CurrentReg;
AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
Exit;
end;
end;
end;
{ Attempts to allocate a volatile MM register for use between p and hp,
using AUsedRegs for the current register usage information. Returns NR_NO
if no free register could be found }
function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
var
RegSet: TCPURegisterSet;
CurrentSuperReg: Integer;
CurrentReg: TRegister;
Currentp: tai;
Breakout: Boolean;
begin
{ TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
Result := NR_NO;
RegSet := paramanager.get_volatile_registers_mm(current_procinfo.procdef.proccalloption);
for CurrentSuperReg in RegSet do
begin
CurrentReg := newreg(R_MMREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
if not AUsedRegs[R_MMREGISTER].IsUsed(CurrentReg) then
begin
Currentp := p;
Breakout := False;
while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
begin
case Currentp.typ of
ait_instruction:
begin
if RegInInstruction(CurrentReg, Currentp) then
begin
Breakout := True;
Break;
end;
{ Cannot allocate across an unconditional jump }
if is_calljmpuncondret(taicpu(Currentp).opcode) then
Exit;
end;
ait_marker:
{ Don't try anything more if a marker is hit }
Exit;
ait_regalloc:
if (tai_regalloc(Currentp).ratype <> ra_dealloc) and SuperRegistersEqual(CurrentReg, tai_regalloc(Currentp).reg) then
begin
Breakout := True;
Break;
end;
else
;
end;
end;
if Breakout then
{ Try the next register }
Continue;
{ We have a free register available }
Result := CurrentReg;
AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
Exit;
end;
end;
end;
function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
begin
if not SuperRegistersEqual(reg1,reg2) then
exit(false);
if getregtype(reg1)<>R_INTREGISTER then
exit(true); {because SuperRegisterEqual is true}
case getsubreg(reg1) of
{ A write to R_SUBL doesn't change R_SUBH and if reg2 is R_SUBW or
higher, it preserves the high bits, so the new value depends on
reg2's previous value. In other words, it is equivalent to doing:
reg2 := (reg2 and $ffffff00) or byte(reg1); }
R_SUBL:
exit(getsubreg(reg2)=R_SUBL);
{ A write to R_SUBH doesn't change R_SUBL and if reg2 is R_SUBW or
higher, it actually does a:
reg2 := (reg2 and $ffff00ff) or (reg1 and $ff00); }
R_SUBH:
exit(getsubreg(reg2)=R_SUBH);
{ If reg2 is R_SUBD or larger, a write to R_SUBW preserves the high 16
bits of reg2:
reg2 := (reg2 and $ffff0000) or word(reg1); }
R_SUBW:
exit(getsubreg(reg2) in [R_SUBL,R_SUBH,R_SUBW]);
{ a write to R_SUBD always overwrites every other subregister,
because it clears the high 32 bits of R_SUBQ on x86_64 }
R_SUBD,
R_SUBQ:
exit(true);
else
internalerror(2017042801);
end;
end;
function TX86AsmOptimizer.Reg1ReadDependsOnReg2(reg1, reg2: tregister): boolean;
begin
if not SuperRegistersEqual(reg1,reg2) then
exit(false);
if getregtype(reg1)<>R_INTREGISTER then
exit(true); {because SuperRegisterEqual is true}
case getsubreg(reg1) of
R_SUBL:
exit(getsubreg(reg2)<>R_SUBH);
R_SUBH:
exit(getsubreg(reg2)<>R_SUBL);
R_SUBW,
R_SUBD,
R_SUBQ:
exit(true);
else
internalerror(2017042802);
end;
end;
function TX86AsmOptimizer.PrePeepholeOptSxx(var p : tai) : boolean;
var
hp1 : tai;
l : TCGInt;
begin
result:=false;
{ changes the code sequence
shr/sar const1, x
shl const2, x
to
either "sar/and", "shl/and" or just "and" depending on const1 and const2 }
if GetNextInstruction(p, hp1) and
MatchInstruction(hp1,A_SHL,[]) and
(taicpu(p).oper[0]^.typ = top_const) and
(taicpu(hp1).oper[0]^.typ = top_const) and
(taicpu(hp1).opsize = taicpu(p).opsize) and
(taicpu(hp1).oper[1]^.typ = taicpu(p).oper[1]^.typ) and
OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) then
begin
if (taicpu(p).oper[0]^.val > taicpu(hp1).oper[0]^.val) and
not(cs_opt_size in current_settings.optimizerswitches) then
begin
{ shr/sar const1, %reg
shl const2, %reg
with const1 > const2 }
taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
taicpu(hp1).opcode := A_AND;
l := (1 shl (taicpu(hp1).oper[0]^.val)) - 1;
case taicpu(p).opsize Of
S_B: taicpu(hp1).loadConst(0,l Xor $ff);
S_W: taicpu(hp1).loadConst(0,l Xor $ffff);
S_L: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffff));
S_Q: taicpu(hp1).loadConst(0,l Xor tcgint($ffffffffffffffff));
else
Internalerror(2017050703)
end;
end
else if (taicpu(p).oper[0]^.val<taicpu(hp1).oper[0]^.val) and
not(cs_opt_size in current_settings.optimizerswitches) then
begin
{ shr/sar const1, %reg
shl const2, %reg
with const1 < const2 }
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val-taicpu(p).oper[0]^.val);
taicpu(p).opcode := A_AND;
l := (1 shl (taicpu(p).oper[0]^.val))-1;
case taicpu(p).opsize Of
S_B: taicpu(p).loadConst(0,l Xor $ff);
S_W: taicpu(p).loadConst(0,l Xor $ffff);
S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
else
Internalerror(2017050702)
end;
end
else if (taicpu(p).oper[0]^.val = taicpu(hp1).oper[0]^.val) then
begin
{ shr/sar const1, %reg
shl const2, %reg
with const1 = const2 }
taicpu(p).opcode := A_AND;
l := (1 shl (taicpu(p).oper[0]^.val))-1;
case taicpu(p).opsize Of
S_B: taicpu(p).loadConst(0,l Xor $ff);
S_W: taicpu(p).loadConst(0,l Xor $ffff);
S_L: taicpu(p).loadConst(0,l Xor tcgint($ffffffff));
S_Q: taicpu(p).loadConst(0,l Xor tcgint($ffffffffffffffff));
else
Internalerror(2017050701)
end;
RemoveInstruction(hp1);
end;
end;
end;
function TX86AsmOptimizer.PrePeepholeOptIMUL(var p : tai) : boolean;
var
opsize : topsize;
hp1 : tai;
tmpref : treference;
ShiftValue : Cardinal;
BaseValue : TCGInt;
begin
result:=false;
opsize:=taicpu(p).opsize;
{ changes certain "imul const, %reg"'s to lea sequences }
if (MatchOpType(taicpu(p),top_const,top_reg) or
MatchOpType(taicpu(p),top_const,top_reg,top_reg)) and
(opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
if (taicpu(p).oper[0]^.val = 1) then
if (taicpu(p).ops = 2) then
{ remove "imul $1, reg" }
begin
DebugMsg(SPeepholeOptimization + 'Imul2Nop done',p);
Result := RemoveCurrentP(p);
end
else
{ change "imul $1, reg1, reg2" to "mov reg1, reg2" }
begin
hp1 := taicpu.Op_Reg_Reg(A_MOV, opsize, taicpu(p).oper[1]^.reg,taicpu(p).oper[2]^.reg);
InsertLLItem(p.previous, p.next, hp1);
DebugMsg(SPeepholeOptimization + 'Imul2Mov done',p);
p.free;
p := hp1;
end
else if ((taicpu(p).ops <= 2) or
(taicpu(p).oper[2]^.typ = Top_Reg)) and
not(cs_opt_size in current_settings.optimizerswitches) and
(not(GetNextInstruction(p, hp1)) or
not((tai(hp1).typ = ait_instruction) and
((taicpu(hp1).opcode=A_Jcc) and
(taicpu(hp1).condition in [C_O,C_NO])))) then
begin
{
imul X, reg1, reg2 to
lea (reg1,reg1,Y), reg2
shl ZZ,reg2
imul XX, reg1 to
lea (reg1,reg1,YY), reg1
shl ZZ,reg2
This optimziation makes sense for pretty much every x86, except the VIA Nano3000: it has IMUL latency 2, lea/shl pair as well,
it does not exist as a separate optimization target in FPC though.
This optimziation can be applied as long as only two bits are set in the constant and those two bits are separated by
at most two zeros
}
reference_reset(tmpref,1,[]);
if (PopCnt(QWord(taicpu(p).oper[0]^.val))=2) and (BsrQWord(taicpu(p).oper[0]^.val)-BsfQWord(taicpu(p).oper[0]^.val)<=3) then
begin
ShiftValue:=BsfQWord(taicpu(p).oper[0]^.val);
BaseValue:=taicpu(p).oper[0]^.val shr ShiftValue;
TmpRef.base := taicpu(p).oper[1]^.reg;
TmpRef.index := taicpu(p).oper[1]^.reg;
if not(BaseValue in [3,5,9]) then
Internalerror(2018110101);
TmpRef.ScaleFactor := BaseValue-1;
if (taicpu(p).ops = 2) then
hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[1]^.reg)
else
hp1 := taicpu.op_ref_reg(A_LEA, opsize, TmpRef, taicpu(p).oper[2]^.reg);
AsmL.InsertAfter(hp1,p);
DebugMsg(SPeepholeOptimization + 'Imul2LeaShl done',p);
taicpu(hp1).fileinfo:=taicpu(p).fileinfo;
RemoveCurrentP(p, hp1);
if ShiftValue>0 then
AsmL.InsertAfter(taicpu.op_const_reg(A_SHL, opsize, ShiftValue, taicpu(hp1).oper[1]^.reg),hp1);
end;
end;
end;
function TX86AsmOptimizer.PrePeepholeOptAND(var p : tai) : boolean;
begin
Result := False;
if MatchOperand(taicpu(p).oper[0]^, 0) and
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
begin
DebugMsg(SPeepholeOptimization + 'AND 0 -> MOV 0', p);
taicpu(p).opcode := A_MOV;
Result := True;
end;
end;
function TX86AsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
var
p: taicpu absolute hp;
i: Integer;
begin
Result := False;
if not assigned(hp) or
(hp.typ <> ait_instruction) then
Exit;
// p := taicpu(hp);
Prefetch(insprop[p.opcode]);
if SuperRegistersEqual(reg,NR_DEFAULTFLAGS) then
with insprop[p.opcode] do
begin
case getsubreg(reg) of
R_SUBW,R_SUBD,R_SUBQ:
Result:=
RegLoadedWithNewValue(NR_CARRYFLAG,hp) and
RegLoadedWithNewValue(NR_PARITYFLAG,hp) and
RegLoadedWithNewValue(NR_AUXILIARYFLAG,hp) and
RegLoadedWithNewValue(NR_ZEROFLAG,hp) and
RegLoadedWithNewValue(NR_SIGNFLAG,hp) and
RegLoadedWithNewValue(NR_OVERFLOWFLAG,hp);
R_SUBFLAGCARRY:
Result:=[Ch_W0CarryFlag,Ch_W1CarryFlag,Ch_WCarryFlag,Ch_WUCarryFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGPARITY:
Result:=[Ch_W0ParityFlag,Ch_W1ParityFlag,Ch_WParityFlag,Ch_WUParityFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGAUXILIARY:
Result:=[Ch_W0AuxiliaryFlag,Ch_W1AuxiliaryFlag,Ch_WAuxiliaryFlag,Ch_WUAuxiliaryFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGZERO:
Result:=[Ch_W0ZeroFlag,Ch_W1ZeroFlag,Ch_WZeroFlag,Ch_WUZeroFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGSIGN:
Result:=[Ch_W0SignFlag,Ch_W1SignFlag,Ch_WSignFlag,Ch_WUSignFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGOVERFLOW:
Result:=[Ch_W0OverflowFlag,Ch_W1OverflowFlag,Ch_WOverflowFlag,Ch_WUOverflowFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGINTERRUPT:
Result:=[Ch_W0IntFlag,Ch_W1IntFlag,Ch_WFlags]*Ch<>[];
R_SUBFLAGDIRECTION:
Result:=[Ch_W0DirFlag,Ch_W1DirFlag,Ch_WFlags]*Ch<>[];
else
begin
writeln(getsubreg(reg));
internalerror(2017050501);
end;
end;
exit;
end;
{ Handle special cases first }
case p.opcode of
A_MOV, A_MOVZX, A_MOVSX, A_LEA, A_VMOVSS, A_VMOVSD, A_VMOVAPD,
A_VMOVAPS, A_VMOVQ, A_MOVSS, A_MOVSD, A_MOVQ, A_MOVAPD, A_MOVAPS:
begin
Result :=
(p.ops=2) and { A_MOVSD can have zero operands, so this check is needed }
(p.oper[1]^.typ = top_reg) and
(Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)) and
(
(p.oper[0]^.typ = top_const) or
(
(p.oper[0]^.typ = top_reg) and
not(Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg))
) or (
(p.oper[0]^.typ = top_ref) and
not RegInRef(reg,p.oper[0]^.ref^)
)
);
end;
A_MUL, A_IMUL:
Result :=
(
(p.ops=3) and { IMUL only }
(Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg,reg)) and
(
(
(p.oper[1]^.typ=top_reg) and
not Reg1ReadDependsOnReg2(p.oper[1]^.reg,reg)
) or (
(p.oper[1]^.typ=top_ref) and
not RegInRef(reg,p.oper[1]^.ref^)
)
)
) or (
(
(p.ops=1) and
(
(
(
(p.oper[0]^.typ=top_reg) and
not Reg1ReadDependsOnReg2(p.oper[0]^.reg,reg)
)
) or (
(p.oper[0]^.typ=top_ref) and
not RegInRef(reg,p.oper[0]^.ref^)
)
) and (
(
(p.opsize=S_B) and
Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and
not Reg1ReadDependsOnReg2(NR_AL,reg)
) or (
(p.opsize=S_W) and
Reg1WriteOverwritesReg2Entirely(NR_DX,reg)
) or (
(p.opsize=S_L) and
Reg1WriteOverwritesReg2Entirely(NR_EDX,reg)
{$ifdef x86_64}
) or (
(p.opsize=S_Q) and
Reg1WriteOverwritesReg2Entirely(NR_RDX,reg)
{$endif x86_64}
)
)
)
);
A_CBW:
Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg) and not(Reg1ReadDependsOnReg2(NR_AL,reg));
{$ifndef x86_64}
A_LDS:
Result := (reg=NR_DS) and not(RegInRef(reg,p.oper[0]^.ref^));
A_LES:
Result := (reg=NR_ES) and not(RegInRef(reg,p.oper[0]^.ref^));
{$endif not x86_64}
A_LFS:
Result := (reg=NR_FS) and not(RegInRef(reg,p.oper[0]^.ref^));
A_LGS:
Result := (reg=NR_GS) and not(RegInRef(reg,p.oper[0]^.ref^));
A_LSS:
Result := (reg=NR_SS) and not(RegInRef(reg,p.oper[0]^.ref^));
A_LAHF{$ifndef x86_64}, A_AAM{$endif not x86_64}:
Result := Reg1WriteOverwritesReg2Entirely(NR_AH,reg);
A_LODSB:
Result := Reg1WriteOverwritesReg2Entirely(NR_AL,reg);
A_LODSW:
Result := Reg1WriteOverwritesReg2Entirely(NR_AX,reg);
{$ifdef x86_64}
A_LODSQ:
Result := Reg1WriteOverwritesReg2Entirely(NR_RAX,reg);
{$endif x86_64}
A_LODSD:
Result := Reg1WriteOverwritesReg2Entirely(NR_EAX,reg);
A_FSTSW, A_FNSTSW:
Result := (p.oper[0]^.typ=top_reg) and Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg,reg);
else
begin
with insprop[p.opcode] do
begin
if (
{ xor %reg,%reg etc. is classed as a new value }
(([Ch_NoReadIfEqualRegs]*Ch)<>[]) and
MatchOpType(p, top_reg, top_reg) and
(p.oper[0]^.reg = p.oper[1]^.reg) and
Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg,reg)
) then
begin
Result := True;
Exit;
end;
{ Make sure the entire register is overwritten }
if (getregtype(reg) = R_INTREGISTER) then
begin
if (p.ops > 0) then
begin
if RegInOp(reg, p.oper[0]^) then
begin
if (p.oper[0]^.typ = top_ref) then
begin
if RegInRef(reg, p.oper[0]^.ref^) then
begin
Result := False;
Exit;
end;
end
else if (p.oper[0]^.typ = top_reg) then
begin
if ([Ch_ROp1, Ch_RWOp1, Ch_MOp1]*Ch<>[]) then
begin
Result := False;
Exit;
end
else if ([Ch_WOp1]*Ch<>[]) then
begin
if Reg1WriteOverwritesReg2Entirely(p.oper[0]^.reg, reg) then
Result := True
else
begin
Result := False;
Exit;
end;
end;
end;
end;
if (p.ops > 1) then
begin
if RegInOp(reg, p.oper[1]^) then
begin
if (p.oper[1]^.typ = top_ref) then
begin
if RegInRef(reg, p.oper[1]^.ref^) then
begin
Result := False;
Exit;
end;
end
else if (p.oper[1]^.typ = top_reg) then
begin
if ([Ch_ROp2, Ch_RWOp2, Ch_MOp2]*Ch<>[]) then
begin
Result := False;
Exit;
end
else if ([Ch_WOp2]*Ch<>[]) then
begin
if Reg1WriteOverwritesReg2Entirely(p.oper[1]^.reg, reg) then
Result := True
else
begin
Result := False;
Exit;
end;
end;
end;
end;
if (p.ops > 2) then
begin
if RegInOp(reg, p.oper[2]^) then
begin
if (p.oper[2]^.typ = top_ref) then
begin
if RegInRef(reg, p.oper[2]^.ref^) then
begin
Result := False;
Exit;
end;
end
else if (p.oper[2]^.typ = top_reg) then
begin
if ([Ch_ROp3, Ch_RWOp3, Ch_MOp3]*Ch<>[]) then
begin
Result := False;
Exit;
end
else if ([Ch_WOp3]*Ch<>[]) then
begin
if Reg1WriteOverwritesReg2Entirely(p.oper[2]^.reg, reg) then
Result := True
else
begin
Result := False;
Exit;
end;
end;
end;
end;
if (p.ops > 3) and RegInOp(reg, p.oper[3]^) then
begin
if (p.oper[3]^.typ = top_ref) then
begin
if RegInRef(reg, p.oper[3]^.ref^) then
begin
Result := False;
Exit;
end;
end
else if (p.oper[3]^.typ = top_reg) then
begin
if ([Ch_ROp4, Ch_RWOp4, Ch_MOp4]*Ch<>[]) then
begin
Result := False;
Exit;
end
else if ([Ch_WOp4]*Ch<>[]) then
begin
if Reg1WriteOverwritesReg2Entirely(p.oper[3]^.reg, reg) then
Result := True
else
begin
Result := False;
Exit;
end;
end;
end;
end;
end;
end;
end;
{ Don't do these ones first in case an input operand is equal to an explicit output registers }
case getsupreg(reg) of
RS_EAX:
if ([Ch_WEAX{$ifdef x86_64},Ch_WRAX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EAX, reg) then
begin
Result := True;
Exit;
end;
RS_ECX:
if ([Ch_WECX{$ifdef x86_64},Ch_WRCX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ECX, reg) then
begin
Result := True;
Exit;
end;
RS_EDX:
if ([Ch_REDX{$ifdef x86_64},Ch_WRDX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDX, reg) then
begin
Result := True;
Exit;
end;
RS_EBX:
if ([Ch_WEBX{$ifdef x86_64},Ch_WRBX{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBX, reg) then
begin
Result := True;
Exit;
end;
RS_ESP:
if ([Ch_WESP{$ifdef x86_64},Ch_WRSP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESP, reg) then
begin
Result := True;
Exit;
end;
RS_EBP:
if ([Ch_WEBP{$ifdef x86_64},Ch_WRBP{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EBP, reg) then
begin
Result := True;
Exit;
end;
RS_ESI:
if ([Ch_WESI{$ifdef x86_64},Ch_WRSI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_ESI, reg) then
begin
Result := True;
Exit;
end;
RS_EDI:
if ([Ch_WEDI{$ifdef x86_64},Ch_WRDI{$endif x86_64}]*Ch<>[]) and Reg1WriteOverwritesReg2Entirely(NR_EDI, reg) then
begin
Result := True;
Exit;
end;
else
;
end;
end;
end;
end;
end;
end;
class function TX86AsmOptimizer.IsExitCode(p : tai) : boolean;
var
hp2,hp3 : tai;
begin
{ some x86-64 issue a NOP before the real exit code }
if MatchInstruction(p,A_NOP,[]) then
GetNextInstruction(p,p);
result:=assigned(p) and (p.typ=ait_instruction) and
((taicpu(p).opcode = A_RET) or
((taicpu(p).opcode=A_LEAVE) and
GetNextInstruction(p,hp2) and
MatchInstruction(hp2,A_RET,[S_NO])
) or
(((taicpu(p).opcode=A_LEA) and
MatchOpType(taicpu(p),top_ref,top_reg) and
(taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
(taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
) and
GetNextInstruction(p,hp2) and
MatchInstruction(hp2,A_RET,[S_NO])
) or
((((taicpu(p).opcode=A_MOV) and
MatchOpType(taicpu(p),top_reg,top_reg) and
(taicpu(p).oper[0]^.reg=current_procinfo.framepointer) and
(taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)) or
((taicpu(p).opcode=A_LEA) and
MatchOpType(taicpu(p),top_ref,top_reg) and
(taicpu(p).oper[0]^.ref^.base=current_procinfo.framepointer) and
(taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG)
)
) and
GetNextInstruction(p,hp2) and
MatchInstruction(hp2,A_POP,[reg2opsize(current_procinfo.framepointer)]) and
MatchOpType(taicpu(hp2),top_reg) and
(taicpu(hp2).oper[0]^.reg=current_procinfo.framepointer) and
GetNextInstruction(hp2,hp3) and
MatchInstruction(hp3,A_RET,[S_NO])
)
);
end;
class function TX86AsmOptimizer.isFoldableArithOp(hp1: taicpu; reg: tregister): boolean;
begin
isFoldableArithOp := False;
case hp1.opcode of
A_ADD,A_SUB,A_OR,A_XOR,A_AND,A_SHL,A_SHR,A_SAR:
isFoldableArithOp :=
((taicpu(hp1).oper[0]^.typ = top_const) or
((taicpu(hp1).oper[0]^.typ = top_reg) and
(taicpu(hp1).oper[0]^.reg <> reg))) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
(taicpu(hp1).oper[1]^.reg = reg);
A_INC,A_DEC,A_NEG,A_NOT:
isFoldableArithOp :=
(taicpu(hp1).oper[0]^.typ = top_reg) and
(taicpu(hp1).oper[0]^.reg = reg);
else
;
end;
end;
procedure TX86AsmOptimizer.RemoveLastDeallocForFuncRes(p: tai);
procedure DoRemoveLastDeallocForFuncRes( supreg: tsuperregister);
var
hp2: tai;
begin
hp2 := p;
repeat
hp2 := tai(hp2.previous);
if assigned(hp2) and
(hp2.typ = ait_regalloc) and
(tai_regalloc(hp2).ratype=ra_dealloc) and
(getregtype(tai_regalloc(hp2).reg) = R_INTREGISTER) and
(getsupreg(tai_regalloc(hp2).reg) = supreg) then
begin
RemoveInstruction(hp2);
break;
end;
until not(assigned(hp2)) or regInInstruction(newreg(R_INTREGISTER,supreg,R_SUBWHOLE),hp2);
end;
begin
case current_procinfo.procdef.returndef.typ of
arraydef,recorddef,pointerdef,
stringdef,enumdef,procdef,objectdef,errordef,
filedef,setdef,procvardef,
classrefdef,forwarddef:
DoRemoveLastDeallocForFuncRes(RS_EAX);
orddef:
if current_procinfo.procdef.returndef.size <> 0 then
begin
DoRemoveLastDeallocForFuncRes(RS_EAX);
{ for int64/qword }
if current_procinfo.procdef.returndef.size = 8 then
DoRemoveLastDeallocForFuncRes(RS_EDX);
end;
else
;
end;
end;
function TX86AsmOptimizer.OptPass1_V_MOVAP(var p : tai) : boolean;
var
hp1,hp2 : tai;
begin
result:=false;
if MatchOpType(taicpu(p),top_reg,top_reg) then
begin
{ vmova* reg1,reg1
=>
<nop> }
if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
begin
RemoveCurrentP(p);
result:=true;
exit;
end
else if GetNextInstruction(p,hp1) then
begin
if MatchInstruction(hp1,[taicpu(p).opcode],[S_NO]) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
begin
{ vmova* reg1,reg2
vmova* reg2,reg3
dealloc reg2
=>
vmova* reg1,reg3 }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if MatchOpType(taicpu(hp1),top_reg,top_reg) and
not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 1',p);
taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
RemoveInstruction(hp1);
result:=true;
exit;
end
{ special case:
vmova* reg1,<op>
vmova* <op>,reg1
=>
vmova* reg1,<op> }
else if MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
((taicpu(p).oper[0]^.typ<>top_ref) or
(not(vol_read in taicpu(p).oper[0]^.ref^.volatility))
) then
begin
DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVA*2(V)MOVA* 2',p);
RemoveInstruction(hp1);
result:=true;
exit;
end
end
else if ((MatchInstruction(p,[A_MOVAPS,A_VMOVAPS],[S_NO]) and
MatchInstruction(hp1,[A_MOVSS,A_VMOVSS],[S_NO])) or
((MatchInstruction(p,[A_MOVAPD,A_VMOVAPD],[S_NO]) and
MatchInstruction(hp1,[A_MOVSD,A_VMOVSD],[S_NO])))
) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
begin
{ vmova* reg1,reg2
vmovs* reg2,<op>
dealloc reg2
=>
vmovs* reg1,reg3 }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + '(V)MOVA*(V)MOVS*2(V)MOVS* 1',p);
taicpu(p).opcode:=taicpu(hp1).opcode;
taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
RemoveInstruction(hp1);
result:=true;
exit;
end
end;
end;
if GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
begin
if MatchInstruction(hp1,[A_VFMADDPD,
A_VFMADD132PD,
A_VFMADD132PS,
A_VFMADD132SD,
A_VFMADD132SS,
A_VFMADD213PD,
A_VFMADD213PS,
A_VFMADD213SD,
A_VFMADD213SS,
A_VFMADD231PD,
A_VFMADD231PS,
A_VFMADD231SD,
A_VFMADD231SS,
A_VFMADDSUB132PD,
A_VFMADDSUB132PS,
A_VFMADDSUB213PD,
A_VFMADDSUB213PS,
A_VFMADDSUB231PD,
A_VFMADDSUB231PS,
A_VFMSUB132PD,
A_VFMSUB132PS,
A_VFMSUB132SD,
A_VFMSUB132SS,
A_VFMSUB213PD,
A_VFMSUB213PS,
A_VFMSUB213SD,
A_VFMSUB213SS,
A_VFMSUB231PD,
A_VFMSUB231PS,
A_VFMSUB231SD,
A_VFMSUB231SS,
A_VFMSUBADD132PD,
A_VFMSUBADD132PS,
A_VFMSUBADD213PD,
A_VFMSUBADD213PS,
A_VFMSUBADD231PD,
A_VFMSUBADD231PS,
A_VFNMADD132PD,
A_VFNMADD132PS,
A_VFNMADD132SD,
A_VFNMADD132SS,
A_VFNMADD213PD,
A_VFNMADD213PS,
A_VFNMADD213SD,
A_VFNMADD213SS,
A_VFNMADD231PD,
A_VFNMADD231PS,
A_VFNMADD231SD,
A_VFNMADD231SS,
A_VFNMSUB132PD,
A_VFNMSUB132PS,
A_VFNMSUB132SD,
A_VFNMSUB132SS,
A_VFNMSUB213PD,
A_VFNMSUB213PS,
A_VFNMSUB213SD,
A_VFNMSUB213SS,
A_VFNMSUB231PD,
A_VFNMSUB231PS,
A_VFNMSUB231SD,
A_VFNMSUB231SS],[S_NO]) and
{ we mix single and double opperations here because we assume that the compiler
generates vmovapd only after double operations and vmovaps only after single operations }
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[2]^) and
GetNextInstruction(hp1,hp2) and
MatchInstruction(hp2,[A_VMOVAPD,A_VMOVAPS,A_MOVAPD,A_MOVAPS],[S_NO]) and
MatchOperand(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
begin
taicpu(hp1).loadoper(2,taicpu(p).oper[0]^);
RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
RemoveInstruction(hp2);
end;
end
else if (hp1.typ = ait_instruction) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,taicpu(p).opcode,[]) and
OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
MatchOpType(taicpu(hp2),top_reg,top_reg) and
MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^) and
(((taicpu(p).opcode=A_MOVAPS) and
((taicpu(hp1).opcode=A_ADDSS) or (taicpu(hp1).opcode=A_SUBSS) or
(taicpu(hp1).opcode=A_MULSS) or (taicpu(hp1).opcode=A_DIVSS))) or
((taicpu(p).opcode=A_MOVAPD) and
((taicpu(hp1).opcode=A_ADDSD) or (taicpu(hp1).opcode=A_SUBSD) or
(taicpu(hp1).opcode=A_MULSD) or (taicpu(hp1).opcode=A_DIVSD)))
) then
{ change
movapX reg,reg2
addsX/subsX/... reg3, reg2
movapX reg2,reg
to
addsX/subsX/... reg3,reg
}
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'MovapXOpMovapX2Op ('+
debug_op2str(taicpu(p).opcode)+' '+
debug_op2str(taicpu(hp1).opcode)+' '+
debug_op2str(taicpu(hp2).opcode)+') done',p);
{ we cannot eliminate the first move if
the operations uses the same register for source and dest }
if not(OpsEqual(taicpu(hp1).oper[1]^,taicpu(hp1).oper[0]^)) then
RemoveCurrentP(p, nil);
p:=hp1;
taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
RemoveInstruction(hp2);
result:=true;
end;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1VOP(var p : tai) : boolean;
var
hp1 : tai;
begin
result:=false;
{ replace
V<Op>X %mreg1,%mreg2,%mreg3
VMovX %mreg3,%mreg4
dealloc %mreg3
by
V<Op>X %mreg1,%mreg2,%mreg4
?
}
if GetNextInstruction(p,hp1) and
{ we mix single and double operations here because we assume that the compiler
generates vmovapd only after double operations and vmovaps only after single operations }
MatchInstruction(hp1,A_VMOVAPD,A_VMOVAPS,[S_NO]) and
MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
(taicpu(hp1).oper[1]^.typ=top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'VOpVmov2VOp done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
{ Replaces all references to AOldReg in a memory reference to ANewReg }
class function TX86AsmOptimizer.ReplaceRegisterInRef(var ref: TReference; const AOldReg, ANewReg: TRegister): Boolean;
begin
Result := False;
{ For safety reasons, only check for exact register matches }
{ Check base register }
if (ref.base = AOldReg) then
begin
ref.base := ANewReg;
Result := True;
end;
{ Check index register }
if (ref.index = AOldReg) then
begin
ref.index := ANewReg;
Result := True;
end;
end;
{ Replaces all references to AOldReg in an operand to ANewReg }
class function TX86AsmOptimizer.ReplaceRegisterInOper(const p: taicpu; const OperIdx: Integer; const AOldReg, ANewReg: TRegister): Boolean;
var
OldSupReg, NewSupReg: TSuperRegister;
OldSubReg, NewSubReg: TSubRegister;
OldRegType: TRegisterType;
ThisOper: POper;
begin
ThisOper := p.oper[OperIdx]; { Faster to access overall }
Result := False;
if (AOldReg = NR_NO) or (ANewReg = NR_NO) then
InternalError(2020011801);
OldSupReg := getsupreg(AOldReg);
OldSubReg := getsubreg(AOldReg);
OldRegType := getregtype(AOldReg);
NewSupReg := getsupreg(ANewReg);
NewSubReg := getsubreg(ANewReg);
if OldRegType <> getregtype(ANewReg) then
InternalError(2020011802);
if OldSubReg <> NewSubReg then
InternalError(2020011803);
case ThisOper^.typ of
top_reg:
if (
(ThisOper^.reg = AOldReg) or
(
(OldRegType = R_INTREGISTER) and
(getsupreg(ThisOper^.reg) = OldSupReg) and
(getregtype(ThisOper^.reg) = R_INTREGISTER) and
(
(getsubreg(ThisOper^.reg) <= OldSubReg)
{$ifndef x86_64}
and (
{ Under i386 and i8086, ESI, EDI, EBP and ESP
don't have an 8-bit representation }
(getsubreg(ThisOper^.reg) >= R_SUBW) or
not (NewSupReg in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
)
{$endif x86_64}
)
)
) then
begin
ThisOper^.reg := newreg(getregtype(ANewReg), NewSupReg, getsubreg(p.oper[OperIdx]^.reg));
Result := True;
end;
top_ref:
if ReplaceRegisterInRef(ThisOper^.ref^, AOldReg, ANewReg) then
Result := True;
else
;
end;
end;
{ Replaces all references to AOldReg in an instruction to ANewReg }
class function TX86AsmOptimizer.ReplaceRegisterInInstruction(const p: taicpu; const AOldReg, ANewReg: TRegister): Boolean;
const
ReadFlag: array[0..3] of TInsChange = (Ch_Rop1, Ch_Rop2, Ch_Rop3, Ch_Rop4);
var
OperIdx: Integer;
begin
Result := False;
for OperIdx := 0 to p.ops - 1 do
if (ReadFlag[OperIdx] in InsProp[p.Opcode].Ch) then
begin
{ The shift and rotate instructions can only use CL }
if not (
(OperIdx = 0) and
{ This second condition just helps to avoid unnecessarily
calling MatchInstruction for 10 different opcodes }
(p.oper[0]^.reg = NR_CL) and
MatchInstruction(p, [A_RCL, A_RCR, A_ROL, A_ROR, A_SAL, A_SAR, A_SHL, A_SHLD, A_SHR, A_SHRD], [])
) then
Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
end
else if p.oper[OperIdx]^.typ = top_ref then
{ It's okay to replace registers in references that get written to }
Result := ReplaceRegisterInOper(p, OperIdx, AOldReg, ANewReg) or Result;
end;
class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean;
begin
with ref^ do
Result :=
(index = NR_NO) and
(
{$ifdef x86_64}
(
(base = NR_RIP) and
(refaddr in [addr_pic, addr_pic_no_got])
) or
{$endif x86_64}
(base = NR_STACK_POINTER_REG) or
(base = current_procinfo.framepointer)
);
end;
function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean;
var
l: asizeint;
begin
Result := False;
{ Should have been checked previously }
if p.opcode <> A_LEA then
InternalError(2020072501);
{ do not mess with the stack point as adjusting it by lea is recommend, except if we optimize for size }
if (p.oper[1]^.reg=NR_STACK_POINTER_REG) and
not(cs_opt_size in current_settings.optimizerswitches) then
exit;
with p.oper[0]^.ref^ do
begin
if (base <> p.oper[1]^.reg) or
(index <> NR_NO) or
assigned(symbol) then
exit;
l:=offset;
if (l=1) and UseIncDec then
begin
p.opcode:=A_INC;
p.loadreg(0,p.oper[1]^.reg);
p.ops:=1;
DebugMsg(SPeepholeOptimization + 'Lea2Inc done',p);
end
else if (l=-1) and UseIncDec then
begin
p.opcode:=A_DEC;
p.loadreg(0,p.oper[1]^.reg);
p.ops:=1;
DebugMsg(SPeepholeOptimization + 'Lea2Dec done',p);
end
else
begin
if (l<0) and (l<>-2147483648) then
begin
p.opcode:=A_SUB;
p.loadConst(0,-l);
DebugMsg(SPeepholeOptimization + 'Lea2Sub done',p);
end
else
begin
p.opcode:=A_ADD;
p.loadConst(0,l);
DebugMsg(SPeepholeOptimization + 'Lea2Add done',p);
end;
end;
end;
Result := True;
end;
function TX86AsmOptimizer.DeepMOVOpt(const p_mov: taicpu; const hp: taicpu): Boolean;
var
CurrentReg, ReplaceReg: TRegister;
begin
Result := False;
ReplaceReg := taicpu(p_mov).oper[0]^.reg;
CurrentReg := taicpu(p_mov).oper[1]^.reg;
case hp.opcode of
A_FSTSW, A_FNSTSW,
A_IN, A_INS, A_OUT, A_OUTS,
A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS:
{ These routines have explicit operands, but they are restricted in
what they can be (e.g. IN and OUT can only read from AL, AX or
EAX. }
Exit;
A_IMUL:
begin
{ The 1-operand version writes to implicit registers
The 2-operand version reads from the first operator, and reads
from and writes to the second (equivalent to Ch_ROp1, ChRWOp2).
the 3-operand version reads from a register that it doesn't write to
}
case hp.ops of
1:
if (
(
(hp.opsize = S_B) and (getsupreg(CurrentReg) <> RS_EAX)
) or
not (getsupreg(CurrentReg) in [RS_EAX, RS_EDX])
) and ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
begin
Result := True;
DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 1)', hp);
AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
end;
2:
{ Only modify the first parameter }
if ReplaceRegisterInOper(hp, 0, CurrentReg, ReplaceReg) then
begin
Result := True;
DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 2)', hp);
AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
end;
3:
{ Only modify the second parameter }
if ReplaceRegisterInOper(hp, 1, CurrentReg, ReplaceReg) then
begin
Result := True;
DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovIMul2MovIMul 3)', hp);
AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
end;
else
InternalError(2020012901);
end;
end;
else
if (hp.ops > 0) and
ReplaceRegisterInInstruction(hp, CurrentReg, ReplaceReg) then
begin
Result := True;
DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + debug_regname(ReplaceReg) + '; changed to minimise pipeline stall (MovXXX2MovXXX)', hp);
AllocRegBetween(ReplaceReg, p_mov, hp, UsedRegs);
end;
end;
end;
function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean;
var
hp1, hp2, hp3: tai;
DoOptimisation, TempBool: Boolean;
{$ifdef x86_64}
NewConst: TCGInt;
{$endif x86_64}
procedure convert_mov_value(signed_movop: tasmop; max_value: tcgint); inline;
begin
if taicpu(hp1).opcode = signed_movop then
begin
if taicpu(p).oper[0]^.val > max_value shr 1 then
taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val - max_value - 1 { Convert to signed }
end
else
taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and max_value; { Trim to unsigned }
end;
function TryConstMerge(var p1, p2: tai): Boolean;
var
ThisRef: TReference;
begin
Result := False;
ThisRef := taicpu(p2).oper[1]^.ref^;
{ Only permit writes to the stack, since we can guarantee alignment with that }
if (ThisRef.index = NR_NO) and
(
(ThisRef.base = NR_STACK_POINTER_REG) or
(ThisRef.base = current_procinfo.framepointer)
) then
begin
case taicpu(p).opsize of
S_B:
begin
{ Word writes must be on a 2-byte boundary }
if (taicpu(p1).oper[1]^.ref^.offset mod 2) = 0 then
begin
{ Reduce offset of second reference to see if it is sequential with the first }
Dec(ThisRef.offset, 1);
if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
begin
{ Make sure the constants aren't represented as a
negative number, as these won't merge properly }
taicpu(p1).opsize := S_W;
taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FF) or ((taicpu(p2).oper[0]^.val and $FF) shl 8);
DebugMsg(SPeepholeOptimization + 'Merged two byte-sized constant writes to stack (MovMov2Mov 2a)', p1);
RemoveInstruction(p2);
Result := True;
end;
end;
end;
S_W:
begin
{ Longword writes must be on a 4-byte boundary }
if (taicpu(p1).oper[1]^.ref^.offset mod 4) = 0 then
begin
{ Reduce offset of second reference to see if it is sequential with the first }
Dec(ThisRef.offset, 2);
if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
begin
{ Make sure the constants aren't represented as a
negative number, as these won't merge properly }
taicpu(p1).opsize := S_L;
taicpu(p1).oper[0]^.val := (taicpu(p1).oper[0]^.val and $FFFF) or ((taicpu(p2).oper[0]^.val and $FFFF) shl 16);
DebugMsg(SPeepholeOptimization + 'Merged two word-sized constant writes to stack (MovMov2Mov 2b)', p1);
RemoveInstruction(p2);
Result := True;
end;
end;
end;
{$ifdef x86_64}
S_L:
begin
{ Only sign-extended 32-bit constants can be written to 64-bit memory directly, so check to
see if the constants can be encoded this way. }
NewConst := (taicpu(p1).oper[0]^.val and $FFFFFFFF) or (taicpu(p2).oper[0]^.val shl 32);
if (NewConst >= -2147483648) and (NewConst <= 2147483647) and
{ Quadword writes must be on an 8-byte boundary }
((taicpu(p1).oper[1]^.ref^.offset mod 8) = 0) then
begin
{ Reduce offset of second reference to see if it is sequential with the first }
Dec(ThisRef.offset, 4);
if RefsEqual(taicpu(p1).oper[1]^.ref^, ThisRef) then
begin
{ Make sure the constants aren't represented as a
negative number, as these won't merge properly }
taicpu(p1).opsize := S_Q;
{ Force a typecast into a 32-bit signed integer (that will then be sign-extended to 64-bit) }
taicpu(p1).oper[0]^.val := NewConst;
DebugMsg(SPeepholeOptimization + 'Merged two longword-sized constant writes to stack (MovMov2Mov 2c)', p1);
RemoveInstruction(p2);
Result := True;
end;
end;
end;
{$endif x86_64}
else
;
end;
end;
end;
var
GetNextInstruction_p, TempRegUsed, CrossJump: Boolean;
PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
NewSize: topsize;
CurrentReg, ActiveReg: TRegister;
SourceRef, TargetRef: TReference;
MovAligned, MovUnaligned: TAsmOp;
begin
Result:=false;
GetNextInstruction_p:=GetNextInstruction(p, hp1);
{ remove mov reg1,reg1? }
if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^)
then
begin
DebugMsg(SPeepholeOptimization + 'Mov2Nop 1 done',p);
{ take care of the register (de)allocs following p }
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
{ All the next optimisations require a next instruction }
if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
Exit;
{ Look for:
mov %reg1,%reg2
??? %reg2,r/m
Change to:
mov %reg1,%reg2
??? %reg1,r/m
}
if MatchOpType(taicpu(p), top_reg, top_reg) then
begin
CurrentReg := taicpu(p).oper[1]^.reg;
if RegReadByInstruction(CurrentReg, hp1) and
DeepMOVOpt(taicpu(p), taicpu(hp1)) then
begin
{ A change has occurred, just not in p }
Result := True;
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if not RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs) and
{ Just in case something didn't get modified (e.g. an
implicit register) }
not RegReadByInstruction(CurrentReg, hp1) then
begin
{ We can remove the original MOV }
DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p);
RemoveCurrentp(p, hp1);
{ UsedRegs got updated by RemoveCurrentp }
Result := True;
Exit;
end;
{ If we know a MOV instruction has become a null operation, we might as well
get rid of it now to save time. }
if (taicpu(hp1).opcode = A_MOV) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
{ Just being a register is enough to confirm it's a null operation }
(taicpu(hp1).oper[0]^.typ = top_reg) then
begin
Result := True;
{ Speed-up to reduce a pipeline stall... if we had something like...
movl %eax,%edx
movw %dx,%ax
... the second instruction would change to movw %ax,%ax, but
given that it is now %ax that's active rather than %eax,
penalties might occur due to a partial register write, so instead,
change it to a MOVZX instruction when optimising for speed.
}
if not (cs_opt_size in current_settings.optimizerswitches) and
IsMOVZXAcceptable and
(taicpu(hp1).opsize < taicpu(p).opsize)
{$ifdef x86_64}
{ operations already implicitly set the upper 64 bits to zero }
and not ((taicpu(hp1).opsize = S_L) and (taicpu(p).opsize = S_Q))
{$endif x86_64}
then
begin
CurrentReg := taicpu(hp1).oper[1]^.reg;
DebugMsg(SPeepholeOptimization + 'Zero-extension to minimise pipeline stall (Mov2Movz)',hp1);
case taicpu(p).opsize of
S_W:
if taicpu(hp1).opsize = S_B then
taicpu(hp1).opsize := S_BL
else
InternalError(2020012911);
S_L{$ifdef x86_64}, S_Q{$endif x86_64}:
case taicpu(hp1).opsize of
S_B:
taicpu(hp1).opsize := S_BL;
S_W:
taicpu(hp1).opsize := S_WL;
else
InternalError(2020012912);
end;
else
InternalError(2020012910);
end;
taicpu(hp1).opcode := A_MOVZX;
taicpu(hp1).oper[1]^.reg := newreg(getregtype(CurrentReg), getsupreg(CurrentReg), R_SUBD)
end
else
begin
GetNextInstruction_p := GetNextInstruction(hp1, hp2);
DebugMsg(SPeepholeOptimization + 'Mov2Nop 4 done',hp1);
RemoveInstruction(hp1);
{ The instruction after what was hp1 is now the immediate next instruction,
so we can continue to make optimisations if it's present }
if not GetNextInstruction_p or (hp2.typ <> ait_instruction) then
Exit;
hp1 := hp2;
end;
end;
end;
end;
{ Depending on the DeepMOVOpt above, it may turn out that hp1 completely
overwrites the original destination register. e.g.
movl ###,%reg2d
movslq ###,%reg2q (### doesn't have to be the same as the first one)
In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
}
if (taicpu(p).oper[1]^.typ = top_reg) and
MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
begin
if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
begin
if (taicpu(hp1).oper[0]^.typ = top_reg) then
case taicpu(p).oper[0]^.typ of
top_const:
{ We have something like:
movb $x, %regb
movzbl %regb,%regd
Change to:
movl $x, %regd
}
begin
case taicpu(hp1).opsize of
S_BW:
begin
convert_mov_value(A_MOVSX, $FF);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
taicpu(p).opsize := S_W;
end;
S_BL:
begin
convert_mov_value(A_MOVSX, $FF);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).opsize := S_L;
end;
S_WL:
begin
convert_mov_value(A_MOVSX, $FFFF);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).opsize := S_L;
end;
{$ifdef x86_64}
S_BQ:
begin
convert_mov_value(A_MOVSX, $FF);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
S_WQ:
begin
convert_mov_value(A_MOVSX, $FFFF);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
S_LQ:
begin
convert_mov_value(A_MOVSXD, $FFFFFFFF); { Note it's MOVSXD, not MOVSX }
setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
taicpu(p).opsize := S_Q;
end;
{$endif x86_64}
else
{ If hp1 was a MOV instruction, it should have been
optimised already }
InternalError(2020021001);
end;
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
RemoveInstruction(hp1);
Result := True;
Exit;
end;
top_ref:
{ We have something like:
movb mem, %regb
movzbl %regb,%regd
Change to:
movzbl mem, %regd
}
if (taicpu(p).oper[0]^.ref^.refaddr<>addr_full) and (IsMOVZXAcceptable or (taicpu(hp1).opcode<>A_MOVZX)) then
begin
DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
RemoveCurrentP(p, hp1);
Result:=True;
Exit;
end;
else
if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
{ Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
Exit;
end;
end
{ The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
optimised }
else
begin
DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
if (taicpu(hp1).opcode = A_AND) and
(taicpu(p).oper[1]^.typ = top_reg) and
MatchOpType(taicpu(hp1),top_const,top_reg) then
begin
if MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) then
begin
case taicpu(p).opsize of
S_L:
if (taicpu(hp1).oper[0]^.val = $ffffffff) then
begin
{ Optimize out:
mov x, %reg
and ffffffffh, %reg
}
DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 1 done',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
S_Q: { TODO: Confirm if this is even possible }
if (taicpu(hp1).oper[0]^.val = $ffffffffffffffff) then
begin
{ Optimize out:
mov x, %reg
and ffffffffffffffffh, %reg
}
DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 2 done',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
else
;
end;
if ((taicpu(p).oper[0]^.typ=top_reg) or
((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
GetNextInstruction(hp1,hp2) and
MatchInstruction(hp2,A_TEST,[taicpu(p).opsize]) and
MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp2).oper[1]^) and
(MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^) or
MatchOperand(taicpu(hp2).oper[0]^,-1)) and
GetNextInstruction(hp2,hp3) and
MatchInstruction(hp3,A_Jcc,A_Setcc,[]) and
(taicpu(hp3).condition in [C_E,C_NE]) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'MovAndTest2Test done',p);
taicpu(hp1).loadoper(1,taicpu(p).oper[0]^);
taicpu(hp1).opcode:=A_TEST;
RemoveInstruction(hp2);
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
end;
end
else if IsMOVZXAcceptable and
(taicpu(p).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
(taicpu(p).oper[0]^.typ <> top_const) and { MOVZX only supports registers and memory, not immediates (use MOV for that!) }
(getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
then
begin
InputVal := debug_operstr(taicpu(p).oper[0]^);
MaskNum := debug_tostr(taicpu(hp1).oper[0]^.val);
case taicpu(p).opsize of
S_B:
if (taicpu(hp1).oper[0]^.val = $ff) then
begin
{ Convert:
movb x, %regl movb x, %regl
andw ffh, %regw andl ffh, %regd
To:
movzbw x, %regd movzbl x, %regd
(Identical registers, just different sizes)
}
RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 8-bit register name }
RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 16/32-bit register name }
case taicpu(hp1).opsize of
S_W: NewSize := S_BW;
S_L: NewSize := S_BL;
{$ifdef x86_64}
S_Q: NewSize := S_BQ;
{$endif x86_64}
else
InternalError(2018011510);
end;
end
else
NewSize := S_NO;
S_W:
if (taicpu(hp1).oper[0]^.val = $ffff) then
begin
{ Convert:
movw x, %regw
andl ffffh, %regd
To:
movzwl x, %regd
(Identical registers, just different sizes)
}
RegName1 := debug_regname(taicpu(p).oper[1]^.reg); { 16-bit register name }
RegName2 := debug_regname(taicpu(hp1).oper[1]^.reg); { 32-bit register name }
case taicpu(hp1).opsize of
S_L: NewSize := S_WL;
{$ifdef x86_64}
S_Q: NewSize := S_WQ;
{$endif x86_64}
else
InternalError(2018011511);
end;
end
else
NewSize := S_NO;
else
NewSize := S_NO;
end;
if NewSize <> S_NO then
begin
PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + InputVal + ',' + RegName1;
{ The actual optimization }
taicpu(p).opcode := A_MOVZX;
taicpu(p).changeopsize(NewSize);
taicpu(p).oper[1]^ := taicpu(hp1).oper[1]^;
{ Safeguard if "and" is followed by a conditional command }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs,tai(p.next));
if (RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
begin
{ At this point, the "and" command is effectively equivalent to
"test %reg,%reg". This will be handled separately by the
Peephole Optimizer. [Kit] }
DebugMsg(SPeepholeOptimization + PreMessage +
' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
end
else
begin
DebugMsg(SPeepholeOptimization + PreMessage + '; and' + debug_opsize2str(taicpu(hp1).opsize) + ' $' + MaskNum + ',' + RegName2 +
' -> movz' + debug_opsize2str(NewSize) + ' ' + InputVal + ',' + RegName2, p);
RemoveInstruction(hp1);
end;
Result := True;
Exit;
end;
end;
end;
if (taicpu(hp1).opcode = A_OR) and
(taicpu(p).oper[1]^.typ = top_reg) and
MatchOperand(taicpu(p).oper[0]^, 0) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
begin
{ mov 0, %reg
or ###,%reg
Change to (only if the flags are not used):
mov ###,%reg
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
DoOptimisation := True;
{ Even if the flags are used, we might be able to do the optimisation
if the conditions are predictable }
if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
begin
{ Only perform if ### = %reg (the same register) or equal to 0,
so %reg is guaranteed to still have a value of zero }
if MatchOperand(taicpu(hp1).oper[0]^, 0) or
MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) then
begin
hp2 := hp1;
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
while RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
GetNextInstruction(hp2, hp3) do
begin
{ Don't continue modifying if the flags state is getting changed }
if RegModifiedByInstruction(NR_DEFAULTFLAGS, hp3) then
Break;
UpdateUsedRegs(TmpUsedRegs, tai(hp3.Next));
if MatchInstruction(hp3, A_Jcc, A_SETcc, A_CMOVcc, []) then
begin
if condition_in(C_E, taicpu(hp3).condition) or (taicpu(hp3).condition in [C_NC, C_NS, C_NO]) then
begin
{ Condition is always true }
case taicpu(hp3).opcode of
A_Jcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always true (jump made unconditional)', hp3);
{ Check for jump shortcuts before we destroy the condition }
DoJumpOptimizations(hp3, TempBool);
MakeUnconditional(taicpu(hp3));
Result := True;
end;
A_CMOVcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always true (CMOVcc -> MOV)', hp3);
taicpu(hp3).opcode := A_MOV;
taicpu(hp3).condition := C_None;
Result := True;
end;
A_SETcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always true (changed to MOV 1)', hp3);
{ Convert "set(c) %reg" instruction to "movb 1,%reg" }
taicpu(hp3).opcode := A_MOV;
taicpu(hp3).ops := 2;
taicpu(hp3).condition := C_None;
taicpu(hp3).opsize := S_B;
taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
taicpu(hp3).loadconst(0, 1);
Result := True;
end;
else
InternalError(2021090701);
end;
end
else if (taicpu(hp3).condition in [C_A, C_B, C_C, C_G, C_L, C_NE, C_NZ, C_O, C_S]) then
begin
{ Condition is always false }
case taicpu(hp3).opcode of
A_Jcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always false (jump removed)', hp3);
TAsmLabel(taicpu(hp3).oper[0]^.ref^.symbol).decrefs;
RemoveInstruction(hp3);
Result := True;
{ Since hp3 was deleted, hp2 must not be updated }
Continue;
end;
A_CMOVcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always false (conditional load removed)', hp3);
RemoveInstruction(hp3);
Result := True;
{ Since hp3 was deleted, hp2 must not be updated }
Continue;
end;
A_SETcc:
begin
DebugMsg(SPeepholeOptimization + 'Condition is always false (changed to MOV 0)', hp3);
{ Convert "set(c) %reg" instruction to "movb 0,%reg" }
taicpu(hp3).opcode := A_MOV;
taicpu(hp3).ops := 2;
taicpu(hp3).condition := C_None;
taicpu(hp3).opsize := S_B;
taicpu(hp3).loadreg(1,taicpu(hp3).oper[0]^.reg);
taicpu(hp3).loadconst(0, 0);
Result := True;
end;
else
InternalError(2021090702);
end;
end
else
{ Uncertain what to do - don't optimise (although optimise other conditional statements if present) }
DoOptimisation := False;
end;
hp2 := hp3;
end;
{ Flags are still in use - don't optimise }
if DoOptimisation and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
DoOptimisation := False;
end
else
DoOptimisation := False;
end;
if DoOptimisation then
begin
{$ifdef x86_64}
{ OR only supports 32-bit sign-extended constants for 64-bit
instructions, so compensate for this if the constant is
encoded as a value greater than or equal to 2^31 }
if (taicpu(hp1).opsize = S_Q) and
(taicpu(hp1).oper[0]^.typ = top_const) and
(taicpu(hp1).oper[0]^.val >= $80000000) then
taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val or $FFFFFFFF00000000;
{$endif x86_64}
DebugMsg(SPeepholeOptimization + 'MOV 0 / OR -> MOV', p);
taicpu(hp1).opcode := A_MOV;
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
{ Next instruction is also a MOV ? }
if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
begin
if MatchOpType(taicpu(p), top_const, top_ref) and
MatchOpType(taicpu(hp1), top_const, top_ref) and
TryConstMerge(p, hp1) then
begin
Result := True;
{ In case we have four byte writes in a row, check for 2 more
right now so we don't have to wait for another iteration of
pass 1
}
{ If two byte-writes were merged, the opsize is now S_W, not S_B }
case taicpu(p).opsize of
S_W:
begin
if GetNextInstruction(p, hp1) and
MatchInstruction(hp1, A_MOV, [S_B]) and
MatchOpType(taicpu(hp1), top_const, top_ref) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, [S_B]) and
MatchOpType(taicpu(hp2), top_const, top_ref) and
{ Try to merge the two bytes }
TryConstMerge(hp1, hp2) then
{ Now try to merge the two words (hp2 will get deleted) }
TryConstMerge(p, hp1);
end;
S_L:
begin
{ Though this only really benefits x86_64 and not i386, it
gets a potential optimisation done faster and hence
reduces the number of times OptPass1MOV is entered }
if GetNextInstruction(p, hp1) and
MatchInstruction(hp1, A_MOV, [S_W]) and
MatchOpType(taicpu(hp1), top_const, top_ref) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, [S_W]) and
MatchOpType(taicpu(hp2), top_const, top_ref) and
{ Try to merge the two words }
TryConstMerge(hp1, hp2) then
{ This will always fail on i386, so don't bother
calling it unless we're doing x86_64 }
{$ifdef x86_64}
{ Now try to merge the two longwords (hp2 will get deleted) }
TryConstMerge(p, hp1)
{$endif x86_64}
;
end;
else
;
end;
Exit;
end;
if (taicpu(p).oper[1]^.typ = top_reg) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) then
begin
CurrentReg := taicpu(p).oper[1]^.reg;
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
{ we have
mov x, %treg
mov %treg, y
}
if not(RegInOp(CurrentReg, taicpu(hp1).oper[1]^)) then
if not(RegUsedAfterInstruction(CurrentReg, hp1, TmpUsedRegs)) then
{ we've got
mov x, %treg
mov %treg, y
with %treg is not used after }
case taicpu(p).oper[0]^.typ Of
{ top_reg is covered by DeepMOVOpt }
top_const:
begin
{ change
mov const, %treg
mov %treg, y
to
mov const, y
}
if (taicpu(hp1).oper[1]^.typ=top_reg) or
((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
begin
if taicpu(hp1).oper[1]^.typ=top_reg then
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
taicpu(p).loadOper(1,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 5 done',p);
RemoveInstruction(hp1);
Result:=true;
Exit;
end;
end;
top_ref:
case taicpu(hp1).oper[1]^.typ of
top_reg:
begin
{ change
mov mem, %treg
mov %treg, %reg
to
mov mem, %reg"
}
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
RemoveInstruction(hp1);
Result:=true;
Exit;
end;
top_ref:
begin
{$ifdef x86_64}
{ Look for the following to simplify:
mov x(mem1), %reg
mov %reg, y(mem2)
mov x+8(mem1), %reg
mov %reg, y+8(mem2)
Change to:
movdqu x(mem1), %xmmreg
movdqu %xmmreg, y(mem2)
}
SourceRef := taicpu(p).oper[0]^.ref^;
TargetRef := taicpu(hp1).oper[1]^.ref^;
if (taicpu(p).opsize = S_Q) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp2), top_ref, top_reg) then
begin
{ Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
Inc(SourceRef.offset, 8);
if UseAVX then
begin
MovAligned := A_VMOVDQA;
MovUnaligned := A_VMOVDQU;
end
else
begin
MovAligned := A_MOVDQA;
MovUnaligned := A_MOVDQU;
end;
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
Inc(TargetRef.offset, 8);
if GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp3), top_reg, top_ref) and
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
begin
CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
if CurrentReg <> NR_NO then
begin
{ Remember that the offsets are 8 ahead }
if ((SourceRef.offset mod 16) = 8) and
(
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
(SourceRef.base = current_procinfo.framepointer) or
((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
) then
taicpu(p).opcode := MovAligned
else
taicpu(p).opcode := MovUnaligned;
taicpu(p).opsize := S_XMM;
taicpu(p).oper[1]^.reg := CurrentReg;
if ((TargetRef.offset mod 16) = 8) and
(
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
(TargetRef.base = current_procinfo.framepointer) or
((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
) then
taicpu(hp1).opcode := MovAligned
else
taicpu(hp1).opcode := MovUnaligned;
taicpu(hp1).opsize := S_XMM;
taicpu(hp1).oper[0]^.reg := CurrentReg;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)', p);
RemoveInstruction(hp2);
RemoveInstruction(hp3);
Result := True;
Exit;
end;
end;
end
else
begin
{ See if the next references are 8 less rather than 8 greater }
Dec(SourceRef.offset, 16); { -8 the other way }
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
Dec(TargetRef.offset, 8); { Only 8, not 16, as it wasn't incremented unlike SourceRef }
if GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp3), top_reg, top_ref) and
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
begin
CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
if CurrentReg <> NR_NO then
begin
{ hp2 and hp3 are the starting offsets, so mod = 0 this time }
if ((SourceRef.offset mod 16) = 0) and
(
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
(SourceRef.base = current_procinfo.framepointer) or
((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
) then
taicpu(hp2).opcode := MovAligned
else
taicpu(hp2).opcode := MovUnaligned;
taicpu(hp2).opsize := S_XMM;
taicpu(hp2).oper[1]^.reg := CurrentReg;
if ((TargetRef.offset mod 16) = 0) and
(
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
(TargetRef.base = current_procinfo.framepointer) or
((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
) then
taicpu(hp3).opcode := MovAligned
else
taicpu(hp3).opcode := MovUnaligned;
taicpu(hp3).opsize := S_XMM;
taicpu(hp3).oper[0]^.reg := CurrentReg;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 2)', p);
RemoveInstruction(hp1);
RemoveCurrentP(p, hp2);
Result := True;
Exit;
end;
end;
end;
end;
end;
{$endif x86_64}
end;
else
{ The write target should be a reg or a ref }
InternalError(2021091601);
end;
else
;
end
else
{ %treg is used afterwards, but all eventualities
other than the first MOV instruction being a constant
are covered by DeepMOVOpt, so only check for that }
if (taicpu(p).oper[0]^.typ = top_const) and
(
{ For MOV operations, a size saving is only made if the register/const is byte-sized }
not (cs_opt_size in current_settings.optimizerswitches) or
(taicpu(hp1).opsize = S_B)
) and
(
(taicpu(hp1).oper[1]^.typ = top_reg) or
((taicpu(p).oper[0]^.val >= low(longint)) and (taicpu(p).oper[0]^.val <= high(longint)))
) then
begin
DebugMsg(SPeepholeOptimization + debug_operstr(taicpu(hp1).oper[0]^) + ' = $' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 6b)',hp1);
taicpu(hp1).loadconst(0, taicpu(p).oper[0]^.val);
end;
end;
if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
(taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
{ mov reg1, mem1 or mov mem1, reg1
mov mem2, reg2 mov reg2, mem2}
begin
if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
{ mov reg1, mem1 or mov mem1, reg1
mov mem2, reg1 mov reg2, mem1}
begin
if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
{ Removes the second statement from
mov reg1, mem1/reg2
mov mem1/reg2, reg1 }
begin
if taicpu(p).oper[0]^.typ=top_reg then
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 1',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end
else
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
if (taicpu(p).oper[1]^.typ = top_ref) and
{ mov reg1, mem1
mov mem2, reg1 }
(taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,A_CMP,[taicpu(p).opsize]) and
OpsEqual(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
OpsEqual(taicpu(p).oper[0]^,taicpu(hp2).oper[1]^) and
not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp2, TmpUsedRegs)) then
{ change to
mov reg1, mem1 mov reg1, mem1
mov mem2, reg1 cmp reg1, mem2
cmp mem1, reg1
}
begin
RemoveInstruction(hp2);
taicpu(hp1).opcode := A_CMP;
taicpu(hp1).loadref(1,taicpu(hp1).oper[0]^.ref^);
taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
DebugMsg(SPeepholeOptimization + 'MovMovCmp2MovCmp done',hp1);
end;
end;
end
else if (taicpu(p).oper[1]^.typ=top_ref) and
OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
begin
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,UsedRegs);
taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
DebugMsg(SPeepholeOptimization + 'MovMov2MovMov1 done',p);
end
else
begin
TransferUsedRegs(TmpUsedRegs);
if GetNextInstruction(hp1, hp2) and
MatchOpType(taicpu(p),top_ref,top_reg) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
(taicpu(hp1).oper[1]^.typ = top_ref) and
MatchInstruction(hp2,A_MOV,[taicpu(p).opsize]) and
MatchOpType(taicpu(hp2),top_ref,top_reg) and
RefsEqual(taicpu(hp2).oper[0]^.ref^, taicpu(hp1).oper[1]^.ref^) then
if not RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^) and
not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,tmpUsedRegs)) then
{ mov mem1, %reg1
mov %reg1, mem2
mov mem2, reg2
to:
mov mem1, reg2
mov reg2, mem2}
begin
AllocRegBetween(taicpu(hp2).oper[1]^.reg,p,hp2,usedregs);
DebugMsg(SPeepholeOptimization + 'MovMovMov2MovMov 1 done',p);
taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
RemoveInstruction(hp2);
Result := True;
end
{$ifdef i386}
{ this is enabled for i386 only, as the rules to create the reg sets below
are too complicated for x86-64, so this makes this code too error prone
on x86-64
}
else if (taicpu(p).oper[1]^.reg <> taicpu(hp2).oper[1]^.reg) and
not(RegInRef(taicpu(p).oper[1]^.reg,taicpu(p).oper[0]^.ref^)) and
not(RegInRef(taicpu(hp2).oper[1]^.reg,taicpu(hp2).oper[0]^.ref^)) then
{ mov mem1, reg1 mov mem1, reg1
mov reg1, mem2 mov reg1, mem2
mov mem2, reg2 mov mem2, reg1
to: to:
mov mem1, reg1 mov mem1, reg1
mov mem1, reg2 mov reg1, mem2
mov reg1, mem2
or (if mem1 depends on reg1
and/or if mem2 depends on reg2)
to:
mov mem1, reg1
mov reg1, mem2
mov reg1, reg2
}
begin
taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
taicpu(hp1).loadReg(1,taicpu(hp2).oper[1]^.reg);
taicpu(hp2).loadRef(1,taicpu(hp2).oper[0]^.ref^);
taicpu(hp2).loadReg(0,taicpu(p).oper[1]^.reg);
AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
(getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
AllocRegBetween(taicpu(p).oper[0]^.ref^.base,p,hp2,usedregs);
if (taicpu(p).oper[0]^.ref^.index <> NR_NO) and
(getsupreg(taicpu(p).oper[0]^.ref^.index) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]) then
AllocRegBetween(taicpu(p).oper[0]^.ref^.index,p,hp2,usedregs);
end
else if (taicpu(hp1).Oper[0]^.reg <> taicpu(hp2).Oper[1]^.reg) then
begin
taicpu(hp2).loadReg(0,taicpu(hp1).Oper[0]^.reg);
AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp2,usedregs);
end
else
begin
RemoveInstruction(hp2);
end
{$endif i386}
;
end;
end
{ movl [mem1],reg1
movl [mem1],reg2
to
movl [mem1],reg1
movl reg1,reg2
}
else if MatchOpType(taicpu(p),top_ref,top_reg) and
MatchOpType(taicpu(hp1),top_ref,top_reg) and
(taicpu(p).opsize = taicpu(hp1).opsize) and
RefsEqual(taicpu(p).oper[0]^.ref^,taicpu(hp1).oper[0]^.ref^) and
(taicpu(p).oper[0]^.ref^.volatility=[]) and
(taicpu(hp1).oper[0]^.ref^.volatility=[]) and
not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.base)) and
not(SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^.index)) then
begin
DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 2',p);
taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
end;
{ movl const1,[mem1]
movl [mem1],reg1
to
movl const1,reg1
movl reg1,[mem1]
}
if MatchOpType(Taicpu(p),top_const,top_ref) and
MatchOpType(Taicpu(hp1),top_ref,top_reg) and
(taicpu(p).opsize = taicpu(hp1).opsize) and
RefsEqual(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.ref^) and
not(RegInRef(taicpu(hp1).oper[1]^.reg,taicpu(hp1).oper[0]^.ref^)) then
begin
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
taicpu(hp1).fileinfo := taicpu(p).fileinfo;
DebugMsg(SPeepholeOptimization + 'MovMov2MovMov 1',p);
Result:=true;
exit;
end;
{ mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
end;
{ search further than the next instruction for a mov (as long as it's not a jump) }
if not is_calljmpuncondret(taicpu(hp1).opcode) and
{ check as much as possible before the expensive GetNextInstructionUsingRegCond call }
(taicpu(p).oper[1]^.typ = top_reg) and
(taicpu(p).oper[0]^.typ in [top_reg,top_const]) and
not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
begin
{ we work with hp2 here, so hp1 can be still used later on when
checking for GetNextInstruction_p }
hp3 := hp1;
{ Initialise CrossJump (if it becomes True at any point, it will remain True) }
CrossJump := (taicpu(hp1).opcode = A_Jcc);
{ Saves on a large number of dereferences }
ActiveReg := taicpu(p).oper[1]^.reg;
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
while GetNextInstructionUsingRegCond(hp3,hp2,ActiveReg,CrossJump) and
{ GetNextInstructionUsingRegCond only searches one instruction ahead unless -O3 is specified }
(hp2.typ=ait_instruction) do
begin
case taicpu(hp2).opcode of
A_POP:
if MatchOperand(taicpu(hp2).oper[0]^,ActiveReg) then
begin
if not CrossJump and
not RegUsedBetween(ActiveReg, p, hp2) then
begin
{ We can remove the original MOV since the register
wasn't used between it and its popping from the stack }
DebugMsg(SPeepholeOptimization + 'Mov2Nop 3c done',p);
RemoveCurrentp(p, hp1);
Result := True;
Exit;
end;
{ Can't go any further }
Break;
end;
A_MOV:
if MatchOperand(taicpu(hp2).oper[0]^,ActiveReg) and
((taicpu(p).oper[0]^.typ=top_const) or
((taicpu(p).oper[0]^.typ=top_reg) and
not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2))
)
) then
begin
{ we have
mov x, %treg
mov %treg, y
}
{ We don't need to call UpdateUsedRegs for every instruction between
p and hp2 because the register we're concerned about will not
become deallocated (otherwise GetNextInstructionUsingReg would
have stopped at an earlier instruction). [Kit] }
TempRegUsed :=
CrossJump { Assume the register is in use if it crossed a conditional jump } or
RegReadByInstruction(ActiveReg, hp3) or
RegUsedAfterInstruction(ActiveReg, hp2, TmpUsedRegs);
case taicpu(p).oper[0]^.typ Of
top_reg:
begin
{ change
mov %reg, %treg
mov %treg, y
to
mov %reg, y
}
CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences }
RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
if MatchOperand(taicpu(hp2).oper[1]^, CurrentReg) then
begin
{ %reg = y - remove hp2 completely (doing it here instead of relying on
the "mov %reg,%reg" optimisation might cut down on a pass iteration) }
if TempRegUsed then
begin
DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2);
AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
{ Set the start of the next GetNextInstructionUsingRegCond search
to start at the entry right before hp2 (which is about to be removed) }
hp3 := tai(hp2.Previous);
RemoveInstruction(hp2);
{ See if there's more we can optimise }
Continue;
end
else
begin
RemoveInstruction(hp2);
{ We can remove the original MOV too }
DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p);
RemoveCurrentP(p, hp1);
Result:=true;
Exit;
end;
end
else
begin
AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
taicpu(hp2).loadReg(0, CurrentReg);
DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2);
{ Check to see if the register also appears in the reference }
if (taicpu(hp2).oper[1]^.typ = top_ref) then
ReplaceRegisterInRef(taicpu(hp2).oper[1]^.ref^, ActiveReg, CurrentReg);
{ Don't remove the first instruction if the temporary register is in use }
if not TempRegUsed and
{ ReplaceRegisterInRef won't actually replace the register if it's a different size }
not RegInOp(ActiveReg, taicpu(hp2).oper[1]^) then
begin
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p);
RemoveCurrentP(p, hp1);
Result:=true;
Exit;
end;
{ No need to set Result to True here. If there's another instruction later
on that can be optimised, it will be detected when the main Pass 1 loop
reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }
end;
end;
top_const:
if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then
begin
{ change
mov const, %treg
mov %treg, y
to
mov const, y
}
if (taicpu(hp2).oper[1]^.typ=top_reg) or
((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then
begin
RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg);
taicpu(hp2).loadOper(0,taicpu(p).oper[0]^);
if TempRegUsed then
begin
{ Don't remove the first instruction if the temporary register is in use }
DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2);
{ No need to set Result to True. If there's another instruction later on
that can be optimised, it will be detected when the main Pass 1 loop
reaches what is now hp2 and passes it through OptPass1MOV. [Kit] };
end
else
begin
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p);
RemoveCurrentP(p, hp1);
Result:=true;
Exit;
end;
end;
end;
else
Internalerror(2019103001);
end;
end
else
if MatchOperand(taicpu(hp2).oper[1]^, ActiveReg) then
begin
if not CrossJump and
not RegUsedBetween(ActiveReg, p, hp2) and
not RegReadByInstruction(ActiveReg, hp2) then
begin
{ Register is not used before it is overwritten }
DebugMsg(SPeepholeOptimization + 'Mov2Nop 3a done',p);
RemoveCurrentp(p, hp1);
Result := True;
Exit;
end;
if (taicpu(p).oper[0]^.typ = top_const) and
(taicpu(hp2).oper[0]^.typ = top_const) then
begin
if taicpu(p).oper[0]^.val = taicpu(hp2).oper[0]^.val then
begin
{ Same value - register hasn't changed }
DebugMsg(SPeepholeOptimization + 'Mov2Nop 2 done', hp2);
RemoveInstruction(hp2);
Result := True;
{ See if there's more we can optimise }
Continue;
end;
end;
end;
A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
if MatchOpType(taicpu(hp2), top_reg, top_reg) and
MatchOperand(taicpu(hp2).oper[0]^, ActiveReg) and
SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, ActiveReg) then
begin
{
Change from:
mov ###, %reg
...
movs/z %reg,%reg (Same register, just different sizes)
To:
movs/z ###, %reg (Longer version)
...
(remove)
}
DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p);
taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg;
{ Keep the first instruction as mov if ### is a constant }
if taicpu(p).oper[0]^.typ = top_const then
taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg)
else
begin
taicpu(p).opcode := taicpu(hp2).opcode;
taicpu(p).opsize := taicpu(hp2).opsize;
end;
DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2);
AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs);
RemoveInstruction(hp2);
Result := True;
Exit;
end;
else
{ Move down to the MatchOpType if-block below };
end;
{ Also catches MOV/S/Z instructions that aren't modified }
if taicpu(p).oper[0]^.typ = top_reg then
begin
CurrentReg := taicpu(p).oper[0]^.reg;
if
not RegModifiedByInstruction(CurrentReg, hp3) and
not RegModifiedBetween(CurrentReg, hp3, hp2) and
DeepMOVOpt(taicpu(p), taicpu(hp2)) then
begin
Result := True;
{ Just in case something didn't get modified (e.g. an
implicit register). Also, if it does read from this
register, then there's no longer an advantage to
changing the register on subsequent instructions.}
if not RegReadByInstruction(ActiveReg, hp2) then
begin
{ If a conditional jump was crossed, do not delete
the original MOV no matter what }
if not CrossJump and
{ RegEndOfLife returns True if the register is
deallocated before the next instruction or has
been loaded with a new value }
RegEndOfLife(ActiveReg, taicpu(hp2)) then
begin
{ We can remove the original MOV }
DebugMsg(SPeepholeOptimization + 'Mov2Nop 3b done',p);
RemoveCurrentp(p, hp1);
Exit;
end;
if not RegModifiedByInstruction(ActiveReg, hp2) then
begin
{ See if there's more we can optimise }
hp3 := hp2;
Continue;
end;
end;
end;
end;
{ Break out of the while loop under normal circumstances }
Break;
end;
end;
if (aoc_MovAnd2Mov_3 in OptsToCheck) and
(taicpu(p).oper[1]^.typ = top_reg) and
(taicpu(p).opsize = S_L) and
GetNextInstructionUsingRegTrackingUse(p,hp2,taicpu(p).oper[1]^.reg) and
(taicpu(hp2).opcode = A_AND) and
(MatchOpType(taicpu(hp2),top_const,top_reg) or
(MatchOpType(taicpu(hp2),top_reg,top_reg) and
MatchOperand(taicpu(hp2).oper[0]^,taicpu(hp2).oper[1]^))
) then
begin
if SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp2).oper[1]^.reg) then
begin
if ((taicpu(hp2).oper[0]^.typ=top_const) and (taicpu(hp2).oper[0]^.val = $ffffffff)) or
((taicpu(hp2).oper[0]^.typ=top_reg) and (taicpu(hp2).opsize=S_L)) then
begin
{ Optimize out:
mov x, %reg
and ffffffffh, %reg
}
DebugMsg(SPeepholeOptimization + 'MovAnd2Mov 3 done',p);
RemoveInstruction(hp2);
Result:=true;
exit;
end;
end;
end;
{ leave out the mov from "mov reg, x(%frame_pointer); leave/ret" (with
x >= RetOffset) as it doesn't do anything (it writes either to a
parameter or to the temporary storage room for the function
result)
}
if IsExitCode(hp1) and
(taicpu(p).oper[1]^.typ = top_ref) and
(taicpu(p).oper[1]^.ref^.index = NR_NO) and
(
(
(taicpu(p).oper[1]^.ref^.base = current_procinfo.FramePointer) and
not (
assigned(current_procinfo.procdef.funcretsym) and
(taicpu(p).oper[1]^.ref^.offset <= tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)
)
) or
{ Also discard writes to the stack that are below the base pointer,
as this is temporary storage rather than a function result on the
stack, say. }
(
(taicpu(p).oper[1]^.ref^.base = NR_STACK_POINTER_REG) and
(taicpu(p).oper[1]^.ref^.offset < current_procinfo.final_localsize)
)
) then
begin
RemoveCurrentp(p, hp1);
DebugMsg(SPeepholeOptimization + 'removed deadstore before leave/ret',p);
RemoveLastDeallocForFuncRes(p);
Result:=true;
exit;
end;
if MatchInstruction(hp1,A_CMP,A_TEST,[taicpu(p).opsize]) then
begin
if MatchOpType(taicpu(p),top_reg,top_ref) and
(taicpu(hp1).oper[1]^.typ = top_ref) and
RefsEqual(taicpu(p).oper[1]^.ref^, taicpu(hp1).oper[1]^.ref^) then
begin
{ change
mov reg1, mem1
test/cmp x, mem1
to
mov reg1, mem1
test/cmp x, reg1
}
taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
DebugMsg(SPeepholeOptimization + 'MovTestCmp2MovTestCmp 1',hp1);
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
Result := True;
Exit;
end;
if DoMovCmpMemOpt(p, hp1, True) then
begin
Result := True;
Exit;
end;
end;
if MatchInstruction(hp1,A_LEA,[S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
{ If the flags register is in use, don't change the instruction to an
ADD otherwise this will scramble the flags. [Kit] }
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
begin
if MatchOpType(Taicpu(p),top_ref,top_reg) and
((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
(Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
) or
(MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
(Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
)
) then
{ mov reg1,ref
lea reg2,[reg1,reg2]
to
add reg2,ref}
begin
TransferUsedRegs(TmpUsedRegs);
{ reg1 may not be used afterwards }
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
begin
Taicpu(hp1).opcode:=A_ADD;
Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
DebugMsg(SPeepholeOptimization + 'MovLea2Add done',hp1);
RemoveCurrentp(p, hp1);
result:=true;
exit;
end;
end;
{ If the LEA instruction can be converted into an arithmetic instruction,
it may be possible to then fold it in the next optimisation, otherwise
there's nothing more that can be optimised here. }
if not ConvertLEA(taicpu(hp1)) then
Exit;
end;
if (taicpu(p).oper[1]^.typ = top_reg) and
(hp1.typ = ait_instruction) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,A_MOV,[]) and
(SuperRegistersEqual(taicpu(hp2).oper[0]^.reg,taicpu(p).oper[1]^.reg)) and
(topsize2memsize[taicpu(hp1).opsize]>=topsize2memsize[taicpu(hp2).opsize]) and
(
IsFoldableArithOp(taicpu(hp1), taicpu(p).oper[1]^.reg)
{$ifdef x86_64}
or
(
(taicpu(p).opsize=S_L) and (taicpu(hp1).opsize=S_Q) and (taicpu(hp2).opsize=S_L) and
IsFoldableArithOp(taicpu(hp1), newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[1]^.reg),R_SUBQ))
)
{$endif x86_64}
) then
begin
if OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[0]^) and
(taicpu(hp2).oper[0]^.typ=top_reg) then
{ change movsX/movzX reg/ref, reg2
add/sub/or/... reg3/$const, reg2
mov reg2 reg/ref
dealloc reg2
to
add/sub/or/... reg3/$const, reg/ref }
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
begin
{ by example:
movswl %si,%eax movswl %si,%eax p
decl %eax addl %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
->
movswl %si,%eax movswl %si,%eax p
decw %eax addw %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
}
DebugMsg(SPeepholeOptimization + 'MovOpMov2Op ('+
debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
taicpu(hp1).changeopsize(taicpu(hp2).opsize);
{
->
movswl %si,%eax movswl %si,%eax p
decw %si addw %dx,%si hp1
movw %ax,%si movw %ax,%si hp2
}
case taicpu(hp1).ops of
1:
begin
taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
if taicpu(hp1).oper[0]^.typ=top_reg then
setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
end;
2:
begin
taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
if (taicpu(hp1).oper[0]^.typ=top_reg) and
(taicpu(hp1).opcode<>A_SHL) and
(taicpu(hp1).opcode<>A_SHR) and
(taicpu(hp1).opcode<>A_SAR) then
setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
end;
else
internalerror(2008042701);
end;
{
->
decw %si addw %dx,%si p
}
RemoveInstruction(hp2);
RemoveCurrentP(p, hp1);
Result:=True;
Exit;
end;
end;
if MatchOpType(taicpu(hp2),top_reg,top_reg) and
not(SuperRegistersEqual(taicpu(hp1).oper[0]^.reg,taicpu(hp2).oper[1]^.reg)) and
((topsize2memsize[taicpu(hp1).opsize]<= topsize2memsize[taicpu(hp2).opsize]) or
{ opsize matters for these opcodes, we could probably work around this, but it is not worth the effort }
((taicpu(hp1).opcode<>A_SHL) and (taicpu(hp1).opcode<>A_SHR) and (taicpu(hp1).opcode<>A_SAR))
)
{$ifdef i386}
{ byte registers of esi, edi, ebp, esp are not available on i386 }
and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
and ((taicpu(hp2).opsize<>S_B) or not(getsupreg(taicpu(p).oper[0]^.reg) in [RS_ESI,RS_EDI,RS_EBP,RS_ESP]))
{$endif i386}
then
{ change movsX/movzX reg/ref, reg2
add/sub/or/... regX/$const, reg2
mov reg2, reg3
dealloc reg2
to
movsX/movzX reg/ref, reg3
add/sub/or/... reg3/$const, reg3
}
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
If not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs)) then
begin
{ by example:
movswl %si,%eax movswl %si,%eax p
decl %eax addl %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
->
movswl %si,%eax movswl %si,%eax p
decw %eax addw %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
}
DebugMsg(SPeepholeOptimization + 'MovOpMov2MovOp ('+
debug_op2str(taicpu(p).opcode)+debug_opsize2str(taicpu(p).opsize)+' '+
debug_op2str(taicpu(hp1).opcode)+debug_opsize2str(taicpu(hp1).opsize)+' '+
debug_op2str(taicpu(hp2).opcode)+debug_opsize2str(taicpu(hp2).opsize)+')',p);
{ limit size of constants as well to avoid assembler errors, but
check opsize to avoid overflow when left shifting the 1 }
if (taicpu(p).oper[0]^.typ=top_const) and (topsize2memsize[taicpu(hp2).opsize]<=63) then
taicpu(p).oper[0]^.val:=taicpu(p).oper[0]^.val and ((qword(1) shl topsize2memsize[taicpu(hp2).opsize])-1);
{$ifdef x86_64}
{ Be careful of, for example:
movl %reg1,%reg2
addl %reg3,%reg2
movq %reg2,%reg4
This will cause problems if the upper 32-bits of %reg3 or %reg4 are non-zero
}
if (taicpu(hp1).opsize = S_L) and (taicpu(hp2).opsize = S_Q) then
begin
taicpu(hp2).changeopsize(S_L);
setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBD);
setsubreg(taicpu(hp2).oper[1]^.reg, R_SUBD);
end;
{$endif x86_64}
taicpu(hp1).changeopsize(taicpu(hp2).opsize);
taicpu(p).changeopsize(taicpu(hp2).opsize);
if taicpu(p).oper[0]^.typ=top_reg then
setsubreg(taicpu(p).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
taicpu(p).loadoper(1, taicpu(hp2).oper[1]^);
AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,usedregs);
{
->
movswl %si,%eax movswl %si,%eax p
decw %si addw %dx,%si hp1
movw %ax,%si movw %ax,%si hp2
}
case taicpu(hp1).ops of
1:
begin
taicpu(hp1).loadoper(0, taicpu(hp2).oper[1]^);
if taicpu(hp1).oper[0]^.typ=top_reg then
setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
end;
2:
begin
taicpu(hp1).loadoper(1, taicpu(hp2).oper[1]^);
if (taicpu(hp1).oper[0]^.typ=top_reg) and
(taicpu(hp1).opcode<>A_SHL) and
(taicpu(hp1).opcode<>A_SHR) and
(taicpu(hp1).opcode<>A_SAR) then
setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
end;
else
internalerror(2018111801);
end;
{
->
decw %si addw %dx,%si p
}
RemoveInstruction(hp2);
end;
end;
end;
if MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
MatchOperand(Taicpu(p).oper[0]^,0) and
(Taicpu(p).oper[1]^.typ = top_reg) and
MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
{ mov reg1,0
bts reg1,operand1 --> mov reg1,operand2
or reg1,operand2 bts reg1,operand1}
begin
Taicpu(hp2).opcode:=A_MOV;
DebugMsg(SPeepholeOptimization + 'MovBtsOr2MovBts done',hp1);
asml.remove(hp1);
insertllitem(hp2,hp2.next,hp1);
RemoveCurrentp(p, hp1);
Result:=true;
exit;
end;
{
mov ref,reg0
<op> reg0,reg1
dealloc reg0
to
<op> ref,reg1
}
if MatchOpType(taicpu(p),top_ref,top_reg) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
MatchInstruction(hp1,[A_AND,A_OR,A_XOR,A_ADD,A_SUB,A_CMP],[Taicpu(p).opsize]) and
not(MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^)) and
RegEndOfLife(taicpu(p).oper[1]^.reg,taicpu(hp1)) then
begin
taicpu(hp1).loadoper(0,taicpu(p).oper[0]^);
DebugMsg(SPeepholeOptimization + 'MovOp2Op done',hp1);
RemoveCurrentp(p, hp1);
Result:=true;
exit;
end;
{$ifdef x86_64}
{ Convert:
movq x(ref),%reg64
shrq y,%reg64
To:
movl x+4(ref),%reg32
shrl y-32,%reg32 (Remove if y = 32)
}
if (taicpu(p).opsize = S_Q) and
(taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
(taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFB) and
MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp1), top_const, top_reg) and
(taicpu(hp1).oper[0]^.val >= 32) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
{ Convert to 32-bit }
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).opsize := S_L;
Inc(taicpu(p).oper[0]^.ref^.offset, 4);
PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
if (taicpu(hp1).oper[0]^.val = 32) then
begin
DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
RemoveInstruction(hp1);
end
else
begin
{ This will potentially open up more arithmetic operations since
the peephole optimizer now has a big hint that only the lower
32 bits are currently in use (and opcodes are smaller in size) }
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
taicpu(hp1).opsize := S_L;
Dec(taicpu(hp1).oper[0]^.val, 32);
DebugMsg(SPeepholeOptimization + PreMessage +
'; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
end;
Result := True;
Exit;
end;
{$endif x86_64}
{ Backward optimisation. If we have:
func. %reg1,%reg2
mov %reg2,%reg3
(dealloc %reg2)
Change to:
func. %reg1,%reg3 (see comment below for what a valid func. is)
}
if MatchOpType(taicpu(p), top_reg, top_reg) then
begin
CurrentReg := taicpu(p).oper[0]^.reg;
ActiveReg := taicpu(p).oper[1]^.reg;
TransferUsedRegs(TmpUsedRegs);
if not RegUsedAfterInstruction(CurrentReg, p, TmpUsedRegs) and
GetLastInstruction(p, hp2) and
(hp2.typ = ait_instruction) and
{ Have to make sure it's an instruction that only reads from
operand 1 and only writes (not reads or modifies) from operand 2;
in essence, a one-operand pure function such as BSR or POPCNT }
(taicpu(hp2).ops = 2) and
(insprop[taicpu(hp2).opcode].Ch * [Ch_Rop1, Ch_Wop2] = [Ch_Rop1, Ch_Wop2]) and
(taicpu(hp2).oper[1]^.typ = top_reg) and
(taicpu(hp2).oper[1]^.reg = CurrentReg) then
begin
case taicpu(hp2).opcode of
A_FSTSW, A_FNSTSW,
A_IN, A_INS, A_OUT, A_OUTS,
A_CMPS, A_LODS, A_MOVS, A_SCAS, A_STOS,
{ These routines have explicit operands, but they are restricted in
what they can be (e.g. IN and OUT can only read from AL, AX or
EAX. }
A_CMOVcc:
{ CMOV is not valid either because then CurrentReg will depend
on an unknown value if the condition is False and hence is
not a pure write }
;
else
begin
DebugMsg(SPeepholeOptimization + 'Removed MOV and changed destination on previous instruction to optimise register usage (FuncMov2Func)', p);
taicpu(hp2).oper[1]^.reg := ActiveReg;
AllocRegBetween(ActiveReg, hp2, p, TmpUsedRegs);
RemoveCurrentp(p, hp1);
Result := True;
Exit;
end;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
var
hp1 : tai;
begin
Result:=false;
if taicpu(p).ops <> 2 then
exit;
if (MatchOpType(taicpu(p),top_reg,top_reg) and GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg)) or
GetNextInstruction(p,hp1) then
begin
if MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
(taicpu(hp1).ops = 2) then
begin
if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
(taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
{ movXX reg1, mem1 or movXX mem1, reg1
movXX mem2, reg2 movXX reg2, mem2}
begin
if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) then
{ movXX reg1, mem1 or movXX mem1, reg1
movXX mem2, reg1 movXX reg2, mem1}
begin
if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
begin
{ Removes the second statement from
movXX reg1, mem1/reg2
movXX mem1/reg2, reg1
}
if taicpu(p).oper[0]^.typ=top_reg then
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,usedregs);
{ Removes the second statement from
movXX mem1/reg1, reg2
movXX reg2, mem1/reg1
}
if (taicpu(p).oper[1]^.typ=top_reg) and
not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,UsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'MovXXMovXX2Nop 1 done',p);
RemoveInstruction(hp1);
RemoveCurrentp(p); { p will now be equal to the instruction that follows what was hp1 }
Result:=true;
exit;
end
else if (taicpu(hp1).oper[1]^.typ<>top_ref) or (not(vol_write in taicpu(hp1).oper[1]^.ref^.volatility)) and
(taicpu(hp1).oper[0]^.typ<>top_ref) or (not(vol_read in taicpu(hp1).oper[0]^.ref^.volatility)) then
begin
DebugMsg(SPeepholeOptimization + 'MovXXMovXX2MoVXX 1 done',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
end
end;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1OP(var p : tai) : boolean;
var
hp1 : tai;
begin
result:=false;
{ replace
<Op>X %mreg1,%mreg2 // Op in [ADD,MUL]
MovX %mreg2,%mreg1
dealloc %mreg2
by
<Op>X %mreg2,%mreg1
?
}
if GetNextInstruction(p,hp1) and
{ we mix single and double opperations here because we assume that the compiler
generates vmovapd only after double operations and vmovaps only after single operations }
MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and
(taicpu(p).oper[0]^.typ=top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(0,taicpu(hp1).oper[0]^);
taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'OpMov2Op done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
function TX86AsmOptimizer.OptPass1Test(var p: tai) : boolean;
var
hp1, p_label, p_dist, hp1_dist: tai;
JumpLabel, JumpLabel_dist: TAsmLabel;
FirstValue, SecondValue: TCGInt;
begin
Result := False;
if (taicpu(p).oper[0]^.typ = top_const) and
(taicpu(p).oper[0]^.val <> -1) then
begin
{ Convert unsigned maximum constants to -1 to aid optimisation }
case taicpu(p).opsize of
S_B:
if (taicpu(p).oper[0]^.val and $FF) = $FF then
begin
taicpu(p).oper[0]^.val := -1;
Result := True;
Exit;
end;
S_W:
if (taicpu(p).oper[0]^.val and $FFFF) = $FFFF then
begin
taicpu(p).oper[0]^.val := -1;
Result := True;
Exit;
end;
S_L:
if (taicpu(p).oper[0]^.val and $FFFFFFFF) = $FFFFFFFF then
begin
taicpu(p).oper[0]^.val := -1;
Result := True;
Exit;
end;
{$ifdef x86_64}
S_Q:
{ Storing anything greater than $7FFFFFFF is not possible so do
nothing };
{$endif x86_64}
else
InternalError(2021121001);
end;
end;
if GetNextInstruction(p, hp1) and
TrySwapMovCmp(p, hp1) then
begin
Result := True;
Exit;
end;
{ Search for:
test $x,(reg/ref)
jne @lbl1
test $y,(reg/ref) (same register or reference)
jne @lbl1
Change to:
test $(x or y),(reg/ref)
jne @lbl1
(Note, this doesn't work with je instead of jne)
Also catch cases where "cmp $0,(reg/ref)" and "test %reg,%reg" are used.
Also search for:
test $x,(reg/ref)
je @lbl1
test $y,(reg/ref)
je/jne @lbl2
If (x or y) = x, then the second jump is deterministic
}
if (
(
(taicpu(p).oper[0]^.typ = top_const) or
(
{ test %reg,%reg can be considered equivalent to test, -1,%reg }
(taicpu(p).oper[0]^.typ = top_reg) and
MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg)
)
) and
MatchInstruction(hp1, A_JCC, [])
) then
begin
if (taicpu(p).oper[0]^.typ = top_reg) and
MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[0]^.reg) then
FirstValue := -1
else
FirstValue := taicpu(p).oper[0]^.val;
{ If we have several test/jne's in a row, it might be the case that
the second label doesn't go to the same location, but the one
after it might (e.g. test; jne @lbl1; test; jne @lbl2; test @lbl1),
so accommodate for this with a while loop.
}
hp1_dist := hp1;
if GetNextInstruction(hp1, p_dist) and
(p_dist.typ = ait_instruction) and
(
(
(taicpu(p_dist).opcode = A_TEST) and
(
(taicpu(p_dist).oper[0]^.typ = top_const) or
{ test %reg,%reg can be considered equivalent to test, -1,%reg }
MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^)
)
) or
(
{ cmp 0,%reg = test %reg,%reg }
(taicpu(p_dist).opcode = A_CMP) and
MatchOperand(taicpu(p_dist).oper[0]^, 0)
)
) and
{ Make sure the destination operands are actually the same }
MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
GetNextInstruction(p_dist, hp1_dist) and
MatchInstruction(hp1_dist, A_JCC, []) then
begin
if
(taicpu(p_dist).opcode = A_CMP) { constant will be zero } or
(
(taicpu(p_dist).oper[0]^.typ = top_reg) and
MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p_dist).oper[0]^.reg)
) then
SecondValue := -1
else
SecondValue := taicpu(p_dist).oper[0]^.val;
{ If both of the TEST constants are identical, delete the second
TEST that is unnecessary. }
if (FirstValue = SecondValue) then
begin
DebugMsg(SPeepholeOptimization + 'TEST/Jcc/TEST; removed superfluous TEST', p_dist);
RemoveInstruction(p_dist);
{ Don't let the flags register become deallocated and reallocated between the jumps }
AllocRegBetween(NR_DEFAULTFLAGS, hp1, hp1_dist, UsedRegs);
Result := True;
if condition_in(taicpu(hp1_dist).condition, taicpu(hp1).condition) then
begin
{ Since the second jump's condition is a subset of the first, we
know it will never branch because the first jump dominates it.
Get it out of the way now rather than wait for the jump
optimisations for a speed boost. }
if IsJumpToLabel(taicpu(hp1_dist)) then
TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
DebugMsg(SPeepholeOptimization + 'Removed dominated jump (via TEST/Jcc/TEST)', hp1_dist);
RemoveInstruction(hp1_dist);
end
else if condition_in(inverse_cond(taicpu(hp1).condition), taicpu(hp1_dist).condition) then
begin
{ If the inverse of the first condition is a subset of the second,
the second one will definitely branch if the first one doesn't }
DebugMsg(SPeepholeOptimization + 'Conditional jump will always branch (via TEST/Jcc/TEST)', hp1_dist);
MakeUnconditional(taicpu(hp1_dist));
RemoveDeadCodeAfterJump(hp1_dist);
end;
Exit;
end;
if (taicpu(hp1).condition in [C_NE, C_NZ]) and
(taicpu(hp1_dist).condition in [C_NE, C_NZ]) and
{ If the first instruction is test %reg,%reg or test $-1,%reg,
then the second jump will never branch, so it can also be
removed regardless of where it goes }
(
(FirstValue = -1) or
(SecondValue = -1) or
MatchOperand(taicpu(hp1_dist).oper[0]^, taicpu(hp1).oper[0]^)
) then
begin
{ Same jump location... can be a register since nothing's changed }
{ If any of the entries are equivalent to test %reg,%reg, then the
merged $(x or y) is also test %reg,%reg / test $-1,%reg }
taicpu(p).loadconst(0, FirstValue or SecondValue);
if IsJumpToLabel(taicpu(hp1_dist)) then
TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol).DecRefs;
DebugMsg(SPeepholeOptimization + 'TEST/JNE/TEST/JNE merged', p);
RemoveInstruction(hp1_dist);
{ Only remove the second test if no jumps or other conditional instructions follow }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, p_dist, TmpUsedRegs) then
RemoveInstruction(p_dist);
Result := True;
Exit;
end;
end;
end;
{ Search for:
test %reg,%reg
j(c1) @lbl1
...
@lbl:
test %reg,%reg (same register)
j(c2) @lbl2
If c2 is a subset of c1, change to:
test %reg,%reg
j(c1) @lbl2
(@lbl1 may become a dead label as a result)
}
if (taicpu(p).oper[1]^.typ = top_reg) and
(taicpu(p).oper[0]^.typ = top_reg) and
(taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
MatchInstruction(hp1, A_JCC, []) and
IsJumpToLabel(taicpu(hp1)) then
begin
JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol);
p_label := nil;
if Assigned(JumpLabel) then
p_label := getlabelwithsym(JumpLabel);
if Assigned(p_label) and
GetNextInstruction(p_label, p_dist) and
MatchInstruction(p_dist, A_TEST, []) and
{ It's fine if the second test uses smaller sub-registers }
(taicpu(p_dist).opsize <= taicpu(p).opsize) and
MatchOpType(taicpu(p_dist), top_reg, top_reg) and
SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and
GetNextInstruction(p_dist, hp1_dist) and
MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
begin
JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
if JumpLabel = JumpLabel_dist then
{ This is an infinite loop }
Exit;
{ Best optimisation when the first condition is a subset (or equal) of the second }
if condition_in(taicpu(hp1).condition, taicpu(hp1_dist).condition) then
begin
{ Any registers used here will already be allocated }
if Assigned(JumpLabel_dist) then
JumpLabel_dist.IncRefs;
if Assigned(JumpLabel) then
JumpLabel.DecRefs;
DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1);
taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^);
Result := True;
Exit;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1Add(var p : tai) : boolean;
var
hp1, hp2: tai;
ActiveReg: TRegister;
OldOffset: asizeint;
ThisConst: TCGInt;
function RegDeallocated: Boolean;
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
end;
begin
result:=false;
hp1 := nil;
{ replace
addX const,%reg1
leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
dealloc %reg1
by
leaX const+const*Y(%reg1,%reg1,Y),%reg2
}
if MatchOpType(taicpu(p),top_const,top_reg) then
begin
ActiveReg := taicpu(p).oper[1]^.reg;
{ Ensures the entire register was updated }
if (taicpu(p).opsize >= S_L) and
GetNextInstructionUsingReg(p,hp1, ActiveReg) and
MatchInstruction(hp1,A_LEA,[]) and
(SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
(
{ Cover the case where the register in the reference is also the destination register }
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
(
{ Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
RegDeallocated
)
) then
begin
OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
{$push}
{$R-}{$Q-}
{ Explicitly disable overflow checking for these offset calculation
as those do not matter for the final result }
if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
{$pop}
{$ifdef x86_64}
if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
begin
{ Overflow; abort }
taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
end
else
{$endif x86_64}
begin
DebugMsg(SPeepholeOptimization + 'AddLea2Lea done',p);
if not (cs_opt_level3 in current_settings.optimizerswitches) then
{ hp1 is the immediate next instruction for sure - good for a quick speed boost }
RemoveCurrentP(p, hp1)
else
RemoveCurrentP(p);
result:=true;
Exit;
end;
end;
if (
{ Save calling GetNextInstructionUsingReg again }
Assigned(hp1) or
GetNextInstructionUsingReg(p,hp1, ActiveReg)
) and
MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.reg = ActiveReg) then
begin
if taicpu(hp1).oper[0]^.typ = top_const then
begin
{ Merge add const1,%reg; add/sub const2,%reg to add const1+/-const2,%reg }
if taicpu(hp1).opcode = A_ADD then
ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val
else
ThisConst := taicpu(p).oper[0]^.val - taicpu(hp1).oper[0]^.val;
Result := True;
{ Handle any overflows }
case taicpu(p).opsize of
S_B:
taicpu(p).oper[0]^.val := ThisConst and $FF;
S_W:
taicpu(p).oper[0]^.val := ThisConst and $FFFF;
S_L:
taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
{$ifdef x86_64}
S_Q:
if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
{ Overflow; abort }
Result := False
else
taicpu(p).oper[0]^.val := ThisConst;
{$endif x86_64}
else
InternalError(2021102610);
end;
{ Result may get set to False again if the combined immediate overflows for S_Q sizes }
if Result then
begin
if (taicpu(p).oper[0]^.val < 0) and
(
((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
) then
begin
DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> SUB',p);
taicpu(p).opcode := A_SUB;
taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
end
else
DebugMsg(SPeepholeOptimization + 'ADD; ADD/SUB -> ADD',p);
RemoveInstruction(hp1);
end;
end
else
begin
{ Make doubly sure the flags aren't in use because the order of additions may affect them }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
hp2 := p;
while not (cs_opt_level3 in current_settings.optimizerswitches) and
GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
begin
{ Move the constant addition to after the reg/ref addition to improve optimisation }
DebugMsg(SPeepholeOptimization + 'Add/sub swap 1a done',p);
Asml.Remove(p);
Asml.InsertAfter(p, hp1);
p := hp1;
Result := True;
end;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
var
hp1: tai;
ref: Integer;
saveref: treference;
TempReg: TRegister;
Multiple: TCGInt;
begin
Result:=false;
{ removes seg register prefixes from LEA operations, as they
don't do anything}
taicpu(p).oper[0]^.ref^.Segment:=NR_NO;
{ changes "lea (%reg1), %reg2" into "mov %reg1, %reg2" }
if (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
(taicpu(p).oper[0]^.ref^.index = NR_NO) and
(
{ do not mess with leas accessing the stack pointer
unless it's a null operation }
(taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) or
(
(taicpu(p).oper[0]^.ref^.base = NR_STACK_POINTER_REG) and
(taicpu(p).oper[0]^.ref^.offset = 0)
)
) and
(not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
begin
if (taicpu(p).oper[0]^.ref^.offset = 0) then
begin
if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) then
begin
hp1:=taicpu.op_reg_reg(A_MOV,taicpu(p).opsize,taicpu(p).oper[0]^.ref^.base,
taicpu(p).oper[1]^.reg);
InsertLLItem(p.previous,p.next, hp1);
DebugMsg(SPeepholeOptimization + 'Lea2Mov done',hp1);
p.free;
p:=hp1;
end
else
begin
DebugMsg(SPeepholeOptimization + 'Lea2Nop done',p);
RemoveCurrentP(p);
end;
Result:=true;
exit;
end
else if (
{ continue to use lea to adjust the stack pointer,
it is the recommended way, but only if not optimizing for size }
(taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
(cs_opt_size in current_settings.optimizerswitches)
) and
{ If the flags register is in use, don't change the instruction
to an ADD otherwise this will scramble the flags. [Kit] }
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
ConvertLEA(taicpu(p)) then
begin
Result:=true;
exit;
end;
end;
if GetNextInstruction(p,hp1) and
(hp1.typ=ait_instruction) then
begin
if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
MatchOpType(Taicpu(hp1),top_reg,top_reg) and
(taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'LeaMov2Lea done',p);
RemoveInstruction(hp1);
result:=true;
exit;
end;
end;
{ changes
lea <ref1>, reg1
<op> ...,<ref. with reg1>,...
to
<op> ...,<ref1>,... }
if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
(taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
not(MatchInstruction(hp1,A_LEA,[])) then
begin
{ find a reference which uses reg1 }
if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
ref:=0
else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
ref:=1
else
ref:=-1;
if (ref<>-1) and
{ reg1 must be either the base or the index }
((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
begin
{ reg1 can be removed from the reference }
saveref:=taicpu(hp1).oper[ref]^.ref^;
if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
else
Internalerror(2019111201);
{ check if the can insert all data of the lea into the second instruction }
if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
(taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
{$ifdef x86_64}
and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
)
{$endif x86_64}
then
begin
{ reg1 might not used by the second instruction after it is remove from the reference }
if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
{ reg1 is not updated so it might not be used afterwards }
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
if taicpu(p).oper[0]^.ref^.base<>NR_NO then
taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
if taicpu(p).oper[0]^.ref^.index<>NR_NO then
taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
if taicpu(p).oper[0]^.ref^.symbol<>nil then
taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
RemoveCurrentP(p, hp1);
result:=true;
exit;
end
end;
end;
{ recover }
taicpu(hp1).oper[ref]^.ref^:=saveref;
end;
end;
end;
{ for now, we do not mess with the stack pointer, thought it might be usefull to remove
unneeded lea sequences on the stack pointer, it needs to be tested in detail }
if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
begin
{ Check common LEA/LEA conditions }
if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
(taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(p).oper[0]^.ref^.relsymbol = nil) and
(taicpu(p).oper[0]^.ref^.segment = NR_NO) and
(taicpu(p).oper[0]^.ref^.symbol = nil) and
(taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
(taicpu(hp1).oper[0]^.ref^.segment = NR_NO) and
(taicpu(hp1).oper[0]^.ref^.symbol = nil) and
(
(taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
) and (
(taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
(taicpu(p).oper[0]^.ref^.index = NR_NO) or
not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
) then
begin
{ changes
lea (regX,scale), reg1
lea offset(reg1,reg1), reg1
to
lea offset(regX,scale*2), reg1
and
lea (regX,scale1), reg1
lea offset(reg1,scale2), reg1
to
lea offset(regX,scale1*scale2), reg1
... so long as the final scale does not exceed 8
(Similarly, allow the first instruction to be "lea (regX,regX),reg1")
}
if (taicpu(p).oper[0]^.ref^.offset = 0) and
(taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
(
(
(taicpu(p).oper[0]^.ref^.base = NR_NO)
) or (
(taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
(
(taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index, p, hp1))
)
)
) and (
(
{ lea (reg1,scale2), reg1 variant }
(taicpu(hp1).oper[0]^.ref^.base = NR_NO) and
(
(
(taicpu(p).oper[0]^.ref^.base = NR_NO) and
(taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
) or (
{ lea (regX,regX), reg1 variant }
(taicpu(p).oper[0]^.ref^.base <> NR_NO) and
(taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
)
)
) or (
{ lea (reg1,reg1), reg1 variant }
(taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
)
) then
begin
DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
{ Make everything homogeneous to make calculations easier }
if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
begin
if taicpu(p).oper[0]^.ref^.index <> NR_NO then
{ Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
taicpu(p).oper[0]^.ref^.scalefactor := 2
else
taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
taicpu(p).oper[0]^.ref^.base := NR_NO;
end;
if (taicpu(hp1).oper[0]^.ref^.base = NR_NO) then
begin
{ Just to prevent miscalculations }
if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
else
taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor;
end
else
begin
taicpu(hp1).oper[0]^.ref^.base := NR_NO;
taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor * 2;
end;
taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
RemoveCurrentP(p);
result:=true;
exit;
end
{ changes
lea offset1(regX), reg1
lea offset2(reg1), reg1
to
lea offset1+offset2(regX), reg1 }
else if
(
(taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
(taicpu(p).oper[0]^.ref^.index = NR_NO)
) or (
(taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
(
(
(taicpu(p).oper[0]^.ref^.index = NR_NO) or
(taicpu(p).oper[0]^.ref^.base = NR_NO)
) or (
(taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
(
(taicpu(p).oper[0]^.ref^.index = NR_NO) or
(
(taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
(
(taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
(taicpu(hp1).oper[0]^.ref^.base = NR_NO)
)
)
)
)
)
) then
begin
DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
begin
taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
{ if the register is used as index and base, we have to increase for base as well
and adapt base }
if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
begin
taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
end;
end
else
begin
inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
end;
if taicpu(p).oper[0]^.ref^.index<>NR_NO then
begin
taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
end;
RemoveCurrentP(p);
result:=true;
exit;
end;
end;
{ Change:
leal/q $x(%reg1),%reg2
...
shll/q $y,%reg2
To:
leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
}
if MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp1), top_const, top_reg) and
(taicpu(hp1).oper[0]^.val <= 3) then
begin
Multiple := 1 shl taicpu(hp1).oper[0]^.val;
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
TempReg := taicpu(hp1).oper[1]^.reg; { Store locally to reduce the number of dereferences }
if
{ This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
(this works even if scalefactor is zero) }
((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
{ Ensure offset doesn't go out of bounds }
(abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
MatchOperand(taicpu(p).oper[1]^, TempReg) and
(
(
not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, TempReg) and
(
(taicpu(p).oper[0]^.ref^.index = NR_NO) or
(taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
(
{ Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
(taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
(taicpu(p).oper[0]^.ref^.scalefactor <= 1)
)
)
) or (
(
(taicpu(p).oper[0]^.ref^.base = NR_NO) or
(taicpu(p).oper[0]^.ref^.base = NR_INVALID)
) and
not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, TempReg)
)
) then
begin
repeat
with taicpu(p).oper[0]^.ref^ do
begin
{ Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
if index = base then
begin
if Multiple > 4 then
{ Optimisation will no longer work because resultant
scale factor will exceed 8 }
Break;
base := NR_NO;
scalefactor := 2;
DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
end
else if (base <> NR_NO) and (base <> NR_INVALID) then
begin
{ Scale factor only works on the index register }
index := base;
base := NR_NO;
end;
{ For safety }
if scalefactor <= 1 then
begin
DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
scalefactor := Multiple;
end
else
begin
DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
scalefactor := scalefactor * Multiple;
end;
offset := offset * Multiple;
end;
RemoveInstruction(hp1);
Result := True;
Exit;
{ This repeat..until loop exists for the benefit of Break }
until True;
end;
end;
end;
end;
function TX86AsmOptimizer.DoSubAddOpt(var p: tai): Boolean;
var
hp1 : tai;
begin
DoSubAddOpt := False;
if taicpu(p).oper[0]^.typ <> top_const then
{ Should have been confirmed before calling }
InternalError(2021102601);
if GetLastInstruction(p, hp1) and
(hp1.typ = ait_instruction) and
(taicpu(hp1).opsize = taicpu(p).opsize) then
case taicpu(hp1).opcode Of
A_DEC:
if MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) then
begin
taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+1);
RemoveInstruction(hp1);
end;
A_SUB:
if (taicpu(hp1).oper[0]^.typ = top_const) and
MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
begin
taicpu(p).loadConst(0,taicpu(p).oper[0]^.val+taicpu(hp1).oper[0]^.val);
RemoveInstruction(hp1);
end;
A_ADD:
begin
if (taicpu(hp1).oper[0]^.typ = top_const) and
MatchOperand(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) then
begin
taicpu(p).loadConst(0,taicpu(p).oper[0]^.val-taicpu(hp1).oper[0]^.val);
RemoveInstruction(hp1);
if (taicpu(p).oper[0]^.val = 0) then
begin
hp1 := tai(p.next);
RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
if not GetLastInstruction(hp1, p) then
p := hp1;
DoSubAddOpt := True;
end
end;
end;
else
;
end;
end;
function TX86AsmOptimizer.DoMovCmpMemOpt(var p : tai; const hp1: tai; UpdateTmpUsedRegs: Boolean) : Boolean;
begin
Result := False;
if UpdateTmpUsedRegs then
TransferUsedRegs(TmpUsedRegs);
if MatchOpType(taicpu(p),top_ref,top_reg) and
{ The x86 assemblers have difficulty comparing values against absolute addresses }
(taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
(taicpu(hp1).oper[0]^.typ <> top_ref) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
(
(
(taicpu(hp1).opcode = A_TEST)
) or (
(taicpu(hp1).opcode = A_CMP) and
{ A sanity check more than anything }
not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
)
) then
begin
{ change
mov mem, %reg
cmp/test x, %reg / test %reg,%reg
(reg deallocated)
to
cmp/test x, mem / cmp 0, mem
}
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
begin
{ Convert test %reg,%reg or test $-1,%reg to cmp $0,mem }
if (taicpu(hp1).opcode = A_TEST) and
(
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
MatchOperand(taicpu(hp1).oper[0]^, -1)
) then
begin
taicpu(hp1).opcode := A_CMP;
taicpu(hp1).loadconst(0, 0);
end;
taicpu(hp1).loadref(1, taicpu(p).oper[0]^.ref^);
DebugMsg(SPeepholeOptimization + 'MOV/CMP -> CMP (memory check)', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
end;
function TX86AsmOptimizer.OptPass1Sub(var p : tai) : boolean;
var
hp1, hp2: tai;
ActiveReg: TRegister;
OldOffset: asizeint;
ThisConst: TCGInt;
function RegDeallocated: Boolean;
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
Result := not(RegUsedAfterInstruction(ActiveReg,hp1,TmpUsedRegs))
end;
begin
Result:=false;
hp1 := nil;
{ replace
subX const,%reg1
leaX (%reg1,%reg1,Y),%reg2 // Base or index might not be equal to reg1
dealloc %reg1
by
leaX -const-const*Y(%reg1,%reg1,Y),%reg2
}
if MatchOpType(taicpu(p),top_const,top_reg) then
begin
ActiveReg := taicpu(p).oper[1]^.reg;
{ Ensures the entire register was updated }
if (taicpu(p).opsize >= S_L) and
GetNextInstructionUsingReg(p,hp1, ActiveReg) and
MatchInstruction(hp1,A_LEA,[]) and
(SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.base) or
SuperRegistersEqual(ActiveReg, taicpu(hp1).oper[0]^.ref^.index)) and
(
{ Cover the case where the register in the reference is also the destination register }
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ActiveReg) or
(
{ Try to avoid the expensive check of RegUsedAfterInstruction if we know it will return False }
not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ActiveReg) and
RegDeallocated
)
) then
begin
OldOffset := taicpu(hp1).oper[0]^.ref^.offset;
if ActiveReg=taicpu(hp1).oper[0]^.ref^.base then
Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val);
if ActiveReg=taicpu(hp1).oper[0]^.ref^.index then
Dec(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.val*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
{$ifdef x86_64}
if (taicpu(hp1).oper[0]^.ref^.offset > $7FFFFFFF) or (taicpu(hp1).oper[0]^.ref^.offset < -2147483648) then
begin
{ Overflow; abort }
taicpu(hp1).oper[0]^.ref^.offset := OldOffset;
end
else
{$endif x86_64}
begin
DebugMsg(SPeepholeOptimization + 'SubLea2Lea done',p);
if not (cs_opt_level3 in current_settings.optimizerswitches) then
{ hp1 is the immediate next instruction for sure - good for a quick speed boost }
RemoveCurrentP(p, hp1)
else
RemoveCurrentP(p);
result:=true;
Exit;
end;
end;
if (
{ Save calling GetNextInstructionUsingReg again }
Assigned(hp1) or
GetNextInstructionUsingReg(p,hp1, ActiveReg)
) and
MatchInstruction(hp1,A_SUB,[taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.reg = ActiveReg) then
begin
if taicpu(hp1).oper[0]^.typ = top_const then
begin
{ Merge add const1,%reg; add const2,%reg to add const1+const2,%reg }
ThisConst := taicpu(p).oper[0]^.val + taicpu(hp1).oper[0]^.val;
Result := True;
{ Handle any overflows }
case taicpu(p).opsize of
S_B:
taicpu(p).oper[0]^.val := ThisConst and $FF;
S_W:
taicpu(p).oper[0]^.val := ThisConst and $FFFF;
S_L:
taicpu(p).oper[0]^.val := ThisConst and $FFFFFFFF;
{$ifdef x86_64}
S_Q:
if (ThisConst > $7FFFFFFF) or (ThisConst < -2147483648) then
{ Overflow; abort }
Result := False
else
taicpu(p).oper[0]^.val := ThisConst;
{$endif x86_64}
else
InternalError(2021102610);
end;
{ Result may get set to False again if the combined immediate overflows for S_Q sizes }
if Result then
begin
if (taicpu(p).oper[0]^.val < 0) and
(
((taicpu(p).opsize = S_B) and (taicpu(p).oper[0]^.val <> -128)) or
((taicpu(p).opsize = S_W) and (taicpu(p).oper[0]^.val <> -32768)) or
((taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).oper[0]^.val <> -2147483648))
) then
begin
DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> ADD',p);
taicpu(p).opcode := A_SUB;
taicpu(p).oper[0]^.val := -taicpu(p).oper[0]^.val;
end
else
DebugMsg(SPeepholeOptimization + 'SUB; ADD/SUB -> SUB',p);
RemoveInstruction(hp1);
end;
end
else
begin
{ Make doubly sure the flags aren't in use because the order of subtractions may affect them }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
hp2 := p;
while not (cs_opt_level3 in current_settings.optimizerswitches) and
GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
begin
{ Move the constant subtraction to after the reg/ref addition to improve optimisation }
DebugMsg(SPeepholeOptimization + 'Add/sub swap 1b done',p);
Asml.Remove(p);
Asml.InsertAfter(p, hp1);
p := hp1;
Result := True;
Exit;
end;
end;
end;
{ * change "subl $2, %esp; pushw x" to "pushl x"}
{ * change "sub/add const1, reg" or "dec reg" followed by
"sub const2, reg" to one "sub ..., reg" }
{$ifdef i386}
if (taicpu(p).oper[0]^.val = 2) and
(ActiveReg = NR_ESP) and
{ Don't do the sub/push optimization if the sub }
{ comes from setting up the stack frame (JM) }
(not(GetLastInstruction(p,hp1)) or
not(MatchInstruction(hp1,A_MOV,[S_L]) and
MatchOperand(taicpu(hp1).oper[0]^,NR_ESP) and
MatchOperand(taicpu(hp1).oper[0]^,NR_EBP))) then
begin
hp1 := tai(p.next);
while Assigned(hp1) and
(tai(hp1).typ in [ait_instruction]+SkipInstr) and
not RegReadByInstruction(NR_ESP,hp1) and
not RegModifiedByInstruction(NR_ESP,hp1) do
hp1 := tai(hp1.next);
if Assigned(hp1) and
MatchInstruction(hp1,A_PUSH,[S_W]) then
begin
taicpu(hp1).changeopsize(S_L);
if taicpu(hp1).oper[0]^.typ=top_reg then
setsubreg(taicpu(hp1).oper[0]^.reg,R_SUBWHOLE);
hp1 := tai(p.next);
RemoveCurrentp(p, hp1);
Result:=true;
exit;
end;
end;
{$endif i386}
if DoSubAddOpt(p) then
Result:=true;
end;
end;
function TX86AsmOptimizer.OptPass1SHLSAL(var p : tai) : boolean;
var
TmpBool1,TmpBool2 : Boolean;
tmpref : treference;
hp1,hp2: tai;
mask: tcgint;
begin
Result:=false;
{ All these optimisations work on "shl/sal const,%reg" }
if not MatchOpType(taicpu(p),top_const,top_reg) then
Exit;
if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
(taicpu(p).oper[0]^.val <= 3) then
{ Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
begin
{ should we check the next instruction? }
TmpBool1 := True;
{ have we found an add/sub which could be
integrated in the lea? }
TmpBool2 := False;
reference_reset(tmpref,2,[]);
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
while TmpBool1 and
GetNextInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
((((taicpu(hp1).opcode = A_ADD) or
(taicpu(hp1).opcode = A_SUB)) and
(taicpu(hp1).oper[1]^.typ = Top_Reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)) or
(((taicpu(hp1).opcode = A_INC) or
(taicpu(hp1).opcode = A_DEC)) and
(taicpu(hp1).oper[0]^.typ = Top_Reg) and
(taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg)) or
((taicpu(hp1).opcode = A_LEA) and
(taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg))) and
(not GetNextInstruction(hp1,hp2) or
not instrReadsFlags(hp2)) Do
begin
TmpBool1 := False;
if taicpu(hp1).opcode=A_LEA then
begin
if (TmpRef.base = NR_NO) and
(taicpu(hp1).oper[0]^.ref^.symbol=nil) and
(taicpu(hp1).oper[0]^.ref^.relsymbol=nil) and
(taicpu(hp1).oper[0]^.ref^.segment=NR_NO) and
((taicpu(hp1).oper[0]^.ref^.scalefactor=0) or
(taicpu(hp1).oper[0]^.ref^.scalefactor*tmpref.scalefactor<=8)) then
begin
TmpBool1 := True;
TmpBool2 := True;
inc(TmpRef.offset, taicpu(hp1).oper[0]^.ref^.offset);
if taicpu(hp1).oper[0]^.ref^.scalefactor<>0 then
tmpref.scalefactor:=tmpref.scalefactor*taicpu(hp1).oper[0]^.ref^.scalefactor;
TmpRef.base := taicpu(hp1).oper[0]^.ref^.base;
RemoveInstruction(hp1);
end
end
else if (taicpu(hp1).oper[0]^.typ = Top_Const) then
begin
TmpBool1 := True;
TmpBool2 := True;
case taicpu(hp1).opcode of
A_ADD:
inc(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
A_SUB:
dec(TmpRef.offset, longint(taicpu(hp1).oper[0]^.val));
else
internalerror(2019050536);
end;
RemoveInstruction(hp1);
end
else
if (taicpu(hp1).oper[0]^.typ = Top_Reg) and
(((taicpu(hp1).opcode = A_ADD) and
(TmpRef.base = NR_NO)) or
(taicpu(hp1).opcode = A_INC) or
(taicpu(hp1).opcode = A_DEC)) then
begin
TmpBool1 := True;
TmpBool2 := True;
case taicpu(hp1).opcode of
A_ADD:
TmpRef.base := taicpu(hp1).oper[0]^.reg;
A_INC:
inc(TmpRef.offset);
A_DEC:
dec(TmpRef.offset);
else
internalerror(2019050535);
end;
RemoveInstruction(hp1);
end;
end;
if TmpBool2
{$ifndef x86_64}
or
((current_settings.optimizecputype < cpu_Pentium2) and
(taicpu(p).oper[0]^.val <= 3) and
not(cs_opt_size in current_settings.optimizerswitches))
{$endif x86_64}
then
begin
if not(TmpBool2) and
(taicpu(p).oper[0]^.val=1) then
begin
hp1:=taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg)
end
else
hp1:=taicpu.op_ref_reg(A_LEA, taicpu(p).opsize, TmpRef,
taicpu(p).oper[1]^.reg);
DebugMsg(SPeepholeOptimization + 'ShlAddLeaSubIncDec2Lea',p);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end;
end
{$ifndef x86_64}
else if (current_settings.optimizecputype < cpu_Pentium2) then
begin
{ changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
but faster on a 486, and Tairable in both U and V pipes on the Pentium
(unlike shl, which is only Tairable in the U pipe) }
if taicpu(p).oper[0]^.val=1 then
begin
hp1 := taicpu.Op_reg_reg(A_ADD,taicpu(p).opsize,
taicpu(p).oper[1]^.reg, taicpu(p).oper[1]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end
{ changes "shl $2, %reg" to "lea (,%reg,4), %reg"
"shl $3, %reg" to "lea (,%reg,8), %reg }
else if (taicpu(p).opsize = S_L) and
(taicpu(p).oper[0]^.val<= 3) then
begin
reference_reset(tmpref,2,[]);
TmpRef.index := taicpu(p).oper[1]^.reg;
TmpRef.scalefactor := 1 shl taicpu(p).oper[0]^.val;
hp1 := taicpu.Op_ref_reg(A_LEA,S_L,TmpRef, taicpu(p).oper[1]^.reg);
InsertLLItem(p.previous, p.next, hp1);
p.free;
p := hp1;
end;
end
{$endif x86_64}
else if
GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
(
(
MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
SetAndTest(hp1, hp2)
{$ifdef x86_64}
) or
(
MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp2), top_reg, top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
{$endif x86_64}
)
) and
(taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
begin
{ Change:
shl x, %reg1
mov -(1<<x), %reg2
and %reg2, %reg1
Or:
shl x, %reg1
and -(1<<x), %reg1
To just:
shl x, %reg1
Since the and operation only zeroes bits that are already zero from the shl operation
}
case taicpu(p).oper[0]^.val of
8:
mask:=$FFFFFFFFFFFFFF00;
16:
mask:=$FFFFFFFFFFFF0000;
32:
mask:=$FFFFFFFF00000000;
63:
{ Constant pre-calculated to prevent overflow errors with Int64 }
mask:=$8000000000000000;
else
begin
if taicpu(p).oper[0]^.val >= 64 then
{ Shouldn't happen realistically, since the register
is guaranteed to be set to zero at this point }
mask := 0
else
mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
end;
end;
if taicpu(hp1).oper[0]^.val = mask then
begin
{ Everything checks out, perform the optimisation, as long as
the FLAGS register isn't being used}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
{$ifdef x86_64}
if (hp1 <> hp2) then
begin
{ "shl/mov/and" version }
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
{ Don't do the optimisation if the FLAGS register is in use }
if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
{ Don't remove the 'mov' instruction if its register is used elsewhere }
if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
begin
RemoveInstruction(hp1);
Result := True;
end;
{ Only set Result to True if the 'mov' instruction was removed }
RemoveInstruction(hp2);
end;
end
else
{$endif x86_64}
begin
{ "shl/and" version }
{ Don't do the optimisation if the FLAGS register is in use }
if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
RemoveInstruction(hp1);
Result := True;
end;
end;
Exit;
end
else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
begin
{ Even if the mask doesn't allow for its removal, we might be
able to optimise the mask for the "shl/and" version, which
may permit other peephole optimisations }
{$ifdef DEBUG_AOPTCPU}
mask := taicpu(hp1).oper[0]^.val and mask;
if taicpu(hp1).oper[0]^.val <> mask then
begin
DebugMsg(
SPeepholeOptimization +
'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
' to $' + debug_tostr(mask) +
'based on previous instruction (ShlAnd2ShlAnd)', hp1);
taicpu(hp1).oper[0]^.val := mask;
end;
{$else DEBUG_AOPTCPU}
{ If debugging is off, just set the operand even if it's the same }
taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
{$endif DEBUG_AOPTCPU}
end;
end;
{
change
shl/sal const,reg
<op> ...(...,reg,1),...
into
<op> ...(...,reg,1 shl const),...
if const in 1..3
}
if MatchOpType(taicpu(p), top_const, top_reg) and
(taicpu(p).oper[0]^.val in [1..3]) and
GetNextInstruction(p, hp1) and
MatchInstruction(hp1,A_MOV,A_LEA,[]) and
MatchOpType(taicpu(hp1), top_ref, top_reg) and
(taicpu(p).oper[1]^.reg=taicpu(hp1).oper[0]^.ref^.index) and
(taicpu(p).oper[1]^.reg<>taicpu(hp1).oper[0]^.ref^.base) and
(taicpu(hp1).oper[0]^.ref^.scalefactor in [0,1]) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
begin
taicpu(hp1).oper[0]^.ref^.scalefactor:=1 shl taicpu(p).oper[0]^.val;
DebugMsg(SPeepholeOptimization + 'ShlOp2Op', p);
RemoveCurrentP(p);
Result:=true;
end;
end;
end;
function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
var
CurrentRef: TReference;
FullReg: TRegister;
hp1, hp2: tai;
begin
Result := False;
if (first_mov.opsize <> S_B) or (second_mov.opsize <> S_B) then
Exit;
{ We assume you've checked if the operand is actually a reference by
this point. If it isn't, you'll most likely get an access violation }
CurrentRef := first_mov.oper[1]^.ref^;
{ Memory must be aligned }
if (CurrentRef.offset mod 4) <> 0 then
Exit;
Inc(CurrentRef.offset);
CurrentRef.alignment := 1; { Otherwise references_equal will return False }
if MatchOperand(second_mov.oper[0]^, 0) and
references_equal(second_mov.oper[1]^.ref^, CurrentRef) and
GetNextInstruction(second_mov, hp1) and
(hp1.typ = ait_instruction) and
(taicpu(hp1).opcode = A_MOV) and
MatchOpType(taicpu(hp1), top_const, top_ref) and
(taicpu(hp1).oper[0]^.val = 0) then
begin
Inc(CurrentRef.offset);
CurrentRef.alignment := taicpu(hp1).oper[1]^.ref^.alignment; { Otherwise references_equal might return False }
FullReg := newreg(R_INTREGISTER,getsupreg(first_mov.oper[0]^.reg), R_SUBD);
if references_equal(taicpu(hp1).oper[1]^.ref^, CurrentRef) then
begin
case taicpu(hp1).opsize of
S_B:
if GetNextInstruction(hp1, hp2) and
MatchInstruction(taicpu(hp2), A_MOV, [S_B]) and
MatchOpType(taicpu(hp2), top_const, top_ref) and
(taicpu(hp2).oper[0]^.val = 0) then
begin
Inc(CurrentRef.offset);
CurrentRef.alignment := 1; { Otherwise references_equal will return False }
if references_equal(taicpu(hp2).oper[1]^.ref^, CurrentRef) and
(taicpu(hp2).opsize = S_B) then
begin
RemoveInstruction(hp1);
RemoveInstruction(hp2);
first_mov.opsize := S_L;
if first_mov.oper[0]^.typ = top_reg then
begin
DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVZX/MOVl', first_mov);
{ Reuse second_mov as a MOVZX instruction }
second_mov.opcode := A_MOVZX;
second_mov.opsize := S_BL;
second_mov.loadreg(0, first_mov.oper[0]^.reg);
second_mov.loadreg(1, FullReg);
first_mov.oper[0]^.reg := FullReg;
asml.Remove(second_mov);
asml.InsertBefore(second_mov, first_mov);
end
else
{ It's a value }
begin
DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVb/MOVb -> MOVl', first_mov);
RemoveInstruction(second_mov);
end;
Result := True;
Exit;
end;
end;
S_W:
begin
RemoveInstruction(hp1);
first_mov.opsize := S_L;
if first_mov.oper[0]^.typ = top_reg then
begin
DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVZX/MOVl', first_mov);
{ Reuse second_mov as a MOVZX instruction }
second_mov.opcode := A_MOVZX;
second_mov.opsize := S_BL;
second_mov.loadreg(0, first_mov.oper[0]^.reg);
second_mov.loadreg(1, FullReg);
first_mov.oper[0]^.reg := FullReg;
asml.Remove(second_mov);
asml.InsertBefore(second_mov, first_mov);
end
else
{ It's a value }
begin
DebugMsg(SPeepholeOptimization + 'MOVb/MOVb/MOVw -> MOVl', first_mov);
RemoveInstruction(second_mov);
end;
Result := True;
Exit;
end;
else
;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1FSTP(var p: tai): boolean;
{ returns true if a "continue" should be done after this optimization }
var
hp1, hp2: tai;
begin
Result := false;
if MatchOpType(taicpu(p),top_ref) and
GetNextInstruction(p, hp1) and
(hp1.typ = ait_instruction) and
(((taicpu(hp1).opcode = A_FLD) and
(taicpu(p).opcode = A_FSTP)) or
((taicpu(p).opcode = A_FISTP) and
(taicpu(hp1).opcode = A_FILD))) and
MatchOpType(taicpu(hp1),top_ref) and
(taicpu(hp1).opsize = taicpu(p).opsize) and
RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
begin
{ replacing fstp f;fld f by fst f is only valid for extended because of rounding or if fastmath is on }
if ((taicpu(p).opsize=S_FX) or (cs_opt_fastmath in current_settings.optimizerswitches)) and
GetNextInstruction(hp1, hp2) and
(hp2.typ = ait_instruction) and
IsExitCode(hp2) and
(taicpu(p).oper[0]^.ref^.base = current_procinfo.FramePointer) and
not(assigned(current_procinfo.procdef.funcretsym) and
(taicpu(p).oper[0]^.ref^.offset < tabstractnormalvarsym(current_procinfo.procdef.funcretsym).localloc.reference.offset)) and
(taicpu(p).oper[0]^.ref^.index = NR_NO) then
begin
RemoveInstruction(hp1);
RemoveCurrentP(p, hp2);
RemoveLastDeallocForFuncRes(p);
Result := true;
end
else
{ we can do this only in fast math mode as fstp is rounding ...
... still disabled as it breaks the compiler and/or rtl }
if ({ (cs_opt_fastmath in current_settings.optimizerswitches) or }
{ ... or if another fstp equal to the first one follows }
(GetNextInstruction(hp1,hp2) and
(hp2.typ = ait_instruction) and
(taicpu(p).opcode=taicpu(hp2).opcode) and
(taicpu(p).opsize=taicpu(hp2).opsize))
) and
{ fst can't store an extended/comp value }
(taicpu(p).opsize <> S_FX) and
(taicpu(p).opsize <> S_IQ) then
begin
if (taicpu(p).opcode = A_FSTP) then
taicpu(p).opcode := A_FST
else
taicpu(p).opcode := A_FIST;
DebugMsg(SPeepholeOptimization + 'FstpFld2Fst',p);
RemoveInstruction(hp1);
end;
end;
end;
function TX86AsmOptimizer.OptPass1FLD(var p : tai) : boolean;
var
hp1, hp2: tai;
begin
result:=false;
if MatchOpType(taicpu(p),top_reg) and
GetNextInstruction(p, hp1) and
(hp1.typ = Ait_Instruction) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(taicpu(hp1).oper[0]^.reg = NR_ST) and
(taicpu(hp1).oper[1]^.reg = NR_ST1) then
{ change to
fld reg fxxx reg,st
fxxxp st, st1 (hp1)
Remark: non commutative operations must be reversed!
}
begin
case taicpu(hp1).opcode Of
A_FMULP,A_FADDP,
A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
begin
case taicpu(hp1).opcode Of
A_FADDP: taicpu(hp1).opcode := A_FADD;
A_FMULP: taicpu(hp1).opcode := A_FMUL;
A_FSUBP: taicpu(hp1).opcode := A_FSUBR;
A_FSUBRP: taicpu(hp1).opcode := A_FSUB;
A_FDIVP: taicpu(hp1).opcode := A_FDIVR;
A_FDIVRP: taicpu(hp1).opcode := A_FDIV;
else
internalerror(2019050534);
end;
taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
taicpu(hp1).oper[1]^.reg := NR_ST;
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
else
;
end;
end
else
if MatchOpType(taicpu(p),top_ref) and
GetNextInstruction(p, hp2) and
(hp2.typ = Ait_Instruction) and
MatchOpType(taicpu(hp2),top_reg,top_reg) and
(taicpu(p).opsize in [S_FS, S_FL]) and
(taicpu(hp2).oper[0]^.reg = NR_ST) and
(taicpu(hp2).oper[1]^.reg = NR_ST1) then
if GetLastInstruction(p, hp1) and
MatchInstruction(hp1,A_FLD,A_FST,[taicpu(p).opsize]) and
MatchOpType(taicpu(hp1),top_ref) and
RefsEqual(taicpu(p).oper[0]^.ref^, taicpu(hp1).oper[0]^.ref^) then
if ((taicpu(hp2).opcode = A_FMULP) or
(taicpu(hp2).opcode = A_FADDP)) then
{ change to
fld/fst mem1 (hp1) fld/fst mem1
fld mem1 (p) fadd/
faddp/ fmul st, st
fmulp st, st1 (hp2) }
begin
RemoveCurrentP(p, hp1);
if (taicpu(hp2).opcode = A_FADDP) then
taicpu(hp2).opcode := A_FADD
else
taicpu(hp2).opcode := A_FMUL;
taicpu(hp2).oper[1]^.reg := NR_ST;
end
else
{ change to
fld/fst mem1 (hp1) fld/fst mem1
fld mem1 (p) fld st}
begin
taicpu(p).changeopsize(S_FL);
taicpu(p).loadreg(0,NR_ST);
end
else
begin
case taicpu(hp2).opcode Of
A_FMULP,A_FADDP,A_FSUBP,A_FDIVP,A_FSUBRP,A_FDIVRP:
{ change to
fld/fst mem1 (hp1) fld/fst mem1
fld mem2 (p) fxxx mem2
fxxxp st, st1 (hp2) }
begin
case taicpu(hp2).opcode Of
A_FADDP: taicpu(p).opcode := A_FADD;
A_FMULP: taicpu(p).opcode := A_FMUL;
A_FSUBP: taicpu(p).opcode := A_FSUBR;
A_FSUBRP: taicpu(p).opcode := A_FSUB;
A_FDIVP: taicpu(p).opcode := A_FDIVR;
A_FDIVRP: taicpu(p).opcode := A_FDIV;
else
internalerror(2019050533);
end;
RemoveInstruction(hp2);
end
else
;
end
end
end;
function IsCmpSubset(cond1, cond2: TAsmCond): Boolean; inline;
begin
Result := condition_in(cond1, cond2) or
{ Not strictly subsets due to the actual flags checked, but because we're
comparing integers, E is a subset of AE and GE and their aliases }
((cond1 in [C_E, C_Z]) and (cond2 in [C_AE, C_NB, C_NC, C_GE, C_NL]));
end;
function TX86AsmOptimizer.OptPass1Cmp(var p: tai): boolean;
var
v: TCGInt;
hp1, hp2, p_dist, p_jump, hp1_dist, p_label, hp1_label: tai;
FirstMatch: Boolean;
NewReg: TRegister;
JumpLabel, JumpLabel_dist, JumpLabel_far: TAsmLabel;
begin
Result:=false;
{ All these optimisations need a next instruction }
if not GetNextInstruction(p, hp1) then
Exit;
{ Search for:
cmp ###,###
j(c1) @lbl1
...
@lbl:
cmp ###.### (same comparison as above)
j(c2) @lbl2
If c1 is a subset of c2, change to:
cmp ###,###
j(c2) @lbl2
(@lbl1 may become a dead label as a result)
}
{ Also handle cases where there are multiple jumps in a row }
p_jump := hp1;
while Assigned(p_jump) and MatchInstruction(p_jump, A_JCC, []) do
begin
if IsJumpToLabel(taicpu(p_jump)) then
begin
JumpLabel := TAsmLabel(taicpu(p_jump).oper[0]^.ref^.symbol);
p_label := nil;
if Assigned(JumpLabel) then
p_label := getlabelwithsym(JumpLabel);
if Assigned(p_label) and
GetNextInstruction(p_label, p_dist) and
MatchInstruction(p_dist, A_CMP, []) and
MatchOperand(taicpu(p_dist).oper[0]^, taicpu(p).oper[0]^) and
MatchOperand(taicpu(p_dist).oper[1]^, taicpu(p).oper[1]^) and
GetNextInstruction(p_dist, hp1_dist) and
MatchInstruction(hp1_dist, A_JCC, []) then { This doesn't have to be an explicit label }
begin
JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol);
if JumpLabel = JumpLabel_dist then
{ This is an infinite loop }
Exit;
{ Best optimisation when the first condition is a subset (or equal) of the second }
if IsCmpSubset(taicpu(p_jump).condition, taicpu(hp1_dist).condition) then
begin
{ Any registers used here will already be allocated }
if Assigned(JumpLabel_dist) then
JumpLabel_dist.IncRefs;
if Assigned(JumpLabel) then
JumpLabel.DecRefs;
DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc -> CMP/Jcc, redirecting first jump', p_jump);
taicpu(p_jump).condition := taicpu(hp1_dist).condition;
taicpu(p_jump).loadref(0, taicpu(hp1_dist).oper[0]^.ref^);
Result := True;
{ Don't exit yet. Since p and p_jump haven't actually been
removed, we can check for more on this iteration }
end
else if IsCmpSubset(taicpu(hp1_dist).condition, inverse_cond(taicpu(p_jump).condition)) and
GetNextInstruction(hp1_dist, hp1_label) and
SkipAligns(hp1_label, hp1_label) and
(hp1_label.typ = ait_label) then
begin
JumpLabel_far := tai_label(hp1_label).labsym;
if (JumpLabel_far = JumpLabel_dist) or (JumpLabel_far = JumpLabel) then
{ This is an infinite loop }
Exit;
if Assigned(JumpLabel_far) then
begin
{ In this situation, if the first jump branches, the second one will never,
branch so change the destination label to after the second jump }
DebugMsg(SPeepholeOptimization + 'CMP/Jcc/@Lbl/CMP/Jcc/@Lbl -> CMP/Jcc, redirecting first jump to 2nd label', p_jump);
if Assigned(JumpLabel) then
JumpLabel.DecRefs;
JumpLabel_far.IncRefs;
taicpu(p_jump).oper[0]^.ref^.symbol := JumpLabel_far;
Result := True;
{ Don't exit yet. Since p and p_jump haven't actually been
removed, we can check for more on this iteration }
Continue;
end;
end;
end;
end;
{ Search for:
cmp ###,###
j(c1) @lbl1
cmp ###,### (same as first)
Remove second cmp
}
if GetNextInstruction(p_jump, hp2) and
(
(
MatchInstruction(hp2, A_CMP, [taicpu(p).opsize]) and
(
(
MatchOpType(taicpu(p), top_const, top_reg) and
MatchOpType(taicpu(hp2), top_const, top_reg) and
(taicpu(hp2).oper[0]^.val = taicpu(p).oper[0]^.val) and
Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
) or (
MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^)
)
)
) or (
{ Also match cmp $0,%reg; jcc @lbl; test %reg,%reg }
MatchOperand(taicpu(p).oper[0]^, 0) and
(taicpu(p).oper[1]^.typ = top_reg) and
MatchInstruction(hp2, A_TEST, []) and
MatchOpType(taicpu(hp2), top_reg, top_reg) and
(taicpu(hp2).oper[0]^.reg = taicpu(hp2).oper[1]^.reg) and
Reg1WriteOverwritesReg2Entirely(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg)
)
) then
begin
DebugMsg(SPeepholeOptimization + 'CMP/Jcc/CMP; removed superfluous CMP', hp2);
RemoveInstruction(hp2);
Result := True;
{ Continue the while loop in case "Jcc/CMP" follows the second CMP that was just removed }
end;
GetNextInstruction(p_jump, p_jump);
end;
{
Try to optimise the following:
cmp $x,### ($x and $y can be registers or constants)
je @lbl1 (only reference)
cmp $y,### (### are identical)
@Lbl:
sete %reg1
Change to:
cmp $x,###
sete %reg2 (allocate new %reg2)
cmp $y,###
sete %reg1
orb %reg2,%reg1
(dealloc %reg2)
This adds an instruction (so don't perform under -Os), but it removes
a conditional branch.
}
if not (cs_opt_size in current_settings.optimizerswitches) and
(
(hp1 = p_jump) or
GetNextInstruction(p, hp1)
) and
MatchInstruction(hp1, A_Jcc, []) and
IsJumpToLabel(taicpu(hp1)) and
(taicpu(hp1).condition in [C_E, C_Z]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_CMP, A_TEST, [taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^) and
{ The first operand of CMP instructions can only be a register or
immediate anyway, so no need to check }
GetNextInstruction(hp2, p_label) and
(p_label.typ = ait_label) and
(tai_label(p_label).labsym.getrefs = 1) and
(JumpTargetOp(taicpu(hp1))^.ref^.symbol = tai_label(p_label).labsym) and
GetNextInstruction(p_label, p_dist) and
MatchInstruction(p_dist, A_SETcc, []) and
(taicpu(p_dist).condition in [C_E, C_Z]) and
(taicpu(p_dist).oper[0]^.typ = top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
UpdateUsedRegs(TmpUsedRegs, tai(p_label.Next));
UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
{ Get the instruction after the SETcc instruction so we can
allocate a new register over the entire range }
GetNextInstruction(p_dist, hp1_dist) then
begin
{ Register can appear in p if it's not used afterwards, so only
allocate between hp1 and hp1_dist }
NewReg := GetIntRegisterBetween(R_SUBL, TmpUsedRegs, hp1, hp1_dist);
if NewReg <> NR_NO then
begin
DebugMsg(SPeepholeOptimization + 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR, removing conditional branch', p);
{ Change the jump instruction into a SETcc instruction }
taicpu(hp1).opcode := A_SETcc;
taicpu(hp1).opsize := S_B;
taicpu(hp1).loadreg(0, NewReg);
{ This is now a dead label }
tai_label(p_label).labsym.decrefs;
{ Prefer adding before the next instruction so the FLAGS
register is deallicated first }
AsmL.InsertBefore(
taicpu.op_reg_reg(A_OR, S_B, NewReg, taicpu(p_dist).oper[0]^.reg),
hp1_dist
);
Result := True;
{ Don't exit yet, as p wasn't changed and hp1, while
modified, is still intact and might be optimised by the
SETcc optimisation below }
end;
end;
end;
if taicpu(p).oper[0]^.typ = top_const then
begin
if (taicpu(p).oper[0]^.val = 0) and
(taicpu(p).oper[1]^.typ = top_reg) and
MatchInstruction(hp1,A_Jcc,A_SETcc,[]) then
begin
hp2 := p;
FirstMatch := True;
{ When dealing with "cmp $0,%reg", only ZF and SF contain
anything meaningful once it's converted to "test %reg,%reg";
additionally, some jumps will always (or never) branch, so
evaluate every jump immediately following the
comparison, optimising the conditions if possible.
Similarly with SETcc... those that are always set to 0 or 1
are changed to MOV instructions }
while FirstMatch or { Saves calling GetNextInstruction unnecessarily }
(
GetNextInstruction(hp2, hp1) and
MatchInstruction(hp1,A_Jcc,A_SETcc,[])
) do
begin
FirstMatch := False;
case taicpu(hp1).condition of
C_B, C_C, C_NAE, C_O:
{ For B/NAE:
Will never branch since an unsigned integer can never be below zero
For C/O:
Result cannot overflow because 0 is being subtracted
}
begin
if taicpu(hp1).opcode = A_Jcc then
begin
DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (jump removed)', hp1);
TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol).decrefs;
RemoveInstruction(hp1);
{ Since hp1 was deleted, hp2 must not be updated }
Continue;
end
else
begin
DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition B/C/NAE/O --> Never (set -> mov 0)', hp1);
{ Convert "set(c) %reg" instruction to "movb 0,%reg" }
taicpu(hp1).opcode := A_MOV;
taicpu(hp1).ops := 2;
taicpu(hp1).condition := C_None;
taicpu(hp1).opsize := S_B;
taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
taicpu(hp1).loadconst(0, 0);
end;
end;
C_BE, C_NA:
begin
{ Will only branch if equal to zero }
DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition BE/NA --> E', hp1);
taicpu(hp1).condition := C_E;
end;
C_A, C_NBE:
begin
{ Will only branch if not equal to zero }
DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition A/NBE --> NE', hp1);
taicpu(hp1).condition := C_NE;
end;
C_AE, C_NB, C_NC, C_NO:
begin
{ Will always branch }
DebugMsg(SPeepholeOptimization + 'Cmpcc2Testcc - condition AE/NB/NC/NO --> Always', hp1);
if taicpu(hp1).opcode = A_Jcc then
begin
MakeUnconditional(taicpu(hp1));
{ Any jumps/set that follow will now be dead code }
RemoveDeadCodeAfterJump(taicpu(hp1));
Break;
end
else
begin
{ Convert "set(c) %reg" instruction to "movb 1,%reg" }
taicpu(hp1).opcode := A_MOV;
taicpu(hp1).ops := 2;
taicpu(hp1).condition := C_None;
taicpu(hp1).opsize := S_B;
taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
taicpu(hp1).loadconst(0, 1);
end;
end;
C_None:
InternalError(2020012201);
C_P, C_PE, C_NP, C_PO:
{ We can't handle parity checks and they should never be generated
after a general-purpose CMP (it's used in some floating-point
comparisons that don't use CMP) }
InternalError(2020012202);
else
{ Zero/Equality, Sign, their complements and all of the
signed comparisons do not need to be converted };
end;
hp2 := hp1;
end;
{ Convert the instruction to a TEST }
taicpu(p).opcode := A_TEST;
taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
Result := True;
Exit;
end
else if (taicpu(p).oper[0]^.val = 1) and
MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
(taicpu(hp1).condition in [C_L, C_NGE]) then
begin
{ Convert; To:
cmp $1,r/m cmp $0,r/m
jl @lbl jle @lbl
}
DebugMsg(SPeepholeOptimization + 'Cmp1Jl2Cmp0Jle', p);
taicpu(p).oper[0]^.val := 0;
taicpu(hp1).condition := C_LE;
{ If the instruction is now "cmp $0,%reg", convert it to a
TEST (and effectively do the work of the "cmp $0,%reg" in
the block above)
If it's a reference, we can get away with not setting
Result to True because he haven't evaluated the jump
in this pass yet.
}
if (taicpu(p).oper[1]^.typ = top_reg) then
begin
taicpu(p).opcode := A_TEST;
taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
Result := True;
end;
Exit;
end
else if (taicpu(p).oper[1]^.typ = top_reg)
{$ifdef x86_64}
and (taicpu(p).opsize <> S_Q) { S_Q will never happen: cmp with 64 bit constants is not possible }
{$endif x86_64}
then
begin
{ cmp register,$8000 neg register
je target --> jo target
.... only if register is deallocated before jump.}
case Taicpu(p).opsize of
S_B: v:=$80;
S_W: v:=$8000;
S_L: v:=qword($80000000);
else
internalerror(2013112905);
end;
if (taicpu(p).oper[0]^.val=v) and
MatchInstruction(hp1,A_Jcc,A_SETcc,[]) and
(Taicpu(hp1).condition in [C_E,C_NE]) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs,tai(p.next));
if not(RegInUsedRegs(Taicpu(p).oper[1]^.reg, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'CmpJe2NegJo done',p);
Taicpu(p).opcode:=A_NEG;
Taicpu(p).loadoper(0,Taicpu(p).oper[1]^);
Taicpu(p).clearop(1);
Taicpu(p).ops:=1;
if Taicpu(hp1).condition=C_E then
Taicpu(hp1).condition:=C_O
else
Taicpu(hp1).condition:=C_NO;
Result:=true;
exit;
end;
end;
end;
end;
if TrySwapMovCmp(p, hp1) then
begin
Result := True;
Exit;
end;
end;
function TX86AsmOptimizer.OptPass1PXor(var p: tai): boolean;
var
hp1: tai;
begin
{
remove the second (v)pxor from
pxor reg,reg
...
pxor reg,reg
}
Result:=false;
if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
MatchOpType(taicpu(p),top_reg,top_reg) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) then
begin
DebugMsg(SPeepholeOptimization + 'PXorPXor2PXor done',hp1);
RemoveInstruction(hp1);
Result:=true;
Exit;
end
{
replace
pxor reg1,reg1
movapd/s reg1,reg2
dealloc reg1
by
pxor reg2,reg2
}
else if GetNextInstruction(p,hp1) and
{ we mix single and double opperations here because we assume that the compiler
generates vmovapd only after double operations and vmovaps only after single operations }
MatchInstruction(hp1,A_MOVAPD,A_MOVAPS,[S_NO]) and
MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^) and
MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
(taicpu(p).oper[0]^.typ=top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(0,taicpu(hp1).oper[1]^);
taicpu(p).loadoper(1,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'PXorMovapd2PXor done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
function TX86AsmOptimizer.OptPass1VPXor(var p: tai): boolean;
var
hp1: tai;
begin
{
remove the second (v)pxor from
(v)pxor reg,reg
...
(v)pxor reg,reg
}
Result:=false;
if MatchOperand(taicpu(p).oper[0]^,taicpu(p).oper[1]^,taicpu(p).oper[2]^) and
MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
MatchInstruction(hp1,taicpu(p).opcode,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[0]^) and
MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^,taicpu(hp1).oper[2]^) then
begin
DebugMsg(SPeepholeOptimization + 'VPXorVPXor2PXor done',hp1);
RemoveInstruction(hp1);
Result:=true;
Exit;
end
else
Result:=OptPass1VOP(p);
end;
function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean;
var
hp1 : tai;
begin
result:=false;
{ replace
IMul const,%mreg1,%mreg2
Mov %reg2,%mreg3
dealloc %mreg3
by
Imul const,%mreg1,%mreg23
}
if (taicpu(p).ops=3) and
GetNextInstruction(p,hp1) and
MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
(taicpu(hp1).oper[1]^.typ=top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'ImulMov2Imul done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
function TX86AsmOptimizer.OptPass1SHXX(var p: tai): boolean;
var
hp1 : tai;
begin
result:=false;
{ replace
IMul %reg0,%reg1,%reg2
Mov %reg2,%reg3
dealloc %reg2
by
Imul %reg0,%reg1,%reg3
}
if GetNextInstruction(p,hp1) and
MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[2]^,taicpu(hp1).oper[0]^) and
(taicpu(hp1).oper[1]^.typ=top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg,hp1,TmpUsedRegs)) then
begin
taicpu(p).loadoper(2,taicpu(hp1).oper[1]^);
DebugMsg(SPeepholeOptimization + 'SHXXMov2SHXX done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
function TX86AsmOptimizer.OptPass1_V_Cvtss2sd(var p: tai): boolean;
var
hp1: tai;
begin
Result:=false;
{ get rid of
(v)cvtss2sd reg0,<reg1,>reg2
(v)cvtss2sd reg2,<reg2,>reg0
}
if GetNextInstruction(p,hp1) and
(((taicpu(p).opcode=A_CVTSS2SD) and MatchInstruction(hp1,A_CVTSD2SS,[taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[0]^,taicpu(hp1).oper[1]^) and MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)) or
((taicpu(p).opcode=A_VCVTSS2SD) and MatchInstruction(hp1,A_VCVTSD2SS,[taicpu(p).opsize]) and
MatchOpType(taicpu(p),top_reg,top_reg,top_reg) and
MatchOpType(taicpu(hp1),top_reg,top_reg,top_reg) and
(getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
(getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
(getsupreg(taicpu(p).oper[2]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg))
)
) then
begin
if ((taicpu(p).opcode=A_CVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
((taicpu(p).opcode=A_VCVTSS2SD) and (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[2]^.reg))) then
begin
DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Nop done',p);
RemoveCurrentP(p);
RemoveInstruction(hp1);
end
else
begin
DebugMsg(SPeepholeOptimization + '(V)Cvtss2CvtSd(V)Cvtsd2ss2Vmovaps done',p);
if taicpu(hp1).opcode=A_CVTSD2SS then
begin
taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
taicpu(p).opcode:=A_MOVAPS;
end
else
begin
taicpu(p).loadreg(1,taicpu(hp1).oper[2]^.reg);
taicpu(p).opcode:=A_VMOVAPS;
end;
taicpu(p).ops:=2;
RemoveInstruction(hp1);
end;
Result:=true;
Exit;
end;
end;
function TX86AsmOptimizer.OptPass1Jcc(var p : tai) : boolean;
var
hp1, hp2, hp3, hp4, hp5: tai;
ThisReg: TRegister;
begin
Result := False;
if not GetNextInstruction(p,hp1) or (hp1.typ <> ait_instruction) then
Exit;
{
convert
j<c> .L1
mov 1,reg
jmp .L2
.L1
mov 0,reg
.L2
into
mov 0,reg
set<not(c)> reg
take care of alignment and that the mov 0,reg is not converted into a xor as this
would destroy the flag contents
Use MOVZX if size is preferred, since while mov 0,reg is bigger, it can be
executed at the same time as a previous comparison.
set<not(c)> reg
movzx reg, reg
}
if MatchInstruction(hp1,A_MOV,[]) and
(taicpu(hp1).oper[0]^.typ = top_const) and
(
(
(taicpu(hp1).oper[1]^.typ = top_reg)
{$ifdef i386}
{ Under i386, ESI, EDI, EBP and ESP
don't have an 8-bit representation }
and not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
{$endif i386}
) or (
{$ifdef i386}
(taicpu(hp1).oper[1]^.typ <> top_reg) and
{$endif i386}
(taicpu(hp1).opsize = S_B)
)
) and
GetNextInstruction(hp1,hp2) and
MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
GetNextInstruction(hp2,hp3) and
SkipAligns(hp3, hp3) and
(hp3.typ=ait_label) and
(tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
GetNextInstruction(hp3,hp4) and
MatchInstruction(hp4,A_MOV,[taicpu(hp1).opsize]) and
(taicpu(hp4).oper[0]^.typ = top_const) and
(
((taicpu(hp1).oper[0]^.val = 0) and (taicpu(hp4).oper[0]^.val = 1)) or
((taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0))
) and
MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
GetNextInstruction(hp4,hp5) and
SkipAligns(hp5, hp5) and
(hp5.typ=ait_label) and
(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) then
begin
if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
taicpu(p).condition := inverse_cond(taicpu(p).condition);
tai_label(hp3).labsym.DecRefs;
{ If this isn't the only reference to the middle label, we can
still make a saving - only that the first jump and everything
that follows will remain. }
if (tai_label(hp3).labsym.getrefs = 0) then
begin
if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c)',p)
else
DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c)',p);
{ remove jump, first label and second MOV (also catching any aligns) }
repeat
if not GetNextInstruction(hp2, hp3) then
InternalError(2021040810);
RemoveInstruction(hp2);
hp2 := hp3;
until hp2 = hp5;
{ Don't decrement reference count before the removal loop
above, otherwise GetNextInstruction won't stop on the
the label }
tai_label(hp5).labsym.DecRefs;
end
else
begin
if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then
DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c) (partial)',p)
else
DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c) (partial)',p);
end;
taicpu(p).opcode:=A_SETcc;
taicpu(p).opsize:=S_B;
taicpu(p).is_jmp:=False;
if taicpu(hp1).opsize=S_B then
begin
taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
if taicpu(hp1).oper[1]^.typ = top_reg then
AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp2, UsedRegs);
RemoveInstruction(hp1);
end
else
begin
{ Will be a register because the size can't be S_B otherwise }
ThisReg := newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBL);
taicpu(p).loadreg(0, ThisReg);
AllocRegBetween(ThisReg, p, hp2, UsedRegs);
if (cs_opt_size in current_settings.optimizerswitches) and IsMOVZXAcceptable then
begin
case taicpu(hp1).opsize of
S_W:
taicpu(hp1).opsize := S_BW;
S_L:
taicpu(hp1).opsize := S_BL;
{$ifdef x86_64}
S_Q:
begin
taicpu(hp1).opsize := S_BL;
{ Change the destination register to 32-bit }
taicpu(hp1).loadreg(1, newreg(R_INTREGISTER,getsupreg(ThisReg), R_SUBD));
end;
{$endif x86_64}
else
InternalError(2021040820);
end;
taicpu(hp1).opcode := A_MOVZX;
taicpu(hp1).loadreg(0, ThisReg);
end
else
begin
AllocRegBetween(NR_FLAGS,p,hp1,UsedRegs);
{ hp1 is already a MOV instruction with the correct register }
taicpu(hp1).loadconst(0, 0);
{ Inserting it right before p will guarantee that the flags are also tracked }
asml.Remove(hp1);
asml.InsertBefore(hp1, p);
end;
end;
Result:=true;
exit;
end
end;
function TX86AsmOptimizer.OptPass1VMOVDQ(var p: tai): Boolean;
var
hp1, hp2, hp3: tai;
SourceRef, TargetRef: TReference;
CurrentReg: TRegister;
begin
{ VMOVDQU/CMOVDQA shouldn't have even been generated }
if not UseAVX then
InternalError(2021100501);
Result := False;
{ Look for the following to simplify:
vmovdqa/u x(mem1), %xmmreg
vmovdqa/u %xmmreg, y(mem2)
vmovdqa/u x+16(mem1), %xmmreg
vmovdqa/u %xmmreg, y+16(mem2)
Change to:
vmovdqa/u x(mem1), %ymmreg
vmovdqa/u %ymmreg, y(mem2)
vpxor %ymmreg, %ymmreg, %ymmreg
( The VPXOR instruction is to zero the upper half, thus removing the
need to call the potentially expensive VZEROUPPER instruction. Other
peephole optimisations can remove VPXOR if it's unnecessary )
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
{ NOTE: In the optimisations below, if the references dictate that an
aligned move is possible (i.e. VMOVDQA), the existing instructions
should already be VMOVDQA because if (x mod 32) = 0, then (x mod 16) = 0 }
if (taicpu(p).opsize = S_XMM) and
MatchOpType(taicpu(p), top_ref, top_reg) and
GetNextInstruction(p, hp1) and
MatchInstruction(hp1, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
MatchOpType(taicpu(hp1), top_reg, top_ref) and
not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) then
begin
SourceRef := taicpu(p).oper[0]^.ref^;
TargetRef := taicpu(hp1).oper[1]^.ref^;
if GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
MatchOpType(taicpu(hp2), top_ref, top_reg) then
begin
{ Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
Inc(SourceRef.offset, 16);
{ Reuse the register in the first block move }
CurrentReg := newreg(R_MMREGISTER, getsupreg(taicpu(p).oper[1]^.reg), R_SUBMMY);
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
Inc(TargetRef.offset, 16);
if GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_VMOVDQA, A_VMOVDQU, [S_XMM]) and
MatchOpType(taicpu(hp3), top_reg, top_ref) and
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
begin
{ Update the register tracking to the new size }
AllocRegBetween(CurrentReg, p, hp2, UsedRegs);
{ Remember that the offsets are 16 ahead }
{ Switch to unaligned if the memory isn't on a 32-byte boundary }
if not (
((SourceRef.offset mod 32) = 16) and
(SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
) then
taicpu(p).opcode := A_VMOVDQU;
taicpu(p).opsize := S_YMM;
taicpu(p).oper[1]^.reg := CurrentReg;
if not (
((TargetRef.offset mod 32) = 16) and
(TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
) then
taicpu(hp1).opcode := A_VMOVDQU;
taicpu(hp1).opsize := S_YMM;
taicpu(hp1).oper[0]^.reg := CurrentReg;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p);
{ If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
if (pi_uses_ymm in current_procinfo.flags) then
RemoveInstruction(hp2)
else
begin
taicpu(hp2).opcode := A_VPXOR;
taicpu(hp2).opsize := S_YMM;
taicpu(hp2).loadreg(0, CurrentReg);
taicpu(hp2).loadreg(1, CurrentReg);
taicpu(hp2).loadreg(2, CurrentReg);
taicpu(hp2).ops := 3;
end;
RemoveInstruction(hp3);
Result := True;
Exit;
end;
end
else
begin
{ See if the next references are 16 less rather than 16 greater }
Dec(SourceRef.offset, 32); { -16 the other way }
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
Dec(TargetRef.offset, 16); { Only 16, not 32, as it wasn't incremented unlike SourceRef }
if GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp3), top_reg, top_ref) and
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
begin
{ Update the register tracking to the new size }
AllocRegBetween(CurrentReg, hp2, hp3, UsedRegs);
{ hp2 and hp3 are the starting offsets, so mod = 0 this time }
{ Switch to unaligned if the memory isn't on a 32-byte boundary }
if not(
((SourceRef.offset mod 32) = 0) and
(SourceRef.alignment >= 32) and ((SourceRef.alignment mod 32) = 0)
) then
taicpu(hp2).opcode := A_VMOVDQU;
taicpu(hp2).opsize := S_YMM;
taicpu(hp2).oper[1]^.reg := CurrentReg;
if not (
((TargetRef.offset mod 32) = 0) and
(TargetRef.alignment >= 32) and ((TargetRef.alignment mod 32) = 0)
) then
taicpu(hp3).opcode := A_VMOVDQU;
taicpu(hp3).opsize := S_YMM;
taicpu(hp3).oper[0]^.reg := CurrentReg;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p);
{ If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
if (pi_uses_ymm in current_procinfo.flags) then
RemoveInstruction(hp1)
else
begin
taicpu(hp1).opcode := A_VPXOR;
taicpu(hp1).opsize := S_YMM;
taicpu(hp1).loadreg(0, CurrentReg);
taicpu(hp1).loadreg(1, CurrentReg);
taicpu(hp1).loadreg(2, CurrentReg);
taicpu(hp1).ops := 3;
Asml.Remove(hp1);
Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
end;
RemoveCurrentP(p, hp2);
Result := True;
Exit;
end;
end;
end;
end;
end;
end;
function TX86AsmOptimizer.CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
var
hp2, hp3, first_assignment: tai;
IncCount, OperIdx: Integer;
OrigLabel: TAsmLabel;
begin
Count := 0;
Result := False;
first_assignment := nil;
if (LoopCount >= 20) then
begin
{ Guard against infinite loops }
Exit;
end;
if (taicpu(p).oper[0]^.typ <> top_ref) or
(taicpu(p).oper[0]^.ref^.refaddr <> addr_full) or
(taicpu(p).oper[0]^.ref^.base <> NR_NO) or
(taicpu(p).oper[0]^.ref^.index <> NR_NO) or
not (taicpu(p).oper[0]^.ref^.symbol is TAsmLabel) then
Exit;
OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
{
change
jmp .L1
...
.L1:
mov ##, ## ( multiple movs possible )
jmp/ret
into
mov ##, ##
jmp/ret
}
if not Assigned(hp1) then
begin
hp1 := GetLabelWithSym(OrigLabel);
if not Assigned(hp1) or not SkipLabels(hp1, hp1) then
Exit;
end;
hp2 := hp1;
while Assigned(hp2) do
begin
if Assigned(hp2) and (hp2.typ in [ait_label, ait_align]) then
SkipLabels(hp2,hp2);
if not Assigned(hp2) or (hp2.typ <> ait_instruction) then
Break;
case taicpu(hp2).opcode of
A_MOVSS:
begin
if taicpu(hp2).ops = 0 then
{ Wrong MOVSS }
Break;
Inc(Count);
if Count >= 5 then
{ Too many to be worthwhile }
Break;
GetNextInstruction(hp2, hp2);
Continue;
end;
A_MOV,
A_MOVD,
A_MOVQ,
A_MOVSX,
{$ifdef x86_64}
A_MOVSXD,
{$endif x86_64}
A_MOVZX,
A_MOVAPS,
A_MOVUPS,
A_MOVSD,
A_MOVAPD,
A_MOVUPD,
A_MOVDQA,
A_MOVDQU,
A_VMOVSS,
A_VMOVAPS,
A_VMOVUPS,
A_VMOVSD,
A_VMOVAPD,
A_VMOVUPD,
A_VMOVDQA,
A_VMOVDQU:
begin
Inc(Count);
if Count >= 5 then
{ Too many to be worthwhile }
Break;
GetNextInstruction(hp2, hp2);
Continue;
end;
A_JMP:
begin
{ Guard against infinite loops }
if taicpu(hp2).oper[0]^.ref^.symbol = OrigLabel then
Exit;
{ Analyse this jump first in case it also duplicates assignments }
if CheckJumpMovTransferOpt(hp2, nil, LoopCount + 1, IncCount) then
begin
{ Something did change! }
Result := True;
Inc(Count, IncCount);
if Count >= 5 then
begin
{ Too many to be worthwhile }
Exit;
end;
if MatchInstruction(hp2, [A_JMP, A_RET], []) then
Break;
end;
Result := True;
Break;
end;
A_RET:
begin
Result := True;
Break;
end;
else
Break;
end;
end;
if Result then
begin
{ A count of zero can happen when CheckJumpMovTransferOpt is called recursively }
if Count = 0 then
begin
Result := False;
Exit;
end;
hp3 := p;
DebugMsg(SPeepholeOptimization + 'Duplicated ' + debug_tostr(Count) + ' assignment(s) and redirected jump', p);
while True do
begin
if Assigned(hp1) and (hp1.typ in [ait_label, ait_align]) then
SkipLabels(hp1,hp1);
if (hp1.typ <> ait_instruction) then
InternalError(2021040720);
case taicpu(hp1).opcode of
A_JMP:
begin
{ Change the original jump to the new destination }
OrigLabel.decrefs;
taicpu(hp1).oper[0]^.ref^.symbol.increfs;
taicpu(p).loadref(0, taicpu(hp1).oper[0]^.ref^);
{ Set p to the first duplicated assignment so it can get optimised if needs be }
if not Assigned(first_assignment) then
InternalError(2021040810)
else
p := first_assignment;
Exit;
end;
A_RET:
begin
{ Now change the jump into a RET instruction }
ConvertJumpToRET(p, hp1);
{ Set p to the first duplicated assignment so it can get optimised if needs be }
if not Assigned(first_assignment) then
InternalError(2021040811)
else
p := first_assignment;
Exit;
end;
else
begin
{ Duplicate the MOV instruction }
hp3:=tai(hp1.getcopy);
if first_assignment = nil then
first_assignment := hp3;
asml.InsertBefore(hp3, p);
{ Make sure the compiler knows about any final registers written here }
for OperIdx := 0 to taicpu(hp3).ops - 1 do
with taicpu(hp3).oper[OperIdx]^ do
begin
case typ of
top_ref:
begin
if (ref^.base <> NR_NO) and
(getsupreg(ref^.base) <> RS_ESP) and
(getsupreg(ref^.base) <> RS_EBP)
{$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64}
then
AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs);
if (ref^.index <> NR_NO) and
(getsupreg(ref^.index) <> RS_ESP) and
(getsupreg(ref^.index) <> RS_EBP)
{$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} and
(ref^.index <> ref^.base) then
AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs);
end;
top_reg:
AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs);
else
;
end;
end;
end;
end;
if not GetNextInstruction(hp1, hp1) then
{ Should have dropped out earlier }
InternalError(2021040710);
end;
end;
end;
function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
var
hp2: tai;
X: Integer;
const
WriteOp: array[0..3] of set of TInsChange = (
[Ch_Wop1, Ch_RWop1, Ch_Mop1],
[Ch_Wop2, Ch_RWop2, Ch_Mop2],
[Ch_Wop3, Ch_RWop3, Ch_Mop3],
[Ch_Wop4, Ch_RWop4, Ch_Mop4]);
RegWriteFlags: array[0..7] of set of TInsChange = (
{ The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
[Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
[Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
[Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
[Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
[Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
[Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
[Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
[Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
begin
{ If we have something like:
cmp ###,%reg1
mov 0,%reg2
And no modified registers are shared, move the instruction to before
the comparison as this means it can be optimised without worrying
about the FLAGS register. (CMP/MOV is generated by
"J(c)Mov1JmpMov0 -> Set(~c)", among other things).
As long as the second instruction doesn't use the flags or one of the
registers used by CMP or TEST (also check any references that use the
registers), then it can be moved prior to the comparison.
}
Result := False;
if (hp1.typ <> ait_instruction) or
taicpu(hp1).is_jmp or
RegInInstruction(NR_DEFAULTFLAGS, hp1) then
Exit;
{ NOP is a pipeline fence, likely marking the beginning of the function
epilogue, so drop out. Similarly, drop out if POP or RET are
encountered }
if MatchInstruction(hp1, A_NOP, A_POP, []) then
Exit;
if (taicpu(hp1).opcode = A_MOVSS) and
(taicpu(hp1).ops = 0) then
{ Wrong MOVSS }
Exit;
{ Check for writes to specific registers first }
{ EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
for X := 0 to 7 do
if (RegWriteFlags[X] * InsProp[taicpu(hp1).opcode].Ch <> [])
and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), p) then
Exit;
for X := 0 to taicpu(hp1).ops - 1 do
begin
{ Check to see if this operand writes to something }
if ((WriteOp[X] * InsProp[taicpu(hp1).opcode].Ch) <> []) and
{ And matches something in the CMP/TEST instruction }
(
MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[0]^) or
MatchOperand(taicpu(hp1).oper[X]^, taicpu(p).oper[1]^) or
(
{ If it's a register, make sure the register written to doesn't
appear in the cmp instruction as part of a reference }
(taicpu(hp1).oper[X]^.typ = top_reg) and
RegInInstruction(taicpu(hp1).oper[X]^.reg, p)
)
) then
Exit;
end;
{ The instruction can be safely moved }
asml.Remove(hp1);
{ Try to insert after the last instructions where the FLAGS register is not yet in use }
if not GetLastInstruction(p, hp2) then
asml.InsertBefore(hp1, p)
else
asml.InsertAfter(hp1, hp2);
DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
for X := 0 to taicpu(hp1).ops - 1 do
case taicpu(hp1).oper[X]^.typ of
top_reg:
AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
top_ref:
begin
if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
end;
else
;
end;
if taicpu(hp1).opcode = A_LEA then
{ The flags will be overwritten by the CMP/TEST instruction }
ConvertLEA(taicpu(hp1));
Result := True;
end;
function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean;
function IsXCHGAcceptable: Boolean; inline;
begin
{ Always accept if optimising for size }
Result := (cs_opt_size in current_settings.optimizerswitches) or
(
{$ifdef x86_64}
{ XCHG takes 3 cycles on AMD Athlon64 }
(current_settings.optimizecputype >= cpu_core_i)
{$else x86_64}
{ From the Pentium M onwards, XCHG only has a latency of 2 rather
than 3, so it becomes a saving compared to three MOVs with two of
them able to execute simultaneously. [Kit] }
(current_settings.optimizecputype >= cpu_PentiumM)
{$endif x86_64}
);
end;
var
NewRef: TReference;
hp1, hp2, hp3, hp4: Tai;
{$ifndef x86_64}
OperIdx: Integer;
{$endif x86_64}
NewInstr : Taicpu;
NewAligh : Tai_align;
DestLabel: TAsmLabel;
function TryMovArith2Lea(InputInstr: tai): Boolean;
var
NextInstr: tai;
begin
Result := False;
UpdateUsedRegs(TmpUsedRegs, tai(InputInstr.Next));
if not GetNextInstruction(InputInstr, NextInstr) or
(
{ The FLAGS register isn't always tracked properly, so do not
perform this optimisation if a conditional statement follows }
not RegReadByInstruction(NR_DEFAULTFLAGS, NextInstr) and
not RegUsedAfterInstruction(NR_DEFAULTFLAGS, NextInstr, TmpUsedRegs)
) then
begin
reference_reset(NewRef, 1, []);
NewRef.base := taicpu(p).oper[0]^.reg;
NewRef.scalefactor := 1;
if taicpu(InputInstr).opcode = A_ADD then
begin
DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
NewRef.offset := taicpu(InputInstr).oper[0]^.val;
end
else
begin
DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
NewRef.offset := -taicpu(InputInstr).oper[0]^.val;
end;
taicpu(p).opcode := A_LEA;
taicpu(p).loadref(0, NewRef);
RemoveInstruction(InputInstr);
Result := True;
end;
end;
begin
Result:=false;
{ This optimisation adds an instruction, so only do it for speed }
if not (cs_opt_size in current_settings.optimizerswitches) and
MatchOpType(taicpu(p), top_const, top_reg) and
(taicpu(p).oper[0]^.val = 0) then
begin
{ To avoid compiler warning }
DestLabel := nil;
if (p.typ <> ait_instruction) or (taicpu(p).oper[1]^.typ <> top_reg) then
InternalError(2021040750);
if not GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) then
Exit;
case hp1.typ of
ait_label:
begin
{ Change:
mov $0,%reg mov $0,%reg
@Lbl1: @Lbl1:
test %reg,%reg / cmp $0,%reg test %reg,%reg / mov $0,%reg
je @Lbl2 jne @Lbl2
To: To:
mov $0,%reg mov $0,%reg
jmp @Lbl2 jmp @Lbl3
(align) (align)
@Lbl1: @Lbl1:
test %reg,%reg / cmp $0,%reg test %reg,%reg / cmp $0,%reg
je @Lbl2 je @Lbl2
@Lbl3: <-- Only if label exists
(Not if it's optimised for size)
}
if not GetNextInstruction(hp1, hp2) then
Exit;
if not (cs_opt_size in current_settings.optimizerswitches) and
(hp2.typ = ait_instruction) and
(
{ Register sizes must exactly match }
(
(taicpu(hp2).opcode = A_CMP) and
MatchOperand(taicpu(hp2).oper[0]^, 0) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
) or (
(taicpu(hp2).opcode = A_TEST) and
MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg)
)
) and GetNextInstruction(hp2, hp3) and
(hp3.typ = ait_instruction) and
(taicpu(hp3).opcode = A_JCC) and
(taicpu(hp3).oper[0]^.typ=top_ref) and (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and (taicpu(hp3).oper[0]^.ref^.base=NR_NO) and
(taicpu(hp3).oper[0]^.ref^.index=NR_NO) and (taicpu(hp3).oper[0]^.ref^.symbol is tasmlabel) then
begin
{ Check condition of jump }
{ Always true? }
if condition_in(C_E, taicpu(hp3).condition) then
begin
{ Copy label symbol and obtain matching label entry for the
conditional jump, as this will be our destination}
DestLabel := tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol);
DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Je -> Mov0JmpLblCmp0Je', p);
Result := True;
end
{ Always false? }
else if condition_in(C_NE, taicpu(hp3).condition) and GetNextInstruction(hp3, hp2) then
begin
{ This is only worth it if there's a jump to take }
case hp2.typ of
ait_instruction:
begin
if taicpu(hp2).opcode = A_JMP then
begin
DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
{ An unconditional jump follows the conditional jump which will always be false,
so use this jump's destination for the new jump }
DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with JMP)', p);
Result := True;
end
else if taicpu(hp2).opcode = A_JCC then
begin
DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol);
if condition_in(C_E, taicpu(hp2).condition) then
begin
{ A second conditional jump follows the conditional jump which will always be false,
while the second jump is always True, so use this jump's destination for the new jump }
DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with second Jcc)', p);
Result := True;
end;
{ Don't risk it if the jump isn't always true (Result remains False) }
end;
end;
else
{ If anything else don't optimise };
end;
end;
if Result then
begin
{ Just so we have something to insert as a paremeter}
reference_reset(NewRef, 1, []);
NewInstr := taicpu.op_ref(A_JMP, S_NO, NewRef);
{ Now actually load the correct parameter }
NewInstr.loadsymbol(0, DestLabel, 0);
{ Get instruction before original label (may not be p under -O3) }
if not GetLastInstruction(hp1, hp2) then
{ Shouldn't fail here }
InternalError(2021040701);
DestLabel.increfs;
AsmL.InsertAfter(NewInstr, hp2);
{ Add new alignment field }
(* AsmL.InsertAfter(
cai_align.create_max(
current_settings.alignment.jumpalign,
current_settings.alignment.jumpalignskipmax
),
NewInstr
); *)
end;
Exit;
end;
end;
else
;
end;
end;
if not GetNextInstruction(p, hp1) then
Exit;
if MatchInstruction(hp1, A_CMP, A_TEST, [taicpu(p).opsize])
and DoMovCmpMemOpt(p, hp1, True) then
begin
Result := True;
Exit;
end
else if MatchInstruction(hp1, A_JMP, [S_NO]) then
begin
{ Sometimes the MOVs that OptPass2JMP produces can be improved
further, but we can't just put this jump optimisation in pass 1
because it tends to perform worse when conditional jumps are
nearby (e.g. when converting CMOV instructions). [Kit] }
if OptPass2JMP(hp1) then
{ call OptPass1MOV once to potentially merge any MOVs that were created }
Result := OptPass1MOV(p)
{ OptPass2MOV will now exit but will be called again if OptPass1MOV
returned True and the instruction is still a MOV, thus checking
the optimisations below }
{ If OptPass2JMP returned False, no optimisations were done to
the jump and there are no further optimisations that can be done
to the MOV instruction on this pass }
end
else if MatchOpType(taicpu(p),top_reg,top_reg) and
(taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
{ Change:
movl/q %reg1,%reg2 movl/q %reg1,%reg2
addl/q $x,%reg2 subl/q $x,%reg2
To:
leal/q x(%reg1),%reg2 leal/q -x(%reg1),%reg2
}
if (taicpu(hp1).oper[0]^.typ = top_const) and
{ be lazy, checking separately for sub would be slightly better }
(abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if TryMovArith2Lea(hp1) then
begin
Result := True;
Exit;
end
end
else if not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
GetNextInstructionUsingReg(hp1, hp2, taicpu(p).oper[1]^.reg) and
{ Same as above, but also adds or subtracts to %reg2 in between.
It's still valid as long as the flags aren't in use }
MatchInstruction(hp2,A_ADD,A_SUB,[taicpu(p).opsize]) and
MatchOpType(taicpu(hp2), top_const, top_reg) and
(taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
{ be lazy, checking separately for sub would be slightly better }
(abs(taicpu(hp2).oper[0]^.val)<=$7fffffff) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if TryMovArith2Lea(hp2) then
begin
Result := True;
Exit;
end;
end;
end
else if MatchOpType(taicpu(p),top_reg,top_reg) and
{$ifdef x86_64}
MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
{$else x86_64}
MatchInstruction(hp1,A_MOVZX,A_MOVSX,[]) and
{$endif x86_64}
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(taicpu(hp1).oper[0]^.reg = taicpu(p).oper[1]^.reg) then
{ mov reg1, reg2 mov reg1, reg2
movzx/sx reg2, reg3 to movzx/sx reg1, reg3}
begin
taicpu(hp1).oper[0]^.reg := taicpu(p).oper[0]^.reg;
DebugMsg(SPeepholeOptimization + 'mov %reg1,%reg2; movzx/sx %reg2,%reg3 -> mov %reg1,%reg2;movzx/sx %reg1,%reg3',p);
{ Don't remove the MOV command without first checking that reg2 isn't used afterwards,
or unless supreg(reg3) = supreg(reg2)). [Kit] }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) or
not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)
then
begin
RemoveCurrentP(p, hp1);
Result:=true;
end;
exit;
end
else if MatchOpType(taicpu(p),top_reg,top_reg) and
IsXCHGAcceptable and
{ XCHG doesn't support 8-byte registers }
(taicpu(p).opsize <> S_B) and
MatchInstruction(hp1, A_MOV, []) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, []) and
{ Don't need to call MatchOpType for hp2 because the operand matches below cover for it }
MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) then
begin
{ mov %reg1,%reg2
mov %reg3,%reg1 -> xchg %reg3,%reg1
mov %reg2,%reg3
(%reg2 not used afterwards)
Note that xchg takes 3 cycles to execute, and generally mov's take
only one cycle apiece, but the first two mov's can be executed in
parallel, only taking 2 cycles overall. Older processors should
therefore only optimise for size. [Kit]
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) then
begin
DebugMsg(SPeepholeOptimization + 'MovMovMov2XChg', p);
AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp1, UsedRegs);
taicpu(hp1).opcode := A_XCHG;
RemoveCurrentP(p, hp1);
RemoveInstruction(hp2);
Result := True;
Exit;
end;
end
else if MatchOpType(taicpu(p),top_reg,top_reg) and
MatchInstruction(hp1, A_SAR, []) then
begin
if MatchOperand(taicpu(hp1).oper[0]^, 31) then
begin
{ the use of %edx also covers the opsize being S_L }
if MatchOperand(taicpu(hp1).oper[1]^, NR_EDX) then
begin
{ Note it has to be specifically "movl %eax,%edx", and those specific sub-registers }
if (taicpu(p).oper[0]^.reg = NR_EAX) and
(taicpu(p).oper[1]^.reg = NR_EDX) then
begin
{ Change:
movl %eax,%edx
sarl $31,%edx
To:
cltd
}
DebugMsg(SPeepholeOptimization + 'MovSar2Cltd', p);
RemoveInstruction(hp1);
taicpu(p).opcode := A_CDQ;
taicpu(p).opsize := S_NO;
taicpu(p).clearop(1);
taicpu(p).clearop(0);
taicpu(p).ops:=0;
Result := True;
end
else if (cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[0]^.reg = NR_EDX) and
(taicpu(p).oper[1]^.reg = NR_EAX) then
begin
{ Change:
movl %edx,%eax
sarl $31,%edx
To:
movl %edx,%eax
cltd
Note that this creates a dependency between the two instructions,
so only perform if optimising for size.
}
DebugMsg(SPeepholeOptimization + 'MovSar2MovCltd', p);
taicpu(hp1).opcode := A_CDQ;
taicpu(hp1).opsize := S_NO;
taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0;
end;
{$ifndef x86_64}
end
{ Don't bother if CMOV is supported, because a more optimal
sequence would have been generated for the Abs() intrinsic }
else if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
{ the use of %eax also covers the opsize being S_L }
MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) and
(taicpu(p).oper[0]^.reg = NR_EAX) and
(taicpu(p).oper[1]^.reg = NR_EDX) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_XOR, [S_L]) and
MatchOperand(taicpu(hp2).oper[0]^, NR_EAX) and
MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) and
GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_SUB, [S_L]) and
MatchOperand(taicpu(hp3).oper[0]^, NR_EAX) and
MatchOperand(taicpu(hp3).oper[1]^, NR_EDX) then
begin
{ Change:
movl %eax,%edx
sarl $31,%eax
xorl %eax,%edx
subl %eax,%edx
(Instruction that uses %edx)
(%eax deallocated)
(%edx deallocated)
To:
cltd
xorl %edx,%eax <-- Note the registers have swapped
subl %edx,%eax
(Instruction that uses %eax) <-- %eax rather than %edx
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
if not RegUsedAfterInstruction(NR_EAX, hp3, TmpUsedRegs) then
begin
if GetNextInstruction(hp3, hp4) and
not RegModifiedByInstruction(NR_EDX, hp4) and
not RegUsedAfterInstruction(NR_EDX, hp4, TmpUsedRegs) then
begin
DebugMsg(SPeepholeOptimization + 'abs() intrinsic optimisation', p);
taicpu(p).opcode := A_CDQ;
taicpu(p).clearop(1);
taicpu(p).clearop(0);
taicpu(p).ops:=0;
RemoveInstruction(hp1);
taicpu(hp2).loadreg(0, NR_EDX);
taicpu(hp2).loadreg(1, NR_EAX);
taicpu(hp3).loadreg(0, NR_EDX);
taicpu(hp3).loadreg(1, NR_EAX);
AllocRegBetween(NR_EAX, hp3, hp4, TmpUsedRegs);
{ Convert references in the following instruction (hp4) from %edx to %eax }
for OperIdx := 0 to taicpu(hp4).ops - 1 do
with taicpu(hp4).oper[OperIdx]^ do
case typ of
top_reg:
if getsupreg(reg) = RS_EDX then
reg := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
top_ref:
begin
if getsupreg(reg) = RS_EDX then
ref^.base := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
if getsupreg(reg) = RS_EDX then
ref^.index := newreg(R_INTREGISTER,RS_EAX,getsubreg(reg));
end;
else
;
end;
end;
end;
{$else x86_64}
end;
end
else if MatchOperand(taicpu(hp1).oper[0]^, 63) and
{ the use of %rdx also covers the opsize being S_Q }
MatchOperand(taicpu(hp1).oper[1]^, NR_RDX) then
begin
{ Note it has to be specifically "movq %rax,%rdx", and those specific sub-registers }
if (taicpu(p).oper[0]^.reg = NR_RAX) and
(taicpu(p).oper[1]^.reg = NR_RDX) then
begin
{ Change:
movq %rax,%rdx
sarq $63,%rdx
To:
cqto
}
DebugMsg(SPeepholeOptimization + 'MovSar2Cqto', p);
RemoveInstruction(hp1);
taicpu(p).opcode := A_CQO;
taicpu(p).opsize := S_NO;
taicpu(p).clearop(1);
taicpu(p).clearop(0);
taicpu(p).ops:=0;
Result := True;
end
else if (cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[0]^.reg = NR_RDX) and
(taicpu(p).oper[1]^.reg = NR_RAX) then
begin
{ Change:
movq %rdx,%rax
sarq $63,%rdx
To:
movq %rdx,%rax
cqto
Note that this creates a dependency between the two instructions,
so only perform if optimising for size.
}
DebugMsg(SPeepholeOptimization + 'MovSar2MovCqto', p);
taicpu(hp1).opcode := A_CQO;
taicpu(hp1).opsize := S_NO;
taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0;
{$endif x86_64}
end;
end;
end
else if MatchInstruction(hp1, A_MOV, []) and
(taicpu(hp1).oper[1]^.typ = top_reg) then
{ Though "GetNextInstruction" could be factored out, along with
the instructions that depend on hp2, it is an expensive call that
should be delayed for as long as possible, hence we do cheaper
checks first that are likely to be False. [Kit] }
begin
if (
(
MatchOperand(taicpu(p).oper[1]^, NR_EDX) and
(taicpu(hp1).oper[1]^.reg = NR_EAX) and
(
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
MatchOperand(taicpu(hp1).oper[0]^, NR_EDX)
)
) or
(
MatchOperand(taicpu(p).oper[1]^, NR_EAX) and
(taicpu(hp1).oper[1]^.reg = NR_EDX) and
(
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
MatchOperand(taicpu(hp1).oper[0]^, NR_EAX)
)
)
) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_SAR, []) and
MatchOperand(taicpu(hp2).oper[0]^, 31) then
begin
if MatchOperand(taicpu(hp2).oper[1]^, NR_EDX) then
begin
{ Change:
movl r/m,%edx movl r/m,%eax movl r/m,%edx movl r/m,%eax
movl %edx,%eax or movl %eax,%edx or movl r/m,%eax or movl r/m,%edx
sarl $31,%edx sarl $31,%edx sarl $31,%edx sarl $31,%edx
To:
movl r/m,%eax <- Note the change in register
cltd
}
DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCltd', p);
AllocRegBetween(NR_EAX, p, hp1, UsedRegs);
taicpu(p).loadreg(1, NR_EAX);
taicpu(hp1).opcode := A_CDQ;
taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0;
RemoveInstruction(hp2);
(*
{$ifdef x86_64}
end
else if MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) and
{ This code sequence does not get generated - however it might become useful
if and when 128-bit signed integer types make an appearance, so the code
is kept here for when it is eventually needed. [Kit] }
(
(
(taicpu(hp1).oper[1]^.reg = NR_RAX) and
(
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
MatchOperand(taicpu(hp1).oper[0]^, NR_RDX)
)
) or
(
(taicpu(hp1).oper[1]^.reg = NR_RDX) and
(
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^) or
MatchOperand(taicpu(hp1).oper[0]^, NR_RAX)
)
)
) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_SAR, [S_Q]) and
MatchOperand(taicpu(hp2).oper[0]^, 63) and
MatchOperand(taicpu(hp2).oper[1]^, NR_RDX) then
begin
{ Change:
movq r/m,%rdx movq r/m,%rax movq r/m,%rdx movq r/m,%rax
movq %rdx,%rax or movq %rax,%rdx or movq r/m,%rax or movq r/m,%rdx
sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx sarq $63,%rdx
To:
movq r/m,%rax <- Note the change in register
cqto
}
DebugMsg(SPeepholeOptimization + 'MovMovSar2MovCqto', p);
AllocRegBetween(NR_RAX, p, hp1, UsedRegs);
taicpu(p).loadreg(1, NR_RAX);
taicpu(hp1).opcode := A_CQO;
taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0;
RemoveInstruction(hp2);
{$endif x86_64}
*)
end;
end;
{$ifdef x86_64}
end
else if (taicpu(p).opsize = S_L) and
(taicpu(p).oper[1]^.typ = top_reg) and
(
MatchInstruction(hp1, A_MOV,[]) and
(taicpu(hp1).opsize = S_L) and
(taicpu(hp1).oper[1]^.typ = top_reg)
) and (
GetNextInstruction(hp1, hp2) and
(tai(hp2).typ=ait_instruction) and
(taicpu(hp2).opsize = S_Q) and
(
(
MatchInstruction(hp2, A_ADD,[]) and
(taicpu(hp2).opsize = S_Q) and
(taicpu(hp2).oper[0]^.typ = top_reg) and (taicpu(hp2).oper[1]^.typ = top_reg) and
(
(
(getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(p).oper[1]^.reg)) and
(getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
) or (
(getsupreg(taicpu(hp2).oper[0]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
(getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
)
)
) or (
MatchInstruction(hp2, A_LEA,[]) and
(taicpu(hp2).oper[0]^.ref^.offset = 0) and
(taicpu(hp2).oper[0]^.ref^.scalefactor <= 1) and
(
(
(getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(p).oper[1]^.reg)) and
(getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(hp1).oper[1]^.reg))
) or (
(getsupreg(taicpu(hp2).oper[0]^.ref^.base) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
(getsupreg(taicpu(hp2).oper[0]^.ref^.index) = getsupreg(taicpu(p).oper[1]^.reg))
)
) and (
(
(getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg))
) or (
(getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(p).oper[1]^.reg))
)
)
)
)
) and (
GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_SHR,[]) and
(taicpu(hp3).opsize = S_Q) and
(taicpu(hp3).oper[0]^.typ = top_const) and (taicpu(hp2).oper[1]^.typ = top_reg) and
(taicpu(hp3).oper[0]^.val = 1) and
(taicpu(hp3).oper[1]^.reg = taicpu(hp2).oper[1]^.reg)
) then
begin
{ Change movl x, reg1d movl x, reg1d
movl y, reg2d movl y, reg2d
addq reg2q,reg1q or leaq (reg1q,reg2q),reg1q
shrq $1, reg1q shrq $1, reg1q
( reg1d and reg2d can be switched around in the first two instructions )
To movl x, reg1d
addl y, reg1d
rcrl $1, reg1d
This corresponds to the common expression (x + y) shr 1, where
x and y are Cardinals (replacing "shr 1" with "div 2" produces
smaller code, but won't account for x + y causing an overflow). [Kit]
}
if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
{ Change first MOV command to have the same register as the final output }
taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
else
taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
{ Change second MOV command to an ADD command. This is easier than
converting the existing command because it means we don't have to
touch 'y', which might be a complicated reference, and also the
fact that the third command might either be ADD or LEA. [Kit] }
taicpu(hp1).opcode := A_ADD;
{ Delete old ADD/LEA instruction }
RemoveInstruction(hp2);
{ Convert "shrq $1, reg1q" to "rcr $1, reg1d" }
taicpu(hp3).opcode := A_RCR;
taicpu(hp3).changeopsize(S_L);
setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
{$endif x86_64}
end;
end;
{$push}
{$q-}{$r-}
function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean;
var
ThisReg: TRegister;
MinSize, MaxSize, TryShiftDown, TargetSize: TOpSize;
TargetSubReg: TSubRegister;
hp1, hp2: tai;
RegInUse, RegChanged, p_removed: Boolean;
{ Store list of found instructions so we don't have to call
GetNextInstructionUsingReg multiple times }
InstrList: array of taicpu;
InstrMax, Index: Integer;
UpperLimit, SignedUpperLimit, SignedUpperLimitBottom,
LowerLimit, SignedLowerLimit, SignedLowerLimitBottom,
TryShiftDownLimit, TryShiftDownSignedLimit, TryShiftDownSignedLimitLower,
WorkingValue: TCgInt;
PreMessage: string;
{ Data flow analysis }
TestValMin, TestValMax, TestValSignedMax: TCgInt;
BitwiseOnly, OrXorUsed,
ShiftDownOverflow, UpperSignedOverflow, UpperUnsignedOverflow, LowerSignedOverflow, LowerUnsignedOverflow: Boolean;
function CheckOverflowConditions: Boolean;
begin
Result := True;
if (TestValSignedMax > SignedUpperLimit) then
UpperSignedOverflow := True;
if (TestValSignedMax > SignedLowerLimit) or (TestValSignedMax < SignedLowerLimitBottom) then
LowerSignedOverflow := True;
if (TestValMin > LowerLimit) or (TestValMax > LowerLimit) then
LowerUnsignedOverflow := True;
if (TestValMin > UpperLimit) or (TestValMax > UpperLimit) or (TestValSignedMax > UpperLimit) or
(TestValMin < SignedUpperLimitBottom) or (TestValMax < SignedUpperLimitBottom) or (TestValSignedMax < SignedUpperLimitBottom) then
begin
{ Absolute overflow }
Result := False;
Exit;
end;
if not ShiftDownOverflow and (TryShiftDown <> S_NO) and
((TestValMin > TryShiftDownLimit) or (TestValMax > TryShiftDownLimit)) then
ShiftDownOverflow := True;
if (TestValMin < 0) or (TestValMax < 0) then
begin
LowerUnsignedOverflow := True;
UpperUnsignedOverflow := True;
end;
end;
procedure AdjustFinalLoad;
begin
if ((TargetSize = S_L) and (taicpu(hp1).opsize in [S_L, S_BL, S_WL])) or
((TargetSize = S_W) and (taicpu(hp1).opsize in [S_W, S_BW])) then
begin
{ Convert the output MOVZX to a MOV }
if SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
begin
{ Or remove it completely! }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2', hp1);
{ Be careful; if p = hp1 and p was also removed, p
will become a dangling pointer }
if p = hp1 then
begin
RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
p_removed := True;
end
else
RemoveInstruction(hp1);
end
else
begin
DebugMsg(SPeepholeOptimization + 'Movzx2Mov 2', hp1);
taicpu(hp1).opcode := A_MOV;
taicpu(hp1).oper[0]^.reg := ThisReg;
taicpu(hp1).opsize := TargetSize;
end;
end
else if (TargetSize = S_B) and (MaxSize = S_W) and (taicpu(hp1).opsize = S_WL) then
begin
{ Need to change the size of the output }
DebugMsg(SPeepholeOptimization + 'movzwl2movzbl 2', hp1);
taicpu(hp1).oper[0]^.reg := ThisReg;
taicpu(hp1).opsize := S_BL;
end;
end;
function CompressInstructions: Boolean;
var
LocalIndex: Integer;
begin
Result := False;
{ The objective here is to try to find a combination that
removes one of the MOV/Z instructions. }
if (
(taicpu(p).oper[0]^.typ <> top_reg) or
not SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg)
) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
begin
{ Make a preference to remove the second MOVZX instruction }
case taicpu(hp1).opsize of
S_BL, S_WL:
begin
TargetSize := S_L;
TargetSubReg := R_SUBD;
end;
S_BW:
begin
TargetSize := S_W;
TargetSubReg := R_SUBW;
end;
else
InternalError(2020112302);
end;
end
else
begin
if LowerUnsignedOverflow and not UpperUnsignedOverflow then
begin
{ Exceeded lower bound but not upper bound }
TargetSize := MaxSize;
end
else if not LowerUnsignedOverflow then
begin
{ Size didn't exceed lower bound }
TargetSize := MinSize;
end
else
Exit;
end;
case TargetSize of
S_B:
TargetSubReg := R_SUBL;
S_W:
TargetSubReg := R_SUBW;
S_L:
TargetSubReg := R_SUBD;
else
InternalError(2020112350);
end;
{ Update the register to its new size }
setsubreg(ThisReg, TargetSubReg);
if not SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
begin
{ Check to see if the active register is used afterwards;
if not, we can change it and make a saving. }
RegInUse := False;
TransferUsedRegs(TmpUsedRegs);
{ The target register may be marked as in use to cross
a jump to a distant label, so exclude it }
ExcludeRegFromUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs);
hp2 := p;
repeat
{ Explicitly check for the excluded register (don't include the first
instruction as it may be reading from here }
if ((p <> hp2) and (RegInInstruction(taicpu(hp1).oper[1]^.reg, hp2))) or
RegInUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs) then
begin
RegInUse := True;
Break;
end;
UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
if not GetNextInstruction(hp2, hp2) then
InternalError(2020112340);
until (hp2 = hp1);
if not RegInUse and RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
{ We might still be able to get away with this }
RegInUse := not
(
GetNextInstructionUsingReg(hp1, hp2, ThisReg) and
(hp2.typ = ait_instruction) and
(
{ Under -O1 and -O2, GetNextInstructionUsingReg may return an
instruction that doesn't actually contain ThisReg }
(cs_opt_level3 in current_settings.optimizerswitches) or
RegInInstruction(ThisReg, hp2)
) and
RegLoadedWithNewValue(ThisReg, hp2)
);
if not RegInUse then
begin
{ Force the register size to the same as this instruction so it can be removed}
if (taicpu(hp1).opsize in [S_L, S_BL, S_WL]) then
begin
TargetSize := S_L;
TargetSubReg := R_SUBD;
end
else if (taicpu(hp1).opsize in [S_W, S_BW]) then
begin
TargetSize := S_W;
TargetSubReg := R_SUBW;
end;
ThisReg := taicpu(hp1).oper[1]^.reg;
setsubreg(ThisReg, TargetSubReg);
RegChanged := True;
DebugMsg(SPeepholeOptimization + 'Simplified register usage so ' + debug_regname(ThisReg) + ' = ' + debug_regname(taicpu(p).oper[1]^.reg), p);
TransferUsedRegs(TmpUsedRegs);
AllocRegBetween(ThisReg, p, hp1, TmpUsedRegs);
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 3', hp1);
if p = hp1 then
begin
RemoveCurrentp(p); { p = hp1 and will then become the next instruction }
p_removed := True;
end
else
RemoveInstruction(hp1);
{ Instruction will become "mov %reg,%reg" }
if not p_removed and (taicpu(p).opcode = A_MOV) and
MatchOperand(taicpu(p).oper[0]^, ThisReg) then
begin
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 6', p);
RemoveCurrentP(p);
p_removed := True;
end
else
taicpu(p).oper[1]^.reg := ThisReg;
Result := True;
end
else
begin
if TargetSize <> MaxSize then
begin
{ Since the register is in use, we have to force it to
MaxSize otherwise part of it may become undefined later on }
TargetSize := MaxSize;
case TargetSize of
S_B:
TargetSubReg := R_SUBL;
S_W:
TargetSubReg := R_SUBW;
S_L:
TargetSubReg := R_SUBD;
else
InternalError(2020112351);
end;
setsubreg(ThisReg, TargetSubReg);
end;
AdjustFinalLoad;
end;
end
else
AdjustFinalLoad;
if not p_removed then
begin
if TargetSize = MinSize then
begin
{ Convert the input MOVZX to a MOV }
if (taicpu(p).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
begin
{ Or remove it completely! }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1', p);
DebugMsg(SPeepholeOptimization + tostr(InstrMax), p);
RemoveCurrentP(p);
p_removed := True;
end
else
begin
DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1', p);
taicpu(p).opcode := A_MOV;
taicpu(p).oper[1]^.reg := ThisReg;
taicpu(p).opsize := TargetSize;
end;
Result := True;
end
else if TargetSize <> MaxSize then
begin
case MaxSize of
S_L:
if TargetSize = S_W then
begin
DebugMsg(SPeepholeOptimization + 'movzbl2movzbw', p);
taicpu(p).opsize := S_BW;
taicpu(p).oper[1]^.reg := ThisReg;
Result := True;
end
else
InternalError(2020112341);
S_W:
if TargetSize = S_L then
begin
DebugMsg(SPeepholeOptimization + 'movzbw2movzbl', p);
taicpu(p).opsize := S_BL;
taicpu(p).oper[1]^.reg := ThisReg;
Result := True;
end
else
InternalError(2020112342);
else
;
end;
end;
end;
{ Now go through every instruction we found and change the
size. If TargetSize = MaxSize, then almost no changes are
needed and Result can remain False if it hasn't been set
yet.
If RegChanged is True, then the register requires changing
and so the point about TargetSize = MaxSize doesn't apply. }
if ((TargetSize <> MaxSize) or RegChanged) and (InstrMax >= 0) then
begin
for LocalIndex := 0 to InstrMax do
begin
{ If p_removed is true, then the original MOV/Z was removed
and removing the AND instruction may not be safe if it
appears first }
if (InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.typ <> top_reg) then
InternalError(2020112310);
if InstrList[LocalIndex].oper[0]^.typ = top_reg then
InstrList[LocalIndex].oper[0]^.reg := ThisReg;
InstrList[LocalIndex].oper[InstrList[LocalIndex].ops - 1]^.reg := ThisReg;
InstrList[LocalIndex].opsize := TargetSize;
end;
Result := True;
end;
end;
begin
Result := False;
p_removed := False;
ThisReg := taicpu(p).oper[1]^.reg;
{ Check for:
movs/z ###,%ecx (or %cx or %rcx)
...
shl/shr/sar/rcl/rcr/ror/rol %cl,###
(dealloc %ecx)
Change to:
mov ###,%cl (if ### = %cl, then remove completely)
...
shl/shr/sar/rcl/rcr/ror/rol %cl,###
}
if (getsupreg(ThisReg) = RS_ECX) and
GetNextInstructionUsingReg(p, hp1, NR_ECX) and
(hp1.typ = ait_instruction) and
(
{ Under -O1 and -O2, GetNextInstructionUsingReg may return an
instruction that doesn't actually contain ECX }
(cs_opt_level3 in current_settings.optimizerswitches) or
RegInInstruction(NR_ECX, hp1) or
(
{ It's common for the shift/rotate's read/write register to be
initialised in between, so under -O2 and under, search ahead
one more instruction
}
GetNextInstruction(hp1, hp1) and
(hp1.typ = ait_instruction) and
RegInInstruction(NR_ECX, hp1)
)
) and
MatchInstruction(hp1, [A_SHL, A_SHR, A_SAR, A_ROR, A_ROL, A_RCR, A_RCL], []) and
(taicpu(hp1).oper[0]^.typ = top_reg) { This is enough to determine that it's %cl } then
begin
TransferUsedRegs(TmpUsedRegs);
hp2 := p;
repeat
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
if not RegUsedAfterInstruction(NR_CL, hp1, TmpUsedRegs) then
begin
case taicpu(p).opsize of
S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
if MatchOperand(taicpu(p).oper[0]^, NR_CL) then
begin
DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3a', p);
RemoveCurrentP(p);
end
else
begin
taicpu(p).opcode := A_MOV;
taicpu(p).opsize := S_B;
taicpu(p).oper[1]^.reg := NR_CL;
DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 1', p);
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
if MatchOperand(taicpu(p).oper[0]^, NR_CX) then
begin
DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3b', p);
RemoveCurrentP(p);
end
else
begin
taicpu(p).opcode := A_MOV;
taicpu(p).opsize := S_W;
taicpu(p).oper[1]^.reg := NR_CX;
DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 2', p);
end;
{$ifdef x86_64}
S_LQ:
if MatchOperand(taicpu(p).oper[0]^, NR_ECX) then
begin
DebugMsg(SPeepholeOptimization + 'MovxOp2Op 3c', p);
RemoveCurrentP(p);
end
else
begin
taicpu(p).opcode := A_MOV;
taicpu(p).opsize := S_L;
taicpu(p).oper[1]^.reg := NR_ECX;
DebugMsg(SPeepholeOptimization + 'MovxOp2MovOp 3', p);
end;
{$endif x86_64}
else
InternalError(2021120401);
end;
Result := True;
Exit;
end;
end;
{ This is anything but quick! }
if not(cs_opt_level2 in current_settings.optimizerswitches) then
Exit;
SetLength(InstrList, 0);
InstrMax := -1;
case taicpu(p).opsize of
S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
begin
{$if defined(i386) or defined(i8086)}
{ If the target size is 8-bit, make sure we can actually encode it }
if not (GetSupReg(ThisReg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
Exit;
{$endif i386 or i8086}
LowerLimit := $FF;
SignedLowerLimit := $7F;
SignedLowerLimitBottom := -128;
MinSize := S_B;
if taicpu(p).opsize = S_BW then
begin
MaxSize := S_W;
UpperLimit := $FFFF;
SignedUpperLimit := $7FFF;
SignedUpperLimitBottom := -32768;
end
else
begin
{ Keep at a 32-bit limit for BQ as well since one can't really optimise otherwise }
MaxSize := S_L;
UpperLimit := $FFFFFFFF;
SignedUpperLimit := $7FFFFFFF;
SignedUpperLimitBottom := -2147483648;
end;
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
begin
{ Keep at a 32-bit limit for WQ as well since one can't really optimise otherwise }
LowerLimit := $FFFF;
SignedLowerLimit := $7FFF;
SignedLowerLimitBottom := -32768;
UpperLimit := $FFFFFFFF;
SignedUpperLimit := $7FFFFFFF;
SignedUpperLimitBottom := -2147483648;
MinSize := S_W;
MaxSize := S_L;
end;
{$ifdef x86_64}
S_LQ:
begin
{ Both the lower and upper limits are set to 32-bit. If a limit
is breached, then optimisation is impossible }
LowerLimit := $FFFFFFFF;
SignedLowerLimit := $7FFFFFFF;
SignedLowerLimitBottom := -2147483648;
UpperLimit := $FFFFFFFF;
SignedUpperLimit := $7FFFFFFF;
SignedUpperLimitBottom := -2147483648;
MinSize := S_L;
MaxSize := S_L;
end;
{$endif x86_64}
else
InternalError(2020112301);
end;
TestValMin := 0;
TestValMax := LowerLimit;
TestValSignedMax := SignedLowerLimit;
TryShiftDownLimit := LowerLimit;
TryShiftDown := S_NO;
ShiftDownOverflow := False;
RegChanged := False;
BitwiseOnly := True;
OrXorUsed := False;
UpperSignedOverflow := False;
LowerSignedOverflow := False;
UpperUnsignedOverflow := False;
LowerUnsignedOverflow := False;
hp1 := p;
while GetNextInstructionUsingReg(hp1, hp1, ThisReg) and
(hp1.typ = ait_instruction) and
(
{ Under -O1 and -O2, GetNextInstructionUsingReg may return an
instruction that doesn't actually contain ThisReg }
(cs_opt_level3 in current_settings.optimizerswitches) or
{ This allows this Movx optimisation to work through the SETcc instructions
inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
skip over these SETcc instructions). }
(taicpu(hp1).opcode = A_SETcc) or
RegInInstruction(ThisReg, hp1)
) do
begin
case taicpu(hp1).opcode of
A_INC,A_DEC:
begin
{ Has to be an exact match on the register }
if not MatchOperand(taicpu(hp1).oper[0]^, ThisReg) then
Break;
if taicpu(hp1).opcode = A_INC then
begin
Inc(TestValMin);
Inc(TestValMax);
Inc(TestValSignedMax);
end
else
begin
Dec(TestValMin);
Dec(TestValMax);
Dec(TestValSignedMax);
end;
end;
A_TEST, A_CMP:
begin
if (
{ Too high a risk of non-linear behaviour that breaks DFA
here, unless it's cmp $0,%reg, which is equivalent to
test %reg,%reg }
OrXorUsed and
(taicpu(hp1).opcode = A_CMP) and
not Matchoperand(taicpu(hp1).oper[0]^, 0)
) or
(taicpu(hp1).oper[1]^.typ <> top_reg) or
{ Has to be an exact match on the register }
(taicpu(hp1).oper[1]^.reg <> ThisReg) or
(
{ Permit "test %reg,%reg" }
(taicpu(hp1).opcode = A_TEST) and
(taicpu(hp1).oper[0]^.typ = top_reg) and
(taicpu(hp1).oper[0]^.reg <> ThisReg)
) or
(taicpu(hp1).oper[0]^.typ <> top_const) or
{ Make sure the comparison value is not smaller than the
smallest allowed signed value for the minimum size (e.g.
-128 for 8-bit) }
not (
((taicpu(hp1).oper[0]^.val and LowerLimit) = taicpu(hp1).oper[0]^.val) or
{ Is it in the negative range? }
(
(taicpu(hp1).oper[0]^.val < 0) and
(taicpu(hp1).oper[0]^.val >= SignedLowerLimitBottom)
)
) then
Break;
{ Check to see if the active register is used afterwards }
TransferUsedRegs(TmpUsedRegs);
IncludeRegInUsedRegs(ThisReg, TmpUsedRegs);
if not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
begin
{ Make sure the comparison or any previous instructions
hasn't pushed the test values outside of the range of
MinSize }
if LowerUnsignedOverflow and not UpperUnsignedOverflow then
begin
{ Exceeded lower bound but not upper bound }
TargetSize := MaxSize;
end
else if not LowerSignedOverflow or not LowerUnsignedOverflow then
begin
{ Size didn't exceed lower bound }
TargetSize := MinSize;
end
else
Break;
case TargetSize of
S_B:
TargetSubReg := R_SUBL;
S_W:
TargetSubReg := R_SUBW;
S_L:
TargetSubReg := R_SUBD;
else
InternalError(2021051002);
end;
{ Update the register to its new size }
setsubreg(ThisReg, TargetSubReg);
taicpu(hp1).oper[1]^.reg := ThisReg;
taicpu(hp1).opsize := MinSize;
{ Convert the input MOVZX to a MOV }
if (taicpu(p).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
begin
{ Or remove it completely! }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1a', p);
RemoveCurrentP(p);
p_removed := True;
end
else
begin
DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1a', p);
taicpu(p).opcode := A_MOV;
taicpu(p).oper[1]^.reg := ThisReg;
taicpu(p).opsize := MinSize;
end;
if (InstrMax >= 0) then
begin
for Index := 0 to InstrMax do
begin
{ If p_removed is true, then the original MOV/Z was removed
and removing the AND instruction may not be safe if it
appears first }
if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
InternalError(2020112311);
if InstrList[Index].oper[0]^.typ = top_reg then
InstrList[Index].oper[0]^.reg := ThisReg;
InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
InstrList[Index].opsize := MinSize;
end;
end;
Result := True;
Exit;
end;
end;
A_SETcc:
begin
{ This allows this Movx optimisation to work through the SETcc instructions
inserted by the 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR'
optimisation on -O1 and -O2 (on -O3, GetNextInstructionUsingReg will
skip over these SETcc instructions). }
if (cs_opt_level3 in current_settings.optimizerswitches) or
{ Of course, break out if the current register is used }
RegInOp(ThisReg, taicpu(hp1).oper[0]^) then
Break
else
{ We must use Continue so the instruction doesn't get added
to InstrList }
Continue;
end;
A_ADD,A_SUB,A_AND,A_OR,A_XOR,A_SHL,A_SHR,A_SAR:
begin
if
(taicpu(hp1).oper[1]^.typ <> top_reg) or
{ Has to be an exact match on the register }
(taicpu(hp1).oper[1]^.reg <> ThisReg) or not
(
(
(taicpu(hp1).oper[0]^.typ = top_const) and
(
(
(taicpu(hp1).opcode = A_SHL) and
(
((MinSize = S_B) and (taicpu(hp1).oper[0]^.val < 8)) or
((MinSize = S_W) and (taicpu(hp1).oper[0]^.val < 16)) or
((MinSize = S_L) and (taicpu(hp1).oper[0]^.val < 32))
)
) or (
(taicpu(hp1).opcode <> A_SHL) and
(
((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
{ Is it in the negative range? }
(((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
)
)
)
) or (
MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) and
((taicpu(hp1).opcode = A_ADD) or (taicpu(hp1).opcode = A_AND) or (taicpu(hp1).opcode = A_SUB))
)
) then
Break;
{ Only process OR and XOR if there are only bitwise operations,
since otherwise they can too easily fool the data flow
analysis (they can cause non-linear behaviour) }
case taicpu(hp1).opcode of
A_ADD:
begin
if OrXorUsed then
{ Too high a risk of non-linear behaviour that breaks DFA here }
Break
else
BitwiseOnly := False;
if (taicpu(hp1).oper[0]^.typ = top_reg) then
begin
TestValMin := TestValMin * 2;
TestValMax := TestValMax * 2;
TestValSignedMax := TestValSignedMax * 2;
end
else
begin
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin + WorkingValue;
TestValMax := TestValMax + WorkingValue;
TestValSignedMax := TestValSignedMax + WorkingValue;
end;
end;
A_SUB:
begin
if (taicpu(hp1).oper[0]^.typ = top_reg) then
begin
TestValMin := 0;
TestValMax := 0;
TestValSignedMax := 0;
end
else
begin
if OrXorUsed then
{ Too high a risk of non-linear behaviour that breaks DFA here }
Break
else
BitwiseOnly := False;
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin - WorkingValue;
TestValMax := TestValMax - WorkingValue;
TestValSignedMax := TestValSignedMax - WorkingValue;
end;
end;
A_AND:
if (taicpu(hp1).oper[0]^.typ = top_const) then
begin
{ we might be able to go smaller if AND appears first }
if InstrMax = -1 then
case MinSize of
S_B:
;
S_W:
if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
begin
TryShiftDown := S_B;
TryShiftDownLimit := $FF;
end;
S_L:
if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
begin
TryShiftDown := S_B;
TryShiftDownLimit := $FF;
end
else if ((taicpu(hp1).oper[0]^.val and $FFFF) = taicpu(hp1).oper[0]^.val) or
((not(taicpu(hp1).oper[0]^.val) and $7FFF) = (not taicpu(hp1).oper[0]^.val)) then
begin
TryShiftDown := S_W;
TryShiftDownLimit := $FFFF;
end;
else
InternalError(2020112320);
end;
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin and WorkingValue;
TestValMax := TestValMax and WorkingValue;
TestValSignedMax := TestValSignedMax and WorkingValue;
end;
A_OR:
begin
if not BitwiseOnly then
Break;
OrXorUsed := True;
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin or WorkingValue;
TestValMax := TestValMax or WorkingValue;
TestValSignedMax := TestValSignedMax or WorkingValue;
end;
A_XOR:
begin
if (taicpu(hp1).oper[0]^.typ = top_reg) then
begin
TestValMin := 0;
TestValMax := 0;
TestValSignedMax := 0;
end
else
begin
if not BitwiseOnly then
Break;
OrXorUsed := True;
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin xor WorkingValue;
TestValMax := TestValMax xor WorkingValue;
TestValSignedMax := TestValSignedMax xor WorkingValue;
end;
end;
A_SHL:
begin
BitwiseOnly := False;
WorkingValue := taicpu(hp1).oper[0]^.val;
TestValMin := TestValMin shl WorkingValue;
TestValMax := TestValMax shl WorkingValue;
TestValSignedMax := TestValSignedMax shl WorkingValue;
end;
A_SHR,
{ The first instruction was MOVZX, so the value won't be negative }
A_SAR:
begin
if InstrMax <> -1 then
BitwiseOnly := False
else
{ we might be able to go smaller if SHR appears first }
case MinSize of
S_B:
;
S_W:
if (taicpu(hp1).oper[0]^.val >= 8) then
begin
TryShiftDown := S_B;
TryShiftDownLimit := $FF;
TryShiftDownSignedLimit := $7F;
TryShiftDownSignedLimitLower := -128;
end;
S_L:
if (taicpu(hp1).oper[0]^.val >= 24) then
begin
TryShiftDown := S_B;
TryShiftDownLimit := $FF;
TryShiftDownSignedLimit := $7F;
TryShiftDownSignedLimitLower := -128;
end
else if (taicpu(hp1).oper[0]^.val >= 16) then
begin
TryShiftDown := S_W;
TryShiftDownLimit := $FFFF;
TryShiftDownSignedLimit := $7FFF;
TryShiftDownSignedLimitLower := -32768;
end;
else
InternalError(2020112321);
end;
WorkingValue := taicpu(hp1).oper[0]^.val;
if taicpu(hp1).opcode = A_SAR then
begin
TestValMin := SarInt64(TestValMin, WorkingValue);
TestValMax := SarInt64(TestValMax, WorkingValue);
TestValSignedMax := SarInt64(TestValSignedMax, WorkingValue);
end
else
begin
TestValMin := TestValMin shr WorkingValue;
TestValMax := TestValMax shr WorkingValue;
TestValSignedMax := TestValSignedMax shr WorkingValue;
end;
end;
else
InternalError(2020112303);
end;
end;
(*
A_IMUL:
case taicpu(hp1).ops of
2:
begin
if not MatchOpType(hp1, top_reg, top_reg) or
{ Has to be an exact match on the register }
(taicpu(hp1).oper[0]^.reg <> ThisReg) or
(taicpu(hp1).oper[1]^.reg <> ThisReg) then
Break;
TestValMin := TestValMin * TestValMin;
TestValMax := TestValMax * TestValMax;
TestValSignedMax := TestValSignedMax * TestValMax;
end;
3:
begin
if not MatchOpType(hp1, top_const, top_reg, top_reg) or
{ Has to be an exact match on the register }
(taicpu(hp1).oper[1]^.reg <> ThisReg) or
(taicpu(hp1).oper[2]^.reg <> ThisReg) or
((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
{ Is it in the negative range? }
(((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
Break;
TestValMin := TestValMin * taicpu(hp1).oper[0]^.val;
TestValMax := TestValMax * taicpu(hp1).oper[0]^.val;
TestValSignedMax := TestValSignedMax * taicpu(hp1).oper[0]^.val;
end;
else
Break;
end;
A_IDIV:
case taicpu(hp1).ops of
3:
begin
if not MatchOpType(hp1, top_const, top_reg, top_reg) or
{ Has to be an exact match on the register }
(taicpu(hp1).oper[1]^.reg <> ThisReg) or
(taicpu(hp1).oper[2]^.reg <> ThisReg) or
((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
{ Is it in the negative range? }
(((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
Break;
TestValMin := TestValMin div taicpu(hp1).oper[0]^.val;
TestValMax := TestValMax div taicpu(hp1).oper[0]^.val;
TestValSignedMax := TestValSignedMax div taicpu(hp1).oper[0]^.val;
end;
else
Break;
end;
*)
A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
begin
{ If there are no instructions in between, then we might be able to make a saving }
if UpperSignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) or (taicpu(hp1).oper[0]^.reg <> ThisReg) then
Break;
{ We have something like:
movzbw %dl,%dx
...
movswl %dx,%edx
Change the latter to a zero-extension then enter the
A_MOVZX case branch.
}
{$ifdef x86_64}
if (taicpu(hp1).opsize = S_LQ) and SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
begin
{ this becomes a zero extension from 32-bit to 64-bit, but
the upper 32 bits are already zero, so just delete the
instruction }
DebugMsg(SPeepholeOptimization + 'MovzMovsxd2MovzNop', hp1);
RemoveInstruction(hp1);
Result := True;
Exit;
end
else
{$endif x86_64}
begin
DebugMsg(SPeepholeOptimization + 'MovzMovs2MovzMovz', hp1);
taicpu(hp1).opcode := A_MOVZX;
{$ifdef x86_64}
case taicpu(hp1).opsize of
S_BQ:
begin
taicpu(hp1).opsize := S_BL;
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
end;
S_WQ:
begin
taicpu(hp1).opsize := S_WL;
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
end;
S_LQ:
begin
taicpu(hp1).opcode := A_MOV;
taicpu(hp1).opsize := S_L;
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
{ In this instance, we need to break out because the
instruction is no longer MOVZX or MOVSXD }
Result := True;
Exit;
end;
else
;
end;
{$endif x86_64}
Result := CompressInstructions;
Exit;
end;
end;
A_MOVZX:
begin
if UpperUnsignedOverflow or (taicpu(hp1).oper[0]^.typ <> top_reg) then
Break;
if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
begin
if (InstrMax = -1) and
{ Will return false if the second parameter isn't ThisReg
(can happen on -O2 and under) }
Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, ThisReg) then
begin
{ The two MOVZX instructions are adjacent, so remove the first one }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 5', p);
RemoveCurrentP(p);
Result := True;
Exit;
end;
Break;
end;
Result := CompressInstructions;
Exit;
end;
else
{ This includes ADC, SBB and IDIV }
Break;
end;
if not CheckOverflowConditions then
Break;
{ Contains highest index (so instruction count - 1) }
Inc(InstrMax);
if InstrMax > High(InstrList) then
SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
InstrList[InstrMax] := taicpu(hp1);
end;
end;
{$pop}
function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
var
hp1 : tai;
begin
Result:=false;
if (taicpu(p).ops >= 2) and
((taicpu(p).oper[0]^.typ = top_const) or
((taicpu(p).oper[0]^.typ = top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full))) and
(taicpu(p).oper[1]^.typ = top_reg) and
((taicpu(p).ops = 2) or
((taicpu(p).oper[2]^.typ = top_reg) and
(taicpu(p).oper[2]^.reg = taicpu(p).oper[1]^.reg))) and
GetLastInstruction(p,hp1) and
MatchInstruction(hp1,A_MOV,[]) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
TransferUsedRegs(TmpUsedRegs);
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,p,TmpUsedRegs)) or
((taicpu(p).ops = 3) and (taicpu(p).oper[1]^.reg=taicpu(p).oper[2]^.reg)) then
{ change
mov reg1,reg2
imul y,reg2 to imul y,reg1,reg2 }
begin
taicpu(p).ops := 3;
taicpu(p).loadreg(2,taicpu(p).oper[1]^.reg);
taicpu(p).loadreg(1,taicpu(hp1).oper[0]^.reg);
DebugMsg(SPeepholeOptimization + 'MovImul2Imul done',p);
RemoveInstruction(hp1);
result:=true;
end;
end;
end;
procedure TX86AsmOptimizer.ConvertJumpToRET(const p: tai; const ret_p: tai);
var
ThisLabel: TAsmLabel;
begin
ThisLabel := tasmlabel(taicpu(p).oper[0]^.ref^.symbol);
ThisLabel.decrefs;
taicpu(p).opcode := A_RET;
taicpu(p).is_jmp := false;
taicpu(p).ops := taicpu(ret_p).ops;
case taicpu(ret_p).ops of
0:
taicpu(p).clearop(0);
1:
taicpu(p).loadconst(0,taicpu(ret_p).oper[0]^.val);
else
internalerror(2016041301);
end;
{ If the original label is now dead, it might turn out that the label
immediately follows p. As a result, everything beyond it, which will
be just some final register configuration and a RET instruction, is
now dead code. [Kit] }
{ NOTE: This is much faster than introducing a OptPass2RET routine and
running RemoveDeadCodeAfterJump for each RET instruction, because
this optimisation rarely happens and most RETs appear at the end of
routines where there is nothing that can be stripped. [Kit] }
if not ThisLabel.is_used then
RemoveDeadCodeAfterJump(p);
end;
function TX86AsmOptimizer.OptPass2SETcc(var p: tai): boolean;
var
hp1,hp2,next: tai; SetC, JumpC: TAsmCond;
Unconditional, PotentialModified: Boolean;
OperPtr: POper;
NewRef: TReference;
InstrList: array of taicpu;
InstrMax, Index: Integer;
const
{$ifdef DEBUG_AOPTCPU}
SNoFlags: shortstring = ' so the flags aren''t modified';
{$else DEBUG_AOPTCPU}
SNoFlags = '';
{$endif DEBUG_AOPTCPU}
begin
Result:=false;
if MatchOpType(taicpu(p),top_reg) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
begin
if MatchInstruction(hp1, A_TEST, [S_B]) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(taicpu(hp1).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(p).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_Jcc, A_SETcc, []) then
{ Change from: To:
set(C) %reg j(~C) label
test %reg,%reg/cmp $0,%reg
je label
set(C) %reg j(C) label
test %reg,%reg/cmp $0,%reg
jne label
(Also do something similar with sete/setne instead of je/jne)
}
begin
{ Before we do anything else, we need to check the instructions
in between SETcc and TEST to make sure they don't modify the
FLAGS register - if -O2 or under, there won't be any
instructions between SET and TEST }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if (cs_opt_level3 in current_settings.optimizerswitches) then
begin
next := p;
SetLength(InstrList, 0);
InstrMax := -1;
PotentialModified := False;
{ Make a note of every instruction that modifies the FLAGS
register }
while GetNextInstruction(next, next) and (next <> hp1) do
begin
if next.typ <> ait_instruction then
{ GetNextInstructionUsingReg should have returned False }
InternalError(2021051701);
if RegModifiedByInstruction(NR_DEFAULTFLAGS, next) then
begin
case taicpu(next).opcode of
A_SETcc,
A_CMOVcc,
A_Jcc:
begin
if PotentialModified then
{ Not safe because the flags were modified earlier }
Exit
else
{ Condition is the same as the initial SETcc, so this is safe
(don't add to instruction list though) }
Continue;
end;
A_ADD:
begin
if (taicpu(next).opsize = S_B) or
{ LEA doesn't support 8-bit operands }
(taicpu(next).oper[1]^.typ <> top_reg) or
{ Must write to a register }
(taicpu(next).oper[0]^.typ = top_ref) then
{ Require a constant or a register }
Exit;
PotentialModified := True;
end;
A_SUB:
begin
if (taicpu(next).opsize = S_B) or
{ LEA doesn't support 8-bit operands }
(taicpu(next).oper[1]^.typ <> top_reg) or
{ Must write to a register }
(taicpu(next).oper[0]^.typ <> top_const) or
(taicpu(next).oper[0]^.val = $80000000) then
{ Can't subtract a register with LEA - also
check that the value isn't -2^31, as this
can't be negated }
Exit;
PotentialModified := True;
end;
A_SAL,
A_SHL:
begin
if (taicpu(next).opsize = S_B) or
{ LEA doesn't support 8-bit operands }
(taicpu(next).oper[1]^.typ <> top_reg) or
{ Must write to a register }
(taicpu(next).oper[0]^.typ <> top_const) or
(taicpu(next).oper[0]^.val < 0) or
(taicpu(next).oper[0]^.val > 3) then
Exit;
PotentialModified := True;
end;
A_IMUL:
begin
if (taicpu(next).ops <> 3) or
(taicpu(next).oper[1]^.typ <> top_reg) or
{ Must write to a register }
(taicpu(next).oper[2]^.val in [2,3,4,5,8,9]) then
{ We can convert "imul x,%reg1,%reg2" (where x = 2, 4 or 8)
to "lea (%reg1,x),%reg2". If x = 3, 5 or 9, we can
change this to "lea (%reg1,%reg1,(x-1)),%reg2" }
Exit
else
PotentialModified := True;
end;
else
{ Don't know how to change this, so abort }
Exit;
end;
{ Contains highest index (so instruction count - 1) }
Inc(InstrMax);
if InstrMax > High(InstrList) then
SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
InstrList[InstrMax] := taicpu(next);
end;
UpdateUsedRegs(TmpUsedRegs, tai(next.next));
end;
if not Assigned(next) or (next <> hp1) then
{ It should be equal to hp1 }
InternalError(2021051702);
{ Cycle through each instruction and check to see if we can
change them to versions that don't modify the flags }
if (InstrMax >= 0) then
begin
for Index := 0 to InstrMax do
case InstrList[Index].opcode of
A_ADD:
begin
DebugMsg(SPeepholeOptimization + 'ADD -> LEA' + SNoFlags, InstrList[Index]);
InstrList[Index].opcode := A_LEA;
reference_reset(NewRef, 1, []);
NewRef.base := InstrList[Index].oper[1]^.reg;
if InstrList[Index].oper[0]^.typ = top_reg then
begin
NewRef.index := InstrList[Index].oper[0]^.reg;
NewRef.scalefactor := 1;
end
else
NewRef.offset := InstrList[Index].oper[0]^.val;
InstrList[Index].loadref(0, NewRef);
end;
A_SUB:
begin
DebugMsg(SPeepholeOptimization + 'SUB -> LEA' + SNoFlags, InstrList[Index]);
InstrList[Index].opcode := A_LEA;
reference_reset(NewRef, 1, []);
NewRef.base := InstrList[Index].oper[1]^.reg;
NewRef.offset := -InstrList[Index].oper[0]^.val;
InstrList[Index].loadref(0, NewRef);
end;
A_SHL,
A_SAL:
begin
DebugMsg(SPeepholeOptimization + 'SHL -> LEA' + SNoFlags, InstrList[Index]);
InstrList[Index].opcode := A_LEA;
reference_reset(NewRef, 1, []);
NewRef.index := InstrList[Index].oper[1]^.reg;
NewRef.scalefactor := 1 shl (InstrList[Index].oper[0]^.val);
InstrList[Index].loadref(0, NewRef);
end;
A_IMUL:
begin
DebugMsg(SPeepholeOptimization + 'IMUL -> LEA' + SNoFlags, InstrList[Index]);
InstrList[Index].opcode := A_LEA;
reference_reset(NewRef, 1, []);
NewRef.index := InstrList[Index].oper[1]^.reg;
case InstrList[Index].oper[0]^.val of
2, 4, 8:
NewRef.scalefactor := InstrList[Index].oper[0]^.val;
else {3, 5 and 9}
begin
NewRef.scalefactor := InstrList[Index].oper[0]^.val - 1;
NewRef.base := InstrList[Index].oper[1]^.reg;
end;
end;
InstrList[Index].loadref(0, NewRef);
end;
else
InternalError(2021051710);
end;
end;
{ Mark the FLAGS register as used across this whole block }
AllocRegBetween(NR_DEFAULTFLAGS, p, hp1, UsedRegs);
end;
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
JumpC := taicpu(hp2).condition;
Unconditional := False;
if conditions_equal(JumpC, C_E) then
SetC := inverse_cond(taicpu(p).condition)
else if conditions_equal(JumpC, C_NE) then
SetC := taicpu(p).condition
else
{ We've got something weird here (and inefficent) }
begin
DebugMsg('DEBUG: Inefficient jump - check code generation', p);
SetC := C_NONE;
{ JAE/JNB will always branch (use 'condition_in', since C_AE <> C_NB normally) }
if condition_in(C_AE, JumpC) then
Unconditional := True
else
{ Not sure what to do with this jump - drop out }
Exit;
end;
RemoveInstruction(hp1);
if Unconditional then
MakeUnconditional(taicpu(hp2))
else
begin
if SetC = C_NONE then
InternalError(2018061402);
taicpu(hp2).SetCondition(SetC);
end;
{ as hp2 is a jump, we cannot use RegUsedAfterInstruction but we have to check if it is included in
TmpUsedRegs }
if not TmpUsedRegs[getregtype(taicpu(p).oper[0]^.reg)].IsUsed(taicpu(p).oper[0]^.reg) then
begin
RemoveCurrentp(p, hp2);
if taicpu(hp2).opcode = A_SETcc then
DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc',p)
else
DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> Jcc',p);
end
else
if taicpu(hp2).opcode = A_SETcc then
DebugMsg(SPeepholeOptimization + 'SETcc/TEST/SETcc -> SETcc/SETcc',p)
else
DebugMsg(SPeepholeOptimization + 'SETcc/TEST/Jcc -> SETcc/Jcc',p);
Result := True;
end
else if
{ Make sure the instructions are adjacent }
(
not (cs_opt_level3 in current_settings.optimizerswitches) or
GetNextInstruction(p, hp1)
) and
MatchInstruction(hp1, A_MOV, [S_B]) and
{ Writing to memory is allowed }
MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg) then
begin
{
Watch out for sequences such as:
set(c)b %regb
movb %regb,(ref)
movb $0,1(ref)
movb $0,2(ref)
movb $0,3(ref)
Much more efficient to turn it into:
movl $0,%regl
set(c)b %regb
movl %regl,(ref)
Or:
set(c)b %regb
movzbl %regb,%regl
movl %regl,(ref)
}
if (taicpu(hp1).oper[1]^.typ = top_ref) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, [S_B]) and
(taicpu(hp2).oper[1]^.typ = top_ref) and
CheckMemoryWrite(taicpu(hp1), taicpu(hp2)) then
begin
{ Don't do anything else except set Result to True }
end
else
begin
if taicpu(p).oper[0]^.typ = top_reg then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
end;
{ If it's not a register, it's a memory address }
if (taicpu(p).oper[0]^.typ <> top_reg) or RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
begin
{ Even if the register is still in use, we can minimise the
pipeline stall by changing the MOV into another SETcc. }
taicpu(hp1).opcode := A_SETcc;
taicpu(hp1).condition := taicpu(p).condition;
if taicpu(hp1).oper[1]^.typ = top_ref then
begin
{ Swapping the operand pointers like this is probably a
bit naughty, but it is far faster than using loadoper
to transfer the reference from oper[1] to oper[0] if
you take into account the extra procedure calls and
the memory allocation and deallocation required }
OperPtr := taicpu(hp1).oper[1];
taicpu(hp1).oper[1] := taicpu(hp1).oper[0];
taicpu(hp1).oper[0] := OperPtr;
end
else
taicpu(hp1).oper[0]^.reg := taicpu(hp1).oper[1]^.reg;
taicpu(hp1).clearop(1);
taicpu(hp1).ops := 1;
DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc/SETcc',p);
end
else
begin
if taicpu(hp1).oper[1]^.typ = top_reg then
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
taicpu(p).loadoper(0, taicpu(hp1).oper[1]^);
RemoveInstruction(hp1);
DebugMsg(SPeepholeOptimization + 'SETcc/Mov -> SETcc',p);
end
end;
Result := True;
end;
end;
end;
function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean;
var
hp1: tai;
Count: Integer;
OrigLabel: TAsmLabel;
begin
result := False;
{ Sometimes, the optimisations below can permit this }
RemoveDeadCodeAfterJump(p);
if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and
(taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then
begin
OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
{ Also a side-effect of optimisations }
if CollapseZeroDistJump(p, OrigLabel) then
begin
Result := True;
Exit;
end;
hp1 := GetLabelWithSym(OrigLabel);
if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then
begin
case taicpu(hp1).opcode of
A_RET:
{
change
jmp .L1
...
.L1:
ret
into
ret
}
begin
ConvertJumpToRET(p, hp1);
result:=true;
end;
{ Check any kind of direct assignment instruction }
A_MOV,
A_MOVD,
A_MOVQ,
A_MOVSX,
{$ifdef x86_64}
A_MOVSXD,
{$endif x86_64}
A_MOVZX,
A_MOVAPS,
A_MOVUPS,
A_MOVSD,
A_MOVAPD,
A_MOVUPD,
A_MOVDQA,
A_MOVDQU,
A_VMOVSS,
A_VMOVAPS,
A_VMOVUPS,
A_VMOVSD,
A_VMOVAPD,
A_VMOVUPD,
A_VMOVDQA,
A_VMOVDQU:
if ((current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size]) and
CheckJumpMovTransferOpt(p, hp1, 0, Count) then
begin
Result := True;
Exit;
end;
else
;
end;
end;
end;
end;
class function TX86AsmOptimizer.CanBeCMOV(p : tai) : boolean;
begin
CanBeCMOV:=assigned(p) and
MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
{ we can't use cmov ref,reg because
ref could be nil and cmov still throws an exception
if ref=nil but the mov isn't done (FK)
or ((taicpu(p).oper[0]^.typ = top_ref) and
(taicpu(p).oper[0]^.ref^.refaddr = addr_no))
}
(taicpu(p).oper[1]^.typ = top_reg) and
(
(taicpu(p).oper[0]^.typ = top_reg) or
{ allow references, but only pure symbols or got rel. addressing with RIP as based,
it is not expected that this can cause a seg. violation }
(
(taicpu(p).oper[0]^.typ = top_ref) and
IsRefSafe(taicpu(p).oper[0]^.ref)
)
);
end;
function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
var
hp1,hp2: tai;
{$ifndef i8086}
hp3,hp4,hpmov2, hp5: tai;
l : Longint;
condition : TAsmCond;
{$endif i8086}
carryadd_opcode : TAsmOp;
symbol: TAsmSymbol;
reg: tsuperregister;
increg, tmpreg: TRegister;
begin
result:=false;
if GetNextInstruction(p,hp1) and (hp1.typ=ait_instruction) then
begin
symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol);
if (
(
((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and
MatchOptype(Taicpu(hp1),top_const,top_reg) and
(Taicpu(hp1).oper[0]^.val=1)
) or
((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC))
) and
GetNextInstruction(hp1,hp2) and
SkipAligns(hp2, hp2) and
(hp2.typ = ait_label) and
(Tasmlabel(symbol) = Tai_label(hp2).labsym) then
{ jb @@1 cmc
inc/dec operand --> adc/sbb operand,0
@@1:
... and ...
jnb @@1
inc/dec operand --> adc/sbb operand,0
@@1: }
begin
if Taicpu(p).condition in [C_NAE,C_B,C_C] then
begin
case taicpu(hp1).opcode of
A_INC,
A_ADD:
carryadd_opcode:=A_ADC;
A_DEC,
A_SUB:
carryadd_opcode:=A_SBB;
else
InternalError(2021011001);
end;
Taicpu(p).clearop(0);
Taicpu(p).ops:=0;
Taicpu(p).is_jmp:=false;
Taicpu(p).opcode:=A_CMC;
Taicpu(p).condition:=C_NONE;
DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2CmcAdc/Sbb',p);
Taicpu(hp1).ops:=2;
if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
else
Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
Taicpu(hp1).loadconst(0,0);
Taicpu(hp1).opcode:=carryadd_opcode;
result:=true;
exit;
end
else if Taicpu(p).condition in [C_AE,C_NB,C_NC] then
begin
case taicpu(hp1).opcode of
A_INC,
A_ADD:
carryadd_opcode:=A_ADC;
A_DEC,
A_SUB:
carryadd_opcode:=A_SBB;
else
InternalError(2021011002);
end;
Taicpu(hp1).ops:=2;
DebugMsg(SPeepholeOptimization+'JccAdd/Inc/Dec2Adc/Sbb',p);
if (Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB) then
Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[1]^)
else
Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^);
Taicpu(hp1).loadconst(0,0);
Taicpu(hp1).opcode:=carryadd_opcode;
RemoveCurrentP(p, hp1);
result:=true;
exit;
end
{
jcc @@1 setcc tmpreg
inc/dec/add/sub operand -> (movzx tmpreg)
@@1: add/sub tmpreg,operand
While this increases code size slightly, it makes the code much faster if the
jump is unpredictable
}
else if not(cs_opt_size in current_settings.optimizerswitches) then
begin
{ search for an available register which is volatile }
for reg in tcpuregisterset do
begin
if
{$if defined(i386) or defined(i8086)}
{ Only use registers whose lowest 8-bits can Be accessed }
(reg in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) and
{$endif i386 or i8086}
(reg in paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption)) and
not(reg in UsedRegs[R_INTREGISTER].GetUsedRegs)
{ We don't need to check if tmpreg is in hp1 or not, because
it will be marked as in use at p (if not, this is
indictive of a compiler bug). }
then
begin
TAsmLabel(symbol).decrefs;
increg := newreg(R_INTREGISTER,reg,R_SUBL);
Taicpu(p).clearop(0);
Taicpu(p).ops:=1;
Taicpu(p).is_jmp:=false;
Taicpu(p).opcode:=A_SETcc;
DebugMsg(SPeepholeOptimization+'JccAdd2SetccAdd',p);
Taicpu(p).condition:=inverse_cond(Taicpu(p).condition);
Taicpu(p).loadreg(0,increg);
if getsubreg(Taicpu(hp1).oper[1]^.reg)<>R_SUBL then
begin
case getsubreg(Taicpu(hp1).oper[1]^.reg) of
R_SUBW:
begin
tmpreg := newreg(R_INTREGISTER,reg,R_SUBW);
hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BW,increg,tmpreg);
end;
R_SUBD:
begin
tmpreg := newreg(R_INTREGISTER,reg,R_SUBD);
hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,tmpreg);
end;
{$ifdef x86_64}
R_SUBQ:
begin
{ MOVZX doesn't have a 64-bit variant, because
the 32-bit version implicitly zeroes the
upper 32-bits of the destination register }
hp2:=Taicpu.op_reg_reg(A_MOVZX,S_BL,increg,
newreg(R_INTREGISTER,reg,R_SUBD));
tmpreg := newreg(R_INTREGISTER,reg,R_SUBQ);
end;
{$endif x86_64}
else
Internalerror(2020030601);
end;
taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
asml.InsertAfter(hp2,p);
end
else
tmpreg := increg;
if (Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC) then
begin
Taicpu(hp1).ops:=2;
Taicpu(hp1).loadoper(1,Taicpu(hp1).oper[0]^)
end;
Taicpu(hp1).loadreg(0,tmpreg);
AllocRegBetween(tmpreg,p,hp1,UsedRegs);
Result := True;
{ p is no longer a Jcc instruction, so exit }
Exit;
end;
end;
end;
end;
{ Detect the following:
jmp<cond> @Lbl1
jmp @Lbl2
...
@Lbl1:
ret
Change to:
jmp<inv_cond> @Lbl2
ret
}
if MatchInstruction(hp1,A_JMP,[]) and (taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
begin
hp2:=getlabelwithsym(TAsmLabel(symbol));
if Assigned(hp2) and SkipLabels(hp2,hp2) and
MatchInstruction(hp2,A_RET,[S_NO]) then
begin
taicpu(p).condition := inverse_cond(taicpu(p).condition);
{ Change label address to that of the unconditional jump }
taicpu(p).loadoper(0, taicpu(hp1).oper[0]^);
TAsmLabel(symbol).DecRefs;
taicpu(hp1).opcode := A_RET;
taicpu(hp1).is_jmp := false;
taicpu(hp1).ops := taicpu(hp2).ops;
DebugMsg(SPeepholeOptimization+'JccJmpRet2J!ccRet',p);
case taicpu(hp2).ops of
0:
taicpu(hp1).clearop(0);
1:
taicpu(hp1).loadconst(0,taicpu(hp2).oper[0]^.val);
else
internalerror(2016041302);
end;
end;
{$ifndef i8086}
end
{
convert
j<c> .L1
mov 1,reg
jmp .L2
.L1
mov 0,reg
.L2
into
mov 0,reg
set<not(c)> reg
take care of alignment and that the mov 0,reg is not converted into a xor as this
would destroy the flag contents
}
else if MatchInstruction(hp1,A_MOV,[]) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
{$ifdef i386}
(
{ Under i386, ESI, EDI, EBP and ESP
don't have an 8-bit representation }
not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP])
) and
{$endif i386}
(taicpu(hp1).oper[0]^.val=1) and
GetNextInstruction(hp1,hp2) and
MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and
GetNextInstruction(hp2,hp3) and
{ skip align }
((hp3.typ<>ait_align) or GetNextInstruction(hp3,hp3)) and
(hp3.typ=ait_label) and
(tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and
(tai_label(hp3).labsym.getrefs=1) and
GetNextInstruction(hp3,hp4) and
MatchInstruction(hp4,A_MOV,[]) and
MatchOpType(taicpu(hp4),top_const,top_reg) and
(taicpu(hp4).oper[0]^.val=0) and
MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and
GetNextInstruction(hp4,hp5) and
(hp5.typ=ait_label) and
(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) and
(tai_label(hp5).labsym.getrefs=1) then
begin
AllocRegBetween(NR_FLAGS,p,hp4,UsedRegs);
DebugMsg(SPeepholeOptimization+'JccMovJmpMov2MovSetcc',p);
{ remove last label }
RemoveInstruction(hp5);
{ remove second label }
RemoveInstruction(hp3);
{ if align is present remove it }
if GetNextInstruction(hp2,hp3) and (hp3.typ=ait_align) then
RemoveInstruction(hp3);
{ remove jmp }
RemoveInstruction(hp2);
if taicpu(hp1).opsize=S_B then
RemoveInstruction(hp1)
else
taicpu(hp1).loadconst(0,0);
taicpu(hp4).opcode:=A_SETcc;
taicpu(hp4).opsize:=S_B;
taicpu(hp4).condition:=inverse_cond(taicpu(p).condition);
taicpu(hp4).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(hp4).oper[1]^.reg),R_SUBL));
taicpu(hp4).opercnt:=1;
taicpu(hp4).ops:=1;
taicpu(hp4).freeop(1);
RemoveCurrentP(p);
Result:=true;
exit;
end
else if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
begin
{ check for
jCC xxx
<several movs>
xxx:
}
l:=0;
while assigned(hp1) and
CanBeCMOV(hp1) and
{ stop on labels }
not(hp1.typ=ait_label) do
begin
inc(l);
GetNextInstruction(hp1,hp1);
end;
if assigned(hp1) then
begin
if FindLabel(tasmlabel(symbol),hp1) then
begin
if (l<=4) and (l>0) then
begin
condition:=inverse_cond(taicpu(p).condition);
UpdateUsedRegs(tai(p.next));
GetNextInstruction(p,hp1);
repeat
if not Assigned(hp1) then
InternalError(2018062900);
taicpu(hp1).opcode:=A_CMOVcc;
taicpu(hp1).condition:=condition;
UpdateUsedRegs(tai(hp1.next));
GetNextInstruction(hp1,hp1);
until not(CanBeCMOV(hp1));
{ Remember what hp1 is in case there's multiple aligns to get rid of }
hp2 := hp1;
repeat
if not Assigned(hp2) then
InternalError(2018062910);
case hp2.typ of
ait_label:
{ What we expected - break out of the loop (it won't be a dead label at the top of
a cluster because that was optimised at an earlier stage) }
Break;
ait_align:
{ Go to the next entry until a label is found (may be multiple aligns before it) }
begin
hp2 := tai(hp2.Next);
Continue;
end;
else
begin
{ Might be a comment or temporary allocation entry }
if not (hp2.typ in SkipInstr) then
InternalError(2018062911);
hp2 := tai(hp2.Next);
Continue;
end;
end;
until False;
{ Now we can safely decrement the reference count }
tasmlabel(symbol).decrefs;
DebugMsg(SPeepholeOptimization+'JccMov2CMov',p);
{ Remove the original jump }
RemoveInstruction(p); { Note, the choice to not use RemoveCurrentp is deliberate }
UpdateUsedRegs(tai(hp2.next));
GetNextInstruction(hp2, p); { Instruction after the label }
{ Remove the label if this is its final reference }
if (tasmlabel(symbol).getrefs=0) then
StripLabelFast(hp1);
if Assigned(p) then
result:=true;
exit;
end;
end
else
begin
{ check further for
jCC xxx
<several movs 1>
jmp yyy
xxx:
<several movs 2>
yyy:
}
{ hp2 points to jmp yyy }
hp2:=hp1;
{ skip hp1 to xxx (or an align right before it) }
GetNextInstruction(hp1, hp1);
if assigned(hp2) and
assigned(hp1) and
(l<=3) and
(hp2.typ=ait_instruction) and
(taicpu(hp2).is_jmp) and
(taicpu(hp2).condition=C_None) and
{ real label and jump, no further references to the
label are allowed }
(tasmlabel(symbol).getrefs=1) and
FindLabel(tasmlabel(symbol),hp1) then
begin
l:=0;
{ skip hp1 to <several moves 2> }
if (hp1.typ = ait_align) then
GetNextInstruction(hp1, hp1);
GetNextInstruction(hp1, hpmov2);
hp1 := hpmov2;
while assigned(hp1) and
CanBeCMOV(hp1) do
begin
inc(l);
GetNextInstruction(hp1, hp1);
end;
{ hp1 points to yyy (or an align right before it) }
hp3 := hp1;
if assigned(hp1) and
FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
begin
condition:=inverse_cond(taicpu(p).condition);
UpdateUsedRegs(tai(p.next));
GetNextInstruction(p,hp1);
repeat
taicpu(hp1).opcode:=A_CMOVcc;
taicpu(hp1).condition:=condition;
UpdateUsedRegs(tai(hp1.next));
GetNextInstruction(hp1,hp1);
until not(assigned(hp1)) or
not(CanBeCMOV(hp1));
condition:=inverse_cond(condition);
if GetLastInstruction(hpmov2,hp1) then
UpdateUsedRegs(tai(hp1.next));
hp1 := hpmov2;
{ hp1 is now at <several movs 2> }
while Assigned(hp1) and CanBeCMOV(hp1) do
begin
taicpu(hp1).opcode:=A_CMOVcc;
taicpu(hp1).condition:=condition;
UpdateUsedRegs(tai(hp1.next));
GetNextInstruction(hp1,hp1);
end;
hp1 := p;
{ Get first instruction after label }
UpdateUsedRegs(tai(hp3.next));
GetNextInstruction(hp3, p);
if assigned(p) and (hp3.typ = ait_align) then
GetNextInstruction(p, p);
{ Don't dereference yet, as doing so will cause
GetNextInstruction to skip the label and
optional align marker. [Kit] }
GetNextInstruction(hp2, hp4);
DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
{ remove jCC }
RemoveInstruction(hp1);
{ Now we can safely decrement it }
tasmlabel(symbol).decrefs;
{ Remove label xxx (it will have a ref of zero due to the initial check }
StripLabelFast(hp4);
{ remove jmp }
symbol := taicpu(hp2).oper[0]^.ref^.symbol;
RemoveInstruction(hp2);
{ As before, now we can safely decrement it }
tasmlabel(symbol).decrefs;
{ Remove label yyy (and the optional alignment) if its reference falls to zero }
if tasmlabel(symbol).getrefs = 0 then
StripLabelFast(hp3);
if Assigned(p) then
result:=true;
exit;
end;
end;
end;
end;
{$endif i8086}
end;
end;
end;
function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
var
hp1,hp2,hp3: tai;
reg_and_hp1_is_instr, RegUsed, AndTest: Boolean;
NewSize: TOpSize;
NewRegSize: TSubRegister;
Limit: TCgInt;
SwapOper: POper;
begin
result:=false;
reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
GetNextInstruction(p,hp1) and
(hp1.typ = ait_instruction);
if reg_and_hp1_is_instr and
(
(taicpu(hp1).opcode <> A_LEA) or
{ If the LEA instruction can be converted into an arithmetic instruction,
it may be possible to then fold it. }
(
{ If the flags register is in use, don't change the instruction
to an ADD otherwise this will scramble the flags. [Kit] }
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
ConvertLEA(taicpu(hp1))
)
) and
IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
GetNextInstruction(hp1,hp2) and
MatchInstruction(hp2,A_MOV,[]) and
(taicpu(hp2).oper[0]^.typ = top_reg) and
OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
((taicpu(p).opsize in [S_BW,S_BL]) and (taicpu(hp2).opsize=S_B) or
(taicpu(p).opsize in [S_WL]) and (taicpu(hp2).opsize=S_W)) and
{$ifdef i386}
{ not all registers have byte size sub registers on i386 }
((taicpu(hp2).opsize<>S_B) or (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX])) and
{$endif i386}
(((taicpu(hp1).ops=2) and
(getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg))) or
((taicpu(hp1).ops=1) and
(getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[0]^.reg)))) and
not(RegUsedAfterInstruction(taicpu(hp2).oper[0]^.reg,hp2,UsedRegs)) then
begin
{ change movsX/movzX reg/ref, reg2
add/sub/or/... reg3/$const, reg2
mov reg2 reg/ref
to add/sub/or/... reg3/$const, reg/ref }
{ by example:
movswl %si,%eax movswl %si,%eax p
decl %eax addl %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
->
movswl %si,%eax movswl %si,%eax p
decw %eax addw %edx,%eax hp1
movw %ax,%si movw %ax,%si hp2
}
taicpu(hp1).changeopsize(taicpu(hp2).opsize);
{
->
movswl %si,%eax movswl %si,%eax p
decw %si addw %dx,%si hp1
movw %ax,%si movw %ax,%si hp2
}
case taicpu(hp1).ops of
1:
taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
2:
begin
taicpu(hp1).loadoper(1,taicpu(hp2).oper[1]^);
if (taicpu(hp1).oper[0]^.typ = top_reg) then
setsubreg(taicpu(hp1).oper[0]^.reg,getsubreg(taicpu(hp2).oper[0]^.reg));
end;
else
internalerror(2008042702);
end;
{
->
decw %si addw %dx,%si p
}
DebugMsg(SPeepholeOptimization + 'var3',p);
RemoveCurrentP(p, hp1);
RemoveInstruction(hp2);
Result := True;
Exit;
end;
if reg_and_hp1_is_instr and
(taicpu(hp1).opcode = A_MOV) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and
(MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
{$ifdef x86_64}
{ check for implicit extension to 64 bit }
or
((taicpu(p).opsize in [S_BL,S_WL]) and
(taicpu(hp1).opsize=S_Q) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg)
)
{$endif x86_64}
)
then
begin
{ change
movx %reg1,%reg2
mov %reg2,%reg3
dealloc %reg2
into
movx %reg,%reg3
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'MovxMov2Movx',p);
{$ifdef x86_64}
if (taicpu(p).opsize in [S_BL,S_WL]) and
(taicpu(hp1).opsize=S_Q) then
taicpu(p).loadreg(1,newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg),R_SUBD))
else
{$endif x86_64}
taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
RemoveInstruction(hp1);
Result := True;
Exit;
end;
end;
if reg_and_hp1_is_instr and
((taicpu(hp1).opcode=A_MOV) or
(taicpu(hp1).opcode=A_ADD) or
(taicpu(hp1).opcode=A_SUB) or
(taicpu(hp1).opcode=A_CMP) or
(taicpu(hp1).opcode=A_OR) or
(taicpu(hp1).opcode=A_XOR) or
(taicpu(hp1).opcode=A_AND)
) and
(taicpu(hp1).oper[1]^.typ = top_reg) then
begin
AndTest := (taicpu(hp1).opcode=A_AND) and
GetNextInstruction(hp1, hp2) and
(hp2.typ = ait_instruction) and
(
(
(taicpu(hp2).opcode=A_TEST) and
(
MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[1]^.reg) or
MatchOperand(taicpu(hp2).oper[0]^, -1) or
(
{ If the AND and TEST instructions share a constant, this is also valid }
(taicpu(hp1).oper[0]^.typ = top_const) and
MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[0]^.val)
)
) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
) or
(
(taicpu(hp2).opcode=A_CMP) and
MatchOperand(taicpu(hp2).oper[0]^, 0) and
MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
)
);
{ change
movx (oper),%reg2
and $x,%reg2
test %reg2,%reg2
dealloc %reg2
into
op %reg1,%reg3
if the second op accesses only the bits stored in reg1
}
if ((taicpu(p).oper[0]^.typ=top_reg) or
((taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr<>addr_full))) and
(taicpu(hp1).oper[0]^.typ = top_const) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
AndTest then
begin
{ Check if the AND constant is in range }
case taicpu(p).opsize of
S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
begin
NewSize := S_B;
Limit := $FF;
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
begin
NewSize := S_W;
Limit := $FFFF;
end;
{$ifdef x86_64}
S_LQ:
begin
NewSize := S_L;
Limit := $FFFFFFFF;
end;
{$endif x86_64}
else
InternalError(2021120303);
end;
if (
((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val) or
{ Check for negative operands }
(((not taicpu(hp1).oper[0]^.val) and Limit) = (not taicpu(hp1).oper[0]^.val))
) and
GetNextInstruction(hp2,hp3) and
MatchInstruction(hp3,A_Jcc,A_Setcc,A_CMOVcc,[]) and
(taicpu(hp3).condition in [C_E,C_NE]) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if not(RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp2, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'MovxAndTest2Test done',p);
taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
taicpu(hp1).opcode := A_TEST;
taicpu(hp1).opsize := NewSize;
RemoveInstruction(hp2);
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
end;
end;
if (taicpu(hp1).oper[0]^.typ = top_reg) and
(((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
(taicpu(hp1).opsize=S_B)) or
((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
(taicpu(hp1).opsize=S_W))
{$ifdef x86_64}
or ((taicpu(p).opsize=S_LQ) and
(taicpu(hp1).opsize=S_L))
{$endif x86_64}
) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
begin
{ change
movx %reg1,%reg2
op %reg2,%reg3
dealloc %reg2
into
op %reg1,%reg3
if the second op accesses only the bits stored in reg1
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if AndTest then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
end
else
RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
if not RegUsed then
begin
DebugMsg(SPeepholeOptimization + 'MovxOp2Op 1',p);
if taicpu(p).oper[0]^.typ=top_reg then
begin
case taicpu(hp1).opsize of
S_B:
taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
S_W:
taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
S_L:
taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
else
Internalerror(2020102301);
end;
AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
end
else
taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
RemoveCurrentP(p);
if AndTest then
RemoveInstruction(hp2);
result:=true;
exit;
end;
end
else if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
(
{ Bitwise operations only }
(taicpu(hp1).opcode=A_AND) or
(taicpu(hp1).opcode=A_TEST) or
(
(taicpu(hp1).oper[0]^.typ = top_const) and
(
(taicpu(hp1).opcode=A_OR) or
(taicpu(hp1).opcode=A_XOR)
)
)
) and
(
(taicpu(hp1).oper[0]^.typ = top_const) or
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
) then
begin
{ change
movx %reg2,%reg2
op const,%reg2
into
op const,%reg2 (smaller version)
movx %reg2,%reg2
also change
movx %reg1,%reg2
and/test (oper),%reg2
dealloc %reg2
into
and/test (oper),%reg1
}
case taicpu(p).opsize of
S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
begin
NewSize := S_B;
NewRegSize := R_SUBL;
Limit := $FF;
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
begin
NewSize := S_W;
NewRegSize := R_SUBW;
Limit := $FFFF;
end;
{$ifdef x86_64}
S_LQ:
begin
NewSize := S_L;
NewRegSize := R_SUBD;
Limit := $FFFFFFFF;
end;
{$endif x86_64}
else
Internalerror(2021120302);
end;
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
if AndTest then
begin
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
end
else
RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
if
(
(taicpu(p).opcode = A_MOVZX) and
(
(taicpu(hp1).opcode=A_AND) or
(taicpu(hp1).opcode=A_TEST)
) and
not (
{ If both are references, then the final instruction will have
both operands as references, which is not allowed }
(taicpu(p).oper[0]^.typ = top_ref) and
(taicpu(hp1).oper[0]^.typ = top_ref)
) and
not RegUsed
) or
(
(
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) or
not RegUsed
) and
(taicpu(p).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.typ = top_const) and
((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val)
) then
begin
{$if defined(i386) or defined(i8086)}
{ If the target size is 8-bit, make sure we can actually encode it }
if (NewRegSize = R_SUBL) and (taicpu(hp1).oper[0]^.typ = top_reg) and not (GetSupReg(taicpu(hp1).oper[0]^.reg) in [RS_EAX,RS_EBX,RS_ECX,RS_EDX]) then
Exit;
{$endif i386 or i8086}
DebugMsg(SPeepholeOptimization + 'MovxOp2Op 2',p);
taicpu(hp1).opsize := NewSize;
taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
if AndTest then
begin
RemoveInstruction(hp2);
if not RegUsed then
begin
taicpu(hp1).opcode := A_TEST;
if (taicpu(hp1).oper[0]^.typ = top_ref) then
begin
{ Make sure the reference is the second operand }
SwapOper := taicpu(hp1).oper[0];
taicpu(hp1).oper[0] := taicpu(hp1).oper[1];
taicpu(hp1).oper[1] := SwapOper;
end;
end;
end;
case taicpu(hp1).oper[0]^.typ of
top_reg:
setsubreg(taicpu(hp1).oper[0]^.reg, NewRegSize);
top_const:
{ For the AND/TEST case }
taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and Limit;
else
;
end;
if RegUsed then
begin
AsmL.Remove(p);
AsmL.InsertAfter(p, hp1);
p := hp1;
end
else
RemoveCurrentP(p, hp1);
result:=true;
exit;
end;
end;
end;
if reg_and_hp1_is_instr and
(taicpu(p).oper[0]^.typ = top_reg) and
(
(taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
) and
(taicpu(hp1).oper[0]^.typ = top_const) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
{ Minimum shift value allowed is the bit difference between the sizes }
(taicpu(hp1).oper[0]^.val >=
{ Multiply by 8 because tcgsize2size returns bytes, not bits }
8 * (
tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
)
) then
begin
{ For:
movsx/movzx %reg1,%reg1 (same register, just different sizes)
shl/sal ##, %reg1
Remove the movsx/movzx instruction if the shift overwrites the
extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
}
DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end
else if reg_and_hp1_is_instr and
(taicpu(p).oper[0]^.typ = top_reg) and
(
((taicpu(hp1).opcode = A_SHR) and (taicpu(p).opcode = A_MOVZX)) or
((taicpu(hp1).opcode = A_SAR) and (taicpu(p).opcode <> A_MOVZX))
) and
(taicpu(hp1).oper[0]^.typ = top_const) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
{ Minimum shift value allowed is the bit size of the smallest register - 1 }
(taicpu(hp1).oper[0]^.val <
{ Multiply by 8 because tcgsize2size returns bytes, not bits }
8 * (
tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
)
) then
begin
{ For:
movsx %reg1,%reg1 movzx %reg1,%reg1 (same register, just different sizes)
sar ##, %reg1 shr ##, %reg1
Move the shift to before the movx instruction if the shift value
is not too large.
}
asml.Remove(hp1);
asml.InsertBefore(hp1, p);
taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
case taicpu(p).opsize of
s_BW, S_BL{$ifdef x86_64}, S_BQ{$endif}:
taicpu(hp1).opsize := S_B;
S_WL{$ifdef x86_64}, S_WQ{$endif}:
taicpu(hp1).opsize := S_W;
{$ifdef x86_64}
S_LQ:
taicpu(hp1).opsize := S_L;
{$endif}
else
InternalError(2020112401);
end;
if (taicpu(hp1).opcode = A_SHR) then
DebugMsg(SPeepholeOptimization + 'MovzShr2ShrMovz', hp1)
else
DebugMsg(SPeepholeOptimization + 'MovsSar2SarMovs', hp1);
Result := True;
end;
if reg_and_hp1_is_instr and
(taicpu(p).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
(
(taicpu(hp1).opcode = taicpu(p).opcode)
or ((taicpu(p).opcode = A_MOVZX) and ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}))
{$ifdef x86_64}
or ((taicpu(p).opcode = A_MOVSX) and (taicpu(hp1).opcode = A_MOVSXD))
{$endif x86_64}
) then
begin
if MatchOpType(taicpu(hp1), top_reg, top_reg) and
(taicpu(p).oper[1]^.reg = taicpu(hp1).oper[0]^.reg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
{
For example:
movzbw %al,%ax
movzwl %ax,%eax
Compress into:
movzbl %al,%eax
}
RegUsed := False;
case taicpu(p).opsize of
S_BW:
case taicpu(hp1).opsize of
S_WL:
begin
taicpu(p).opsize := S_BL;
RegUsed := True;
end;
{$ifdef x86_64}
S_WQ:
begin
if taicpu(p).opcode = A_MOVZX then
begin
taicpu(p).opsize := S_BL;
{ 64-bit zero extension is implicit, so change to the 32-bit register }
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
end
else
taicpu(p).opsize := S_BQ;
RegUsed := True;
end;
{$endif x86_64}
else
;
end;
{$ifdef x86_64}
S_BL:
case taicpu(hp1).opsize of
S_LQ:
begin
if taicpu(p).opcode = A_MOVZX then
begin
taicpu(p).opsize := S_BL;
{ 64-bit zero extension is implicit, so change to the 32-bit register }
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
end
else
taicpu(p).opsize := S_BQ;
RegUsed := True;
end;
else
;
end;
S_WL:
case taicpu(hp1).opsize of
S_LQ:
begin
if taicpu(p).opcode = A_MOVZX then
begin
taicpu(p).opsize := S_WL;
{ 64-bit zero extension is implicit, so change to the 32-bit register }
setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
end
else
taicpu(p).opsize := S_WQ;
RegUsed := True;
end;
else
;
end;
{$endif x86_64}
else
;
end;
if RegUsed then
begin
DebugMsg(SPeepholeOptimization + 'MovxMovx2Movx', p);
taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
RemoveInstruction(hp1);
Result := True;
Exit;
end;
end;
if (taicpu(hp1).opsize = taicpu(p).opsize) and
not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, [A_AND, A_OR, A_XOR, A_TEST], []) and
(
((taicpu(hp2).opsize = S_W) and (taicpu(p).opsize = S_BW)) or
((taicpu(hp2).opsize = S_L) and (taicpu(p).opsize in [S_BL, S_WL]))
{$ifdef x86_64}
or ((taicpu(hp2).opsize = S_Q) and (taicpu(p).opsize in [S_BL, S_BQ, S_WL, S_WQ, S_LQ]))
{$endif x86_64}
) and
MatchOpType(taicpu(hp2), top_reg, top_reg) and
(
(
(taicpu(hp2).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg)
) or
(
{ Only allow the operands in reverse order for TEST instructions }
(taicpu(hp2).opcode = A_TEST) and
(taicpu(hp2).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
(taicpu(hp2).oper[1]^.reg = taicpu(hp1).oper[1]^.reg)
)
) then
begin
{
For example:
movzbl %al,%eax
movzbl (ref),%edx
andl %edx,%eax
(%edx deallocated)
Change to:
andb (ref),%al
movzbl %al,%eax
Rules are:
- First two instructions have the same opcode and opsize
- First instruction's operands are the same super-register
- Second instruction operates on a different register
- Third instruction is AND, OR, XOR or TEST
- Third instruction's operands are the destination registers of the first two instructions
- Third instruction writes to the destination register of the first instruction (except with TEST)
- Second instruction's destination register is deallocated afterwards
}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs) then
begin
case taicpu(p).opsize of
S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
NewSize := S_B;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
NewSize := S_W;
{$ifdef x86_64}
S_LQ:
NewSize := S_L;
{$endif x86_64}
else
InternalError(2021120301);
end;
taicpu(hp2).loadoper(0, taicpu(hp1).oper[0]^);
taicpu(hp2).loadreg(1, taicpu(p).oper[0]^.reg);
taicpu(hp2).opsize := NewSize;
RemoveInstruction(hp1);
{ With TEST, it's best to keep the MOVX instruction at the top }
if (taicpu(hp2).opcode <> A_TEST) then
begin
DebugMsg(SPeepholeOptimization + 'MovxMovxTest2MovxTest', p);
asml.Remove(p);
{ If the third instruction uses the flags, the MOVX instruction won't modify then }
asml.InsertAfter(p, hp2);
p := hp2;
end
else
DebugMsg(SPeepholeOptimization + 'MovxMovxOp2OpMovx', p);
Result := True;
Exit;
end;
end;
end;
if taicpu(p).opcode=A_MOVZX then
begin
{ removes superfluous And's after movzx's }
if reg_and_hp1_is_instr and
(taicpu(hp1).opcode = A_AND) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
((taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg)
{$ifdef x86_64}
{ check for implicit extension to 64 bit }
or
((taicpu(p).opsize in [S_BL,S_WL]) and
(taicpu(hp1).opsize=S_Q) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg)
)
{$endif x86_64}
)
then
begin
case taicpu(p).opsize Of
S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
if (taicpu(hp1).oper[0]^.val = $ff) then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz1',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
if (taicpu(hp1).oper[0]^.val = $ffff) then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz2',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
{$ifdef x86_64}
S_LQ:
if (taicpu(hp1).oper[0]^.val = $ffffffff) then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2Movz3',p);
RemoveInstruction(hp1);
Result:=true;
exit;
end;
{$endif x86_64}
else
;
end;
{ we cannot get rid of the and, but can we get rid of the movz ?}
if SuperRegistersEqual(taicpu(p).oper[0]^.reg,taicpu(p).oper[1]^.reg) then
begin
case taicpu(p).opsize Of
S_BL, S_BW{$ifdef x86_64}, S_BQ{$endif x86_64}:
if (taicpu(hp1).oper[0]^.val and $ff)=taicpu(hp1).oper[0]^.val then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2And1',p);
RemoveCurrentP(p,hp1);
Result:=true;
exit;
end;
S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
if (taicpu(hp1).oper[0]^.val and $ffff)=taicpu(hp1).oper[0]^.val then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2And2',p);
RemoveCurrentP(p,hp1);
Result:=true;
exit;
end;
{$ifdef x86_64}
S_LQ:
if (taicpu(hp1).oper[0]^.val and $ffffffff)=taicpu(hp1).oper[0]^.val then
begin
DebugMsg(SPeepholeOptimization + 'MovzAnd2And3',p);
RemoveCurrentP(p,hp1);
Result:=true;
exit;
end;
{$endif x86_64}
else
;
end;
end;
end;
{ changes some movzx constructs to faster synonyms (all examples
are given with eax/ax, but are also valid for other registers)}
if MatchOpType(taicpu(p),top_reg,top_reg) then
begin
case taicpu(p).opsize of
{ Technically, movzbw %al,%ax cannot be encoded in 32/64-bit mode
(the machine code is equivalent to movzbl %al,%eax), but the
code generator still generates that assembler instruction and
it is silently converted. This should probably be checked.
[Kit] }
S_BW:
begin
if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
(
not IsMOVZXAcceptable
{ and $0xff,%ax has a smaller encoding but risks a partial write penalty }
or (
(cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[1]^.reg = NR_AX)
)
) then
{Change "movzbw %al, %ax" to "andw $0x0ffh, %ax"}
begin
DebugMsg(SPeepholeOptimization + 'var7',p);
taicpu(p).opcode := A_AND;
taicpu(p).changeopsize(S_W);
taicpu(p).loadConst(0,$ff);
Result := True;
end
else if not IsMOVZXAcceptable and
GetNextInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
(taicpu(hp1).opcode = A_AND) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
{ Change "movzbw %reg1, %reg2; andw $const, %reg2"
to "movw %reg1, reg2; andw $(const1 and $ff), %reg2"}
begin
DebugMsg(SPeepholeOptimization + 'var8',p);
taicpu(p).opcode := A_MOV;
taicpu(p).changeopsize(S_W);
setsubreg(taicpu(p).oper[0]^.reg,R_SUBW);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
Result := True;
end;
end;
{$ifndef i8086} { movzbl %al,%eax cannot be encoded in 16-bit mode (the machine code is equivalent to movzbw %al,%ax }
S_BL:
begin
if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) and
(
not IsMOVZXAcceptable
{ and $0xff,%eax has a smaller encoding but risks a partial write penalty }
or (
(cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[1]^.reg = NR_EAX)
)
) then
{ Change "movzbl %al, %eax" to "andl $0x0ffh, %eax" }
begin
DebugMsg(SPeepholeOptimization + 'var9',p);
taicpu(p).opcode := A_AND;
taicpu(p).changeopsize(S_L);
taicpu(p).loadConst(0,$ff);
Result := True;
end
else if not IsMOVZXAcceptable and
GetNextInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
(taicpu(hp1).opcode = A_AND) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
{ Change "movzbl %reg1, %reg2; andl $const, %reg2"
to "movl %reg1, reg2; andl $(const1 and $ff), %reg2"}
begin
DebugMsg(SPeepholeOptimization + 'var10',p);
taicpu(p).opcode := A_MOV;
taicpu(p).changeopsize(S_L);
{ do not use R_SUBWHOLE
as movl %rdx,%eax
is invalid in assembler PM }
setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
Result := True;
end;
end;
{$endif i8086}
S_WL:
if not IsMOVZXAcceptable then
begin
if (getsupreg(taicpu(p).oper[0]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)) then
{ Change "movzwl %ax, %eax" to "andl $0x0ffffh, %eax" }
begin
DebugMsg(SPeepholeOptimization + 'var11',p);
taicpu(p).opcode := A_AND;
taicpu(p).changeopsize(S_L);
taicpu(p).loadConst(0,$ffff);
Result := True;
end
else if GetNextInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
(taicpu(hp1).opcode = A_AND) and
(taicpu(hp1).oper[0]^.typ = top_const) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
{ Change "movzwl %reg1, %reg2; andl $const, %reg2"
to "movl %reg1, reg2; andl $(const1 and $ffff), %reg2"}
begin
DebugMsg(SPeepholeOptimization + 'var12',p);
taicpu(p).opcode := A_MOV;
taicpu(p).changeopsize(S_L);
{ do not use R_SUBWHOLE
as movl %rdx,%eax
is invalid in assembler PM }
setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
Result := True;
end;
end;
else
InternalError(2017050705);
end;
end
else if not IsMOVZXAcceptable and (taicpu(p).oper[0]^.typ = top_ref) then
begin
if GetNextInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
(taicpu(hp1).opcode = A_AND) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
begin
//taicpu(p).opcode := A_MOV;
case taicpu(p).opsize Of
S_BL:
begin
DebugMsg(SPeepholeOptimization + 'var13',p);
taicpu(hp1).changeopsize(S_L);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
end;
S_WL:
begin
DebugMsg(SPeepholeOptimization + 'var14',p);
taicpu(hp1).changeopsize(S_L);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ffff);
end;
S_BW:
begin
DebugMsg(SPeepholeOptimization + 'var15',p);
taicpu(hp1).changeopsize(S_W);
taicpu(hp1).loadConst(0,taicpu(hp1).oper[0]^.val and $ff);
end;
else
Internalerror(2017050704)
end;
Result := True;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1AND(var p : tai) : boolean;
var
hp1, hp2 : tai;
MaskLength : Cardinal;
MaskedBits : TCgInt;
ActiveReg : TRegister;
begin
Result:=false;
{ There are no optimisations for reference targets }
if (taicpu(p).oper[1]^.typ <> top_reg) then
Exit;
while GetNextInstruction(p, hp1) and
(hp1.typ = ait_instruction) do
begin
if (taicpu(p).oper[0]^.typ = top_const) then
begin
case taicpu(hp1).opcode of
A_AND:
if MatchOpType(taicpu(hp1),top_const,top_reg) and
(getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
{ the second register must contain the first one, so compare their subreg types }
(getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
(abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
{ change
and const1, reg
and const2, reg
to
and (const1 and const2), reg
}
begin
taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
A_CMP:
if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
{ Just check that the condition on the next instruction is compatible }
GetNextInstruction(hp1, hp2) and
(hp2.typ = ait_instruction) and
(taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
then
{ change
and 2^n, reg
cmp 2^n, reg
j(c) / set(c) / cmov(c) (c is equal or not equal)
to
and 2^n, reg
test reg, reg
j(~c) / set(~c) / cmov(~c)
}
begin
{ Keep TEST instruction in, rather than remove it, because
it may trigger other optimisations such as MovAndTest2Test }
taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
taicpu(hp1).opcode := A_TEST;
DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
Result := True;
Exit;
end;
A_MOVZX:
if MatchOpType(taicpu(hp1),top_reg,top_reg) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
(getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
(
(
(taicpu(p).opsize=S_W) and
(taicpu(hp1).opsize=S_BW)
) or
(
(taicpu(p).opsize=S_L) and
(taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
)
{$ifdef x86_64}
or
(
(taicpu(p).opsize=S_Q) and
(taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
)
{$endif x86_64}
) then
begin
if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
) or
(((taicpu(hp1).opsize) in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $ffff)=taicpu(p).oper[0]^.val))
then
begin
{ Unlike MOVSX, MOVZX doesn't actually have a version that zero-extends a
32-bit register to a 64-bit register, or even a version called MOVZXD, so
code that tests for the presence of AND 0xffffffff followed by MOVZX is
wasted, and is indictive of a compiler bug if it were triggered. [Kit]
NOTE: To zero-extend from 32 bits to 64 bits, simply use the standard MOV.
}
DebugMsg(SPeepholeOptimization + 'AndMovzToAnd done',p);
RemoveInstruction(hp1);
{ See if there are other optimisations possible }
Continue;
end;
end;
A_SHL:
if MatchOpType(taicpu(hp1),top_const,top_reg) and
(getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
begin
{$ifopt R+}
{$define RANGE_WAS_ON}
{$R-}
{$endif}
{ get length of potential and mask }
MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
{ really a mask? }
{$ifdef RANGE_WAS_ON}
{$R+}
{$endif}
if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
{ unmasked part shifted out? }
((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
begin
DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
RemoveCurrentP(p, hp1);
Result:=true;
exit;
end;
end;
A_SHR:
if MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.val <= 63) then
begin
{ Does SHR combined with the AND cover all the bits?
e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
begin
DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
if (taicpu(hp1).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
(
(
(taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
) or (
(taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
{$ifdef x86_64}
) or (
(taicpu(hp1).opsize = S_LQ) and
((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
{$endif x86_64}
)
) then
begin
if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
begin
DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
RemoveInstruction(hp1);
{ See if there are other optimisations possible }
Continue;
end;
{ The super-registers are the same though.
Note that this change by itself doesn't improve
code speed, but it opens up other optimisations. }
{$ifdef x86_64}
{ Convert 64-bit register to 32-bit }
case taicpu(hp1).opsize of
S_BQ:
begin
taicpu(hp1).opsize := S_BL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end;
S_WQ:
begin
taicpu(hp1).opsize := S_WL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end
else
;
end;
{$endif x86_64}
DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
taicpu(hp1).opcode := A_MOVZX;
{ See if there are other optimisations possible }
Continue;
end;
end;
else
;
end;
end
else if MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) and
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
begin
{$ifdef x86_64}
if (taicpu(p).opsize = S_Q) then
begin
{ Never necessary }
DebugMsg(SPeepholeOptimization + 'Andq2Nop', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
{$endif x86_64}
{ Forward check to determine necessity of and %reg,%reg }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
{ Saves on a bunch of dereferences }
ActiveReg := taicpu(p).oper[1]^.reg;
case taicpu(hp1).opcode of
A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
if (
(taicpu(hp1).oper[0]^.typ <> top_ref) or
not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
) and
(
(taicpu(hp1).opcode <> A_MOV) or
(taicpu(hp1).oper[1]^.typ <> top_ref) or
not RegInRef(ActiveReg, taicpu(hp1).oper[1]^.ref^)
) and
not (
{ If mov %reg,%reg is present, remove that instruction instead in OptPass1MOV }
(taicpu(hp1).opcode = A_MOV) and
MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) and
MatchOperand(taicpu(hp1).oper[1]^, ActiveReg)
) and
(
(
(taicpu(hp1).oper[0]^.typ = top_reg) and
(taicpu(hp1).oper[0]^.reg = ActiveReg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg)
) or
(
{$ifdef x86_64}
(
{ If we read from the register, make sure it's not dependent on the upper 32 bits }
(taicpu(hp1).oper[0]^.typ <> top_reg) or
not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ActiveReg) or
(GetSubReg(taicpu(hp1).oper[0]^.reg) <> R_SUBQ)
) and
{$endif x86_64}
not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs)
)
) then
begin
DebugMsg(SPeepholeOptimization + 'AndMovx2Movx', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
A_ADD,
A_AND,
A_BSF,
A_BSR,
A_BTC,
A_BTR,
A_BTS,
A_OR,
A_SUB,
A_XOR:
{ Register is written to, so this will clear the upper 32 bits (2-operand instructions) }
if (
(taicpu(hp1).oper[0]^.typ <> top_ref) or
not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
) and
MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) then
begin
DebugMsg(SPeepholeOptimization + 'AndOp2Op 2', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
A_CMP,
A_TEST:
if (
(taicpu(hp1).oper[0]^.typ <> top_ref) or
not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
) and
MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) and
not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs) then
begin
DebugMsg(SPeepholeOptimization + 'AND; CMP/TEST -> CMP/TEST', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
A_BSWAP,
A_NEG,
A_NOT:
{ Register is written to, so this will clear the upper 32 bits (1-operand instructions) }
if MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) then
begin
DebugMsg(SPeepholeOptimization + 'AndOp2Op 1', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
else
;
end;
end;
if (taicpu(hp1).is_jmp) and
(taicpu(hp1).opcode<>A_JMP) and
not(RegInUsedRegs(taicpu(p).oper[1]^.reg,UsedRegs)) then
begin
{ change
and x, reg
jxx
to
test x, reg
jxx
if reg is deallocated before the
jump, but only if it's a conditional jump (PFV)
}
taicpu(p).opcode := A_TEST;
Exit;
end;
Break;
end;
{ Lone AND tests }
if (taicpu(p).oper[0]^.typ = top_const) then
begin
{
- Convert and $0xFF,reg to and reg,reg if reg is 8-bit
- Convert and $0xFFFF,reg to and reg,reg if reg is 16-bit
- Convert and $0xFFFFFFFF,reg to and reg,reg if reg is 32-bit
}
if ((taicpu(p).oper[0]^.val = $FF) and (taicpu(p).opsize = S_B)) or
((taicpu(p).oper[0]^.val = $FFFF) and (taicpu(p).opsize = S_W)) or
((taicpu(p).oper[0]^.val = $FFFFFFFF) and (taicpu(p).opsize = S_L)) then
begin
taicpu(p).loadreg(0, taicpu(p).oper[1]^.reg);
if taicpu(p).opsize = S_L then
begin
Include(OptsToCheck,aoc_MovAnd2Mov_3);
Result := True;
end;
end;
end;
{ Backward check to determine necessity of and %reg,%reg }
if (taicpu(p).oper[0]^.typ = top_reg) and
(taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
GetLastInstruction(p, hp2) and
RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp2) and
{ Check size of adjacent instruction to determine if the AND is
effectively a null operation }
(
(taicpu(p).opsize = taicpu(hp2).opsize) or
{ Note: Don't include S_Q }
((taicpu(p).opsize = S_L) and (taicpu(hp2).opsize in [S_BL, S_WL])) or
((taicpu(p).opsize = S_W) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_L])) or
((taicpu(p).opsize = S_B) and (taicpu(hp2).opsize in [S_BW, S_BL, S_WL, S_W, S_L]))
) then
begin
DebugMsg(SPeepholeOptimization + 'And2Nop', p);
{ If GetNextInstruction returned False, hp1 will be nil }
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
var
hp1: tai; NewRef: TReference;
{ This entire nested function is used in an if-statement below, but we
want to avoid all the used reg transfers and GetNextInstruction calls
until we really have to check }
function MemRegisterNotUsedLater: Boolean; inline;
var
hp2: tai;
begin
TransferUsedRegs(TmpUsedRegs);
hp2 := p;
repeat
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
end;
begin
Result := False;
if not GetNextInstruction(p, hp1) or (hp1.typ <> ait_instruction) then
Exit;
if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) then
begin
{ Change:
add %reg2,%reg1
mov/s/z #(%reg1),%reg1 (%reg1 superregisters must be the same)
To:
mov/s/z #(%reg1,%reg2),%reg1
}
if MatchOpType(taicpu(p), top_reg, top_reg) and
MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
MatchOpType(taicpu(hp1), top_ref, top_reg) and
(taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
(
(
(taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.ref^.index = NR_NO) and
{ r/esp cannot be an index }
(taicpu(p).oper[0]^.reg<>NR_STACK_POINTER_REG)
) or (
(taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.ref^.base = NR_NO)
)
) and (
Reg1WriteOverwritesReg2Entirely(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) or
(
{ If the super registers ARE equal, then this MOV/S/Z does a partial write }
not SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
MemRegisterNotUsedLater
)
) then
begin
taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
RemoveCurrentp(p, hp1);
Result := True;
Exit;
end;
{ Change:
addl/q $x,%reg1
movl/q %reg1,%reg2
To:
leal/q $x(%reg1),%reg2
addl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
Breaks the dependency chain.
}
if MatchOpType(taicpu(p),top_const,top_reg) and
MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
(
{ Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
not (cs_opt_size in current_settings.optimizerswitches) or
(
not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
)
) then
begin
{ Change the MOV instruction to a LEA instruction, and update the
first operand }
reference_reset(NewRef, 1, []);
NewRef.base := taicpu(p).oper[1]^.reg;
NewRef.scalefactor := 1;
NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
taicpu(hp1).opcode := A_LEA;
taicpu(hp1).loadref(0, NewRef);
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
begin
{ Move what is now the LEA instruction to before the SUB instruction }
Asml.Remove(hp1);
Asml.InsertBefore(hp1, p);
AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
p := hp1;
end
else
begin
{ Since %reg1 or the flags aren't used afterwards, we can delete p completely }
RemoveCurrentP(p, hp1);
DebugMsg(SPeepholeOptimization + 'AddMov2Lea', p);
end;
Result := True;
end;
end;
end;
function TX86AsmOptimizer.OptPass2Lea(var p : tai) : Boolean;
var
SubReg: TSubRegister;
begin
Result:=false;
SubReg := getsubreg(taicpu(p).oper[1]^.reg);
if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
with taicpu(p).oper[0]^.ref^ do
if (offset = 0) and not Assigned(symbol) and not Assigned(relsymbol) and (index <> NR_NO) then
begin
if (scalefactor <= 1) and SuperRegistersEqual(base, taicpu(p).oper[1]^.reg) then
begin
taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(index), SubReg));
taicpu(p).opcode := A_ADD;
DebugMsg(SPeepholeOptimization + 'Lea2AddBase done',p);
Result := True;
end
else if SuperRegistersEqual(index, taicpu(p).oper[1]^.reg) then
begin
if (base <> NR_NO) then
begin
if (scalefactor <= 1) then
begin
taicpu(p).loadreg(0, newreg(R_INTREGISTER, getsupreg(base), SubReg));
taicpu(p).opcode := A_ADD;
DebugMsg(SPeepholeOptimization + 'Lea2AddIndex done',p);
Result := True;
end;
end
else
{ Convert lea (%reg,2^x),%reg to shl x,%reg }
if (scalefactor in [2, 4, 8]) then
begin
{ BsrByte is, in essence, the base-2 logarithm of the scale factor }
taicpu(p).loadconst(0, BsrByte(scalefactor));
taicpu(p).opcode := A_SHL;
DebugMsg(SPeepholeOptimization + 'Lea2Shl done',p);
Result := True;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
var
hp1: tai; NewRef: TReference;
begin
{ Change:
subl/q $x,%reg1
movl/q %reg1,%reg2
To:
leal/q $-x(%reg1),%reg2
subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
Breaks the dependency chain and potentially permits the removal of
a CMP instruction if one follows.
}
Result := False;
if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
MatchOpType(taicpu(p),top_const,top_reg) and
GetNextInstruction(p, hp1) and
MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.typ = top_reg) and
MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
(
{ Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
not (cs_opt_size in current_settings.optimizerswitches) or
(
not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
)
) then
begin
{ Change the MOV instruction to a LEA instruction, and update the
first operand }
reference_reset(NewRef, 1, []);
NewRef.base := taicpu(p).oper[1]^.reg;
NewRef.scalefactor := 1;
NewRef.offset := -taicpu(p).oper[0]^.val;
taicpu(hp1).opcode := A_LEA;
taicpu(hp1).loadref(0, NewRef);
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
begin
{ Move what is now the LEA instruction to before the SUB instruction }
Asml.Remove(hp1);
Asml.InsertBefore(hp1, p);
AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
p := hp1;
end
else
begin
{ Since %reg1 or the flags aren't used afterwards, we can delete p completely }
RemoveCurrentP(p, hp1);
DebugMsg(SPeepholeOptimization + 'SubMov2Lea', p);
end;
Result := True;
end;
end;
function TX86AsmOptimizer.SkipSimpleInstructions(var hp1 : tai) : Boolean;
begin
{ we can skip all instructions not messing with the stack pointer }
while assigned(hp1) and {MatchInstruction(hp1,[A_LEA,A_MOV,A_MOVQ,A_MOVSQ,A_MOVSX,A_MOVSXD,A_MOVZX,
A_AND,A_OR,A_XOR,A_ADD,A_SHR,A_SHL,A_IMUL,A_SETcc,A_SAR,A_SUB,A_TEST,A_CMOVcc,
A_MOVSS,A_MOVSD,A_MOVAPS,A_MOVUPD,A_MOVAPD,A_MOVUPS,
A_VMOVSS,A_VMOVSD,A_VMOVAPS,A_VMOVUPD,A_VMOVAPD,A_VMOVUPS],[]) and}
({(taicpu(hp1).ops=0) or }
({(MatchOpType(taicpu(hp1),top_reg,top_reg) or MatchOpType(taicpu(hp1),top_const,top_reg) or
(MatchOpType(taicpu(hp1),top_ref,top_reg))
) and }
not(RegInInstruction(NR_STACK_POINTER_REG,hp1)) { and not(RegInInstruction(NR_FRAME_POINTER_REG,hp1))}
)
) do
GetNextInstruction(hp1,hp1);
Result:=assigned(hp1);
end;
function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
var
hp1, hp2, hp3, hp4, hp5: tai;
begin
Result:=false;
hp5:=nil;
{ replace
leal(q) x(<stackpointer>),<stackpointer>
call procname
leal(q) -x(<stackpointer>),<stackpointer>
ret
by
jmp procname
but do it only on level 4 because it destroys stack back traces
}
if (cs_opt_level4 in current_settings.optimizerswitches) and
MatchOpType(taicpu(p),top_ref,top_reg) and
(taicpu(p).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
(taicpu(p).oper[0]^.ref^.index=NR_NO) and
{ the -8 or -24 are not required, but bail out early if possible,
higher values are unlikely }
((taicpu(p).oper[0]^.ref^.offset=-8) or
(taicpu(p).oper[0]^.ref^.offset=-24)) and
(taicpu(p).oper[0]^.ref^.symbol=nil) and
(taicpu(p).oper[0]^.ref^.relsymbol=nil) and
(taicpu(p).oper[0]^.ref^.segment=NR_NO) and
(taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) and
GetNextInstruction(p, hp1) and
{ Take a copy of hp1 }
SetAndTest(hp1, hp4) and
{ trick to skip label }
((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
SkipSimpleInstructions(hp1) and
MatchInstruction(hp1,A_CALL,[S_NO]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,A_LEA,[taicpu(p).opsize]) and
MatchOpType(taicpu(hp2),top_ref,top_reg) and
(taicpu(hp2).oper[0]^.ref^.offset=-taicpu(p).oper[0]^.ref^.offset) and
(taicpu(hp2).oper[0]^.ref^.base=NR_STACK_POINTER_REG) and
(taicpu(hp2).oper[0]^.ref^.index=NR_NO) and
(taicpu(hp2).oper[0]^.ref^.symbol=nil) and
(taicpu(hp2).oper[0]^.ref^.relsymbol=nil) and
(taicpu(hp2).oper[0]^.ref^.segment=NR_NO) and
(taicpu(hp2).oper[1]^.reg=NR_STACK_POINTER_REG) and
GetNextInstruction(hp2, hp3) and
{ trick to skip label }
((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
(MatchInstruction(hp3,A_RET,[S_NO]) or
(MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
SetAndTest(hp3,hp5) and
GetNextInstruction(hp3,hp3) and
MatchInstruction(hp3,A_RET,[S_NO])
)
) and
(taicpu(hp3).ops=0) then
begin
taicpu(hp1).opcode := A_JMP;
taicpu(hp1).is_jmp := true;
DebugMsg(SPeepholeOptimization + 'LeaCallLeaRet2Jmp done',p);
RemoveCurrentP(p, hp4);
RemoveInstruction(hp2);
RemoveInstruction(hp3);
if Assigned(hp5) then
begin
AsmL.Remove(hp5);
ASmL.InsertBefore(hp5,hp1)
end;
Result:=true;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptPush(var p : tai) : Boolean;
{$ifdef x86_64}
var
hp1, hp2, hp3, hp4, hp5: tai;
{$endif x86_64}
begin
Result:=false;
{$ifdef x86_64}
hp5:=nil;
{ replace
push %rax
call procname
pop %rcx
ret
by
jmp procname
but do it only on level 4 because it destroys stack back traces
It depends on the fact, that the sequence push rax/pop rcx is used for stack alignment as rcx is volatile
for all supported calling conventions
}
if (cs_opt_level4 in current_settings.optimizerswitches) and
MatchOpType(taicpu(p),top_reg) and
(taicpu(p).oper[0]^.reg=NR_RAX) and
GetNextInstruction(p, hp1) and
{ Take a copy of hp1 }
SetAndTest(hp1, hp4) and
{ trick to skip label }
((hp1.typ=ait_instruction) or GetNextInstruction(hp1, hp1)) and
SkipSimpleInstructions(hp1) and
MatchInstruction(hp1,A_CALL,[S_NO]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2,A_POP,[taicpu(p).opsize]) and
MatchOpType(taicpu(hp2),top_reg) and
(taicpu(hp2).oper[0]^.reg=NR_RCX) and
GetNextInstruction(hp2, hp3) and
{ trick to skip label }
((hp3.typ=ait_instruction) or GetNextInstruction(hp3, hp3)) and
(MatchInstruction(hp3,A_RET,[S_NO]) or
(MatchInstruction(hp3,A_VZEROUPPER,[S_NO]) and
SetAndTest(hp3,hp5) and
GetNextInstruction(hp3,hp3) and
MatchInstruction(hp3,A_RET,[S_NO])
)
) and
(taicpu(hp3).ops=0) then
begin
taicpu(hp1).opcode := A_JMP;
taicpu(hp1).is_jmp := true;
DebugMsg(SPeepholeOptimization + 'PushCallPushRet2Jmp done',p);
RemoveCurrentP(p, hp4);
RemoveInstruction(hp2);
RemoveInstruction(hp3);
if Assigned(hp5) then
begin
AsmL.Remove(hp5);
ASmL.InsertBefore(hp5,hp1)
end;
Result:=true;
end;
{$endif x86_64}
end;
function TX86AsmOptimizer.PostPeepholeOptMov(var p : tai) : Boolean;
var
Value, RegName: string;
begin
Result:=false;
if (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ = top_const) then
begin
case taicpu(p).oper[0]^.val of
0:
{ Don't make this optimisation if the CPU flags are required, since XOR scrambles them }
if not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
begin
{ change "mov $0,%reg" into "xor %reg,%reg" }
taicpu(p).opcode := A_XOR;
taicpu(p).loadReg(0,taicpu(p).oper[1]^.reg);
Result := True;
{$ifdef x86_64}
end
else if (taicpu(p).opsize = S_Q) then
begin
RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
{ The actual optimization }
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).changeopsize(S_L);
DebugMsg(SPeepholeOptimization + 'movq $0,' + RegName + ' -> movl $0,' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
Result := True;
end;
$1..$FFFFFFFF:
begin
{ Code size reduction by J. Gareth "Kit" Moreton }
{ change 64-bit register to 32-bit register to reduce code size (upper 32 bits will be set to zero) }
case taicpu(p).opsize of
S_Q:
begin
RegName := debug_regname(taicpu(p).oper[1]^.reg); { 64-bit register name }
Value := debug_tostr(taicpu(p).oper[0]^.val);
{ The actual optimization }
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).changeopsize(S_L);
DebugMsg(SPeepholeOptimization + 'movq $' + Value + ',' + RegName + ' -> movl $' + Value + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (immediate can be represented with just 32 bits)', p);
Result := True;
end;
else
{ Do nothing };
end;
{$endif x86_64}
end;
-1:
{ Don't make this optimisation if the CPU flags are required, since OR scrambles them }
if (cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).opsize <> S_B) and
not (RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
begin
{ change "mov $-1,%reg" into "or $-1,%reg" }
{ NOTES:
- No size saving is made when changing a Word-sized assignment unless the register is AX (smaller encoding)
- This operation creates a false dependency on the register, so only do it when optimising for size
- It is possible to set memory operands using this method, but this creates an even greater false dependency, so don't do this at all
}
taicpu(p).opcode := A_OR;
Result := True;
end;
else
{ Do nothing };
end;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptAnd(var p : tai) : boolean;
var
hp1: tai;
begin
{ Detect:
andw x, %ax (0 <= x < $8000)
...
movzwl %ax,%eax
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
}
Result := False; if MatchOpType(taicpu(p), top_const, top_reg) and
(taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val) and
GetNextInstructionUsingReg(p, hp1, NR_EAX) and
MatchInstruction(hp1, A_MOVZX, [S_WL]) and
MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
begin
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via AndMovz2AndCwtl)', hp1);
taicpu(hp1).opcode := A_CWDE;
taicpu(hp1).clearop(0);
taicpu(hp1).clearop(1);
taicpu(hp1).ops := 0;
{ A change was made, but not with p, so move forward 1 }
p := tai(p.Next);
Result := True;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptMOVSX(var p : tai) : boolean;
begin
Result := False;
if not MatchOpType(taicpu(p), top_reg, top_reg) then
Exit;
{ Convert:
movswl %ax,%eax -> cwtl
movslq %eax,%rax -> cdqe
NOTE: Don't convert movswl %al,%ax to cbw, because cbw and cwde
refer to the same opcode and depends only on the assembler's
current operand-size attribute. [Kit]
}
with taicpu(p) do
case opsize of
S_WL:
if (oper[0]^.reg = NR_AX) and (oper[1]^.reg = NR_EAX) then
begin
DebugMsg(SPeepholeOptimization + 'Converted movswl %ax,%eax to cwtl', p);
opcode := A_CWDE;
clearop(0);
clearop(1);
ops := 0;
Result := True;
end;
{$ifdef x86_64}
S_LQ:
if (oper[0]^.reg = NR_EAX) and (oper[1]^.reg = NR_RAX) then
begin
DebugMsg(SPeepholeOptimization + 'Converted movslq %eax,%rax to cltq', p);
opcode := A_CDQE;
clearop(0);
clearop(1);
ops := 0;
Result := True;
end;
{$endif x86_64}
else
;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
var
hp1: tai;
begin
{ Detect:
shr x, %ax (x > 0)
...
movzwl %ax,%eax
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
}
Result := False;
if MatchOpType(taicpu(p), top_const, top_reg) and
(taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
(taicpu(p).oper[0]^.val > 0) and
GetNextInstructionUsingReg(p, hp1, NR_EAX) and
MatchInstruction(hp1, A_MOVZX, [S_WL]) and
MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
begin
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
taicpu(hp1).opcode := A_CWDE;
taicpu(hp1).clearop(0);
taicpu(hp1).clearop(1);
taicpu(hp1).ops := 0;
{ A change was made, but not with p, so move forward 1 }
p := tai(p.Next);
Result := True;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptADDSUB(var p : tai) : boolean;
var
hp1, hp2: tai;
Opposite, SecondOpposite: TAsmOp;
NewCond: TAsmCond;
begin
Result := False;
{ Change:
add/sub 128,(dest)
To:
sub/add -128,(dest)
This generaally takes fewer bytes to encode because -128 can be stored
in a signed byte, whereas +128 cannot.
}
if (taicpu(p).opsize <> S_B) and MatchOperand(taicpu(p).oper[0]^, 128) then
begin
if taicpu(p).opcode = A_ADD then
Opposite := A_SUB
else
Opposite := A_ADD;
{ Be careful if the flags are in use, because the CF flag inverts
when changing from ADD to SUB and vice versa }
if RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) and
GetNextInstruction(p, hp1) then
begin
TransferUsedRegs(TmpUsedRegs);
TmpUsedRegs[R_SPECIALREGISTER].Update(tai(p.Next), True);
hp2 := hp1;
{ Scan ahead to check if everything's safe }
while Assigned(hp1) and RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) do
begin
if (hp1.typ <> ait_instruction) then
{ Probably unsafe since the flags are still in use }
Exit;
if MatchInstruction(hp1, A_CALL, A_JMP, A_RET, []) then
{ Stop searching at an unconditional jump }
Break;
if not
(
MatchInstruction(hp1, A_ADC, A_SBB, []) and
(taicpu(hp1).oper[0]^.typ = top_const) { We need to be able to invert a constant }
) and
(taicpu(hp1).condition = C_None) and RegInInstruction(NR_DEFAULTFLAGS, hp1) then
{ Instruction depends on FLAGS (and is not ADC or SBB); break out }
Exit;
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
TmpUsedRegs[R_SPECIALREGISTER].Update(tai(hp1.Next), True);
{ Move to the next instruction }
GetNextInstruction(hp1, hp1);
end;
while Assigned(hp2) and (hp2 <> hp1) do
begin
NewCond := C_None;
case taicpu(hp2).condition of
C_A, C_NBE:
NewCond := C_BE;
C_B, C_C, C_NAE:
NewCond := C_AE;
C_AE, C_NB, C_NC:
NewCond := C_B;
C_BE, C_NA:
NewCond := C_A;
else
{ No change needed };
end;
if NewCond <> C_None then
begin
DebugMsg(SPeepholeOptimization + 'Condition changed from ' + cond2str[taicpu(hp2).condition] + ' to ' + cond2str[NewCond] +
' to accommodate ' + debug_op2str(taicpu(p).opcode) + ' -> ' + debug_op2str(opposite) + ' above', hp2);
taicpu(hp2).condition := NewCond;
end
else
if MatchInstruction(hp2, A_ADC, A_SBB, []) then
begin
{ Because of the flipping of the carry bit, to ensure
the operation remains equivalent, ADC becomes SBB
and vice versa, and the constant is not-inverted.
If multiple ADCs or SBBs appear in a row, each one
changed causes the carry bit to invert, so they all
need to be flipped }
if taicpu(hp2).opcode = A_ADC then
SecondOpposite := A_SBB
else
SecondOpposite := A_ADC;
if taicpu(hp2).oper[0]^.typ <> top_const then
{ Should have broken out of this optimisation already }
InternalError(2021112901);
DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(hp2).opcode) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' -> ' +
debug_op2str(SecondOpposite) + debug_opsize2str(taicpu(hp2).opsize) + ' $' + debug_tostr(not taicpu(hp2).oper[0]^.val) + ',' + debug_operstr(taicpu(hp2).oper[1]^) + ' to accommodate inverted carry bit', hp2);
{ Bit-invert the constant (effectively equivalent to "-1 - val") }
taicpu(hp2).opcode := SecondOpposite;
taicpu(hp2).oper[0]^.val := not taicpu(hp2).oper[0]^.val;
end;
{ Move to the next instruction }
GetNextInstruction(hp2, hp2);
end;
if (hp2 <> hp1) then
InternalError(2021111501);
end;
DebugMsg(SPeepholeOptimization + debug_op2str(taicpu(p).opcode) + debug_opsize2str(taicpu(p).opsize) + ' $128,' + debug_operstr(taicpu(p).oper[1]^) + ' changed to ' +
debug_op2str(opposite) + debug_opsize2str(taicpu(p).opsize) + ' $-128,' + debug_operstr(taicpu(p).oper[1]^) + ' to reduce instruction size', p);
taicpu(p).opcode := Opposite;
taicpu(p).oper[0]^.val := -128;
{ No further optimisations can be made on this instruction, so move
onto the next one to save time }
p := tai(p.Next);
UpdateUsedRegs(p);
Result := True;
Exit;
end;
{ Detect:
add/sub %reg2,(dest)
add/sub x, (dest)
(dest can be a register or a reference)
Swap the instructions to minimise a pipeline stall. This reverses the
"Add swap" and "Sub swap" optimisations done in pass 1 if no new
optimisations could be made.
}
if (taicpu(p).oper[0]^.typ = top_reg) and
not RegInOp(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^) and
(
(
(taicpu(p).oper[1]^.typ = top_reg) and
{ We can try searching further ahead if we're writing to a register }
GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg)
) or
(
(taicpu(p).oper[1]^.typ = top_ref) and
GetNextInstruction(p, hp1)
)
) and
MatchInstruction(hp1, A_ADD, A_SUB, [taicpu(p).opsize]) and
(taicpu(hp1).oper[0]^.typ = top_const) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[1]^) then
begin
{ Make doubly sure the flags aren't in use because the order of additions may affect them }
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
hp2 := p;
while not (cs_opt_level3 in current_settings.optimizerswitches) and
GetNextInstruction(hp2, hp2) and (hp2 <> hp1) do
UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then
begin
asml.remove(hp1);
asml.InsertBefore(hp1, p);
DebugMsg(SPeepholeOptimization + 'Add/Sub swap 2 done', hp1);
Result := True;
end;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
begin
Result:=false;
{ change "cmp $0, %reg" to "test %reg, %reg" }
if MatchOpType(taicpu(p),top_const,top_reg) and
(taicpu(p).oper[0]^.val = 0) then
begin
taicpu(p).opcode := A_TEST;
taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
Result:=true;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptTestOr(var p : tai) : Boolean;
var
IsTestConstX : Boolean;
hp1,hp2 : tai;
begin
Result:=false;
{ removes the line marked with (x) from the sequence
and/or/xor/add/sub/... $x, %y
test/or %y, %y | test $-1, %y (x)
j(n)z _Label
as the first instruction already adjusts the ZF
%y operand may also be a reference }
IsTestConstX:=(taicpu(p).opcode=A_TEST) and
MatchOperand(taicpu(p).oper[0]^,-1);
if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
GetLastInstruction(p, hp1) and
(tai(hp1).typ = ait_instruction) and
GetNextInstruction(p,hp2) and
MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
case taicpu(hp1).opcode Of
A_ADD, A_SUB, A_OR, A_XOR, A_AND,
{ These two instructions set the zero flag if the result is zero }
A_POPCNT, A_LZCNT:
begin
if (
{ With POPCNT, an input of zero will set the zero flag
because the population count of zero is zero }
(taicpu(hp1).opcode = A_POPCNT) and
(taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) and
(
OpsEqual(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^) or
{ Faster than going through the second half of the 'or'
condition below }
OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^)
)
) or (
OpsEqual(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^) and
{ does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
{ and in case of carry for A(E)/B(E)/C/NC }
(
(taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
(
(taicpu(hp1).opcode <> A_ADD) and
(taicpu(hp1).opcode <> A_SUB) and
(taicpu(hp1).opcode <> A_LZCNT)
)
)
) then
begin
RemoveCurrentP(p, hp2);
Result:=true;
Exit;
end;
end;
A_SHL, A_SAL, A_SHR, A_SAR:
begin
if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
{ SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
{ therefore, it's only safe to do this optimization for }
{ shifts by a (nonzero) constant }
(taicpu(hp1).oper[0]^.typ = top_const) and
(taicpu(hp1).oper[0]^.val <> 0) and
{ does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
{ and in case of carry for A(E)/B(E)/C/NC }
(taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
begin
RemoveCurrentP(p, hp2);
Result:=true;
Exit;
end;
end;
A_DEC, A_INC, A_NEG:
begin
if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
{ does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
{ and in case of carry for A(E)/B(E)/C/NC }
(taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
begin
RemoveCurrentP(p, hp2);
Result:=true;
Exit;
end;
end
else
;
end; { case }
{ change "test $-1,%reg" into "test %reg,%reg" }
if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
taicpu(p).loadoper(0,taicpu(p).oper[1]^);
{ Change "or %reg,%reg" to "test %reg,%reg" as OR generates a false dependency }
if MatchInstruction(p, A_OR, []) and
{ Can only match if they're both registers }
MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) then
begin
DebugMsg(SPeepholeOptimization + 'or %reg,%reg -> test %reg,%reg to remove false dependency (Or2Test)', p);
taicpu(p).opcode := A_TEST;
{ No need to set Result to True, as we've done all the optimisations we can }
end;
end;
function TX86AsmOptimizer.PostPeepholeOptCall(var p : tai) : Boolean;
var
hp1,hp3 : tai;
{$ifndef x86_64}
hp2 : taicpu;
{$endif x86_64}
begin
Result:=false;
hp3:=nil;
{$ifndef x86_64}
{ don't do this on modern CPUs, this really hurts them due to
broken call/ret pairing }
if (current_settings.optimizecputype < cpu_Pentium2) and
not(cs_create_pic in current_settings.moduleswitches) and
GetNextInstruction(p, hp1) and
MatchInstruction(hp1,A_JMP,[S_NO]) and
MatchOpType(taicpu(hp1),top_ref) and
(taicpu(hp1).oper[0]^.ref^.refaddr=addr_full) then
begin
hp2 := taicpu.Op_sym(A_PUSH,S_L,taicpu(hp1).oper[0]^.ref^.symbol);
InsertLLItem(p.previous, p, hp2);
taicpu(p).opcode := A_JMP;
taicpu(p).is_jmp := true;
RemoveInstruction(hp1);
Result:=true;
end
else
{$endif x86_64}
{ replace
call procname
ret
by
jmp procname
but do it only on level 4 because it destroys stack back traces
else if the subroutine is marked as no return, remove the ret
}
if ((cs_opt_level4 in current_settings.optimizerswitches) or
(po_noreturn in current_procinfo.procdef.procoptions)) and
GetNextInstruction(p, hp1) and
(MatchInstruction(hp1,A_RET,[S_NO]) or
(MatchInstruction(hp1,A_VZEROUPPER,[S_NO]) and
SetAndTest(hp1,hp3) and
GetNextInstruction(hp1,hp1) and
MatchInstruction(hp1,A_RET,[S_NO])
)
) and
(taicpu(hp1).ops=0) then
begin
if (cs_opt_level4 in current_settings.optimizerswitches) and
{ we might destroy stack alignment here if we do not do a call }
(target_info.stackalign<=sizeof(SizeUInt)) then
begin
taicpu(p).opcode := A_JMP;
taicpu(p).is_jmp := true;
DebugMsg(SPeepholeOptimization + 'CallRet2Jmp done',p);
end
else
DebugMsg(SPeepholeOptimization + 'CallRet2Call done',p);
RemoveInstruction(hp1);
if Assigned(hp3) then
begin
AsmL.Remove(hp3);
AsmL.InsertBefore(hp3,p)
end;
Result:=true;
end;
end;
function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
function ConstInRange(const Val: TCGInt; const OpSize: TOpSize): Boolean;
begin
case OpSize of
S_B, S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
Result := (Val <= $FF) and (Val >= -128);
S_W, S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
Result := (Val <= $FFFF) and (Val >= -32768);
S_L{$ifdef x86_64}, S_LQ{$endif x86_64}:
Result := (Val <= $FFFFFFFF) and (Val >= -2147483648);
else
Result := True;
end;
end;
var
hp1, hp2 : tai;
SizeChange: Boolean;
PreMessage: string;
begin
Result := False;
if (taicpu(p).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) then
begin
{ Change (using movzbl %al,%eax as an example):
movzbl %al, %eax movzbl %al, %eax
cmpl x, %eax testl %eax,%eax
To:
cmpb x, %al testb %al, %al (Move one back to avoid a false dependency)
movzbl %al, %eax movzbl %al, %eax
Smaller instruction and minimises pipeline stall as the CPU
doesn't have to wait for the register to get zero-extended. [Kit]
Also allow if the smaller of the two registers is being checked,
as this still removes the false dependency.
}
if
(
(
(taicpu(hp1).opcode = A_CMP) and MatchOpType(taicpu(hp1), top_const, top_reg) and
ConstInRange(taicpu(hp1).oper[0]^.val, taicpu(p).opsize)
) or (
{ If MatchOperand returns True, they must both be registers }
(taicpu(hp1).opcode = A_TEST) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)
)
) and
(reg2opsize(taicpu(hp1).oper[1]^.reg) <= reg2opsize(taicpu(p).oper[1]^.reg)) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
PreMessage := debug_op2str(taicpu(hp1).opcode) + debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' -> ' + debug_op2str(taicpu(hp1).opcode);
asml.Remove(hp1);
asml.InsertBefore(hp1, p);
{ Swap instructions in the case of cmp 0,%reg or test %reg,%reg }
if (taicpu(hp1).opcode = A_TEST) or (taicpu(hp1).oper[0]^.val = 0) then
begin
taicpu(hp1).opcode := A_TEST;
taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
end;
taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
case taicpu(p).opsize of
S_BW, S_BL:
begin
SizeChange := taicpu(hp1).opsize <> S_B;
taicpu(hp1).changeopsize(S_B);
end;
S_WL:
begin
SizeChange := taicpu(hp1).opsize <> S_W;
taicpu(hp1).changeopsize(S_W);
end
else
InternalError(2020112701);
end;
UpdateUsedRegs(tai(p.Next));
{ Check if the register is used aferwards - if not, we can
remove the movzx instruction completely }
if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
begin
{ Hp1 is a better position than p for debugging purposes }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4a', hp1);
RemoveCurrentp(p, hp1);
Result := True;
end;
if SizeChange then
DebugMsg(SPeepholeOptimization + PreMessage +
debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (smaller and minimises pipeline stall - MovzxCmp2CmpMovzx)', hp1)
else
DebugMsg(SPeepholeOptimization + 'MovzxCmp2CmpMovzx', hp1);
Exit;
end;
{ Change (using movzwl %ax,%eax as an example):
movzwl %ax, %eax
movb %al, (dest) (Register is smaller than read register in movz)
To:
movb %al, (dest) (Move one back to avoid a false dependency)
movzwl %ax, %eax
}
if (taicpu(hp1).opcode = A_MOV) and
(taicpu(hp1).oper[0]^.typ = top_reg) and
not RegInOp(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
(reg2opsize(taicpu(hp1).oper[0]^.reg) <= reg2opsize(taicpu(p).oper[0]^.reg)) then
begin
DebugMsg(SPeepholeOptimization + 'MovzxMov2MovMovzx', hp1);
hp2 := tai(hp1.Previous); { Effectively the old position of hp1 }
asml.Remove(hp1);
asml.InsertBefore(hp1, p);
if taicpu(hp1).oper[1]^.typ = top_reg then
AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
{ Check if the register is used aferwards - if not, we can
remove the movzx instruction completely }
if not RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg, p, UsedRegs) then
begin
{ Hp1 is a better position than p for debugging purposes }
DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4b', hp1);
RemoveCurrentp(p, hp1);
Result := True;
end;
Exit;
end;
end;
end;
{$ifdef x86_64}
function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
var
PreMessage, RegName: string;
begin
{ Code size reduction by J. Gareth "Kit" Moreton }
{ change "xorq %reg,%reg" to "xorl %reg,%reg" for %rax, %rcx, %rdx, %rbx, %rsi, %rdi, %rbp and %rsp,
as this removes the REX prefix }
Result := False;
if not OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
Exit;
if taicpu(p).oper[0]^.typ <> top_reg then
{ Should be impossible if both operands were equal, since one of XOR's operands must be a register }
InternalError(2018011500);
case taicpu(p).opsize of
S_Q:
begin
RegName := debug_regname(taicpu(p).oper[0]^.reg); { 64-bit register name }
PreMessage := 'xorq ' + RegName + ',' + RegName + ' -> xorl ';
{ The actual optimization }
setsubreg(taicpu(p).oper[0]^.reg, R_SUBD);
setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
taicpu(p).changeopsize(S_L);
RegName := debug_regname(taicpu(p).oper[0]^.reg); { 32-bit register name }
DebugMsg(SPeepholeOptimization + PreMessage + RegName + ',' + RegName + ' (32-bit register recommended when zeroing 64-bit counterpart)', p);
end;
else
;
end;
end;
{$endif}
function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean;
var
XReg: TRegister;
begin
Result := False;
{ Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1"
Smaller encoding and slightly faster on some platforms (also works for
ZMM-sized registers) }
if (taicpu(p).opsize in [S_YMM, S_ZMM]) and
MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then
begin
XReg := taicpu(p).oper[0]^.reg;
if (taicpu(p).oper[1]^.reg = XReg) then
begin
taicpu(p).changeopsize(S_XMM);
setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX);
if (cs_opt_size in current_settings.optimizerswitches) then
begin
{ Change input registers to %xmm0 to reduce size. Note that
there's a risk of a false dependency doing this, so only
optimise for size here }
XReg := NR_XMM0;
DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p);
end
else
begin
setsubreg(XReg, R_SUBMMX);
DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p);
end;
taicpu(p).oper[0]^.reg := XReg;
taicpu(p).oper[1]^.reg := XReg;
Result := True;
end;
end;
end;
class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
var
OperIdx: Integer;
begin
for OperIdx := 0 to p.ops - 1 do
if p.oper[OperIdx]^.typ = top_ref then
optimize_ref(p.oper[OperIdx]^.ref^, False);
end;
end.