From a95d54b2bbbfdd420d08f28e8143e526dd7bba0f Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Thu, 17 Feb 2022 19:07:27 +0000 Subject: [PATCH 1/9] * x86: New sliding window-based CSE peephole optimisation to reduce repeated pointer deallocations. --- compiler/aoptbase.pas | 16 + compiler/aoptobj.pas | 92 +++++ compiler/globtype.pas | 8 +- compiler/i386/aoptcpu.pas | 50 ++- compiler/utils/ppuutils/ppudump.pp | 3 +- compiler/x86/aoptx86.pas | 638 ++++++++++++++++++++++++++++- compiler/x86_64/aoptcpu.pas | 25 ++ 7 files changed, 824 insertions(+), 8 deletions(-) diff --git a/compiler/aoptbase.pas b/compiler/aoptbase.pas index e290766e9e..c3b65422f5 100644 --- a/compiler/aoptbase.pas +++ b/compiler/aoptbase.pas @@ -99,6 +99,9 @@ unit aoptbase; { returns true if reg is modified by any instruction between p1 and p2 } function RegModifiedBetween(reg: TRegister; p1, p2: tai): Boolean; + { returns true if reg1 or reg2 is modified by any instruction between p1 and p2 } + function RegPairModifiedBetween(reg1,reg2: TRegister; p1, p2: tai): Boolean; + { returns true if reg is loaded with a new value by hp } function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean; Virtual; @@ -318,6 +321,19 @@ unit aoptbase; end; + Function TAOptBase.RegPairModifiedBetween(reg1,reg2 : TRegister;p1,p2 : tai) : Boolean; + Begin + Result:=false; + while assigned(p1) and assigned(p2) and GetNextInstruction(p1,p1) and (p1<>p2) do + if ((reg1<>NR_NO) and RegModifiedByInstruction(reg1,p1)) or + ((reg2<>NR_NO) and RegModifiedByInstruction(reg2,p1)) then + begin + Result:=true; + exit; + end; + end; + + function TAoptBase.RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; begin result:=false; diff --git a/compiler/aoptobj.pas b/compiler/aoptobj.pas index 1d21f60c34..9050f342cb 100644 --- a/compiler/aoptobj.pas +++ b/compiler/aoptobj.pas @@ -273,6 +273,7 @@ Unit AoptObj; Procedure ClearUsedRegs; Procedure UpdateUsedRegs(p : Tai); {$ifdef USEINLINE}inline;{$endif USEINLINE} class procedure UpdateUsedRegs(var Regs: TAllUsedRegs; p: Tai); static; + class procedure UpdateUsedRegsIgnoreNew(var Regs: TAllUsedRegs; p: Tai); static; { UpdateUsedRegsBetween updates the given TUsedRegs from p1 to p2 exclusive, calling GetNextInstruction to move between instructions and sending p1.Next to UpdateUsedRegs } @@ -284,8 +285,14 @@ Unit AoptObj; function UpdateUsedRegsAndOptimize(p : Tai): Tai; Function CopyUsedRegs(var dest : TAllUsedRegs) : boolean; + class function CopyUsedRegs(var source: TAllUsedRegs; var dest : TAllUsedRegs) : boolean; static; + + { Merges the registers marked as used into UsedRegs } + procedure MergeUsedRegs(const Regs : TAllUsedRegs); + procedure RestoreUsedRegs(const Regs : TAllUsedRegs); procedure TransferUsedRegs(var dest: TAllUsedRegs); + class procedure TransferUsedRegs(var source: TAllUsedRegs; var dest: TAllUsedRegs); static; class procedure ReleaseUsedRegs(const regs : TAllUsedRegs); static; class function RegInUsedRegs(reg : TRegister;var regs : TAllUsedRegs) : boolean; static; class procedure IncludeRegInUsedRegs(reg : TRegister;var regs : TAllUsedRegs); static; {$ifdef USEINLINE}inline;{$endif USEINLINE} @@ -441,6 +448,14 @@ Unit AoptObj; { Actually updates a used register } class procedure UpdateReg(var Regs : TAllUsedRegs; p: tai_regalloc); static; {$ifdef USEINLINE}inline;{$endif USEINLINE} + + { Called whenever a new iteration of pass 1 starts. Override for + platform-specific behaviour } + procedure Pass1Initialize; virtual; + + { Called whenever a new iteration of pass 2 starts. Override for + platform-specific behaviour } + procedure Pass2Initialize; virtual; private procedure DebugMsg(const s: string; p: tai); @@ -1151,6 +1166,37 @@ Unit AoptObj; end; + class procedure TAOptObj.UpdateUsedRegsIgnoreNew(var Regs: TAllUsedRegs; p: Tai); + begin + { this code is based on TUsedRegs.Update to avoid multiple passes through the asmlist, + the code is duplicated here } + repeat + while assigned(p) and + ((p.typ in (SkipInstr - [ait_RegAlloc])) or + ((p.typ = ait_label) and + labelCanBeSkipped(tai_label(p))) or + ((p.typ = ait_marker) and + (tai_Marker(p).Kind in [mark_AsmBlockEnd,mark_NoLineInfoStart,mark_NoLineInfoEnd]))) do + p := tai(p.next); + while assigned(p) and + (p.typ=ait_RegAlloc) Do + begin + prefetch(pointer(p.Next)^); + case tai_regalloc(p).ratype of + ra_dealloc : + Exclude(Regs[getregtype(tai_regalloc(p).reg)].UsedRegs, getsupreg(tai_regalloc(p).reg)); + else + ; + end; + p := tai(p.next); + end; + until not(assigned(p)) or + (not(p.typ in SkipInstr) and + not((p.typ = ait_label) and + labelCanBeSkipped(tai_label(p)))); + end; + + class procedure TAOptObj.UpdateUsedRegsBetween(var Regs: TAllUsedRegs; p1, p2: Tai); begin { this code is based on TUsedRegs.Update to avoid multiple passes through the asmlist, @@ -1192,6 +1238,26 @@ Unit AoptObj; end; + class function TAOptObj.CopyUsedRegs(var source: TAllUsedRegs; var dest: TAllUsedRegs): boolean; + var + i : TRegisterType; + begin + Result:=true; + for i:=low(TRegisterType) to high(TRegisterType) do + dest[i]:=TUsedRegs.Create_Regset(i,source[i].GetUsedRegs); + end; + + + { Merges the registers marked as used into UsedRegs } + procedure TAOptObj.MergeUsedRegs(const Regs : TAllUsedRegs); + var + i : TRegisterType; + begin + for i:=low(TRegisterType) to high(TRegisterType) do + UsedRegs[i].UsedRegs := UsedRegs[i].UsedRegs + Regs[i].UsedRegs; + end; + + procedure TAOptObj.RestoreUsedRegs(const Regs: TAllUsedRegs); var i : TRegisterType; @@ -1216,6 +1282,18 @@ Unit AoptObj; end; + class procedure TAOptObj.TransferUsedRegs(var source: TAllUsedRegs; var dest: TAllUsedRegs); static; + var + i : TRegisterType; + begin + { Note that the constructor Create_Regset is being called as a regular + method - it is not instantiating a new object. This is because it is + the only published means to modify the internal state en-masse. [Kit] } + for i:=low(TRegisterType) to high(TRegisterType) do + dest[i].Create_Regset(i, source[i].GetUsedRegs); + end; + + class procedure TAOptObj.ReleaseUsedRegs(const regs: TAllUsedRegs); var i : TRegisterType; @@ -2695,6 +2773,7 @@ Unit AoptObj; p := StartPoint; FirstInstruction := True; ClearUsedRegs; + Pass1Initialize; while Assigned(p) and (p <> BlockEnd) Do begin @@ -2768,6 +2847,7 @@ Unit AoptObj; stoploop := True; p := BlockStart; ClearUsedRegs; + Pass2Initialize; while (p <> BlockEnd) Do begin prefetch(pointer(p.Next)^); @@ -2830,6 +2910,18 @@ Unit AoptObj; end; + procedure TAOptObj.Pass1Initialize; + begin + { Do nothing by default } + end; + + + procedure TAOptObj.Pass2Initialize; + begin + { Do nothing by default } + end; + + procedure TAOptObj.Debug_InsertInstrRegisterDependencyInfo; var p: tai; diff --git a/compiler/globtype.pas b/compiler/globtype.pas index 32a2e1ce93..ae7e79cbf8 100644 --- a/compiler/globtype.pas +++ b/compiler/globtype.pas @@ -370,7 +370,8 @@ interface cs_opt_use_load_modify_store, cs_opt_unused_para, cs_opt_consts, - cs_opt_forloop + cs_opt_forloop, + cs_opt_asmcse ); toptimizerswitches = set of toptimizerswitch; @@ -444,7 +445,8 @@ interface 'ORDERFIELDS','FASTMATH','DEADVALUES','REMOVEEMPTYPROCS', 'CONSTPROP', 'DEADSTORE','FORCENOSTACKFRAME','USELOADMODIFYSTORE', - 'UNUSEDPARA','CONSTS','FORLOOP' + 'UNUSEDPARA','CONSTS','FORLOOP', + 'ASMCSE' ); WPOptimizerSwitchStr : array [twpoptimizerswitch] of string[14] = ( 'DEVIRTCALLS','OPTVMTS','SYMBOLLIVENESS' @@ -477,7 +479,7 @@ interface genericlevel2optimizerswitches = [cs_opt_level2,cs_opt_remove_empty_proc,cs_opt_unused_para]; genericlevel3optimizerswitches = [cs_opt_level3,cs_opt_constant_propagate,cs_opt_nodedfa,cs_opt_loopstrength {$ifndef llvm},cs_opt_use_load_modify_store{$endif}, - cs_opt_loopunroll,cs_opt_forloop]; + cs_opt_loopunroll,cs_opt_forloop,cs_opt_asmcse]; genericlevel4optimizerswitches = [cs_opt_level4,cs_opt_reorder_fields,cs_opt_dead_values,cs_opt_fastmath]; { whole program optimizations whose information generation requires diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas index bb216d68b2..58a28b9d34 100644 --- a/compiler/i386/aoptcpu.pas +++ b/compiler/i386/aoptcpu.pas @@ -26,6 +26,7 @@ unit aoptcpu; {$ifdef EXTDEBUG} {$define DEBUG_AOPTCPU} +{$define DEBUG_AOPTCSE} {$endif EXTDEBUG} Interface @@ -37,6 +38,7 @@ unit aoptcpu; Type TCpuAsmOptimizer = class(TX86AsmOptimizer) + procedure DebugSWMsg(const s : string; p : tai);inline; function PrePeepHoleOptsCpu(var p: tai): boolean; override; function PeepHoleOptPass1Cpu(var p: tai): boolean; override; function PeepHoleOptPass2Cpu(var p: tai): boolean; override; @@ -49,10 +51,10 @@ unit aoptcpu; Implementation uses - verbose,globtype,globals, + verbose,globtype,globals,cutils, cpuinfo, aasmcpu, - aoptutils, + aoptbase,aoptutils, aasmcfi, procinfo, cgutils, @@ -60,6 +62,26 @@ unit aoptcpu; symsym,symconst; +{$ifdef DEBUG_AOPTCSE} + const + SSlidingWindow: shortstring = 'Assembly CSE: '; + + procedure TCpuAsmOptimizer.DebugSWMsg(const s: string;p : tai); + begin + asml.insertbefore(tai_comment.Create(strpnew(s)), p); + end; +{$else DEBUG_AOPTCSE} + { Empty strings help the optimizer to remove string concatenations that won't + ever appear to the user on release builds. [Kit] } + const + SSlidingWindow = ''; + + procedure TCpuAsmOptimizer.DebugSWMsg(const s: string;p : tai);inline; + begin + end; +{$endif DEBUG_AOPTCSE} + + { Checks if the register is a 32 bit general purpose register } function isgp32reg(reg: TRegister): boolean; begin @@ -260,10 +282,34 @@ unit aoptcpu; A_CLC, A_STC: Result:=OptPass1STCCLC(p); + A_CALL: + if (cs_opt_asmcse in current_settings.optimizerswitches) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon CALL', p); + ResetSW; + end; else ; end; + + { If an unsafe reference is found, clear the sliding window } + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + { Saves doing it twice } + (taicpu(p).opcode <> A_CALL) and + IsWriteToMemory(taicpu(p)) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon memory write', p); + ResetSW; + end; end; + ait_label: + if (cs_opt_asmcse in current_settings.optimizerswitches) and + not labelCanBeSkipped(tai_label(p)) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon finding label', p); + ResetSW; + end; else ; end; diff --git a/compiler/utils/ppuutils/ppudump.pp b/compiler/utils/ppuutils/ppudump.pp index cefc213eba..02ec298572 100644 --- a/compiler/utils/ppuutils/ppudump.pp +++ b/compiler/utils/ppuutils/ppudump.pp @@ -2521,7 +2521,8 @@ const 'cs_opt_use_load_modify_store', 'cs_opt_unused_para', 'cs_opt_consts', - 'cs_opt_forloop' + 'cs_opt_forloop', + 'cs_opt_asmcse' ); var globalswitch : tglobalswitch; diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index cff6a94300..cdf7d233d5 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -24,9 +24,11 @@ unit aoptx86; {$i fpcdefs.inc} { $define DEBUG_AOPTCPU} +{ $define DEBUG_AOPTCSE} {$ifdef EXTDEBUG} {$define DEBUG_AOPTCPU} +{$define DEBUG_AOPTCSE} {$endif EXTDEBUG} interface @@ -34,10 +36,13 @@ unit aoptx86; uses globtype,cclasses, cpubase, - aasmtai,aasmcpu, + aasmtai,aasmcpu,aasmdata, cgbase,cgutils, aopt,aoptobj; + const + SLIDING_WINDOW_SIZE = 32; + type TOptsToCheck = ( aoc_MovAnd2Mov_3, @@ -45,11 +50,18 @@ unit aoptx86; aoc_DoPass2JccOpts ); + TSlidingWindowEntry = record + ai: tai; + RegState: TAllUsedRegs; + end; + TX86AsmOptimizer = class(TAsmOptimizer) { some optimizations are very expensive to check, so the pre opt pass can be used to set some flags, depending on the found instructions if it is worth to check a certain optimization } OptsToCheck : set of TOptsToCheck; + constructor create(_AsmL: TAsmList); override; + destructor Destroy; override; function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override; function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override; class function RegReadByInstruction(reg : TRegister; hp : tai) : boolean; static; @@ -97,9 +109,15 @@ unit aoptx86; reference } function RefModifiedBetween(Ref: TReference; RefSize: ASizeInt; p1, p2: tai): Boolean; private + SlidingWindow: array[0..SLIDING_WINDOW_SIZE-1] of TSlidingWindowEntry; + WindowTop: Cardinal; + function SkipSimpleInstructions(var hp1: tai): Boolean; protected + DisableSW: Boolean; + + { Returns true if the target compiler and its settings favour MOVZX over other instruction combinations } class function IsMOVZXAcceptable: Boolean; static; inline; function CheckMovMov2MovMov2(const p, hp1: tai): Boolean; @@ -135,6 +153,8 @@ unit aoptx86; or writes to a global symbol } class function IsRefSafe(const ref: PReference): Boolean; static; + { Returns true if the instruction writes to memory } + class function IsWriteToMemory(const p: taicpu): Boolean; static; { Returns true if the given MOV instruction can be safely converted to CMOV } class function CanBeCMOV(p, cond_p: tai; var RefModified: Boolean) : boolean; static; @@ -155,6 +175,7 @@ unit aoptx86; function FuncMov2Func(var p: tai; const hp1: tai): Boolean; procedure DebugMsg(const s : string; p : tai);inline; + procedure DebugSWMsg(const s : string; p : tai);inline; class function IsExitCode(p : tai) : boolean; static; class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static; @@ -235,6 +256,32 @@ unit aoptx86; { Processor-dependent reference optimisation } class procedure OptimizeRefs(var p: taicpu); static; + + { Sliding window routines } + procedure ResetSW; + procedure AddToSW(var p: tai); + function FindSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; + + { Allocate all registers marked as used in TrackedRegs between p1 and p2 + (uses TmpUsedRegs as the initialusedregs in calls to AllocRegBetween } + procedure AllocAllUsedRegsBetween(p1, p2: tai; var TrackedRegs: TAllUsedRegs; var InitialUsedRegs: TAllUsedRegs); + + { Checks all instructions between sequence_start and sequence_end inclusive + to see if any registers in use, according to ForwardTrackedRegs, are + not modified in between. False if all registers are static, True otherwise. } + function CheckSWRegisters(p1, p2: tai; var ForwardTrackedRegs: TAllUsedRegs): Boolean; + + { Step thorugh a sliding-window match with the current instruction and + see how much of a chain can be removed } + function TraceRLE(var p: tai; rle_pointer: tai; var RLERegState: TAllUsedRegs): Boolean; + + { Called whenever a new iteration of pass 1 starts. Override for + platform-specific behaviour } + procedure Pass1Initialize; override; + + { Called whenever a new iteration of pass 1 starts. Override for + platform-specific behaviour } + procedure Pass2Initialize; override; end; function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean; @@ -265,6 +312,17 @@ unit aoptx86; and having an offset } function MatchReferenceWithOffset(const ref : treference;base,index : TRegister) : Boolean; + +{$ifdef DEBUG_AOPTCSE} + const + SSlidingWindow: shortstring = 'Assembly CSE: '; +{$else DEBUG_AOPTCSE} + { Empty strings help the optimizer to remove string concatenations that won't + ever appear to the user on release builds. [Kit] } + const + SSlidingWindow = ''; +{$endif DEBUG_AOPTCSE} + implementation uses @@ -569,6 +627,29 @@ unit aoptx86; end; + constructor TX86AsmOptimizer.create(_AsmL: TAsmList); + var + X: Integer; + begin + inherited create(_AsmL); + for X := 0 to SLIDING_WINDOW_SIZE - 1 do + begin + SlidingWindow[X].ai := nil; + CreateUsedRegs(SlidingWindow[X].RegState); + end; + + WindowTop := SLIDING_WINDOW_SIZE - 1; + end; + + + destructor TX86AsmOptimizer.Destroy; + begin + { Release any copied register states in the sliding window } + ResetSW; + inherited Destroy; + end; + + function TX86AsmOptimizer.GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean; begin Next:=Current; @@ -1507,6 +1588,17 @@ unit aoptx86; end; {$endif DEBUG_AOPTCPU} +{$ifdef DEBUG_AOPTCSE} + procedure TX86AsmOptimizer.DebugSWMsg(const s: string;p : tai); + begin + asml.insertbefore(tai_comment.Create(strpnew(s)), p); + end; +{$else DEBUG_AOPTCSE} + procedure TX86AsmOptimizer.DebugSWMsg(const s: string;p : tai);inline; + begin + end; +{$endif DEBUG_AOPTCSE} + class function TX86AsmOptimizer.IsMOVZXAcceptable: Boolean; inline; begin {$ifdef x86_64} @@ -2847,6 +2939,32 @@ unit aoptx86; end; + class function TX86AsmOptimizer.IsWriteToMemory(const p: taicpu): Boolean; + var + X: Integer; + begin + Result := True; + if ([ + Ch_WMemEDI, Ch_All + ] * InsProp[p.opcode].Ch) <> [] then + { Implicit memory writes or modifications to the stack pointer are unsafe } + Exit; + + for X := 0 to p.ops - 1 do + if (p.oper[X]^.typ = top_ref) and + ( + ((X = 0) and (([Ch_WOp1, Ch_RWOp1, Ch_MOp1] * InsProp[p.opcode].Ch) <> [])) or + ((X = 1) and (([Ch_WOp2, Ch_RWOp2, Ch_MOp2] * InsProp[p.opcode].Ch) <> [])) or + ((X = 2) and (([Ch_WOp3, Ch_RWOp3, Ch_MOp3] * InsProp[p.opcode].Ch) <> [])) or + ((X = 3) and (([Ch_WOp4, Ch_RWOp4, Ch_MOp4] * InsProp[p.opcode].Ch) <> [])) + ) then + { Unsafe write } + Exit; + + Result := False; + end; + + function TX86AsmOptimizer.ConvertLEA(const p: taicpu): Boolean; var l: asizeint; @@ -3112,7 +3230,7 @@ unit aoptx86; function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean; var - hp1, hp2, hp3, hp4: tai; + hp1, hp2, hp3, hp4, rle_start, rle_last: tai; DoOptimisation, TempBool: Boolean; {$ifdef x86_64} NewConst: TCGInt; @@ -3222,6 +3340,7 @@ unit aoptx86; MovAligned, MovUnaligned: TAsmOp; ThisRef: TReference; JumpTracking: TLinkedList; + SWRegState: TAllUsedRegs; begin Result:=false; @@ -5480,6 +5599,28 @@ unit aoptx86; Result := True; Exit; end; + + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + (taicpu(p).oper[1]^.typ = top_reg) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + (taicpu(p).oper[0]^.typ = top_ref) and + { Make sure the registers that make up the reference haven't changed } + (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + if not Result then + AddToSW(p); + end; end; @@ -6204,6 +6345,7 @@ unit aoptx86; TempReg: TRegister; Multiple: TCGInt; Adjacent, IntermediateRegDiscarded: Boolean; + SWRegState: TAllUsedRegs; begin Result:=false; @@ -6691,6 +6833,25 @@ unit aoptx86; end; end; end; + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + { Make sure the registers that make up the reference haven't changed } + (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + if not Result then + AddToSW(p); + end; end; @@ -17562,5 +17723,478 @@ unit aoptx86; optimize_ref(p.oper[OperIdx]^.ref^, False); end; + + procedure TX86AsmOptimizer.ResetSW; + var + X: Integer; + begin + if DisableSW then + Exit; + + for X := 0 to SLIDING_WINDOW_SIZE - 1 do + SlidingWindow[X].ai := nil; + + WindowTop := SLIDING_WINDOW_SIZE - 1; + end; + + + procedure TX86AsmOptimizer.AddToSW(var p: tai); + begin + if DisableSW then + Exit; + + Inc(WindowTop); + if (WindowTop = SLIDING_WINDOW_SIZE) then + WindowTop := 0; + + SlidingWindow[WindowTop].ai := p; + TransferUsedRegs(UsedRegs, SlidingWindow[WindowTop].RegState); + end; + + + function TX86AsmOptimizer.FindSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; + var + Index: Cardinal; + X: Integer; + p_sw: taicpu; + hp: taicpu absolute p; { Implicit typecast } + Mismatch: Boolean; + begin + Result := nil; + if DisableSW then + Exit; + + Index := WindowTop; + repeat + p_sw := taicpu(SlidingWindow[Index].ai); + if not Assigned(p_sw) then + { Sliding window hasn't been completely filled; this is as far as + we can search } + Exit; + + if (p_sw.opcode = hp.opcode) and + (p_sw.opsize = hp.opsize) and + (p_sw.ops = hp.ops) then + begin + Mismatch := False; + + { Check to see if all the parameters match } + for X := 0 to p_sw.ops - 1 do + begin + if not MatchOperand(p_sw.oper[X]^, hp.oper[X]^) then + begin + Mismatch := True; + Break; + end; + end; + + { If Mismatch is still false, then we have a match! } + if not Mismatch then + begin + DebugSWMsg(SSlidingWindow + 'Found match in sliding window (ref = ' + hexstr(p) + ')', p); + DebugSWMsg(SSlidingWindow + 'Reference for match found below (ref = ' + hexstr(p) + ')', p_sw); + Result := p_sw; + CopyUsedRegs(SlidingWindow[Index].RegState, RegState); + Exit; + end; + end; + + if Index = 0 then + Index := SLIDING_WINDOW_SIZE - 1 + else + Dec(Index); + + { Drop out if Index has made a complete loop } + until Index = WindowTop; + + end; + + + function TX86AsmOptimizer.CheckSWRegisters(p1, p2: tai; var ForwardTrackedRegs: TAllUsedRegs): Boolean; + var + RegIndex: TSuperRegister; + CurrentReg, SearchReg: TRegister; + begin + Result := False; + SearchReg := NR_NO; +{$ifdef x86_64} + for RegIndex := RS_RAX to RS_R15 do +{$else x86_64} + for RegIndex := RS_EAX to RS_RSP do +{$endif x86_64} + begin + CurrentReg := newreg(R_INTREGISTER, RegIndex, R_SUBWHOLE); + { If the register is not in AllUsedRegs, then it was temporary within the RLE } + if ForwardTrackedRegs[R_INTREGISTER].IsUsed(CurrentReg) then + begin + if SearchReg = NR_NO then + SearchReg := CurrentReg + else + begin + { Search two registers at once to save time } + if RegPairModifiedBetween(SearchReg, CurrentReg, p1, p2) then + Exit; + SearchReg := NR_NO; + end; + end; + end; + +{$ifdef x86_64} + for RegIndex := RS_XMM0 to RS_XMM31 do +{$else x86_64} + for RegIndex := RS_XMM0 to RS_XMM7 do +{$endif x86_64} + begin + CurrentReg := newreg(R_MMREGISTER, RegIndex, R_SUBMMWHOLE); + if ForwardTrackedRegs[R_MMREGISTER].IsUsed(CurrentReg) then + begin + if SearchReg = NR_NO then + SearchReg := CurrentReg + else + begin + { Search two registers at once to save time } + if RegPairModifiedBetween(SearchReg, CurrentReg, p1, p2) then + Exit; + SearchReg := NR_NO; + end; + end; + end; + + { Check leftover loose register } + if (SearchReg <> NR_NO) and RegModifiedBetween(SearchReg, p1, p2) then + Exit; + + Result := True; + end; + + + procedure TX86AsmOptimizer.AllocAllUsedRegsBetween(p1, p2: tai; var TrackedRegs: TAllUsedRegs; var InitialUsedRegs: TAllUsedRegs); + var + RegIndex: TSuperRegister; + CurrentReg: TRegister; + begin +{$ifdef x86_64} + for RegIndex := RS_RAX to RS_R15 do +{$else x86_64} + for RegIndex := RS_EAX to RS_RSP do +{$endif x86_64} + begin + CurrentReg := newreg(R_INTREGISTER, RegIndex, R_SUBWHOLE); + { If the register is not in InitialUsedRegs, then it was temporary within the RLE } + if TrackedRegs[R_INTREGISTER].IsUsed(CurrentReg) then + begin + TmpUsedRegs[R_INTREGISTER].Clear; + IncludeRegInUsedRegs(CurrentReg, InitialUsedRegs); + AllocRegBetween(CurrentReg, p1, p2, InitialUsedRegs); + end; + end; + +{$ifdef x86_64} + for RegIndex := RS_XMM0 to RS_XMM31 do +{$else x86_64} + for RegIndex := RS_XMM0 to RS_XMM7 do +{$endif x86_64} + begin + CurrentReg := newreg(R_MMREGISTER, RegIndex, R_SUBMMWHOLE); + if TrackedRegs[R_MMREGISTER].IsUsed(CurrentReg) then + begin + TmpUsedRegs[R_MMREGISTER].Clear; + IncludeRegInUsedRegs(CurrentReg, InitialUsedRegs); + AllocRegBetween(CurrentReg, p1, p2, InitialUsedRegs); + end; + end; + end; + + + function TX86AsmOptimizer.TraceRLE(var p: tai; rle_pointer: tai; var RLERegState: TAllUsedRegs): Boolean; + var + forward_pointer, forward_last, rle_last, hp1: tai; + RLETrackedRegs, ForwardTrackedRegs: TAllUsedRegs; + + function VerifyRLE: Boolean; + begin + Result := False; + if CheckSWRegisters(rle_last, p, ForwardTrackedRegs) then + begin + AllocAllUsedRegsBetween(rle_last, forward_last, ForwardTrackedRegs, RLETrackedRegs); + + { Make sure UsedRegs knows about the newly allocated registers } + MergeUsedRegs(ForwardTrackedRegs); + + repeat + DebugSWMsg(SSlidingWindow + 'Removed common subexpression', p); + RemoveCurrentP(p); + until p = forward_pointer; { One after forward_last } + + Result := True; + end; + end; + + function CheckRegister(Reg: TRegister): Boolean; + begin + Result := True; + if not RegInUsedRegs(Reg, ForwardTrackedRegs) then + begin + { Reading from outside register, so it must not change value + between rle_pointer and p; also check rle_pointer itself } + if RegModifiedByInstruction(Reg, rle_pointer) or + RegModifiedBetween(Reg, rle_pointer, p) then + begin + Result := False; + Exit; + end; + + IncludeRegInUsedRegs(Reg, ForwardTrackedRegs); + end; + end; + + function CheckInput(Oper: TOper): Boolean; + begin + Result := True; + case Oper.typ of + top_const: + { Consts need no special handling }; + top_reg: + if not CheckRegister(Oper.reg) then + begin + Result := False; + Exit; + end; + + top_ref: + if Oper.ref^.refaddr = addr_no then + begin + { Add registers to tracking lists } + + { Reference base relies on an outside register, so it + must not change value between rle_pointer and p; also + check rle_pointer itself } + if (Oper.ref^.base <> NR_NO) and not CheckRegister(Oper.ref^.base) then + begin + Result := False; + Exit; + end; + + { Reference index relies on an outside register, so it + must not change value between rle_pointer and p; also + check rle_pointer itself } + if (Oper.ref^.index <> NR_NO) and not CheckRegister(Oper.ref^.index) then + begin + Result := False; + Exit; + end; + end; + else + InternalError(2022021810); + end; + end; + + begin + Result := False; + + forward_pointer := p; + forward_last := p; + rle_last := rle_pointer; + + CopyUsedRegs(RLERegState, RLETrackedRegs); + CreateUsedRegs(ForwardTrackedRegs); + + { Analyse initial input } + case taicpu(p).opcode of + A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}: + begin + if not CheckInput(taicpu(p).oper[0]^) then + begin + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + + if taicpu(p).oper[1]^.typ = top_reg then + begin + IncludeRegInUsedRegs(taicpu(p).oper[1]^.reg, ForwardTrackedRegs); + end; + end; + else + begin + { Don't know how to handle this instruction } + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + end; + + TransferUsedRegs(TmpUsedRegs); + + { Now start scanning ahead one instruction at a time until a mismatch is found } + while GetNextInstruction(forward_pointer, forward_pointer) and GetNextInstruction(rle_pointer, rle_pointer) do + begin + { NOTE: forward_pointer = forward ahead of p; rle_pointer = reference somewhere behind p } + + if (rle_pointer = p) then + { We hit the current instruction - don't optimise this } + Break; + + { Use this to look ahead of the forward pointer more accurately to see what's in use } + UpdateUsedRegs(TmpUsedRegs, tai(forward_last.Next)); + + if RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) then + { No longer safe to remove subexpressions } + Break; + + { Only accept instructions of the form "mov (ref),%reg" or "lea (ref),%reg" } + if (forward_pointer.typ <> ait_instruction) or (taicpu(rle_pointer).typ <> ait_instruction) or + (taicpu(forward_pointer).opcode <> taicpu(rle_pointer).opcode) or + (taicpu(forward_pointer).opsize <> taicpu(rle_pointer).opsize) or + (taicpu(forward_pointer).ops <> taicpu(rle_pointer).ops) then + Break; + + case taicpu(forward_pointer).opcode of + A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}: + begin + { Not allowed writes to references } + if taicpu(forward_pointer).oper[1]^.typ = top_ref then + Break; + + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + (taicpu(forward_pointer).oper[1]^.typ <> taicpu(rle_pointer).oper[1]^.typ) then + Break; + + { Check that the input hasn't changed value between the RLE and p } + if not CheckInput(taicpu(forward_pointer).oper[0]^) then + Break; + + { This is a special kind of mismatch - the final MOVx/LEA writes to a different register. + If the register in the first chain hasn't been modified, then the entire second chain + can be replaced with a single MOV instruction to write it to the new register } + if (taicpu(forward_pointer).oper[1]^.typ = top_reg) then + begin + if (taicpu(forward_pointer).oper[1]^.reg <> taicpu(rle_pointer).oper[1]^.reg) then + begin + { Here is the reason why TmpUsedRegs was being used, so it can accurately + detect whether the destination register is in use up to this point, plus + TmpUsedRegs gets modified after a call to RegUsedAfterInstruction } + if not ( + { Is the value of rle_pointer's register still in use? } + RegUsedAfterInstruction(taicpu(rle_pointer).oper[1]^.reg, forward_pointer, TmpUsedRegs) and + { If so, it must preserve its value through the sdequence of instructions p to forward_pointer } + ( + RegModifiedByInstruction(taicpu(rle_pointer).oper[1]^.reg, p) or + RegModifiedBetween(taicpu(rle_pointer).oper[1]^.reg, p, forward_pointer) + ) + ) then + begin + UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + + { Be a little hacky and don't include the assignment of the different target register... } + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); + { ... but do include the RLE register so it's monitored } + IncludeRegInUsedRegs(taicpu(rle_pointer).oper[1]^.reg, ForwardTrackedRegs); + + { Before doing the last instruction, see if we can optimise what's + currently present in case the last line fails } + if VerifyRLE then + Result := True; + + { Look beyond the final instruction that we're replacing to deallocate any + temporary registers that are being used } + UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); + + { Add the RLE register to the forward tracking array so it's not ignored } + IncludeRegInUsedRegs(taicpu(rle_pointer).oper[1]^.reg, ForwardTrackedRegs); + + if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + begin + { This is a valid dereference chain that can be replaced + with the result of the previous one } + DebugSWMsg(SSlidingWindow + 'Removed common subexpression (different ending register)', forward_pointer); + taicpu(forward_pointer).opcode := A_MOV; + taicpu(forward_pointer).opsize := reg2opsize(taicpu(rle_pointer).oper[1]^.reg); + taicpu(forward_pointer).loadreg(0, taicpu(rle_pointer).oper[1]^.reg); + + { Remove all remaining instructions between p and forward_pointer } + while p <> forward_pointer do + begin + { Use RemoveCurrentP so UsedRegs is updated } + DebugSWMsg(SSlidingWindow + 'Removed common subexpression', p); + if not RemoveCurrentP(p) then + InternalError(2022021701); + end; + + { Make sure the RLE registers are tracked all the way through } + AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + + { Make sure UsedRegs knows about the newly allocated registers } + MergeUsedRegs(ForwardTrackedRegs); + + Result := True; + end; + + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + + { Any other kind of mismatch, break out } + Break; + end; + end + else if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + Break; + end; + else + { Don't know how to handle this instruction } + Break; + end; + + { RLE chain is okay so far } + + UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(ForwardTrackedRegs, tai(forward_last.Next)); + + { Try verifying and removing what we have so far against the RLE + (except the current matching instruction) } + if VerifyRLE then + begin + { VerifyRLE calls AllocAllUsedRegsBetween with TmpUsedRegs as a + parameter, so we need to reinitialise it } + TransferUsedRegs(TmpUsedRegs); + Result := True; + end; + + rle_last := rle_pointer; + forward_last := forward_pointer; + end; + + { Scan ahead of the final RLE instruction to deallocate temporary registers } + UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); + + { Now that we've found a mismatch, attempt to verify the RLE one last time } + Result := VerifyRLE or Result; + + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + end; + + + procedure TX86AsmOptimizer.Pass1Initialize; + begin + if (cs_opt_asmcse in current_settings.optimizerswitches) then + begin + DisableSW := False; + ResetSW; + end; + end; + + procedure TX86AsmOptimizer.Pass2Initialize; + begin + if (cs_opt_asmcse in current_settings.optimizerswitches) then + { Some pass 2 optimisations call pass 1 methods... make sure the + sliding window doesn't get used in these instances } + DisableSW := True; + end; + end. diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas index ef6b055fdd..1c46f4a366 100644 --- a/compiler/x86_64/aoptcpu.pas +++ b/compiler/x86_64/aoptcpu.pas @@ -41,6 +41,7 @@ implementation uses globals, + aoptbase,cutils, globtype, aasmcpu; @@ -183,10 +184,34 @@ uses A_CLC, A_STC: Result:=OptPass1STCCLC(p); + A_CALL: + if (cs_opt_asmcse in current_settings.optimizerswitches) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon CALL', p); + ResetSW; + end; else ; end; + + { If an unsafe reference is found, clear the sliding window } + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + { Saves doing it twice } + (taicpu(p).opcode <> A_CALL) and + IsWriteToMemory(taicpu(p)) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon memory write', p); + ResetSW; + end; end; + ait_label: + if (cs_opt_asmcse in current_settings.optimizerswitches) and + not labelCanBeSkipped(tai_label(p)) then + begin + DebugSWMsg(SSlidingWindow + 'Reset sliding window upon finding label', p); + ResetSW; + end; else ; end; From c9b8c859fb0b90c93085c8ec7c11f5bb7493db41 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Thu, 24 Feb 2022 23:05:09 +0000 Subject: [PATCH 2/9] * x86: Added support for other MOV-like instructions to sliding window-based CSE optimisation. --- compiler/x86/aoptx86.pas | 78 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index cdf7d233d5..999cbc87e4 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -5627,6 +5627,7 @@ unit aoptx86; function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean; var hp1 : tai; + SWRegState: TAllUsedRegs; begin Result:=false; if taicpu(p).ops <> 2 then @@ -5680,6 +5681,28 @@ unit aoptx86; end; end; end; + + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + (taicpu(p).oper[1]^.typ = top_reg) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + (taicpu(p).oper[0]^.typ = top_ref) and + { Make sure the registers that make up the reference haven't changed } + (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + if not Result then + AddToSW(p); + end; end; @@ -14321,6 +14344,7 @@ unit aoptx86; NewRegSize: TSubRegister; Limit: TCgInt; SwapOper: POper; + SWRegState: TAllUsedRegs; begin result:=false; reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and @@ -15258,6 +15282,28 @@ unit aoptx86; end; end; end; + + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + (taicpu(p).oper[1]^.typ = top_reg) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + (taicpu(p).oper[0]^.typ = top_ref) and + { Make sure the registers that make up the reference haven't changed } + (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + if not Result then + AddToSW(p); + end; end; @@ -17889,6 +17935,17 @@ unit aoptx86; end; end; + for RegIndex := RS_XMM0 to RS_XMM7 do + begin + CurrentReg := newreg(R_MMXREGISTER, RegIndex, R_SUBMMWHOLE); + if TrackedRegs[R_MMXREGISTER].IsUsed(CurrentReg) then + begin + TmpUsedRegs[R_MMXREGISTER].Clear; + IncludeRegInUsedRegs(CurrentReg, InitialUsedRegs); + AllocRegBetween(CurrentReg, p1, p2, InitialUsedRegs); + end; + end; + {$ifdef x86_64} for RegIndex := RS_XMM0 to RS_XMM31 do {$else x86_64} @@ -18001,9 +18058,13 @@ unit aoptx86; { Analyse initial input } case taicpu(p).opcode of - A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}: + A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}, + A_MOVSS, A_MOVSD, A_MOVAPS, A_MOVUPS, A_MOVAPD, A_MOVUPD, A_MOVDQA, A_MOVDQU, A_MOVD, A_MOVQ, + A_MOVNTDQ, A_MOVNTDQA, A_MOVNTPD, A_MOVNTPS, + A_VMOVSS, A_VMOVSD, A_VMOVAPS, A_VMOVUPS, A_VMOVAPD, A_VMOVUPD, A_VMOVDQA, A_VMOVDQU, A_VMOVD, A_VMOVQ, + A_VMOVNTDQ, A_VMOVNTDQA, A_VMOVNTPD, A_VMOVNTPS: begin - if not CheckInput(taicpu(p).oper[0]^) then + if (taicpu(p).ops <> 2) { Wrong MOVSS } or not CheckInput(taicpu(p).oper[0]^) then begin ReleaseUsedRegs(RLETrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); @@ -18050,8 +18111,16 @@ unit aoptx86; Break; case taicpu(forward_pointer).opcode of - A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}: + A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}, + A_MOVSS, A_MOVSD, A_MOVAPS, A_MOVUPS, A_MOVAPD, A_MOVUPD, A_MOVDQA, A_MOVDQU, A_MOVD, A_MOVQ, + A_MOVNTDQ, A_MOVNTDQA, A_MOVNTPD, A_MOVNTPS, + A_VMOVSS, A_VMOVSD, A_VMOVAPS, A_VMOVUPS, A_VMOVAPD, A_VMOVUPD, A_VMOVDQA, A_VMOVDQU, A_VMOVD, A_VMOVQ, + A_VMOVNTDQ, A_VMOVNTDQA, A_VMOVNTPD, A_VMOVNTPS: begin + if taicpu(forward_pointer).ops <> 2 then + { Wrong MOVSS } + Break; + { Not allowed writes to references } if taicpu(forward_pointer).oper[1]^.typ = top_ref then Break; @@ -18074,7 +18143,8 @@ unit aoptx86; { Here is the reason why TmpUsedRegs was being used, so it can accurately detect whether the destination register is in use up to this point, plus TmpUsedRegs gets modified after a call to RegUsedAfterInstruction } - if not ( + if MatchInstruction(forward_pointer, [A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and + not ( { Is the value of rle_pointer's register still in use? } RegUsedAfterInstruction(taicpu(rle_pointer).oper[1]^.reg, forward_pointer, TmpUsedRegs) and { If so, it must preserve its value through the sdequence of instructions p to forward_pointer } From ca98df604cafe9233eb225240a05b86b7076be1f Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Mon, 21 Feb 2022 18:57:53 +0000 Subject: [PATCH 3/9] * x86: Addition of SHL/SHR/SAR/ROR/ROL to RLE chain handling --- compiler/x86/aoptx86.pas | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 999cbc87e4..f9664f6ede 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -18111,6 +18111,20 @@ unit aoptx86; Break; case taicpu(forward_pointer).opcode of + A_SHL, A_SHR, A_SAR, A_ROR, A_ROL: + begin + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + Break; + + { Not allowed writes to references } + if taicpu(forward_pointer).oper[1]^.typ = top_ref then + Break; + + if not CheckInput(taicpu(forward_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[1]^) then + Break; + end; A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}, A_MOVSS, A_MOVSD, A_MOVAPS, A_MOVUPS, A_MOVAPD, A_MOVUPD, A_MOVDQA, A_MOVDQU, A_MOVD, A_MOVQ, A_MOVNTDQ, A_MOVNTDQA, A_MOVNTPD, A_MOVNTPS, From 00a92095f6470f57268674414390a021731c9fad Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Thu, 24 Feb 2022 22:58:49 +0000 Subject: [PATCH 4/9] * x86: Added support for bitwise and arithmetic instructions to sliding window-based CSE optimisation. --- compiler/x86/aoptx86.pas | 80 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index f9664f6ede..a5eaaf6f78 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -18111,7 +18111,16 @@ unit aoptx86; Break; case taicpu(forward_pointer).opcode of - A_SHL, A_SHR, A_SAR, A_ROR, A_ROL: + A_NOT, A_NEG: + begin + { Not allowed writes to references } + if taicpu(forward_pointer).oper[0]^.typ = top_ref then + Break; + + if not CheckInput(taicpu(forward_pointer).oper[0]^) then + Break; + end; + A_AND, A_OR, A_XOR: begin if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then @@ -18125,6 +18134,75 @@ unit aoptx86; not CheckInput(taicpu(forward_pointer).oper[1]^) then Break; end; + A_SHL, A_SHR, A_SAR, A_ROR, A_ROL, A_ADD, A_SUB: + begin + if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + Break; + + { Not allowed writes to references } + if taicpu(forward_pointer).oper[1]^.typ = top_ref then + Break; + + if not CheckInput(taicpu(forward_pointer).oper[1]^) then + Break; + + if (taicpu(forward_pointer).oper[0]^.typ = top_const) then + begin + { A special kind of mismatch - the shift constant is different, but it + might be possible to make a saving if the register isn't modified + between the first and second chains.} + if (taicpu(forward_pointer).oper[0]^.val <> taicpu(rle_pointer).oper[0]^.val) then + begin + UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(ForwardTrackedRegs, tai(forward_last.Next)); + + { Before doing the last instruction, see if we can optimise what's + currently present in case the last line fails } + if VerifyRLE then + Result := True; + + { Look beyond the final instruction that we're replacing to deallocate any + temporary registers that are being used } + UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); + + if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + begin + if (taicpu(forward_pointer).oper[0]^.val > taicpu(rle_pointer).oper[0]^.val) then + begin + { If the second chain has a larger value, this is easy to accommodate as all + we have to do is keep the second shift/rotate but change the value to be + equal to the difference between the two original values. } + + DebugSWMsg(SSlidingWindow + 'Removed common subexpression (larger immediate)', forward_pointer); + Dec(taicpu(forward_pointer).oper[0]^.val, taicpu(rle_pointer).oper[0]^.val); + + { Remove all remaining instructions between p and forward_pointer } + while p <> forward_pointer do + begin + { Use RemoveCurrentP so UsedRegs is updated } + DebugSWMsg(SSlidingWindow + 'Removed common subexpression', p); + if not RemoveCurrentP(p) then + InternalError(2022021702); + end; + Result := True; + + { Make sure the RLE registers are tracked all the way through } + AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + + { Make sure UsedRegs knows about the newly allocated registers } + MergeUsedRegs(ForwardTrackedRegs); + end; + end; + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + end + else if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[0]^) then + Break; + end; A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}, A_MOVSS, A_MOVSD, A_MOVAPS, A_MOVUPS, A_MOVAPD, A_MOVUPD, A_MOVDQA, A_MOVDQU, A_MOVD, A_MOVQ, A_MOVNTDQ, A_MOVNTDQA, A_MOVNTPD, A_MOVNTPS, From 6ddc111697b14ce029da3473ed5d8cc964ab9312 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Thu, 24 Feb 2022 23:49:28 +0000 Subject: [PATCH 5/9] * x86: Added support for multiplication, division and extension instructions to sliding window-based CSE optimisation. --- compiler/x86/aoptx86.pas | 157 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 1 deletion(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index a5eaaf6f78..455e92bb66 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -8971,6 +8971,7 @@ unit aoptx86; function TX86AsmOptimizer.OptPass1Imul(var p: tai): boolean; var hp1 : tai; + SWRegState: TAllUsedRegs; begin result:=false; { replace @@ -8997,6 +8998,38 @@ unit aoptx86; result:=true; end; end; + + if not Result and + (cs_opt_asmcse in current_settings.optimizerswitches) and + { Only the 3-instruction version is a true write; don't store the 1- + and 2-operand versions in the sliding window } + (taicpu(p).ops = 3) and + { If IMUL reads and writes to the same register, it's impossible for it + to be a valid starting point for a repeating sequence since the input + won't be the same } + not MatchOperand(taicpu(p).oper[1]^, taicpu(p).oper[2]^.reg) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + (taicpu(hp1).oper[1]^.typ = top_reg) and + { Call SuperRegistersEqual to save calling RegModifiedBetween unnecessarily } + RegModifiedBetween(taicpu(hp1).oper[1]^.reg, hp1, p) + ) and + not ( + (taicpu(hp1).oper[1]^.typ = top_ref) and + { Make sure the registers that make up the reference haven't changed } + (taicpu(hp1).oper[1]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, taicpu(hp1).oper[1]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + AddToSW(p); + end; end; @@ -18076,6 +18109,20 @@ unit aoptx86; IncludeRegInUsedRegs(taicpu(p).oper[1]^.reg, ForwardTrackedRegs); end; end; + A_IMUL: + begin + if taicpu(p).ops <> 3 then + InternalError(2022022202); + + if not CheckInput(taicpu(p).oper[1]^) then + begin + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + + IncludeRegInUsedRegs(taicpu(p).oper[taicpu(p).ops-1]^.reg, ForwardTrackedRegs); + end; else begin { Don't know how to handle this instruction } @@ -18111,6 +18158,114 @@ unit aoptx86; Break; case taicpu(forward_pointer).opcode of + A_CBW, A_CWDE, A_CWD, A_CDQ{$ifdef x86_64}, A_CDQE, A_CQO{$endif x86_64}: + { Zero-operand instructions that are fine as is }; + A_IMUL: + begin + case taicpu(forward_pointer).ops of + 1: + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[0]^) and + { Implicitly reads one of the operands from EAX } + not CheckRegister(NR_EAX) then + Break; + 2: + begin + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[0]^) or + (taicpu(forward_pointer).oper[1]^.reg <> taicpu(rle_pointer).oper[1]^.reg) or + not CheckRegister(taicpu(forward_pointer).oper[1]^.reg) then + Break; + end; + 3: + begin + if (taicpu(forward_pointer).oper[0]^.val <> taicpu(rle_pointer).oper[0]^.val) or + not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) or + not CheckInput(taicpu(forward_pointer).oper[1]^) then + Break; + + { This is a special kind of mismatch - the final MOVx/LEA writes to a different register. + If the register in the first chain hasn't been modified, then the entire second chain + can be replaced with a single MOV instruction to write it to the new register } + if (taicpu(forward_pointer).oper[2]^.reg <> taicpu(rle_pointer).oper[2]^.reg) then + begin + UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + + { Be a little hacky and don't include the assignment of the different target register... } + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); + { ... but do include the RLE register so it's monitored } + IncludeRegInUsedRegs(taicpu(rle_pointer).oper[2]^.reg, ForwardTrackedRegs); + + { Before doing the last instruction, see if we can optimise what's + currently present in case the last line fails } + if VerifyRLE then + Result := True; + + { Look beyond the final instruction that we're replacing to deallocate any + temporary registers that are being used } + UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); + + { Add the RLE register to the forward tracking array so it's not ignored } + IncludeRegInUsedRegs(taicpu(rle_pointer).oper[2]^.reg, ForwardTrackedRegs); + + if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + begin + { This is a valid dereference chain that can be replaced + with the result of the previous one } + DebugSWMsg(SSlidingWindow + 'Removed common subexpression (different ending register via IMUL)', forward_pointer); + taicpu(forward_pointer).opcode := A_MOV; + taicpu(forward_pointer).loadreg(0, taicpu(rle_pointer).oper[2]^.reg); + taicpu(forward_pointer).loadreg(1, taicpu(forward_pointer).oper[2]^.reg); + taicpu(forward_pointer).clearop(2); + taicpu(forward_pointer).ops := 2; + + { Remove all remaining instructions between p and forward_pointer } + while p <> forward_pointer do + begin + { Use RemoveCurrentP so UsedRegs is updated } + DebugSWMsg(SSlidingWindow + 'Removed common subexpression', p); + if not RemoveCurrentP(p) then + InternalError(2022021701); + end; + + { Make sure the RLE registers are tracked all the way through } + AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + + { Make sure UsedRegs knows about the newly allocated registers } + MergeUsedRegs(ForwardTrackedRegs); + + Result := True; + end; + + ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(ForwardTrackedRegs); + Exit; + end; + end; + else + InternalError(2022022201); + end; + end; + A_MUL: + begin + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[0]^) or + { Implicitly reads one of the operands from EAX } + not CheckRegister(NR_EAX) then + Break; + end; + A_DIV, A_IDIV: + begin + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + not CheckInput(taicpu(forward_pointer).oper[0]^) or + not CheckRegister(NR_EAX) then + Break; + + { Word, DWord and QWord versions store part of the numerator in EDX } + if (taicpu(forward_pointer).opsize > S_B) and not CheckRegister(NR_EDX) then + Break; + end; A_NOT, A_NEG: begin { Not allowed writes to references } @@ -18119,7 +18274,7 @@ unit aoptx86; if not CheckInput(taicpu(forward_pointer).oper[0]^) then Break; - end; + end; A_AND, A_OR, A_XOR: begin if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or From d3cf312c3f0133af264c5f398fd4607abb24e51d Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Fri, 25 Feb 2022 01:49:00 +0000 Subject: [PATCH 6/9] * x86: Refactoring sliding-window entry code and unnecessary reference checks. --- compiler/x86/aoptx86.pas | 158 +++++++++++++-------------------------- 1 file changed, 51 insertions(+), 107 deletions(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 455e92bb66..bab9de5086 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -258,6 +258,7 @@ unit aoptx86; class procedure OptimizeRefs(var p: taicpu); static; { Sliding window routines } + function HandleSimple2OpSWInstruction(var p: tai): Boolean; procedure ResetSW; procedure AddToSW(var p: tai); function FindSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; @@ -927,10 +928,13 @@ unit aoptx86; ((getregtype(reg)=R_INTREGISTER) and (getsupreg(reg)=RS_EAX)) and ((getsubreg(reg)<>R_SUBH) or (p.opsize<>S_B)) ); - 2,3: + 2: regReadByInstruction := reginop(reg,p.oper[0]^) or reginop(reg,p.oper[1]^); + 3: + regReadByInstruction := + reginop(reg,p.oper[1]^); else InternalError(2019112801); end; @@ -3340,7 +3344,6 @@ unit aoptx86; MovAligned, MovUnaligned: TAsmOp; ThisRef: TReference; JumpTracking: TLinkedList; - SWRegState: TAllUsedRegs; begin Result:=false; @@ -5600,34 +5603,15 @@ unit aoptx86; Exit; end; - if not Result and - (cs_opt_asmcse in current_settings.optimizerswitches) and - (taicpu(p).oper[1]^.typ = top_reg) then - begin - hp1 := FindSWMatch(taicpu(p), SWRegState); - if Assigned(hp1) and - not ( - (taicpu(p).oper[0]^.typ = top_ref) and - { Make sure the registers that make up the reference haven't changed } - (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and - RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) - ) and - TraceRLE(p, hp1, SWRegState) then - begin - Result := True; - Exit; - end; - - if not Result then - AddToSW(p); - end; + { Don't call if Result is already True } + if not Result then + Result := HandleSimple2OpSWInstruction(p); end; function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean; var hp1 : tai; - SWRegState: TAllUsedRegs; begin Result:=false; if taicpu(p).ops <> 2 then @@ -5682,27 +5666,9 @@ unit aoptx86; end; end; - if not Result and - (cs_opt_asmcse in current_settings.optimizerswitches) and - (taicpu(p).oper[1]^.typ = top_reg) then - begin - hp1 := FindSWMatch(taicpu(p), SWRegState); - if Assigned(hp1) and - not ( - (taicpu(p).oper[0]^.typ = top_ref) and - { Make sure the registers that make up the reference haven't changed } - (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and - RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) - ) and - TraceRLE(p, hp1, SWRegState) then - begin - Result := True; - Exit; - end; - - if not Result then - AddToSW(p); - end; + { Don't call if Result is already True } + if not Result then + Result := HandleSimple2OpSWInstruction(p); end; @@ -6368,7 +6334,6 @@ unit aoptx86; TempReg: TRegister; Multiple: TCGInt; Adjacent, IntermediateRegDiscarded: Boolean; - SWRegState: TAllUsedRegs; begin Result:=false; @@ -6856,25 +6821,9 @@ unit aoptx86; end; end; end; - if not Result and - (cs_opt_asmcse in current_settings.optimizerswitches) then - begin - hp1 := FindSWMatch(taicpu(p), SWRegState); - if Assigned(hp1) and - not ( - { Make sure the registers that make up the reference haven't changed } - (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and - RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) - ) and - TraceRLE(p, hp1, SWRegState) then - begin - Result := True; - Exit; - end; - - if not Result then - AddToSW(p); - end; + { Don't call if Result is already True } + if not Result then + Result := HandleSimple2OpSWInstruction(p); end; @@ -14377,7 +14326,6 @@ unit aoptx86; NewRegSize: TSubRegister; Limit: TCgInt; SwapOper: POper; - SWRegState: TAllUsedRegs; begin result:=false; reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and @@ -15316,27 +15264,9 @@ unit aoptx86; end; end; - if not Result and - (cs_opt_asmcse in current_settings.optimizerswitches) and - (taicpu(p).oper[1]^.typ = top_reg) then - begin - hp1 := FindSWMatch(taicpu(p), SWRegState); - if Assigned(hp1) and - not ( - (taicpu(p).oper[0]^.typ = top_ref) and - { Make sure the registers that make up the reference haven't changed } - (taicpu(p).oper[0]^.ref^.refaddr = addr_no) and - RegPairModifiedBetween(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[0]^.ref^.index, hp1, p) - ) and - TraceRLE(p, hp1, SWRegState) then - begin - Result := True; - Exit; - end; - - if not Result then - AddToSW(p); - end; + { Don't call if Result is already True } + if not Result then + Result := HandleSimple2OpSWInstruction(p); end; @@ -17803,6 +17733,39 @@ unit aoptx86; end; + function TX86AsmOptimizer.HandleSimple2OpSWInstruction(var p: tai): Boolean; + var + SWRegState: TAllUsedRegs; + hp1: tai; + begin + Result := False; + if (cs_opt_asmcse in current_settings.optimizerswitches) and + (taicpu(p).oper[1]^.typ = top_reg) then + begin + hp1 := FindSWMatch(taicpu(p), SWRegState); + if Assigned(hp1) and + not ( + (taicpu(hp1).oper[0]^.typ = top_reg) and + { Call SuperRegistersEqual to save calling RegModifiedBetween unnecessarily } + RegModifiedBetween(taicpu(hp1).oper[0]^.reg, hp1, p) + ) and + not ( + (taicpu(hp1).oper[0]^.typ = top_ref) and + { Make sure the registers that make up the reference haven't changed } + (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, taicpu(hp1).oper[0]^.ref^.index, hp1, p) + ) and + TraceRLE(p, hp1, SWRegState) then + begin + Result := True; + Exit; + end; + + AddToSW(p); + end; + end; + + procedure TX86AsmOptimizer.ResetSW; var X: Integer; @@ -18104,10 +18067,7 @@ unit aoptx86; Exit; end; - if taicpu(p).oper[1]^.typ = top_reg then - begin - IncludeRegInUsedRegs(taicpu(p).oper[1]^.reg, ForwardTrackedRegs); - end; + IncludeRegInUsedRegs(taicpu(p).oper[1]^.reg, ForwardTrackedRegs); end; A_IMUL: begin @@ -18268,10 +18228,6 @@ unit aoptx86; end; A_NOT, A_NEG: begin - { Not allowed writes to references } - if taicpu(forward_pointer).oper[0]^.typ = top_ref then - Break; - if not CheckInput(taicpu(forward_pointer).oper[0]^) then Break; end; @@ -18281,10 +18237,6 @@ unit aoptx86; not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then Break; - { Not allowed writes to references } - if taicpu(forward_pointer).oper[1]^.typ = top_ref then - Break; - if not CheckInput(taicpu(forward_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[1]^) then Break; @@ -18294,10 +18246,6 @@ unit aoptx86; if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then Break; - { Not allowed writes to references } - if taicpu(forward_pointer).oper[1]^.typ = top_ref then - Break; - if not CheckInput(taicpu(forward_pointer).oper[1]^) then Break; @@ -18368,10 +18316,6 @@ unit aoptx86; { Wrong MOVSS } Break; - { Not allowed writes to references } - if taicpu(forward_pointer).oper[1]^.typ = top_ref then - Break; - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or (taicpu(forward_pointer).oper[1]^.typ <> taicpu(rle_pointer).oper[1]^.typ) then Break; From 939fdb2b9069cf06786a67b38526c33461036969 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Sat, 26 Feb 2022 02:03:17 +0000 Subject: [PATCH 7/9] * x86: Added support for partial matches in the sliding window. --- compiler/x86/aoptx86.pas | 131 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index bab9de5086..759cd56cea 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -263,6 +263,10 @@ unit aoptx86; procedure AddToSW(var p: tai); function FindSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; + { Locates a match where all but the last operand are equal (although it + must be a register } + function FindPartialSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; + { Allocate all registers marked as used in TrackedRegs between p1 and p2 (uses TmpUsedRegs as the initialusedregs in calls to AllocRegBetween } procedure AllocAllUsedRegsBetween(p1, p2: tai; var TrackedRegs: TAllUsedRegs; var InitialUsedRegs: TAllUsedRegs); @@ -17736,14 +17740,27 @@ unit aoptx86; function TX86AsmOptimizer.HandleSimple2OpSWInstruction(var p: tai): Boolean; var SWRegState: TAllUsedRegs; - hp1: tai; + hp1, hp2: tai; begin Result := False; if (cs_opt_asmcse in current_settings.optimizerswitches) and (taicpu(p).oper[1]^.typ = top_reg) then begin + { Instruction must explicitly write to the last operand (not modify or read-write) } + if not (Ch_WOp2 in InsProp[taicpu(p).opcode].Ch) and + { The only exceptional case } + not ( + (taicpu(p).opcode = A_MOVSD) and + (taicpu(p).ops = 2) + ) then + InternalError(2022022501); + + GetLastInstruction(p, hp2); + hp1 := FindSWMatch(taicpu(p), SWRegState); if Assigned(hp1) and + { Can't be adjacent } + (hp1 <> hp2) and not ( (taicpu(hp1).oper[0]^.typ = top_reg) and { Call SuperRegistersEqual to save calling RegModifiedBetween unnecessarily } @@ -17759,6 +17776,56 @@ unit aoptx86; begin Result := True; Exit; + end + else if (getregtype(taicpu(p).oper[1]^.reg) = R_INTREGISTER) and + { Don't do partial matches for register-to-register moves, as these + are already as efficient as they can be (and will cause an infinite + loop in the peephole optimizer in some cases). } + not ( + (taicpu(p).opcode = A_MOV) and + (taicpu(p).oper[0]^.typ = top_reg) + ) then + begin + { If not an exact match, maybe a partial match? } + hp1 := FindPartialSWMatch(taicpu(p), SWRegState); + { oper[1] of hp1 will be a register, otherwise FindPartialSWMatch would have returned nil } + if Assigned(hp1) and + { Can't be adjacent } + (hp1 <> hp2) and + not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, hp1, p) and + not ( + (taicpu(hp1).oper[0]^.typ = top_reg) and + { Call SuperRegistersEqual to save calling RegModifiedBetween unnecessarily } + not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) and + RegModifiedBetween(taicpu(hp1).oper[0]^.reg, hp1, p) + ) and + not ( + (taicpu(hp1).oper[0]^.typ = top_ref) and + ( + { If True, this means the register changes value, but is + missed by RegPairModifiedBetween } + RegInRef(taicpu(hp1).oper[1]^.reg, taicpu(hp1).oper[0]^.ref^) or + ( + { Make sure the registers that make up the reference haven't changed } + (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and + RegPairModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, taicpu(hp1).oper[0]^.ref^.index, hp1, p) + ) + ) + ) then + begin + DebugSWMsg(SSlidingWindow + 'Successfully converted partial match to register-to-register MOV', p); + taicpu(p).opcode := A_MOV; + taicpu(p).opsize := reg2opsize(taicpu(hp1).oper[1]^.reg); + taicpu(p).loadreg(0, taicpu(hp1).oper[1]^.reg); + + AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, SWRegState); + + { Make sure UsedRegs knows about the newly allocated register } + IncludeRegInUsedRegs(taicpu(hp1).oper[1]^.reg, UsedRegs); + + Result := True; + Exit; + end; end; AddToSW(p); @@ -17852,6 +17919,68 @@ unit aoptx86; end; + function TX86AsmOptimizer.FindPartialSWMatch(const p: tai; out RegState: TAllUsedRegs): tai; + var + Index: Cardinal; + X: Integer; + p_sw: taicpu; + hp: taicpu absolute p; { Implicit typecast } + Mismatch: Boolean; + begin + Result := nil; + if DisableSW then + Exit; + + if (hp.oper[hp.ops-1]^.typ <> top_reg) then + Exit; + + Index := WindowTop; + repeat + p_sw := taicpu(SlidingWindow[Index].ai); + if not Assigned(p_sw) then + { Sliding window hasn't been completely filled; this is as far as + we can search } + Exit; + + if (p_sw.opcode = hp.opcode) and + (p_sw.opsize = hp.opsize) and + (p_sw.ops = hp.ops) then + begin + Mismatch := False; + + { Check to see if all but the final parameters match } + for X := 0 to p_sw.ops - 2 do + begin + if not MatchOperand(p_sw.oper[X]^, hp.oper[X]^) then + begin + Mismatch := True; + Break; + end; + end; + + { If Mismatch is still false, then we have a match! } + if not Mismatch and + (p_sw.oper[p_sw.ops-1]^.typ = top_reg) then + begin + DebugSWMsg(SSlidingWindow + 'Found partial match in sliding window (ref = ' + hexstr(p) + ')', p); + DebugSWMsg(SSlidingWindow + 'Reference for match found below (ref = ' + hexstr(p) + ')', p_sw); + Result := p_sw; + CopyUsedRegs(SlidingWindow[Index].RegState, RegState); + Exit; + end; + end; + + if Index = 0 then + Index := SLIDING_WINDOW_SIZE - 1 + else + Dec(Index); + + { Drop out if Index has made a complete loop } + until Index = WindowTop; + + end; + + function TX86AsmOptimizer.CheckSWRegisters(p1, p2: tai; var ForwardTrackedRegs: TAllUsedRegs): Boolean; var RegIndex: TSuperRegister; From 1dc5f3b5a033e36cd96bba48b8ac65d0b2fb6310 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Sun, 27 Mar 2022 20:56:34 +0100 Subject: [PATCH 8/9] * x86: Renamed references to RLE to "Sequence" and "Sliding Window" for clarity. --- compiler/x86/aoptx86.pas | 198 +++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 759cd56cea..16f039fb2d 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -278,7 +278,7 @@ unit aoptx86; { Step thorugh a sliding-window match with the current instruction and see how much of a chain can be removed } - function TraceRLE(var p: tai; rle_pointer: tai; var RLERegState: TAllUsedRegs): Boolean; + function TraceSWSequence(var p: tai; sw_pointer: tai; var SWRegState: TAllUsedRegs): Boolean; { Called whenever a new iteration of pass 1 starts. Override for platform-specific behaviour } @@ -3238,7 +3238,7 @@ unit aoptx86; function TX86AsmOptimizer.OptPass1MOV(var p : tai) : boolean; var - hp1, hp2, hp3, hp4, rle_start, rle_last: tai; + hp1, hp2, hp3, hp4, sw_start, sw_last: tai; DoOptimisation, TempBool: Boolean; {$ifdef x86_64} NewConst: TCGInt; @@ -8975,7 +8975,7 @@ unit aoptx86; (taicpu(hp1).oper[1]^.ref^.refaddr = addr_no) and RegPairModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, taicpu(hp1).oper[1]^.ref^.index, hp1, p) ) and - TraceRLE(p, hp1, SWRegState) then + TraceSWSequence(p, hp1, SWRegState) then begin Result := True; Exit; @@ -17772,7 +17772,7 @@ unit aoptx86; (taicpu(hp1).oper[0]^.ref^.refaddr = addr_no) and RegPairModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, taicpu(hp1).oper[0]^.ref^.index, hp1, p) ) and - TraceRLE(p, hp1, SWRegState) then + TraceSWSequence(p, hp1, SWRegState) then begin Result := True; Exit; @@ -17995,7 +17995,7 @@ unit aoptx86; {$endif x86_64} begin CurrentReg := newreg(R_INTREGISTER, RegIndex, R_SUBWHOLE); - { If the register is not in AllUsedRegs, then it was temporary within the RLE } + { If the register is not in AllUsedRegs, then it was temporary within the Sliding Window sequence } if ForwardTrackedRegs[R_INTREGISTER].IsUsed(CurrentReg) then begin if SearchReg = NR_NO then @@ -18051,7 +18051,7 @@ unit aoptx86; {$endif x86_64} begin CurrentReg := newreg(R_INTREGISTER, RegIndex, R_SUBWHOLE); - { If the register is not in InitialUsedRegs, then it was temporary within the RLE } + { If the register is not in InitialUsedRegs, then it was temporary within the Sliding Window sequence } if TrackedRegs[R_INTREGISTER].IsUsed(CurrentReg) then begin TmpUsedRegs[R_INTREGISTER].Clear; @@ -18088,17 +18088,17 @@ unit aoptx86; end; - function TX86AsmOptimizer.TraceRLE(var p: tai; rle_pointer: tai; var RLERegState: TAllUsedRegs): Boolean; + function TX86AsmOptimizer.TraceSWSequence(var p: tai; sw_pointer: tai; var SWRegState: TAllUsedRegs): Boolean; var - forward_pointer, forward_last, rle_last, hp1: tai; - RLETrackedRegs, ForwardTrackedRegs: TAllUsedRegs; + forward_pointer, forward_last, sw_last, hp1: tai; + SWTrackedRegs, ForwardTrackedRegs: TAllUsedRegs; - function VerifyRLE: Boolean; + function VerifySequence: Boolean; begin Result := False; - if CheckSWRegisters(rle_last, p, ForwardTrackedRegs) then + if CheckSWRegisters(sw_last, p, ForwardTrackedRegs) then begin - AllocAllUsedRegsBetween(rle_last, forward_last, ForwardTrackedRegs, RLETrackedRegs); + AllocAllUsedRegsBetween(sw_last, forward_last, ForwardTrackedRegs, SWTrackedRegs); { Make sure UsedRegs knows about the newly allocated registers } MergeUsedRegs(ForwardTrackedRegs); @@ -18118,9 +18118,9 @@ unit aoptx86; if not RegInUsedRegs(Reg, ForwardTrackedRegs) then begin { Reading from outside register, so it must not change value - between rle_pointer and p; also check rle_pointer itself } - if RegModifiedByInstruction(Reg, rle_pointer) or - RegModifiedBetween(Reg, rle_pointer, p) then + between sw_pointer and p; also check sw_pointer itself } + if RegModifiedByInstruction(Reg, sw_pointer) or + RegModifiedBetween(Reg, sw_pointer, p) then begin Result := False; Exit; @@ -18149,8 +18149,8 @@ unit aoptx86; { Add registers to tracking lists } { Reference base relies on an outside register, so it - must not change value between rle_pointer and p; also - check rle_pointer itself } + must not change value between sw_pointer and p; also + check sw_pointer itself } if (Oper.ref^.base <> NR_NO) and not CheckRegister(Oper.ref^.base) then begin Result := False; @@ -18158,8 +18158,8 @@ unit aoptx86; end; { Reference index relies on an outside register, so it - must not change value between rle_pointer and p; also - check rle_pointer itself } + must not change value between sw_pointer and p; also + check sw_pointer itself } if (Oper.ref^.index <> NR_NO) and not CheckRegister(Oper.ref^.index) then begin Result := False; @@ -18176,9 +18176,9 @@ unit aoptx86; forward_pointer := p; forward_last := p; - rle_last := rle_pointer; + sw_last := sw_pointer; - CopyUsedRegs(RLERegState, RLETrackedRegs); + CopyUsedRegs(SWRegState, SWTrackedRegs); CreateUsedRegs(ForwardTrackedRegs); { Analyse initial input } @@ -18191,7 +18191,7 @@ unit aoptx86; begin if (taicpu(p).ops <> 2) { Wrong MOVSS } or not CheckInput(taicpu(p).oper[0]^) then begin - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; @@ -18205,7 +18205,7 @@ unit aoptx86; if not CheckInput(taicpu(p).oper[1]^) then begin - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; @@ -18215,7 +18215,7 @@ unit aoptx86; else begin { Don't know how to handle this instruction } - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; @@ -18224,11 +18224,11 @@ unit aoptx86; TransferUsedRegs(TmpUsedRegs); { Now start scanning ahead one instruction at a time until a mismatch is found } - while GetNextInstruction(forward_pointer, forward_pointer) and GetNextInstruction(rle_pointer, rle_pointer) do + while GetNextInstruction(forward_pointer, forward_pointer) and GetNextInstruction(sw_pointer, sw_pointer) do begin - { NOTE: forward_pointer = forward ahead of p; rle_pointer = reference somewhere behind p } + { NOTE: forward_pointer = forward ahead of p; sw_pointer = reference somewhere behind p } - if (rle_pointer = p) then + if (sw_pointer = p) then { We hit the current instruction - don't optimise this } Break; @@ -18240,10 +18240,10 @@ unit aoptx86; Break; { Only accept instructions of the form "mov (ref),%reg" or "lea (ref),%reg" } - if (forward_pointer.typ <> ait_instruction) or (taicpu(rle_pointer).typ <> ait_instruction) or - (taicpu(forward_pointer).opcode <> taicpu(rle_pointer).opcode) or - (taicpu(forward_pointer).opsize <> taicpu(rle_pointer).opsize) or - (taicpu(forward_pointer).ops <> taicpu(rle_pointer).ops) then + if (forward_pointer.typ <> ait_instruction) or (taicpu(sw_pointer).typ <> ait_instruction) or + (taicpu(forward_pointer).opcode <> taicpu(sw_pointer).opcode) or + (taicpu(forward_pointer).opsize <> taicpu(sw_pointer).opsize) or + (taicpu(forward_pointer).ops <> taicpu(sw_pointer).ops) then Break; case taicpu(forward_pointer).opcode of @@ -18253,58 +18253,58 @@ unit aoptx86; begin case taicpu(forward_pointer).ops of 1: - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[0]^) and { Implicitly reads one of the operands from EAX } not CheckRegister(NR_EAX) then Break; 2: begin - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[0]^) or - (taicpu(forward_pointer).oper[1]^.reg <> taicpu(rle_pointer).oper[1]^.reg) or + (taicpu(forward_pointer).oper[1]^.reg <> taicpu(sw_pointer).oper[1]^.reg) or not CheckRegister(taicpu(forward_pointer).oper[1]^.reg) then Break; end; 3: begin - if (taicpu(forward_pointer).oper[0]^.val <> taicpu(rle_pointer).oper[0]^.val) or - not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) or + if (taicpu(forward_pointer).oper[0]^.val <> taicpu(sw_pointer).oper[0]^.val) or + not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(sw_pointer).oper[1]^) or not CheckInput(taicpu(forward_pointer).oper[1]^) then Break; { This is a special kind of mismatch - the final MOVx/LEA writes to a different register. If the register in the first chain hasn't been modified, then the entire second chain can be replaced with a single MOV instruction to write it to the new register } - if (taicpu(forward_pointer).oper[2]^.reg <> taicpu(rle_pointer).oper[2]^.reg) then + if (taicpu(forward_pointer).oper[2]^.reg <> taicpu(sw_pointer).oper[2]^.reg) then begin - UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(SWTrackedRegs, tai(sw_last.Next)); { Be a little hacky and don't include the assignment of the different target register... } UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); - { ... but do include the RLE register so it's monitored } - IncludeRegInUsedRegs(taicpu(rle_pointer).oper[2]^.reg, ForwardTrackedRegs); + { ... but do include the sequence's output register so it's monitored } + IncludeRegInUsedRegs(taicpu(sw_pointer).oper[2]^.reg, ForwardTrackedRegs); { Before doing the last instruction, see if we can optimise what's currently present in case the last line fails } - if VerifyRLE then + if VerifySequence then Result := True; { Look beyond the final instruction that we're replacing to deallocate any temporary registers that are being used } - UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(SWTrackedRegs, tai(sw_pointer.Next)); UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); - { Add the RLE register to the forward tracking array so it's not ignored } - IncludeRegInUsedRegs(taicpu(rle_pointer).oper[2]^.reg, ForwardTrackedRegs); + { Add the sequence's output register to the forward tracking array so it's not ignored } + IncludeRegInUsedRegs(taicpu(sw_pointer).oper[2]^.reg, ForwardTrackedRegs); - if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + if CheckSWRegisters(sw_pointer, p, ForwardTrackedRegs) then begin { This is a valid dereference chain that can be replaced with the result of the previous one } DebugSWMsg(SSlidingWindow + 'Removed common subexpression (different ending register via IMUL)', forward_pointer); taicpu(forward_pointer).opcode := A_MOV; - taicpu(forward_pointer).loadreg(0, taicpu(rle_pointer).oper[2]^.reg); + taicpu(forward_pointer).loadreg(0, taicpu(sw_pointer).oper[2]^.reg); taicpu(forward_pointer).loadreg(1, taicpu(forward_pointer).oper[2]^.reg); taicpu(forward_pointer).clearop(2); taicpu(forward_pointer).ops := 2; @@ -18318,8 +18318,8 @@ unit aoptx86; InternalError(2022021701); end; - { Make sure the RLE registers are tracked all the way through } - AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + { Make sure the sequence's registers are tracked all the way through } + AllocAllUsedRegsBetween(sw_pointer, forward_pointer, ForwardTrackedRegs, SWTrackedRegs); { Make sure UsedRegs knows about the newly allocated registers } MergeUsedRegs(ForwardTrackedRegs); @@ -18327,7 +18327,7 @@ unit aoptx86; Result := True; end; - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; @@ -18338,7 +18338,7 @@ unit aoptx86; end; A_MUL: begin - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[0]^) or { Implicitly reads one of the operands from EAX } not CheckRegister(NR_EAX) then @@ -18346,7 +18346,7 @@ unit aoptx86; end; A_DIV, A_IDIV: begin - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[0]^) or not CheckRegister(NR_EAX) then Break; @@ -18362,8 +18362,8 @@ unit aoptx86; end; A_AND, A_OR, A_XOR: begin - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or - not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or + not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(sw_pointer).oper[1]^) then Break; if not CheckInput(taicpu(forward_pointer).oper[0]^) or @@ -18372,7 +18372,7 @@ unit aoptx86; end; A_SHL, A_SHR, A_SAR, A_ROR, A_ROL, A_ADD, A_SUB: begin - if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(sw_pointer).oper[1]^) then Break; if not CheckInput(taicpu(forward_pointer).oper[1]^) then @@ -18383,31 +18383,31 @@ unit aoptx86; { A special kind of mismatch - the shift constant is different, but it might be possible to make a saving if the register isn't modified between the first and second chains.} - if (taicpu(forward_pointer).oper[0]^.val <> taicpu(rle_pointer).oper[0]^.val) then + if (taicpu(forward_pointer).oper[0]^.val <> taicpu(sw_pointer).oper[0]^.val) then begin - UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(SWTrackedRegs, tai(sw_last.Next)); UpdateUsedRegs(ForwardTrackedRegs, tai(forward_last.Next)); { Before doing the last instruction, see if we can optimise what's currently present in case the last line fails } - if VerifyRLE then + if VerifySequence then Result := True; { Look beyond the final instruction that we're replacing to deallocate any temporary registers that are being used } - UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(SWTrackedRegs, tai(sw_pointer.Next)); UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); - if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + if CheckSWRegisters(sw_pointer, p, ForwardTrackedRegs) then begin - if (taicpu(forward_pointer).oper[0]^.val > taicpu(rle_pointer).oper[0]^.val) then + if (taicpu(forward_pointer).oper[0]^.val > taicpu(sw_pointer).oper[0]^.val) then begin { If the second chain has a larger value, this is easy to accommodate as all we have to do is keep the second shift/rotate but change the value to be equal to the difference between the two original values. } DebugSWMsg(SSlidingWindow + 'Removed common subexpression (larger immediate)', forward_pointer); - Dec(taicpu(forward_pointer).oper[0]^.val, taicpu(rle_pointer).oper[0]^.val); + Dec(taicpu(forward_pointer).oper[0]^.val, taicpu(sw_pointer).oper[0]^.val); { Remove all remaining instructions between p and forward_pointer } while p <> forward_pointer do @@ -18419,19 +18419,19 @@ unit aoptx86; end; Result := True; - { Make sure the RLE registers are tracked all the way through } - AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + { Make sure the sequence's registers are tracked all the way through } + AllocAllUsedRegsBetween(sw_pointer, forward_pointer, ForwardTrackedRegs, SWTrackedRegs); { Make sure UsedRegs knows about the newly allocated registers } MergeUsedRegs(ForwardTrackedRegs); end; end; - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; end - else if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or + else if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or not CheckInput(taicpu(forward_pointer).oper[0]^) then Break; end; @@ -18445,11 +18445,11 @@ unit aoptx86; { Wrong MOVSS } Break; - if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(rle_pointer).oper[0]^) or - (taicpu(forward_pointer).oper[1]^.typ <> taicpu(rle_pointer).oper[1]^.typ) then + if not MatchOperand(taicpu(forward_pointer).oper[0]^, taicpu(sw_pointer).oper[0]^) or + (taicpu(forward_pointer).oper[1]^.typ <> taicpu(sw_pointer).oper[1]^.typ) then Break; - { Check that the input hasn't changed value between the RLE and p } + { Check that the input hasn't changed value between the sequence and p } if not CheckInput(taicpu(forward_pointer).oper[0]^) then Break; @@ -18458,50 +18458,50 @@ unit aoptx86; can be replaced with a single MOV instruction to write it to the new register } if (taicpu(forward_pointer).oper[1]^.typ = top_reg) then begin - if (taicpu(forward_pointer).oper[1]^.reg <> taicpu(rle_pointer).oper[1]^.reg) then + if (taicpu(forward_pointer).oper[1]^.reg <> taicpu(sw_pointer).oper[1]^.reg) then begin { Here is the reason why TmpUsedRegs was being used, so it can accurately detect whether the destination register is in use up to this point, plus TmpUsedRegs gets modified after a call to RegUsedAfterInstruction } if MatchInstruction(forward_pointer, [A_LEA, A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and not ( - { Is the value of rle_pointer's register still in use? } - RegUsedAfterInstruction(taicpu(rle_pointer).oper[1]^.reg, forward_pointer, TmpUsedRegs) and + { Is the value of sw_pointer's register still in use? } + RegUsedAfterInstruction(taicpu(sw_pointer).oper[1]^.reg, forward_pointer, TmpUsedRegs) and { If so, it must preserve its value through the sdequence of instructions p to forward_pointer } ( - RegModifiedByInstruction(taicpu(rle_pointer).oper[1]^.reg, p) or - RegModifiedBetween(taicpu(rle_pointer).oper[1]^.reg, p, forward_pointer) + RegModifiedByInstruction(taicpu(sw_pointer).oper[1]^.reg, p) or + RegModifiedBetween(taicpu(sw_pointer).oper[1]^.reg, p, forward_pointer) ) ) then begin - UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(SWTrackedRegs, tai(sw_last.Next)); { Be a little hacky and don't include the assignment of the different target register... } UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); - { ... but do include the RLE register so it's monitored } - IncludeRegInUsedRegs(taicpu(rle_pointer).oper[1]^.reg, ForwardTrackedRegs); + { ... but do include the sequence's output register so it's monitored } + IncludeRegInUsedRegs(taicpu(sw_pointer).oper[1]^.reg, ForwardTrackedRegs); { Before doing the last instruction, see if we can optimise what's currently present in case the last line fails } - if VerifyRLE then + if VerifySequence then Result := True; { Look beyond the final instruction that we're replacing to deallocate any temporary registers that are being used } - UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_pointer.Next)); + UpdateUsedRegsIgnoreNew(SWTrackedRegs, tai(sw_pointer.Next)); UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_pointer.Next)); - { Add the RLE register to the forward tracking array so it's not ignored } - IncludeRegInUsedRegs(taicpu(rle_pointer).oper[1]^.reg, ForwardTrackedRegs); + { Add the sequence's output register to the forward tracking array so it's not ignored } + IncludeRegInUsedRegs(taicpu(sw_pointer).oper[1]^.reg, ForwardTrackedRegs); - if CheckSWRegisters(rle_pointer, p, ForwardTrackedRegs) then + if CheckSWRegisters(sw_pointer, p, ForwardTrackedRegs) then begin { This is a valid dereference chain that can be replaced with the result of the previous one } DebugSWMsg(SSlidingWindow + 'Removed common subexpression (different ending register)', forward_pointer); taicpu(forward_pointer).opcode := A_MOV; - taicpu(forward_pointer).opsize := reg2opsize(taicpu(rle_pointer).oper[1]^.reg); - taicpu(forward_pointer).loadreg(0, taicpu(rle_pointer).oper[1]^.reg); + taicpu(forward_pointer).opsize := reg2opsize(taicpu(sw_pointer).oper[1]^.reg); + taicpu(forward_pointer).loadreg(0, taicpu(sw_pointer).oper[1]^.reg); { Remove all remaining instructions between p and forward_pointer } while p <> forward_pointer do @@ -18512,8 +18512,8 @@ unit aoptx86; InternalError(2022021701); end; - { Make sure the RLE registers are tracked all the way through } - AllocAllUsedRegsBetween(rle_pointer, forward_pointer, ForwardTrackedRegs, RLETrackedRegs); + { Make sure the sequence's registers are tracked all the way through } + AllocAllUsedRegsBetween(sw_pointer, forward_pointer, ForwardTrackedRegs, SWTrackedRegs); { Make sure UsedRegs knows about the newly allocated registers } MergeUsedRegs(ForwardTrackedRegs); @@ -18521,7 +18521,7 @@ unit aoptx86; Result := True; end; - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); Exit; end; @@ -18530,7 +18530,7 @@ unit aoptx86; Break; end; end - else if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(rle_pointer).oper[1]^) then + else if not MatchOperand(taicpu(forward_pointer).oper[1]^, taicpu(sw_pointer).oper[1]^) then Break; end; else @@ -18538,33 +18538,33 @@ unit aoptx86; Break; end; - { RLE chain is okay so far } + { Sequence chain is okay so far } - UpdateUsedRegs(RLETrackedRegs, tai(rle_last.Next)); + UpdateUsedRegs(SWTrackedRegs, tai(sw_last.Next)); UpdateUsedRegs(ForwardTrackedRegs, tai(forward_last.Next)); - { Try verifying and removing what we have so far against the RLE - (except the current matching instruction) } - if VerifyRLE then + { Try verifying and removing what we have so far against the Sliding + Window sequence (except the current matching instruction) } + if VerifySequence then begin - { VerifyRLE calls AllocAllUsedRegsBetween with TmpUsedRegs as a + { VerifySequence calls AllocAllUsedRegsBetween with TmpUsedRegs as a parameter, so we need to reinitialise it } TransferUsedRegs(TmpUsedRegs); Result := True; end; - rle_last := rle_pointer; + sw_last := sw_pointer; forward_last := forward_pointer; end; - { Scan ahead of the final RLE instruction to deallocate temporary registers } - UpdateUsedRegsIgnoreNew(RLETrackedRegs, tai(rle_last.Next)); + { Scan ahead of the sequence's final instruction to deallocate temporary registers } + UpdateUsedRegsIgnoreNew(SWTrackedRegs, tai(sw_last.Next)); UpdateUsedRegsIgnoreNew(ForwardTrackedRegs, tai(forward_last.Next)); - { Now that we've found a mismatch, attempt to verify the RLE one last time } - Result := VerifyRLE or Result; + { Now that we've found a mismatch, attempt to verify the sequence one last time } + Result := VerifySequence or Result; - ReleaseUsedRegs(RLETrackedRegs); + ReleaseUsedRegs(SWTrackedRegs); ReleaseUsedRegs(ForwardTrackedRegs); end; From 7eec0e1672bde00a6ec185e3899beabcb32cca33 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Mon, 13 Mar 2023 07:34:22 +0000 Subject: [PATCH 9/9] * Partial matches of the form "mov const,%reg" are no longer converted. --- compiler/x86/aoptx86.pas | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 16f039fb2d..51a1ddfe42 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -17811,6 +17811,11 @@ unit aoptx86; RegPairModifiedBetween(taicpu(hp1).oper[0]^.ref^.base, taicpu(hp1).oper[0]^.ref^.index, hp1, p) ) ) + ) and + not ( + { Don't convert "mov const,%reg1; ...; mov const,%reg2" into "mov const,%reg1; ...; mov %reg1,%reg2" } + (taicpu(p).opcode = A_MOV) and + (taicpu(p).oper[0]^.typ = top_const) ) then begin DebugSWMsg(SSlidingWindow + 'Successfully converted partial match to register-to-register MOV', p);