From dd2d1bf68b5c85c66a1d59b2f1d2319cb3b726af Mon Sep 17 00:00:00 2001 From: florian Date: Sat, 5 Oct 2019 20:48:33 +0000 Subject: [PATCH] * (modified/re-formatted) patch by Christo Crause: AVR: Optimizing code generation for shift with compile time constant git-svn-id: trunk@43136 - --- compiler/avr/cgcpu.pas | 109 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) diff --git a/compiler/avr/cgcpu.pas b/compiler/avr/cgcpu.pas index 661c300a9d..dfd3eeecf6 100644 --- a/compiler/avr/cgcpu.pas +++ b/compiler/avr/cgcpu.pas @@ -438,6 +438,11 @@ unit cgcpu; procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister); + var + tmpSrc, tmpDst, countreg: TRegister; + b, b2, i, j: byte; + s1, s2, t1: integer; + l1: TAsmLabel; begin if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then begin @@ -451,6 +456,102 @@ unit cgcpu; a:=a shr 1; end; end + + else if (op in [OP_SHL,OP_SHR]) and + { a=0 get eliminated later by tcg.optimize_op_const } + (a>0) then + begin + { number of bytes to shift } + b:=a div 8; + + { Ensure that b is never larger than base type } + if b>tcgsize2size[size] then + begin + b:=tcgsize2size[size]; + b2:=0; + end + else + b2:=a mod 8; + + if b < tcgsize2size[size] then + { copy from src to dst accounting for shift offset } + for i:=0 to (tcgsize2size[size]-b-1) do + if op=OP_SHL then + a_load_reg_reg(list,OS_8,OS_8, + GetOffsetReg64(src,NR_NO,i), + GetOffsetReg64(dst,NR_NO,i+b)) + else + a_load_reg_reg(list,OS_8,OS_8, + GetOffsetReg64(src,NR_NO,i+b), + GetOffsetReg64(dst,NR_NO,i)); + + { remaining bit shifts } + if b2 > 0 then + begin + { Cost of loop } + s1:=3+tcgsize2size[size]-b; + t1:=b2*(tcgsize2size[size]-b+3); + { Cost of loop unrolling,t2=s2 } + s2:=b2*(tcgsize2size[size]-b); + + if ((cs_opt_size in current_settings.optimizerswitches) and (s10) then + begin + { Shift non-moved bytes in loop } + current_asmdata.getjumplabel(l1); + countreg:=getintregister(list,OS_8); + a_load_const_reg(list,OS_8,b2,countreg); + cg.a_label(list,l1); + if op=OP_SHL then + list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst,NR_NO,b))) + else + list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-1-b))); + + if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then + begin + for i:=2+b to tcgsize2size[size] do + if op=OP_SHL then + list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1))) + else + list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i-b))); + end; + list.concat(taicpu.op_reg(A_DEC,countreg)); + a_jmp_flags(list,F_NE,l1); + { keep registers alive } + a_reg_sync(list,countreg); + end + else + begin + { Unroll shift loop over non-moved bytes } + for j:=1 to b2 do + begin + if op=OP_SHL then + list.concat(taicpu.op_reg(A_LSL, + GetOffsetReg64(dst,NR_NO,b))) + else + list.concat(taicpu.op_reg(A_LSR, + GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-1))); + + if not(size in [OS_8,OS_S8]) then + for i:=2 to tcgsize2size[size]-b do + if op=OP_SHL then + list.concat(taicpu.op_reg(A_ROL, + GetOffsetReg64(dst,NR_NO,b+i-1))) + else + list.concat(taicpu.op_reg(A_ROR, + GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-i))); + end; + end; + end; + + { fill skipped destination registers with 0 + Do last,then optimizer can optimize register moves } + for i:=1 to b do + if op=OP_SHL then + emit_mov(list,GetOffsetReg64(dst,NR_NO,i-1),NR_R1) + else + emit_mov(list,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i),NR_R1); + end else inherited a_op_const_reg_reg(list,op,size,a,src,dst); end; @@ -687,8 +788,8 @@ unit cgcpu; list.concat(taicpu.op_reg(A_DEC,countreg)); a_jmp_flags(list,F_NE,l1); - // keep registers alive - list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg)); + { keep registers alive } + a_reg_sync(list,countreg); cg.a_label(list,l2); end; @@ -2417,8 +2518,8 @@ unit cgcpu; cg.ungetcpuregister(list,NR_R27); cg.ungetcpuregister(list,NR_R30); cg.ungetcpuregister(list,NR_R31); - // keep registers alive - list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg)); + { keep registers alive } + a_reg_sync(list,countreg); end else begin