* (modified/re-formatted) patch by Christo Crause: AVR: Optimizing code generation for shift with compile time constant

git-svn-id: trunk@43136 -
2025-09-11 10:10:00 +02:00 · 2019-10-05 20:48:33 +00:00 · 2019-10-05 20:48:33 +00:00 · dd2d1bf68b
commit dd2d1bf68b
parent 35e11cd6d4
1 changed files with 105 additions and 4 deletions
--- a/compiler/avr/cgcpu.pas
+++ b/compiler/avr/cgcpu.pas
@ -438,6 +438,11 @@ unit cgcpu;
     procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister);
       var
         tmpSrc, tmpDst, countreg: TRegister;
         b, b2, i, j: byte;
         s1, s2, t1: integer;
         l1: TAsmLabel;
       begin
         if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then
           begin
@ -451,6 +456,102 @@ unit cgcpu;
                 a:=a shr 1;
               end;
           end
         else if (op in [OP_SHL,OP_SHR]) and
           { a=0 get eliminated later by tcg.optimize_op_const }
           (a>0)  then
           begin
             { number of bytes to shift }
             b:=a div 8;
             { Ensure that b is never larger than base type }
             if b>tcgsize2size[size] then
               begin
                 b:=tcgsize2size[size];
                 b2:=0;
               end
             else
               b2:=a mod 8;
             if b < tcgsize2size[size] then
               { copy from src to dst accounting for shift offset }
               for i:=0 to (tcgsize2size[size]-b-1) do
                 if op=OP_SHL then
                   a_load_reg_reg(list,OS_8,OS_8,
                     GetOffsetReg64(src,NR_NO,i),
                     GetOffsetReg64(dst,NR_NO,i+b))
                 else
                   a_load_reg_reg(list,OS_8,OS_8,
                     GetOffsetReg64(src,NR_NO,i+b),
                     GetOffsetReg64(dst,NR_NO,i));
             { remaining bit shifts }
             if b2 > 0 then
               begin
                 { Cost of loop }
                 s1:=3+tcgsize2size[size]-b;
                 t1:=b2*(tcgsize2size[size]-b+3);
                 { Cost of loop unrolling,t2=s2 }
                 s2:=b2*(tcgsize2size[size]-b);
                 if ((cs_opt_size in current_settings.optimizerswitches) and (s1<s2)) or
                    (((s2-s1)-t1/s2)>0) then
                   begin
                     { Shift non-moved bytes in loop }
                     current_asmdata.getjumplabel(l1);
                     countreg:=getintregister(list,OS_8);
                     a_load_const_reg(list,OS_8,b2,countreg);
                     cg.a_label(list,l1);
                     if op=OP_SHL then
                       list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst,NR_NO,b)))
                     else
                       list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-1-b)));
                     if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
                       begin
                         for i:=2+b to tcgsize2size[size] do
                           if op=OP_SHL then
                             list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1)))
                           else
                             list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i-b)));
                       end;
                     list.concat(taicpu.op_reg(A_DEC,countreg));
                     a_jmp_flags(list,F_NE,l1);
                     { keep registers alive }
                     a_reg_sync(list,countreg);
                   end
                 else
                   begin
                     { Unroll shift loop over non-moved bytes }
                     for j:=1 to b2 do
                     begin
                       if op=OP_SHL then
                         list.concat(taicpu.op_reg(A_LSL,
                         GetOffsetReg64(dst,NR_NO,b)))
                       else
                         list.concat(taicpu.op_reg(A_LSR,
                           GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-1)));
                       if not(size in [OS_8,OS_S8]) then
                         for i:=2 to tcgsize2size[size]-b do
                           if op=OP_SHL then
                             list.concat(taicpu.op_reg(A_ROL,
                               GetOffsetReg64(dst,NR_NO,b+i-1)))
                           else
                             list.concat(taicpu.op_reg(A_ROR,
                               GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-i)));
                     end;
                   end;
               end;
               { fill skipped destination registers with 0
                 Do last,then optimizer can optimize register moves }
               for i:=1 to b do
                 if op=OP_SHL then
                   emit_mov(list,GetOffsetReg64(dst,NR_NO,i-1),NR_R1)
                 else
                   emit_mov(list,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i),NR_R1);
           end
         else
           inherited a_op_const_reg_reg(list,op,size,a,src,dst);
       end;
@ -687,8 +788,8 @@ unit cgcpu;
               list.concat(taicpu.op_reg(A_DEC,countreg));
               a_jmp_flags(list,F_NE,l1);
-               // keep registers alive
+               { keep registers alive }
-               list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
+               a_reg_sync(list,countreg);
               cg.a_label(list,l2);
             end;
@ -2417,8 +2518,8 @@ unit cgcpu;
            cg.ungetcpuregister(list,NR_R27);
            cg.ungetcpuregister(list,NR_R30);
            cg.ungetcpuregister(list,NR_R31);
-            // keep registers alive
+            { keep registers alive }
-            list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
+            a_reg_sync(list,countreg);
          end
        else
          begin