+ AAarch64: optimize divisions by constant

git-svn-id: trunk@44204 -
2025-04-15 07:19:35 +02:00 · 2020-02-17 20:11:32 +00:00 · 2020-02-17 20:11:32 +00:00 · e50c4f6373
commit e50c4f6373
parent f05b51d7d8
2 changed files with 211 additions and 46 deletions
--- a/compiler/aarch64/ncpumat.pas
+++ b/compiler/aarch64/ncpumat.pas
@ -76,9 +76,58 @@ implementation
         resultreg  : tregister;
         hl : tasmlabel;
         overflowloc: tlocation;
+         power: longint;
+
+       procedure genOrdConstNodeDiv;
+         var
+           helper1, helper2: TRegister;
+           so: tshifterop;
+         begin
+           if tordconstnode(right).value=0 then
+             internalerror(2020021601)
+           else if tordconstnode(right).value=1 then
+             cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, numerator, resultreg)
+           else if (tordconstnode(right).value = int64(-1)) then
+             begin
+               // note: only in the signed case possible..., may overflow
+               if cs_check_overflow in current_settings.localswitches then
+                 cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+
+               current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
+                 resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
+             end
+           else if ispowerof2(tordconstnode(right).value,power) then
+             begin
+               if (is_signed(right.resultdef)) then
+                 begin
+                    helper2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+                    if power = 1 then
+                      helper1:=numerator
+                    else
+                      begin
+                        helper1:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+                        cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,63,numerator,helper1);
+                      end;
+                    shifterop_reset(so);
+                    so.shiftmode:=SM_LSR;
+                    so.shiftimm:=64-power;
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
+                    cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,OS_INT,power,helper2,resultreg);
+                  end
+               else
+                 cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,OS_INT,power,numerator,resultreg)
+             end
+           else
+             { Everything else is handled in the generic code }
+             cg.g_div_const_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),
+               tordconstnode(right).value.svalue,numerator,resultreg);
+         end;
+
      begin
       secondpass(left);
       secondpass(right);
+       { avoid warning }
+       divider:=NR_NO;

       { set result location }
       location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
@ -89,16 +138,32 @@ implementation
       hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
       numerator:=left.location.register;

-       { load divider in a register }
-       hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
-       divider:=right.location.register;
-
-       { start division }
-       if is_signed(left.resultdef) then
-         op:=A_SDIV
+       if (right.nodetype=ordconstn) and
+          ((tordconstnode(right).value=1) or
+           (tordconstnode(right).value=int64(-1)) or
+           (tordconstnode(right).value=0) or
+           ispowerof2(tordconstnode(right).value,power)) then
+         begin
+           genOrdConstNodeDiv;
+           if nodetype=modn then
+             begin
+               divider:=cg.getintregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
+               cg.a_load_const_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),int64(tordconstnode(right).value),divider);
+             end;
+         end
       else
-         op:=A_UDIV;
-       current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
+         begin
+           { load divider in a register }
+           hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
+           divider:=right.location.register;
+
+           { start division }
+           if is_signed(left.resultdef) then
+             op:=A_SDIV
+           else
+             op:=A_UDIV;
+           current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,numerator,divider));
+         end;

       { no divide-by-zero detection available in hardware, emulate (if it's a
         constant, this will have been detected earlier already) }
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@ -1914,29 +1914,148 @@ unit aoptx86;
        { Depending on the DeepMOVOpt above, it may turn out that hp1 completely
          overwrites the original destination register.  e.g.

-          movl   %reg1d,%reg2d
-          movslq %reg1d,%reg2q
+          movl   ###,%reg2d
+          movslq ###,%reg2q (### doesn't have to be the same as the first one)

-          In this case, we can remove the MOV
+          In this case, we can remove the MOV (Go to "Mov2Nop 5" below)
        }
        if (taicpu(p).oper[1]^.typ = top_reg) and
          MatchInstruction(hp1, [A_LEA, A_MOV, A_MOVSX, A_MOVZX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}], []) and
-          { The RegInOp check makes sure that movb r/m,%reg1b; movzbl %reg1b,%reg1l"
-            and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
-            optimised }
          (taicpu(hp1).oper[1]^.typ = top_reg) and
-          not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) and
          Reg1WriteOverwritesReg2Entirely(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
-          begin
-            DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
-            { take care of the register (de)allocs following p }
-            UpdateUsedRegs(tai(p.next));
-            asml.remove(p);
-            p.free;
-            p:=hp1;
-            Result := True;
-            Exit;
-          end;
+            begin
+              if RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^) then
+                begin
+                  if (taicpu(hp1).oper[0]^.typ = top_reg) then
+                    case taicpu(p).oper[0]^.typ of
+                      top_const:
+                        { We have something like:
+
+                          movb   $x,   %regb
+                          movzbl %regb,%regd
+
+                          Change to:
+
+                          movl   $x,   %regd
+                        }
+                        begin
+                          case taicpu(hp1).opsize of
+                            S_BW:
+                              if (taicpu(hp1).opcode <> A_MOVSX) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7F)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBW);
+                                  taicpu(p).opsize := S_W;
+                                end;
+                            S_BL:
+                              if (taicpu(hp1).opcode <> A_MOVSX) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7F)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
+                                  taicpu(p).opsize := S_L;
+                                end;
+                            S_WL:
+                              if (taicpu(hp1).opcode <> A_MOVSX) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7FFF)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
+                                  taicpu(p).opsize := S_L;
+                                end;
+{$ifdef x86_64}
+                            S_BQ:
+                              if (taicpu(hp1).opcode <> A_MOVSX) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7F)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
+                                  taicpu(p).opsize := S_Q;
+                                end;
+                            S_WQ:
+                              if (taicpu(hp1).opcode <> A_MOVSX) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7FFF)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
+                                  taicpu(p).opsize := S_Q;
+                                end;
+                            S_LQ:
+                              if (taicpu(hp1).opcode <> A_MOVSXD) or
+                                (
+                                  (taicpu(p).oper[0]^.val >= 0) and
+                                  (taicpu(p).oper[0]^.val <= $7FFFFFFF)
+                                ) then
+                                begin
+                                  setsubreg(taicpu(p).oper[1]^.reg, R_SUBQ);
+                                  taicpu(p).opsize := S_Q;
+                                end;
+{$endif x86_64}
+                            else
+                              { If hp1 was a MOV instruction, it should have been
+                                optimised already }
+                              InternalError(2020021001);
+                          end;
+                          DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 2 done',p);
+                          asml.Remove(hp1);
+                          hp1.Free;
+                          Result := True;
+                          Exit;
+                        end;
+                      top_ref:
+                        { We have something like:
+
+                          movb   mem,  %regb
+                          movzbl %regb,%regd
+
+                          Change to:
+
+                          movzbl mem,  %regd
+                        }
+                        if IsMOVZXAcceptable or (taicpu(hp1).opcode <> A_MOVZX) then
+                          begin
+                            DebugMsg(SPeepholeOptimization + 'MovMovXX2MovXX 1 done',p);
+                            taicpu(hp1).loadref(0, taicpu(p).oper[0]^.ref^);
+                            { take care of the register (de)allocs following p }
+                            UpdateUsedRegs(tai(p.next));
+                            asml.remove(p);
+                            p.free;
+                            p:=hp1;
+                            Result := True;
+                            Exit;
+                          end;
+                      else
+                        if (taicpu(hp1).opcode <> A_MOV) and (taicpu(hp1).opcode <> A_LEA) then
+                          { Just to make a saving, since there are no more optimisations with MOVZX and MOVSX/D }
+                          Exit;
+                  end;
+                end
+             { The RegInOp check makes sure that movl r/m,%reg1l; movzbl (%reg1l),%reg1l"
+               and "movl r/m,%reg1; leal $1(%reg1,%reg2),%reg1" etc. are not incorrectly
+               optimised }
+              else
+                begin
+                  DebugMsg(SPeepholeOptimization + 'Mov2Nop 5 done',p);
+                  { take care of the register (de)allocs following p }
+                  UpdateUsedRegs(tai(p.next));
+                  asml.remove(p);
+                  p.free;
+                  p:=hp1;
+                  Result := True;
+                  Exit;
+                end;
+            end;

        if (taicpu(hp1).opcode = A_AND) and
          (taicpu(p).oper[1]^.typ = top_reg) and
@ -2339,27 +2458,8 @@ unit aoptx86;
                Result:=true;
                exit;
              end;
-            {
-              mov*  x,reg1
-              mov*  y,reg1

-              to
-
-              mov*  y,reg1
-            }
-            if (taicpu(p).oper[1]^.typ=top_reg) and
-              MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[1]^) and
-              not(RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^)) then
-              begin
-                DebugMsg(SPeepholeOptimization + 'MovMov2Mov 4 done',p);
-                { take care of the register (de)allocs following p }
-                UpdateUsedRegs(tai(p.next));
-                asml.remove(p);
-                p.free;
-                p:=hp1;
-                Result:=true;
-                exit;
-              end;
+              { mov x,reg1; mov y,reg1 -> mov y,reg1 is handled by the Mov2Nop 5 optimisation }
          end;

        { search further than the next instruction for a mov }