* x86: New SHR-based optimisations

2025-09-23 14:49:13 +02:00 · 2022-08-15 06:56:00 +01:00 · 2022-08-15 06:56:00 +01:00 · 6757bf3832
commit 6757bf3832
parent e21186cac0
3 changed files with 328 additions and 25 deletions
--- a/compiler/i386/aoptcpu.pas
+++ b/compiler/i386/aoptcpu.pas
@ -191,6 +191,8 @@ unit aoptcpu;
                  end;
                A_SHL, A_SAL:
                  Result:=OptPass1SHLSAL(p);
+                A_SHR:
+                  Result:=OptPass1SHR(p);
                A_SUB:
                  Result:=OptPass1Sub(p);
                A_Jcc:
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@ -135,6 +135,7 @@ unit aoptx86;

        class function IsExitCode(p : tai) : boolean; static;
        class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
+        class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
        procedure RemoveLastDeallocForFuncRes(p : tai);

        function DoSubAddOpt(var p : tai) : Boolean;
@ -157,6 +158,7 @@ unit aoptx86;
        function OptPass1LEA(var p : tai) : boolean;
        function OptPass1Sub(var p : tai) : boolean;
        function OptPass1SHLSAL(var p : tai) : boolean;
+        function OptPass1SHR(var p : tai) : boolean;
        function OptPass1FSTP(var p : tai) : boolean;
        function OptPass1FLD(var p : tai) : boolean;
        function OptPass1Cmp(var p : tai) : boolean;
@ -6448,6 +6450,146 @@ unit aoptx86;
      end;


+    class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
+      begin
+        case shr_size of
+          S_B:
+            { No valid combinations }
+            Result := False;
+
+          S_W:
+            Result := (Shift >= 8) and (movz_size = S_BW);
+
+          S_L:
+            Result :=
+              (Shift >= 24) { Any opsize is valid for this shift } or
+              ((Shift >= 16) and (movz_size = S_WL));
+{$ifdef x86_64}
+          S_Q:
+            Result :=
+              (Shift >= 56) { Any opsize is valid for this shift } or
+              ((Shift >= 48) and (movz_size = S_WL));
+{$endif x86_64}
+          else
+            InternalError(2022081510);
+        end;
+      end;
+
+    function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
+      var
+        hp1, hp2: tai;
+        Shift: TCGInt;
+        LimitSize: Topsize;
+        DoNotMerge: Boolean;
+      begin
+        Result := False;
+
+        { All these optimisations work on "shr const,%reg" }
+        if not MatchOpType(taicpu(p), top_const, top_reg) then
+          Exit;
+
+        DoNotMerge := False;
+        Shift := taicpu(p).oper[0]^.val;
+        LimitSize := taicpu(p).opsize;
+
+        hp1 := p;
+        repeat
+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
+            Exit;
+
+          case taicpu(hp1).opcode of
+            A_TEST, A_CMP, A_Jcc:
+              { Skip over conditional jumps and relevant comparisons }
+              Continue;
+
+            A_MOVZX:
+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Since the original register is being read as is, subsequent
+                    SHRs must not be merged at this point }
+                  DoNotMerge := True;
+
+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
+                    begin
+                      if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
+                          taicpu(hp1).opcode := A_MOV;
+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
+                          case taicpu(hp1).opsize of
+                            S_BW:
+                              taicpu(hp1).opsize := S_W;
+                            S_BL, S_WL:
+                              taicpu(hp1).opsize := S_L;
+                            else
+                              InternalError(2022081503);
+                          end;
+
+                          { p itself hasn't changed, so no need to set Result to True }
+                          Include(OptsToCheck, aoc_ForceNewIteration);
+
+                          { See if there's anything afterwards that can be
+                            optimised, since the input register hasn't changed }
+                          Continue;
+                        end;
+
+                      { NOTE: If the MOVZX instruction reads and writes the same
+                        register, defer this to the post-peephole optimisation stage }
+                      Exit;
+                    end;
+                end;
+            A_SHL, A_SAL, A_SHR:
+              if (taicpu(hp1).opsize <= LimitSize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the sizes don't exceed the register size limit
+                    (measured by the shift value falling below the limit) }
+
+                  if taicpu(hp1).opsize < LimitSize then
+                    LimitSize := taicpu(hp1).opsize;
+
+                  if taicpu(hp1).opcode = A_SHR then
+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
+                  else
+                    begin
+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
+                      DoNotMerge := True;
+                    end;
+
+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
+                    Exit;
+
+                  { Since we've established that the combined shift is within
+                    limits, we can actually combine the adjacent SHR
+                    instructions even if they're different sizes }
+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
+                    begin
+                      hp2 := tai(hp1.Previous);
+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
+                      RemoveInstruction(hp1);
+                      hp1 := hp2;
+
+                      { Though p has changed, only the constant has, and its
+                        effects can still be detected on the next iteration of
+                        the repeat..until loop }
+                      Include(OptsToCheck, aoc_ForceNewIteration);
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+            else
+              ;
+          end;
+
+          Break;
+        until False;
+      end;
+
+
    function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
      var
        CurrentRef: TReference;
@ -12930,36 +13072,193 @@ unit aoptx86;

    function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
      var
-        hp1: tai;
+        hp1, hp2: tai;
+        IdentityMask, Shift: TCGInt;
+        LimitSize: Topsize;
+        DoNotMerge: Boolean;
      begin
-        { Detect:
-            shr    x,  %ax (x > 0)
-            ...
-            movzwl %ax,%eax
-
-          Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
-        }
-
        Result := False;
-        if MatchOpType(taicpu(p), top_const, top_reg) and
-          (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
-          (taicpu(p).oper[0]^.val > 0) and
-          GetNextInstructionUsingReg(p, hp1, NR_EAX) and
-          MatchInstruction(hp1, A_MOVZX, [S_WL]) and
-          MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
-          MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
-          begin
-            DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
-            taicpu(hp1).opcode := A_CWDE;
-            taicpu(hp1).clearop(0);
-            taicpu(hp1).clearop(1);
-            taicpu(hp1).ops := 0;

-            { A change was made, but not with p, so move forward 1 }
-            p := tai(p.Next);
-            Result := True;
+        { All these optimisations work on "shr const,%reg" }
+        if not MatchOpType(taicpu(p), top_const, top_reg) then
+          Exit;
+
+        DoNotMerge := False;
+        Shift := taicpu(p).oper[0]^.val;
+        LimitSize := taicpu(p).opsize;
+
+        hp1 := p;
+        repeat
+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
+            Exit;
+
+          { Detect:
+              shr x, %reg
+              and y, %reg
+
+            If and y, %reg doesn't actually change the value of %reg (e.g. with
+            "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
+          }
+
+          case taicpu(hp1).opcode of
+            A_AND:
+              if (taicpu(hp1).opsize = taicpu(p).opsize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the FLAGS register isn't in use }
+                  TransferUsedRegs(TmpUsedRegs);
+                  hp2 := p;
+                  repeat
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                  until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                  if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                    begin
+                      { Generate the identity mask }
+                      case taicpu(p).opsize of
+                        S_B:
+                          IdentityMask := $FF shr Shift;
+                        S_W:
+                          IdentityMask := $FFFF shr Shift;
+                        S_L:
+                          IdentityMask := $FFFFFFFF shr Shift;
+{$ifdef x86_64}
+                        S_Q:
+                          { We need to force the operands to be unsigned 64-bit
+                            integers otherwise the wrong value is generated }
+                          IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
+{$endif x86_64}
+                        else
+                          InternalError(2022081501);
+                      end;
+
+                      if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
+                          { All the possible 1 bits are covered, so we can remove the AND }
+                          hp2 := tai(hp1.Previous);
+                          RemoveInstruction(hp1);
+
+                          { p wasn't actually changed, so don't set Result to True,
+                            but a change was nonetheless made elsewhere }
+                          Include(OptsToCheck, aoc_ForceNewIteration);
+
+                          { Do another pass in case other AND or MOVZX instructions
+                            follow }
+                          hp1 := hp2;
+                          Continue;
+                        end;
+
+                    end;
+                end;
+
+            A_TEST, A_CMP, A_Jcc:
+              { Skip over conditional jumps and relevant comparisons }
+              Continue;
+
+            A_MOVZX:
+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Since the original register is being read as is, subsequent
+                    SHRs must not be merged at this point }
+                  DoNotMerge := True;
+
+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
+                    begin
+                      if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
+                          { All the possible 1 bits are covered, so we can remove the AND }
+                          hp2 := tai(hp1.Previous);
+                          RemoveInstruction(hp1);
+
+                          hp1 := hp2;
+                        end
+                      else { Different register target }
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
+                          taicpu(hp1).opcode := A_MOV;
+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
+                          case taicpu(hp1).opsize of
+                            S_BW:
+                              taicpu(hp1).opsize := S_W;
+                            S_BL, S_WL:
+                              taicpu(hp1).opsize := S_L;
+                            else
+                              InternalError(2022081503);
+                          end;
+                        end;
+                    end
+                  else if (Shift > 0) and
+                    (taicpu(p).opsize = S_W) and
+                    (taicpu(hp1).opsize = S_WL) and
+                    (taicpu(hp1).oper[0]^.reg = NR_AX) and
+                    (taicpu(hp1).oper[1]^.reg = NR_EAX) then
+                    begin
+                      { Detect:
+                          shr    x,  %ax (x > 0)
+                          ...
+                          movzwl %ax,%eax
+
+                        Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
+                      }
+                      DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
+                      taicpu(hp1).opcode := A_CWDE;
+                      taicpu(hp1).clearop(0);
+                      taicpu(hp1).clearop(1);
+                      taicpu(hp1).ops := 0;
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+
+            A_SHL, A_SAL, A_SHR:
+              if (taicpu(hp1).opsize <= LimitSize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the sizes don't exceed the register size limit
+                    (measured by the shift value falling below the limit) }
+
+                  if taicpu(hp1).opsize < LimitSize then
+                    LimitSize := taicpu(hp1).opsize;
+
+                  if taicpu(hp1).opcode = A_SHR then
+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
+                  else
+                    begin
+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
+                      DoNotMerge := True;
+                    end;
+
+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
+                    Exit;
+
+                  { Since we've established that the combined shift is within
+                    limits, we can actually combine the adjacent SHR
+                    instructions even if they're different sizes }
+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
+                    begin
+                      hp2 := tai(hp1.Previous);
+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
+                      RemoveInstruction(hp1);
+                      hp1 := hp2;
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+            else
+              ;
          end;

+          Break;
+        until False;
+
      end;


--- a/compiler/x86_64/aoptcpu.pas
+++ b/compiler/x86_64/aoptcpu.pas
@ -130,6 +130,8 @@ uses
                  result:=OptPass1Sub(p);
                A_SHL,A_SAL:
                  result:=OptPass1SHLSAL(p);
+                A_SHR:
+                  result:=OptPass1SHR(p);
                A_FSTP,A_FISTP:
                  result:=OptPass1FSTP(p);
                A_FLD: