* x86: Introduced TrySwapMovOp method, and redesigned TrySwapMovCmp

to use it while also trying to move one more instruction back
2025-04-20 19:09:23 +02:00 · 2022-04-17 05:40:40 +01:00 · 2022-04-17 05:40:40 +01:00 · 5f3749dc49
commit 5f3749dc49
parent 6af886c2b9
1 changed files with 129 additions and 37 deletions
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@ -211,6 +211,7 @@ unit aoptx86;
        procedure ConvertJumpToRET(const p: tai; const ret_p: tai);

        function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
+        function TrySwapMovOp(var p, hp1: tai): Boolean;
        function TrySwapMovCmp(var p, hp1: tai): Boolean;

        { Processor-dependent reference optimisation }
@ -8453,10 +8454,10 @@ unit aoptx86;
            Break;

          case taicpu(hp2).opcode of
-            A_MOVSS:
+            A_MOVSD:
              begin
                if taicpu(hp2).ops = 0 then
-                  { Wrong MOVSS }
+                  { Wrong MOVSD }
                  Break;
                Inc(Count);
                if Count >= 5 then
@ -8475,7 +8476,7 @@ unit aoptx86;
            A_MOVZX,
            A_MOVAPS,
            A_MOVUPS,
-            A_MOVSD,
+            A_MOVSS,
            A_MOVAPD,
            A_MOVUPD,
            A_MOVDQA,
@ -8626,41 +8627,38 @@ unit aoptx86;
    end;


-  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+  const
+    WriteOp: array[0..3] of set of TInsChange = (
+      [Ch_Wop1, Ch_RWop1, Ch_Mop1],
+      [Ch_Wop2, Ch_RWop2, Ch_Mop2],
+      [Ch_Wop3, Ch_RWop3, Ch_Mop3],
+      [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
+
+    RegWriteFlags: array[0..7] of set of TInsChange = (
+      { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
+      [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
+      [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
+      [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
+      [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
+      [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
+      [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
+      [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
+      [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
+
+
+  function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
    var
      hp2: tai;
      X: Integer;
-    const
-      WriteOp: array[0..3] of set of TInsChange = (
-        [Ch_Wop1, Ch_RWop1, Ch_Mop1],
-        [Ch_Wop2, Ch_RWop2, Ch_Mop2],
-        [Ch_Wop3, Ch_RWop3, Ch_Mop3],
-        [Ch_Wop4, Ch_RWop4, Ch_Mop4]);
-
-      RegWriteFlags: array[0..7] of set of TInsChange = (
-        { The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
-        [Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
-        [Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
-        [Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
-        [Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
-        [Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
-        [Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
-        [Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
-        [Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
-
    begin
      { If we have something like:
-          cmp ###,%reg1
-          mov 0,%reg2
+          op  ###,###
+          mov ###,###

-        And no modified registers are shared, move the instruction to before
-        the comparison as this means it can be optimised without worrying
-        about the FLAGS register. (CMP/MOV is generated by
-        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+        Try to move the MOV instruction to before OP as long as OP and MOV don't
+        interfere in regards to what they write to.

-        As long as the second instruction doesn't use the flags or one of the
-        registers used by CMP or TEST (also check any references that use the
-        registers), then it can be moved prior to the comparison.
+        NOTE: p must be a 2-operand instruction
      }

      Result := False;
@ -8672,12 +8670,12 @@ unit aoptx86;
      { NOP is a pipeline fence, likely marking the beginning of the function
        epilogue, so drop out.  Similarly, drop out if POP or RET are
        encountered }
-      if MatchInstruction(hp1, A_NOP, A_POP, []) then
+      if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
        Exit;

-      if (taicpu(hp1).opcode = A_MOVSS) and
+      if (taicpu(hp1).opcode = A_MOVSD) and
        (taicpu(hp1).ops = 0) then
-        { Wrong MOVSS }
+        { Wrong MOVSD }
        Exit;

      { Check for writes to specific registers first }
@ -8705,6 +8703,25 @@ unit aoptx86;
            Exit;
        end;

+      { Check p to make sure it doesn't write to something that affects hp1 }
+
+      { Check for writes to specific registers first }
+      { EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
+      for X := 0 to 7 do
+        if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
+          and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
+          Exit;
+
+      for X := 0 to taicpu(p).ops - 1 do
+        begin
+          { Check to see if this operand writes to something }
+          if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
+            { And matches something in hp1 }
+            (taicpu(p).oper[X]^.typ = top_reg) and
+            RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
+            Exit;
+        end;
+
      { The instruction can be safely moved }
      asml.Remove(hp1);

@ -8712,6 +8729,17 @@ unit aoptx86;
        can be optimised into "xor %reg,%reg" later }
      if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
        asml.InsertBefore(hp1, hp2)
+
+        { Failing that, try to insert after the last instructions where the
+          FLAGS register is not yet in use }
+      else if GetLastInstruction(p, hp2) and
+        (
+          (hp2.typ <> ait_instruction) or
+          { Don't insert after an instruction that uses the flags when p doesn't use them }
+          RegInInstruction(NR_DEFAULTFLAGS, p) or
+          not RegInInstruction(NR_DEFAULTFLAGS, hp2)
+        ) then
+        asml.InsertAfter(hp1, hp2)
      else
        { Note, if p.Previous is nil (even if it should logically never be the
          case), FindRegAllocBackward immediately exits with False and so we
@ -8721,26 +8749,90 @@ unit aoptx86;

      DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);

+      { We can't trust UsedRegs because we're looking backwards, although we
+        know the registers are allocated after p at the very least, so manually
+        create tai_regalloc objects if needed }
      for X := 0 to taicpu(hp1).ops - 1 do
        case taicpu(hp1).oper[X]^.typ of
          top_reg:
-            AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            begin
+              asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
+              IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
+            end;
          top_ref:
            begin
              if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
+                end;
              if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
-                AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                begin
+                  asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
+                  IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
+                  AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
+                end;
            end;
          else
            ;
        end;

+      Result := True;
+    end;
+
+
+  function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
+    var
+      hp2: tai;
+      X: Integer;
+    begin
+      { If we have something like:
+          cmp ###,%reg1
+          mov 0,%reg2
+
+        And no modified registers are shared, move the instruction to before
+        the comparison as this means it can be optimised without worrying
+        about the FLAGS register. (CMP/MOV is generated by
+        "J(c)Mov1JmpMov0 -> Set(~c)", among other things).
+
+        As long as the second instruction doesn't use the flags or one of the
+        registers used by CMP or TEST (also check any references that use the
+        registers), then it can be moved prior to the comparison.
+      }
+
+      Result := False;
+      if not TrySwapMovOp(p, hp1) then
+        Exit;
+
      if taicpu(hp1).opcode = A_LEA then
        { The flags will be overwritten by the CMP/TEST instruction }
        ConvertLEA(taicpu(hp1));

      Result := True;
+
+      { Can we move it one further back? }
+      if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
+        { Check to see if CMP/TEST is a comparison against zero }
+        (
+          (
+            (taicpu(p).opcode = A_CMP) and
+            MatchOperand(taicpu(p).oper[0]^, 0)
+          ) or
+          (
+            (taicpu(p).opcode = A_TEST) and
+            (
+              OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
+              MatchOperand(taicpu(p).oper[0]^, -1)
+            )
+          )
+        ) and
+        { These instructions set the zero flag if the result is zero }
+        MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
+        OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
+          { Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
+          TrySwapMovOp(hp2, hp1);
    end;