From e7218d09fbf59d858cc952569537d739e93401f5 Mon Sep 17 00:00:00 2001
From: "J. Gareth \"Curious Kit\" Moreton" <gareth@moreton-family.com>
Date: Thu, 18 Aug 2022 12:23:55 +0100
Subject: [PATCH]   * x86: New MovShr/Sar2Movx optimisation

---
 compiler/x86/aoptx86.pas | 183 ++++++++++++++++++++++++++++++---------
 1 file changed, 142 insertions(+), 41 deletions(-)

diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas
index 490facfd40..64cd41fed9 100644
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@@ -2761,7 +2761,7 @@ unit aoptx86;
       var
         GetNextInstruction_p, TempRegUsed, CrossJump: Boolean;
         PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
-        NewSize: topsize;
+        NewSize: topsize; NewOffset: asizeint;
         p_SourceReg, p_TargetReg, NewMMReg: TRegister;
         SourceRef, TargetRef: TReference;
         MovAligned, MovUnaligned: TAsmOp;
@@ -4609,54 +4609,155 @@ unit aoptx86;
             exit;
           end;
 
-{$ifdef x86_64}
-        { Convert:
-            movq x(ref),%reg64
-            shrq y,%reg64
-          To:
-            movl x+4(ref),%reg32
-            shrl y-32,%reg32 (Remove if y = 32)
-        }
-        if (taicpu(p).opsize = S_Q) and
-          (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
-          (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFB) and
-          MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
+        if (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
+          MatchInstruction(hp1, A_SHR, A_SAR, [taicpu(p).opsize]) and
           MatchOpType(taicpu(hp1), top_const, top_reg) and
-          (taicpu(hp1).oper[0]^.val >= 32) and
           (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
           begin
             RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
-            PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
-              'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
-
-            { Convert to 32-bit }
-            setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
-            taicpu(p).opsize := S_L;
-
-            Inc(taicpu(p).oper[0]^.ref^.offset, 4);
-
-            PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
-            if (taicpu(hp1).oper[0]^.val = 32) then
+{$ifdef x86_64}
+            { Convert:
+                movq x(ref),%reg64
+                shrq y,%reg64
+              To:
+                movl x+4(ref),%reg32
+                shrl y-32,%reg32 (Remove if y = 32)
+            }
+            if (taicpu(p).opsize = S_Q) and
+              (taicpu(hp1).opcode = A_SHR) and
+              (taicpu(hp1).oper[0]^.val >= 32) then
               begin
-                DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
-                RemoveInstruction(hp1);
-              end
-            else
-              begin
-                { This will potentially open up more arithmetic operations since
-                  the peephole optimizer now has a big hint that only the lower
-                  32 bits are currently in use (and opcodes are smaller in size) }
-                setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
-                taicpu(hp1).opsize := S_L;
+                PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
+                  'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
 
-                Dec(taicpu(hp1).oper[0]^.val, 32);
-                DebugMsg(SPeepholeOptimization + PreMessage +
-                  '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
+                { Convert to 32-bit }
+                setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
+                taicpu(p).opsize := S_L;
+
+                Inc(taicpu(p).oper[0]^.ref^.offset, 4);
+
+                PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
+                if (taicpu(hp1).oper[0]^.val = 32) then
+                  begin
+                    DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
+                    RemoveInstruction(hp1);
+                  end
+                else
+                  begin
+                    { This will potentially open up more arithmetic operations since
+                      the peephole optimizer now has a big hint that only the lower
+                      32 bits are currently in use (and opcodes are smaller in size) }
+                    setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
+                    taicpu(hp1).opsize := S_L;
+
+                    Dec(taicpu(hp1).oper[0]^.val, 32);
+                    DebugMsg(SPeepholeOptimization + PreMessage +
+                      '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
+                  end;
+                Result := True;
+                Exit;
               end;
-            Result := True;
-            Exit;
-          end;
 {$endif x86_64}
+            { Convert:
+                movl   x(ref),%reg
+                shrl   $24,%reg
+              To:
+                movzbl x+3(ref),%reg
+
+              Do similar things for movl; shrl $16 -> movzwl and movw; shrw $8 -> movzbw
+
+              Also accept sar instead of shr, but convert to movsx instead of movzx
+            }
+            if taicpu(hp1).opcode = A_SHR then
+              MovUnaligned := A_MOVZX
+            else
+              MovUnaligned := A_MOVSX;
+
+            NewSize := S_NO;
+            NewOffset := 0;
+            case taicpu(p).opsize of
+              S_B:
+                { No valid combinations };
+              S_W:
+                if (taicpu(hp1).oper[0]^.val = 8) then
+                  begin
+                    NewSize := S_BW;
+                    NewOffset := 1;
+                  end;
+              S_L:
+                case taicpu(hp1).oper[0]^.val of
+                  16:
+                    begin
+                      NewSize := S_WL;
+                      NewOffset := 2;
+                    end;
+                  24:
+                    begin
+                      NewSize := S_BL;
+                      NewOffset := 3;
+                    end;
+                  else
+                    ;
+                end;
+{$ifdef x86_64}
+              S_Q:
+                case taicpu(hp1).oper[0]^.val of
+                  32:
+                    begin
+                      if taicpu(hp1).opcode = A_SAR then
+                        begin
+                          { 32-bit to 64-bit is a distinct instruction }
+                          MovUnaligned := A_MOVSXD;
+                          NewSize := S_LQ;
+                          NewOffset := 4;
+                        end
+                      else
+                        { Should have been handled by MovShr2Mov above }
+                        InternalError(2022081811);
+                    end;
+                  48:
+                    begin
+                      NewSize := S_WQ;
+                      NewOffset := 6;
+                    end;
+                  56:
+                    begin
+                      NewSize := S_BQ;
+                      NewOffset := 7;
+                    end;
+                  else
+                    ;
+                end;
+{$endif x86_64}
+              else
+                InternalError(2022081810);
+            end;
+
+            if (NewSize <> S_NO) and
+              (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFF - NewOffset) then
+              begin
+                PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
+                  'shr' + debug_opsize2str(taicpu(p).opsize) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> ' +
+                  debug_op2str(MovUnaligned);
+
+{$ifdef x86_64}
+                if MovUnaligned <> A_MOVSXD then
+                  { Don't add size suffix for MOVSXD }
+{$endif x86_64}
+                  PreMessage := PreMessage + debug_opsize2str(NewSize);
+
+                Inc(taicpu(p).oper[0]^.ref^.offset, NewOffset);
+                taicpu(p).opcode := MovUnaligned;
+                taicpu(p).opsize := NewSize;
+
+                DebugMsg(SPeepholeOptimization + PreMessage + ' ' +
+                  debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr/Sar2Movx)', p);
+
+                RemoveInstruction(hp1);
+                Result := True;
+                Exit;
+              end;
+          end;
 
         { Backward optimisation.  If we have:
             func.  %reg1,%reg2