* patch by J. Gareth Moreton: AND/CMP optimisation, resolves #39287

2025-08-12 07:26:24 +02:00 · 2021-08-11 22:59:40 +02:00 · 2021-08-11 22:59:40 +02:00 · 306fae299e
commit 306fae299e
parent 25e937b0c9
1 changed files with 167 additions and 123 deletions
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@ -8618,39 +8618,77 @@ unit aoptx86;
          begin
            if (taicpu(p).oper[0]^.typ = top_const) then
              begin
-                if (taicpu(hp1).opcode = A_AND) and
+                case taicpu(hp1).opcode of
-                  MatchOpType(taicpu(hp1),top_const,top_reg) and
+                  A_AND:
-                  (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
+                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
-                  { the second register must contain the first one, so compare their subreg types }
+                      (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
-                  (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
+                      { the second register must contain the first one, so compare their subreg types }
-                  (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
+                      (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
-                  { change
+                      (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
-                      and const1, reg
+                      { change
-                      and const2, reg
+                          and const1, reg
-                    to
+                          and const2, reg
-                      and (const1 and const2), reg
+                        to
-                  }
+                          and (const1 and const2), reg
-                  begin
+                      }
-                    taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
+                      begin
-                    DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
+                        taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
-                    RemoveCurrentP(p, hp1);
+                        DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
-                    Result:=true;
+                        RemoveCurrentP(p, hp1);
-                    exit;
+                        Result:=true;
-                  end
+                        exit;
-                else if (taicpu(hp1).opcode = A_MOVZX) and
+                      end;
-                  MatchOpType(taicpu(hp1),top_reg,top_reg) and
+
-                  SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
+                  A_CMP:
-                  (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
+                    if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
-                   (((taicpu(p).opsize=S_W) and
+                      MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
-                     (taicpu(hp1).opsize=S_BW)) or
+                      MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
-                    ((taicpu(p).opsize=S_L) and
+                      { Just check that the condition on the next instruction is compatible }
-                     (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}]))
+                      GetNextInstruction(hp1, hp2) and
                      (hp2.typ = ait_instruction) and
                      (taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
                      then
                        { change
                            and  2^n, reg
                            cmp  2^n, reg
                            j(c) / set(c) / cmov(c)   (c is equal or not equal)
                          to
                            and  2^n, reg
                            test reg, reg
                            j(~c) / set(~c) / cmov(~c)
                        }
                      begin
                        { Keep TEST instruction in, rather than remove it, because
                          it may trigger other optimisations such as MovAndTest2Test }
                        taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
                        taicpu(hp1).opcode := A_TEST;
                        DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
                        taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
                        Result := True;
                        Exit;
                      end;
                  A_MOVZX:
                    if MatchOpType(taicpu(hp1),top_reg,top_reg) and
                      SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
                      (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
                      (
                        (
                          (taicpu(p).opsize=S_W) and
                          (taicpu(hp1).opsize=S_BW)
                        ) or
                        (
                          (taicpu(p).opsize=S_L) and
                          (taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
                        )
 {$ifdef x86_64}
-                      or
+                        or
-                     ((taicpu(p).opsize=S_Q) and
+                        (
-                      (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL]))
+                          (taicpu(p).opsize=S_Q) and
                          (taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
                        )
 {$endif x86_64}
-                    ) then
+                      ) then
                      begin
                        if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
                            ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
@ -8673,108 +8711,114 @@ unit aoptx86;
                            { See if there are other optimisations possible }
                            Continue;
                          end;
-                      end
+                      end;
-                else if (taicpu(hp1).opcode = A_SHL) and
+
-                  MatchOpType(taicpu(hp1),top_const,top_reg) and
+                  A_SHL:
-                  (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
+                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
-                  begin
+                      (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
                      begin
 {$ifopt R+}
 {$define RANGE_WAS_ON}
 {$R-}
 {$endif}
-                    { get length of potential and mask }
+                        { get length of potential and mask }
-                    MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
+                        MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
-                    { really a mask? }
+                        { really a mask? }
 {$ifdef RANGE_WAS_ON}
 {$R+}
 {$endif}
-                    if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
+                        if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
-                      { unmasked part shifted out? }
+                          { unmasked part shifted out? }
-                      ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
+                          ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
-                      begin
+                          begin
-                        DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
+                            DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
-                        RemoveCurrentP(p, hp1);
+                            RemoveCurrentP(p, hp1);
-                        Result:=true;
+                            Result:=true;
-                        exit;
+                            exit;
                      end;
                  end
                else if (taicpu(hp1).opcode = A_SHR) and
                  MatchOpType(taicpu(hp1),top_const,top_reg) and
                  (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
                  (taicpu(hp1).oper[0]^.val <= 63) then
                  begin
                    { Does SHR combined with the AND cover all the bits?
                      e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
                    MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
                    if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
                      ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
                      ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
                      begin
                        DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
                        RemoveCurrentP(p, hp1);
                        Result := True;
                        Exit;
                      end;
                  end
                else if ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}) and
                  (taicpu(hp1).oper[0]^.typ = top_reg) and
                  SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
                    begin
                      if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
                        (
                          (
                            (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
                            ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
                          ) or (
                            (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
                            ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
 {$ifdef x86_64}
                          ) or (
                            (taicpu(hp1).opsize = S_LQ) and
                            ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
 {$endif x86_64}
                          )
                        ) then
                        begin
                          if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
                            begin
                              DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
                              RemoveInstruction(hp1);
                              { See if there are other optimisations possible }
                              Continue;
                            end;
                          { The super-registers are the same though.
                            Note that this change by itself doesn't improve
                            code speed, but it opens up other optimisations. }
 {$ifdef x86_64}
                          { Convert 64-bit register to 32-bit }
                          case taicpu(hp1).opsize of
                            S_BQ:
                              begin
                                taicpu(hp1).opsize := S_BL;
                                taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
                              end;
                            S_WQ:
                              begin
                                taicpu(hp1).opsize := S_WL;
                                taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
                              end
                            else
                              ;
                          end;
                      end;
                  A_SHR:
                    if MatchOpType(taicpu(hp1),top_const,top_reg) and
                      (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
                      (taicpu(hp1).oper[0]^.val <= 63) then
                      begin
                        { Does SHR combined with the AND cover all the bits?
                          e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
                        MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
                        if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
                          ((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
                          ((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
                          begin
                            DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
                            RemoveCurrentP(p, hp1);
                            Result := True;
                            Exit;
                          end;
                      end;
                  A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
                    if (taicpu(hp1).oper[0]^.typ = top_reg) and
                      SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
                      begin
                        if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
                          (
                            (
                              (taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
                              ((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
                            ) or (
                              (taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
                              ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
 {$ifdef x86_64}
                            ) or (
                              (taicpu(hp1).opsize = S_LQ) and
                              ((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
 {$endif x86_64}
-                          DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
+                            )
-                          taicpu(hp1).opcode := A_MOVZX;
+                          ) then
-                          { See if there are other optimisations possible }
+                          begin
-                          Continue;
+                            if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
-                        end;
+                              begin
-                    end;
+                                DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
                                RemoveInstruction(hp1);
                                { See if there are other optimisations possible }
                                Continue;
                              end;
                            { The super-registers are the same though.
                              Note that this change by itself doesn't improve
                              code speed, but it opens up other optimisations. }
 {$ifdef x86_64}
                            { Convert 64-bit register to 32-bit }
                            case taicpu(hp1).opsize of
                              S_BQ:
                                begin
                                  taicpu(hp1).opsize := S_BL;
                                  taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
                                end;
                              S_WQ:
                                begin
                                  taicpu(hp1).opsize := S_WL;
                                  taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
                                end
                              else
                                ;
                            end;
 {$endif x86_64}
                            DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
                            taicpu(hp1).opcode := A_MOVZX;
                            { See if there are other optimisations possible }
                            Continue;
                          end;
                      end;
                  else
                    ;
                end;
              end;
            if (taicpu(hp1).is_jmp) and