* patch by J. Gareth Moreton: AND/CMP optimisation, resolves #39287

This commit is contained in:
florian 2021-08-11 22:59:40 +02:00
parent 25e937b0c9
commit 306fae299e

View File

@ -8618,39 +8618,77 @@ unit aoptx86;
begin begin
if (taicpu(p).oper[0]^.typ = top_const) then if (taicpu(p).oper[0]^.typ = top_const) then
begin begin
if (taicpu(hp1).opcode = A_AND) and case taicpu(hp1).opcode of
MatchOpType(taicpu(hp1),top_const,top_reg) and A_AND:
(getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and if MatchOpType(taicpu(hp1),top_const,top_reg) and
{ the second register must contain the first one, so compare their subreg types } (getsupreg(taicpu(p).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) and
(getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and { the second register must contain the first one, so compare their subreg types }
(abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then (getsubreg(taicpu(p).oper[1]^.reg)<=getsubreg(taicpu(hp1).oper[1]^.reg)) and
{ change (abs(taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val)<$80000000) then
and const1, reg { change
and const2, reg and const1, reg
to and const2, reg
and (const1 and const2), reg to
} and (const1 and const2), reg
begin }
taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val); begin
DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1); taicpu(hp1).loadConst(0, taicpu(p).oper[0]^.val and taicpu(hp1).oper[0]^.val);
RemoveCurrentP(p, hp1); DebugMsg(SPeepholeOptimization + 'AndAnd2And done',hp1);
Result:=true; RemoveCurrentP(p, hp1);
exit; Result:=true;
end exit;
else if (taicpu(hp1).opcode = A_MOVZX) and end;
MatchOpType(taicpu(hp1),top_reg,top_reg) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and A_CMP:
(getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and if (PopCnt(DWord(taicpu(p).oper[0]^.val)) = 1) and { Only 1 bit set }
(((taicpu(p).opsize=S_W) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.val) and
(taicpu(hp1).opsize=S_BW)) or MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
((taicpu(p).opsize=S_L) and { Just check that the condition on the next instruction is compatible }
(taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])) GetNextInstruction(hp1, hp2) and
(hp2.typ = ait_instruction) and
(taicpu(hp2).condition in [C_Z, C_E, C_NZ, C_NE])
then
{ change
and 2^n, reg
cmp 2^n, reg
j(c) / set(c) / cmov(c) (c is equal or not equal)
to
and 2^n, reg
test reg, reg
j(~c) / set(~c) / cmov(~c)
}
begin
{ Keep TEST instruction in, rather than remove it, because
it may trigger other optimisations such as MovAndTest2Test }
taicpu(hp1).loadreg(0, taicpu(hp1).oper[1]^.reg);
taicpu(hp1).opcode := A_TEST;
DebugMsg(SPeepholeOptimization + 'AND/CMP/J(c) -> AND/J(~c) with power of 2 constant', p);
taicpu(hp2).condition := inverse_cond(taicpu(hp2).condition);
Result := True;
Exit;
end;
A_MOVZX:
if MatchOpType(taicpu(hp1),top_reg,top_reg) and
SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.reg) and
(getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) and
(
(
(taicpu(p).opsize=S_W) and
(taicpu(hp1).opsize=S_BW)
) or
(
(taicpu(p).opsize=S_L) and
(taicpu(hp1).opsize in [S_WL,S_BL{$ifdef x86_64},S_BQ,S_WQ{$endif x86_64}])
)
{$ifdef x86_64} {$ifdef x86_64}
or or
((taicpu(p).opsize=S_Q) and (
(taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])) (taicpu(p).opsize=S_Q) and
(taicpu(hp1).opsize in [S_BQ,S_WQ,S_BL,S_WL])
)
{$endif x86_64} {$endif x86_64}
) then ) then
begin begin
if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and if (((taicpu(hp1).opsize) in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val) ((taicpu(p).oper[0]^.val and $ff)=taicpu(p).oper[0]^.val)
@ -8673,108 +8711,114 @@ unit aoptx86;
{ See if there are other optimisations possible } { See if there are other optimisations possible }
Continue; Continue;
end; end;
end end;
else if (taicpu(hp1).opcode = A_SHL) and
MatchOpType(taicpu(hp1),top_const,top_reg) and A_SHL:
(getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then if MatchOpType(taicpu(hp1),top_const,top_reg) and
begin (getsupreg(taicpu(p).oper[1]^.reg)=getsupreg(taicpu(hp1).oper[1]^.reg)) then
begin
{$ifopt R+} {$ifopt R+}
{$define RANGE_WAS_ON} {$define RANGE_WAS_ON}
{$R-} {$R-}
{$endif} {$endif}
{ get length of potential and mask } { get length of potential and mask }
MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1; MaskLength:=SizeOf(taicpu(p).oper[0]^.val)*8-BsrQWord(taicpu(p).oper[0]^.val)-1;
{ really a mask? } { really a mask? }
{$ifdef RANGE_WAS_ON} {$ifdef RANGE_WAS_ON}
{$R+} {$R+}
{$endif} {$endif}
if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and if (((QWord(1) shl MaskLength)-1)=taicpu(p).oper[0]^.val) and
{ unmasked part shifted out? } { unmasked part shifted out? }
((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then ((MaskLength+taicpu(hp1).oper[0]^.val)>=topsize2memsize[taicpu(hp1).opsize]) then
begin begin
DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p); DebugMsg(SPeepholeOptimization + 'AndShlToShl done',p);
RemoveCurrentP(p, hp1); RemoveCurrentP(p, hp1);
Result:=true; Result:=true;
exit; exit;
end;
end
else if (taicpu(hp1).opcode = A_SHR) and
MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.val <= 63) then
begin
{ Does SHR combined with the AND cover all the bits?
e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
begin
DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end
else if ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}) and
(taicpu(hp1).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
(
(
(taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
) or (
(taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
{$ifdef x86_64}
) or (
(taicpu(hp1).opsize = S_LQ) and
((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
{$endif x86_64}
)
) then
begin
if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
begin
DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
RemoveInstruction(hp1);
{ See if there are other optimisations possible }
Continue;
end;
{ The super-registers are the same though.
Note that this change by itself doesn't improve
code speed, but it opens up other optimisations. }
{$ifdef x86_64}
{ Convert 64-bit register to 32-bit }
case taicpu(hp1).opsize of
S_BQ:
begin
taicpu(hp1).opsize := S_BL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end;
S_WQ:
begin
taicpu(hp1).opsize := S_WL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end
else
;
end; end;
end;
A_SHR:
if MatchOpType(taicpu(hp1),top_const,top_reg) and
(taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
(taicpu(hp1).oper[0]^.val <= 63) then
begin
{ Does SHR combined with the AND cover all the bits?
e.g. for "andb $252,%reg; shrb $2,%reg" - the "and" can be removed }
MaskedBits := taicpu(p).oper[0]^.val or ((TCgInt(1) shl taicpu(hp1).oper[0]^.val) - 1);
if ((taicpu(p).opsize = S_B) and ((MaskedBits and $FF) = $FF)) or
((taicpu(p).opsize = S_W) and ((MaskedBits and $FFFF) = $FFFF)) or
((taicpu(p).opsize = S_L) and ((MaskedBits and $FFFFFFFF) = $FFFFFFFF)) then
begin
DebugMsg(SPeepholeOptimization + 'AndShrToShr done', p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end;
end;
A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
if (taicpu(hp1).oper[0]^.typ = top_reg) and
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
begin
if SuperRegistersEqual(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.reg) and
(
(
(taicpu(hp1).opsize in [S_BW,S_BL{$ifdef x86_64},S_BQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7F) = taicpu(p).oper[0]^.val)
) or (
(taicpu(hp1).opsize in [S_WL{$ifdef x86_64},S_WQ{$endif x86_64}]) and
((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val)
{$ifdef x86_64}
) or (
(taicpu(hp1).opsize = S_LQ) and
((taicpu(p).oper[0]^.val and $7fffffff) = taicpu(p).oper[0]^.val)
{$endif x86_64} {$endif x86_64}
DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1); )
taicpu(hp1).opcode := A_MOVZX; ) then
{ See if there are other optimisations possible } begin
Continue; if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg){$ifdef x86_64} or (taicpu(hp1).opsize = S_LQ){$endif x86_64} then
end; begin
end; DebugMsg(SPeepholeOptimization + 'AndMovsxToAnd',p);
RemoveInstruction(hp1);
{ See if there are other optimisations possible }
Continue;
end;
{ The super-registers are the same though.
Note that this change by itself doesn't improve
code speed, but it opens up other optimisations. }
{$ifdef x86_64}
{ Convert 64-bit register to 32-bit }
case taicpu(hp1).opsize of
S_BQ:
begin
taicpu(hp1).opsize := S_BL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end;
S_WQ:
begin
taicpu(hp1).opsize := S_WL;
taicpu(hp1).oper[1]^.reg := newreg(R_INTREGISTER, getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBD);
end
else
;
end;
{$endif x86_64}
DebugMsg(SPeepholeOptimization + 'AndMovsxToAndMovzx', hp1);
taicpu(hp1).opcode := A_MOVZX;
{ See if there are other optimisations possible }
Continue;
end;
end;
else
;
end;
end; end;
if (taicpu(hp1).is_jmp) and if (taicpu(hp1).is_jmp) and