mirror of
				https://gitlab.com/freepascal.org/fpc/source.git
				synced 2025-11-04 09:19:39 +01:00 
			
		
		
		
	+ patch by J. Gareth Moreton: SHL-centric peephole optimisations, resolves #37389
git-svn-id: trunk@45811 -
This commit is contained in:
		
							parent
							
								
									d6d92e3666
								
							
						
					
					
						commit
						09125e834f
					
				@ -3396,10 +3396,15 @@ unit aoptx86;
 | 
			
		||||
        TmpBool1,TmpBool2 : Boolean;
 | 
			
		||||
        tmpref : treference;
 | 
			
		||||
        hp1,hp2: tai;
 | 
			
		||||
        mask:    tcgint;
 | 
			
		||||
      begin
 | 
			
		||||
        Result:=false;
 | 
			
		||||
        if MatchOpType(taicpu(p),top_const,top_reg) and
 | 
			
		||||
           (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
 | 
			
		||||
 | 
			
		||||
        { All these optimisations work on "shl/sal const,%reg" }
 | 
			
		||||
        if not MatchOpType(taicpu(p),top_const,top_reg) then
 | 
			
		||||
          Exit;
 | 
			
		||||
 | 
			
		||||
        if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
 | 
			
		||||
           (taicpu(p).oper[0]^.val <= 3) then
 | 
			
		||||
          { Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
 | 
			
		||||
          begin
 | 
			
		||||
@ -3511,8 +3516,7 @@ unit aoptx86;
 | 
			
		||||
              end;
 | 
			
		||||
          end
 | 
			
		||||
{$ifndef x86_64}
 | 
			
		||||
        else if (current_settings.optimizecputype < cpu_Pentium2) and
 | 
			
		||||
          MatchOpType(taicpu(p),top_const,top_reg) then
 | 
			
		||||
        else if (current_settings.optimizecputype < cpu_Pentium2) then
 | 
			
		||||
          begin
 | 
			
		||||
            { changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
 | 
			
		||||
              but faster on a 486, and Tairable in both U and V pipes on the Pentium
 | 
			
		||||
@ -3540,7 +3544,130 @@ unit aoptx86;
 | 
			
		||||
             end;
 | 
			
		||||
          end
 | 
			
		||||
{$endif x86_64}
 | 
			
		||||
          ;
 | 
			
		||||
        else if
 | 
			
		||||
          GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
 | 
			
		||||
          (
 | 
			
		||||
            (
 | 
			
		||||
              MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
 | 
			
		||||
              SetAndTest(hp1, hp2)
 | 
			
		||||
{$ifdef x86_64}
 | 
			
		||||
            ) or
 | 
			
		||||
            (
 | 
			
		||||
              MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
 | 
			
		||||
              GetNextInstruction(hp1, hp2) and
 | 
			
		||||
              MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
 | 
			
		||||
              MatchOpType(taicpu(hp2), top_reg, top_reg) and
 | 
			
		||||
              (taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
 | 
			
		||||
{$endif x86_64}
 | 
			
		||||
            )
 | 
			
		||||
          ) and
 | 
			
		||||
          (taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
 | 
			
		||||
          begin
 | 
			
		||||
            { Change:
 | 
			
		||||
                shl x, %reg1
 | 
			
		||||
                mov -(1<<x), %reg2
 | 
			
		||||
                and %reg2, %reg1
 | 
			
		||||
 | 
			
		||||
              Or:
 | 
			
		||||
                shl x, %reg1
 | 
			
		||||
                and -(1<<x), %reg1
 | 
			
		||||
 | 
			
		||||
              To just:
 | 
			
		||||
                shl x, %reg1
 | 
			
		||||
 | 
			
		||||
              Since the and operation only zeroes bits that are already zero from the shl operation
 | 
			
		||||
            }
 | 
			
		||||
            case taicpu(p).oper[0]^.val of
 | 
			
		||||
               8:
 | 
			
		||||
                 mask:=$FFFFFFFFFFFFFF00;
 | 
			
		||||
               16:
 | 
			
		||||
                 mask:=$FFFFFFFFFFFF0000;
 | 
			
		||||
               32:
 | 
			
		||||
                 mask:=$FFFFFFFF00000000;
 | 
			
		||||
               63:
 | 
			
		||||
                 { Constant pre-calculated to prevent overflow errors with Int64 }
 | 
			
		||||
                 mask:=$8000000000000000;
 | 
			
		||||
               else
 | 
			
		||||
                 begin
 | 
			
		||||
                   if taicpu(p).oper[0]^.val >= 64 then
 | 
			
		||||
                     { Shouldn't happen realistically, since the register
 | 
			
		||||
                       is guaranteed to be set to zero at this point }
 | 
			
		||||
                     mask := 0
 | 
			
		||||
                   else
 | 
			
		||||
                     mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
 | 
			
		||||
                 end;
 | 
			
		||||
            end;
 | 
			
		||||
 | 
			
		||||
            if taicpu(hp1).oper[0]^.val = mask then
 | 
			
		||||
              begin
 | 
			
		||||
                { Everything checks out, perform the optimisation, as long as
 | 
			
		||||
                  the FLAGS register isn't being used}
 | 
			
		||||
                TransferUsedRegs(TmpUsedRegs);
 | 
			
		||||
                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
 | 
			
		||||
 | 
			
		||||
{$ifdef x86_64}
 | 
			
		||||
                if (hp1 <> hp2) then
 | 
			
		||||
                  begin
 | 
			
		||||
                    { "shl/mov/and" version }
 | 
			
		||||
                    UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
 | 
			
		||||
 | 
			
		||||
                    { Don't do the optimisation if the FLAGS register is in use }
 | 
			
		||||
                    if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
 | 
			
		||||
                      begin
 | 
			
		||||
                        DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
 | 
			
		||||
                        { Don't remove the 'mov' instruction if its register is used elsewhere }
 | 
			
		||||
                        if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
 | 
			
		||||
                          begin
 | 
			
		||||
                            asml.Remove(hp1);
 | 
			
		||||
                            hp1.Free;
 | 
			
		||||
                            Result := True;
 | 
			
		||||
                          end;
 | 
			
		||||
 | 
			
		||||
                        { Only set Result to True if the 'mov' instruction was removed }
 | 
			
		||||
                        asml.Remove(hp2);
 | 
			
		||||
                        hp2.Free;
 | 
			
		||||
                      end;
 | 
			
		||||
                  end
 | 
			
		||||
                else
 | 
			
		||||
{$endif x86_64}
 | 
			
		||||
                  begin
 | 
			
		||||
                    { "shl/and" version }
 | 
			
		||||
 | 
			
		||||
                    { Don't do the optimisation if the FLAGS register is in use }
 | 
			
		||||
                    if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
 | 
			
		||||
                      begin
 | 
			
		||||
                        DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
 | 
			
		||||
                        asml.Remove(hp1);
 | 
			
		||||
                        hp1.Free;
 | 
			
		||||
                        Result := True;
 | 
			
		||||
                      end;
 | 
			
		||||
                  end;
 | 
			
		||||
 | 
			
		||||
                Exit;
 | 
			
		||||
              end
 | 
			
		||||
            else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
 | 
			
		||||
              begin
 | 
			
		||||
                { Even if the mask doesn't allow for its removal, we might be
 | 
			
		||||
                  able to optimise the mask for the "shl/and" version, which
 | 
			
		||||
                  may permit other peephole optimisations }
 | 
			
		||||
{$ifdef DEBUG_AOPTCPU}
 | 
			
		||||
                mask := taicpu(hp1).oper[0]^.val and mask;
 | 
			
		||||
                if taicpu(hp1).oper[0]^.val <> mask then
 | 
			
		||||
                  begin
 | 
			
		||||
                    DebugMsg(
 | 
			
		||||
                      SPeepholeOptimization +
 | 
			
		||||
                      'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
 | 
			
		||||
                      ' to $' + debug_tostr(mask) +
 | 
			
		||||
                      'based on previous instruction (ShlAnd2ShlAnd)', hp1);
 | 
			
		||||
                    taicpu(hp1).oper[0]^.val := mask;
 | 
			
		||||
                  end;
 | 
			
		||||
{$else DEBUG_AOPTCPU}
 | 
			
		||||
                { If debugging is off, just set the operand even if it's the same }
 | 
			
		||||
                taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
 | 
			
		||||
{$endif DEBUG_AOPTCPU}
 | 
			
		||||
              end;
 | 
			
		||||
 | 
			
		||||
          end;
 | 
			
		||||
      end;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -5357,6 +5484,35 @@ unit aoptx86;
 | 
			
		||||
                hp1.Free;
 | 
			
		||||
              end;
 | 
			
		||||
          end
 | 
			
		||||
        else if reg_and_hp1_is_instr and
 | 
			
		||||
          (taicpu(p).oper[0]^.typ = top_reg) and
 | 
			
		||||
          (
 | 
			
		||||
            (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
 | 
			
		||||
          ) and
 | 
			
		||||
          (taicpu(hp1).oper[0]^.typ = top_const) and
 | 
			
		||||
          SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
 | 
			
		||||
          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
 | 
			
		||||
          { Minimum shift value allowed is the bit difference between the sizes }
 | 
			
		||||
          (taicpu(hp1).oper[0]^.val >=
 | 
			
		||||
            { Multiply by 8 because tcgsize2size returns bytes, not bits }
 | 
			
		||||
            8 * (
 | 
			
		||||
              tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
 | 
			
		||||
              tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
 | 
			
		||||
            )
 | 
			
		||||
          ) then
 | 
			
		||||
          begin
 | 
			
		||||
            { For:
 | 
			
		||||
                movsx/movzx %reg1,%reg1 (same register, just different sizes)
 | 
			
		||||
                shl/sal     ##,   %reg1
 | 
			
		||||
 | 
			
		||||
              Remove the movsx/movzx instruction if the shift overwrites the
 | 
			
		||||
              extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
 | 
			
		||||
            }
 | 
			
		||||
            DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
 | 
			
		||||
            RemoveCurrentP(p, hp1);
 | 
			
		||||
            Result := True;
 | 
			
		||||
            Exit;
 | 
			
		||||
          end
 | 
			
		||||
        else if taicpu(p).opcode=A_MOVZX then
 | 
			
		||||
          begin
 | 
			
		||||
            { removes superfluous And's after movzx's }
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user