fpc/compiler/llvm/nllvminl.pas

{
    Copyright (c) 2014 by Jonas Maebe

    Generate LLVM bytecode for inline nodes

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

 ****************************************************************************
}
unit nllvminl;

{$i fpcdefs.inc}

interface

    uses
      node,
      ncginl;

    type
      tllvminlinenode = class(tcginlinenode)
       protected
        procedure maybe_remove_round_trunc_typeconv;

        function first_get_frame: tnode; override;
        function first_abs_real: tnode; override;
        function first_bitscan: tnode; override;
        function first_fma: tnode; override;
        function first_sqr_real: tnode; override;
        function first_sqrt_real: tnode; override;
        function first_trunc_real: tnode; override;
        function first_popcnt: tnode; override;
       public
        procedure second_length; override;
        procedure second_high; override;
        procedure second_sqr_real; override;
        procedure second_trunc_real; override;
      end;


implementation

     uses
       verbose,globals,globtype,constexp,cutils,
       aasmbase, aasmdata,
       symconst,symtype,symdef,defutil,
       compinnr,
       nutils,nadd,nbas,ncal,ncnv,ncon,nflw,ninl,nld,nmat,
       pass_2,
       cgbase,cgutils,tgobj,hlcgobj,
       cpubase,
       llvmbase,aasmllvm,aasmllvmmetadata;

     procedure tllvminlinenode.maybe_remove_round_trunc_typeconv;
       var
         temp: tnode;
       begin
         { the prototype of trunc()/round() in the system unit is declared
           with valreal as parameter type, so the argument will always be
           extended -> remove the typeconversion to extended if any; not done
           in ninl, because there are other code generators that assume that
           the parameter to trunc has been converted to valreal (e.g. PowerPC).

           (copy from code in nx64inl, should be refactored)
         }
         if (left.nodetype=typeconvn) and
            not(nf_explicit in left.flags) and
            (ttypeconvnode(left).left.resultdef.typ=floatdef) then
           begin
             { get rid of the type conversion, so the use_vectorfpu will be
               applied to the original type }
             temp:=ttypeconvnode(left).left;
             ttypeconvnode(left).left:=nil;
             left.free;
             left:=temp;
           end;
       end;


     function tllvminlinenode.first_get_frame: tnode;
       begin
         result:=ccallnode.createintern('llvm_frameaddress',
           ccallparanode.create(genintconstnode(0),nil));
       end;

    { in general, generate regular expression rather than intrinsics: according
      to the "Performance Tips for Frontend Authors", "The optimizer is quite
      good at reasoning about general control flow and arithmetic, it is not
      anywhere near as strong at reasoning about the various intrinsics. If
      profitable for code generation purposes, the optimizer will likely form
      the intrinsics itself late in the optimization pipeline." }

    function tllvminlinenode.first_abs_real: tnode;
      var
        lefttemp,
        resulttemp: ttempcreatenode;
        stat: tstatementnode;
      begin
        result:=internalstatements(stat);
        lefttemp:=ctempcreatenode.create(left.resultdef,left.resultdef.size,tt_persistent,true);
        { assigned twice -> will be spilled if put in register }
        resulttemp:=ctempcreatenode.create(resultdef,resultdef.size,tt_persistent,false);

        addstatement(stat,lefttemp);
        addstatement(stat,resulttemp);

        { lefttemp:=left }
        addstatement(stat,
          cassignmentnode.create(ctemprefnode.create(lefttemp),left)
        );

        { if lefttemp>=0 then
            resulttemp:=lefttemp
          else
            resulttemp:=-lefttemp
        }
        addstatement(stat,
          cifnode.create(
            caddnode.create(
              gten,
              ctemprefnode.create(lefttemp),
              crealconstnode.create(0.0,left.resultdef)
            ),
            cassignmentnode.create(
              ctemprefnode.create(resulttemp),
              ctemprefnode.create(lefttemp)
            ),
            cassignmentnode.create(
              ctemprefnode.create(resulttemp),
              cunaryminusnode.create(ctemprefnode.create(lefttemp))
            )
          )
        );
        addstatement(stat,ctempdeletenode.create(lefttemp));
        addstatement(stat,ctempdeletenode.create_normal_temp(resulttemp));
        { return resulttemp }
        addstatement(stat,ctemprefnode.create(resulttemp));
        { reused }
        left:=nil;
      end;


    function tllvminlinenode.first_bitscan: tnode;
      var
        leftdef: tdef;
        resulttemp,
        lefttemp: ttempcreatenode;
        stat: tstatementnode;
        block: tblocknode;
        cntresult: tnode;
        procname: string[15];
      begin
        {
          if left<>0 then
            result:=llvm_ctlz/cttz(unsigned(left),true)
          else
            result:=255;
        }
        if inlinenumber=in_bsr_x then
          procname:='LLVM_CTLZ'
        else
          procname:='LLVM_CTTZ';
        leftdef:=left.resultdef;
        block:=internalstatements(stat);
        resulttemp:=ctempcreatenode.create(resultdef,resultdef.size,tt_persistent,false);
        addstatement(stat,resulttemp);
        lefttemp:=maybereplacewithtemp(left,block,stat,left.resultdef.size,true);
        cntresult:=
          ccallnode.createintern(
            procname,
            ccallparanode.create(cordconstnode.create(1,llvmbool1type,false),
              ccallparanode.create(
                ctypeconvnode.create_explicit(left,get_unsigned_inttype(leftdef)),nil
              )
            )
          );
        { ctlz returns the number of leading zero bits, while bsr returns the bit
          number of the first non-zero bit (with the least significant bit as 0)
          -> invert result }
        if inlinenumber=in_bsr_x then
          begin
            cntresult:=
              caddnode.create(xorn,
                cntresult,
                genintconstnode(leftdef.size*8-1)
              );
          end;
        addstatement(stat,
          cifnode.create(caddnode.create(unequaln,left.getcopy,genintconstnode(0)),
            cassignmentnode.create(
              ctemprefnode.create(resulttemp),
              cntresult
            ),
            cassignmentnode.create(
              ctemprefnode.create(resulttemp),
              genintconstnode(255)
            )
          )
        );
        if assigned(lefttemp) then
          addstatement(stat,ctempdeletenode.create(lefttemp));
        addstatement(stat,ctempdeletenode.create_normal_temp(resulttemp));
        addstatement(stat,ctemprefnode.create(resulttemp));
        left:=nil;
        result:=block;
      end;


    function tllvminlinenode.first_fma: tnode;
      var
        exceptmode: ansistring;
        procname: string[40];
      begin
        if cs_opt_fastmath in current_settings.optimizerswitches then
          begin
            case inlinenumber of
              in_fma_single:
                procname:='llvm_fma_f32';
              in_fma_double:
                procname:='llvm_fma_f64';
              in_fma_extended:
                procname:='llvm_fma_f80';
              in_fma_float128:
                procname:='llvm_fma_f128';
              else
                internalerror(2018122101);
            end;
            result:=ccallnode.createintern(procname,left);
          end
        else
          begin
            case inlinenumber of
              in_fma_single,
              in_fma_double,
              in_fma_extended,
              in_fma_float128:
                procname:='LLVM_EXPERIMENTAL_CONSTRAINED_FMA';
              else
                internalerror(2019122811);
            end;
            exceptmode:=llvm_constrainedexceptmodestring;
            result:=ccallnode.createintern(procname,
              ccallparanode.create(cstringconstnode.createpchar(ansistring2pchar(exceptmode),length(exceptmode),llvm_metadatatype),
                ccallparanode.create(cstringconstnode.createpchar(ansistring2pchar('round.dynamic'),length('round.dynamic'),llvm_metadatatype),
                  left
                )
              )
            );
          end;
        left:=nil;
      end;


    function tllvminlinenode.first_sqr_real: tnode;
      begin
        result:=nil;
        if use_vectorfpu(left.resultdef) then
          expectloc:=LOC_MMREGISTER
        else
          expectloc:=LOC_FPUREGISTER;
      end;


    function tllvminlinenode.first_sqrt_real: tnode;
      var
        exceptmode: ansistring;
        intrinsic: string[40];
      begin
        if left.resultdef.typ<>floatdef then
          internalerror(2018121601);
        if cs_opt_fastmath in current_settings.optimizerswitches then
          begin
            case tfloatdef(left.resultdef).floattype of
              s32real:
                intrinsic:='llvm_sqrt_f32';
              s64real:
                intrinsic:='llvm_sqrt_f64';
              s80real,sc80real:
                intrinsic:='llvm_sqrt_f80';
              s128real:
                intrinsic:='llvm_sqrt_f128';
              else
                internalerror(2018121602);
            end;
            result:=ccallnode.createintern(intrinsic, ccallparanode.create(left,nil));
          end
        else
          begin
            case tfloatdef(left.resultdef).floattype of
              s32real,
              s64real,
              s80real,sc80real,
              s128real:
                intrinsic:='LLVM_EXPERIMENTAL_CONSTRAINED_SQRT';
              else
                internalerror(2019122810);
            end;
            exceptmode:=llvm_constrainedexceptmodestring;
            result:=ccallnode.createintern(intrinsic,
              ccallparanode.create(cstringconstnode.createpchar(ansistring2pchar(exceptmode),length(exceptmode),llvm_metadatatype),
                ccallparanode.create(cstringconstnode.createpchar(ansistring2pchar('round.dynamic'),length('round.dynamic'),llvm_metadatatype),
                  ccallparanode.create(left,nil)
                )
              )
            );
          end;
        left:=nil;
      end;


    function tllvminlinenode.first_trunc_real: tnode;
      begin
        { fptosi is undefined if the value is out of range -> only generate
          in cast of fastmath }
        if cs_opt_fastmath in current_settings.optimizerswitches then
          begin
            maybe_remove_round_trunc_typeconv;
            expectloc:=LOC_REGISTER;
            result:=nil;
          end
        else
          result:=inherited;
      end;

    function tllvminlinenode.first_popcnt: tnode;
      begin
        result:=ctypeconvnode.create(ccallnode.createintern('LLVM_CTPOP', ccallparanode.create(left,nil)),resultdef);
        left:=nil;
      end;


    procedure tllvminlinenode.second_length;
      var
        hreg: tregister;
      begin
        second_high;
        { Dynamic arrays do not have their length attached but their maximum index }
        if is_dynamic_array(left.resultdef) then
          begin
            hreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
            hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_ADD,resultdef,1,location.register,hreg);
            location.register:=hreg;
          end;
      end;


    procedure tllvminlinenode.second_high;
      var
        lengthlab, nillab: tasmlabel;
        hregister: tregister;
        href: treference;
        lendef: tdef;
      begin
        secondpass(left);
        if is_shortstring(left.resultdef) then
         begin
            if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
              internalerror(2014080806);
           { typecast the shortstring reference into a length byte reference }
           location_reset_ref(location,left.location.loc,def_cgsize(resultdef),left.location.reference.alignment,left.location.reference.volatility);
           hregister:=hlcg.getaddressregister(current_asmdata.CurrAsmList,cpointerdef.getreusable(resultdef));
           hlcg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,left.resultdef,cpointerdef.getreusable(resultdef),left.location.reference,hregister);
           hlcg.reference_reset_base(location.reference,cpointerdef.getreusable(resultdef),hregister,0,left.location.reference.temppos,left.location.reference.alignment,left.location.reference.volatility);
         end
        else
         begin
           { length in ansi/wide strings and high in dynamic arrays is at offset
             -sizeof(sizeint), for widestrings it's at -4 }
           if is_widestring(left.resultdef) then
             lendef:=u32inttype
           else
             lendef:=ossinttype;
           hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,
             left.resultdef,cpointerdef.getreusable(lendef),true);
           current_asmdata.getjumplabel(nillab);
           current_asmdata.getjumplabel(lengthlab);
           hlcg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,cpointerdef.getreusable(lendef),OC_EQ,0,left.location.register,nillab);
           { volatility of the ansistring/widestring refers to the volatility of the
             string pointer, not of the string data }
           hlcg.reference_reset_base(href,cpointerdef.getreusable(lendef),left.location.register,-lendef.size,ctempposinvalid,lendef.alignment,[]);
           hregister:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
           hlcg.a_load_ref_reg(current_asmdata.CurrAsmList,lendef,resultdef,href,hregister);
           if is_widestring(left.resultdef) then
             hlcg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHR,resultdef,1,hregister);
           hlcg.a_jmp_always(current_asmdata.CurrAsmList,lengthlab);

           hlcg.a_label(current_asmdata.CurrAsmList,nillab);
           if is_dynamic_array(left.resultdef) then
             hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,-1,hregister)
           else
             hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,0,hregister);

           hlcg.a_label(current_asmdata.CurrAsmList,lengthlab);
           location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
           location.register:=hregister;
         end;
      end;


    procedure tllvminlinenode.second_sqr_real;
      begin
        secondpass(left);
        location.loc:=expectloc;
        if expectloc=LOC_MMREGISTER then
          begin
            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
            location.register:=hlcg.getmmregister(current_asmdata.CurrAsmList,resultdef);
          end
        else
          begin
            hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
            location.register:=hlcg.getfpuregister(current_asmdata.CurrAsmList,resultdef);
          end;
        current_asmdata.CurrAsmList.concat(
          taillvm.op_reg_size_reg_reg(la_fmul,
            location.register,resultdef,
            left.location.register,left.location.register
          )
        );
      end;


    procedure tllvminlinenode.second_trunc_real;
      begin
        secondpass(left);
        if use_vectorfpu(left.resultdef) then
          hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true)
        else
          hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
        location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
        location.register:=hlcg.getregisterfordef(current_asmdata.CurrAsmList,resultdef);
        current_asmdata.CurrAsmList.concat(
          taillvm.op_reg_size_reg_size(la_fptosi,location.register,left.resultdef,left.location.register,resultdef)
        );
      end;

begin
  cinlinenode:=tllvminlinenode;
end.