Added initial support for the Cortex-M4F FPv4_S16 FPU

git-svn-id: branches/laksen/arm-embedded@22597 -
2025-10-23 17:51:37 +02:00 · 2012-10-08 20:10:45 +00:00 · 2012-10-08 20:10:45 +00:00 · a8f9b0dac4
commit a8f9b0dac4
parent 0087661fb5
15 changed files with 461 additions and 36 deletions
--- a/compiler/arm/agarmgas.pas
+++ b/compiler/arm/agarmgas.pas
@ -106,6 +106,8 @@ unit agarmgas;
          result:='-mfpu=vfpv3 '+result;
        if (current_settings.fputype = fpu_vfpv3_d16) then
          result:='-mfpu=vfpv3-d16 '+result;
+        if (current_settings.fputype = fpu_fpv4_s16) then
+          result:='-mfpu=fpv4-sp-d16 '+result;

        if current_settings.cputype=cpu_armv7m then
          result:='-march=armv7m -mthumb -mthumb-interwork '+result
@ -292,8 +294,10 @@ unit agarmgas;

          if taicpu(hp).ops = 0 then
            s:=#9+gas_op2str[op]+' '+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix]
+          else if (taicpu(hp).opcode>=A_VABS) and (taicpu(hp).opcode<=A_VSUB) then
+            s:=#9+gas_op2str[op]+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix]
          else
-            s:=#9+gas_op2str[op]+oppostfix2str[taicpu(hp).oppostfix]+postfix+cond2str[taicpu(hp).condition]; // Conditional infixes are deprecated in unified syntax
+            s:=#9+gas_op2str[op]+oppostfix2str[taicpu(hp).oppostfix]+cond2str[taicpu(hp).condition]+postfix; // Conditional infixes are deprecated in unified syntax
        end
      else
        s:=#9+gas_op2str[op]+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix];
--- a/compiler/arm/cgcpu.pas
+++ b/compiler/arm/cgcpu.pas
@ -161,6 +161,12 @@ unit cgcpu;
        procedure g_proc_exit(list : TAsmList;parasize : longint;nostackframe:boolean); override;

        function handle_load_store(list:TAsmList;op: tasmop;oppostfix : toppostfix;reg:tregister;ref: treference):treference; override;
+
+        procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
+        procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
+        procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister; shuffle : pmmshuffle); override;
      end;

      tthumb2cg64farm = class(tcg64farm)
@ -3120,10 +3126,17 @@ unit cgcpu;
          rg[R_INTREGISTER]:=trgintcputhumb2.create(R_INTREGISTER,R_SUBWHOLE,
              [RS_R0,RS_R1,RS_R2,RS_R3,RS_R4,RS_R5,RS_R6,RS_R7,RS_R8,
               RS_R10,RS_R12,RS_R14],first_int_imreg,[]);
-        rg[R_FPUREGISTER]:=trgcputhumb2.create(R_FPUREGISTER,R_SUBNONE,
+        rg[R_FPUREGISTER]:=trgcpu.create(R_FPUREGISTER,R_SUBNONE,
            [RS_F0,RS_F1,RS_F2,RS_F3,RS_F4,RS_F5,RS_F6,RS_F7],first_fpu_imreg,[]);
-        rg[R_MMREGISTER]:=trgcputhumb2.create(R_MMREGISTER,R_SUBNONE,
-            [RS_S0,RS_S1,RS_R2,RS_R3,RS_R4,RS_S31],first_mm_imreg,[]);
+
+        if current_settings.fputype=fpu_fpv4_s16 then
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBFD,
+              [RS_D0,RS_D1,RS_D2,RS_D3,RS_D4,RS_D5,RS_D6,RS_D7,
+               RS_D8,RS_D9,RS_D10,RS_D11,RS_D12,RS_D13,RS_D14,RS_D15
+              ],first_mm_imreg,[])
+        else
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBNONE,
+              [RS_S0,RS_S1,RS_R2,RS_R3,RS_R4,RS_S31],first_mm_imreg,[]);
      end;


@ -3959,6 +3972,127 @@ unit cgcpu;
        Result := ref;
      end;

+     procedure Tthumb2cgarm.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
+      var
+        instr: taicpu;
+      begin
+        if (fromsize=OS_F32) and
+          (tosize=OS_F32) then
+          begin
+            instr:=setoppostfix(taicpu.op_reg_reg(A_VMOV,reg2,reg1), PF_F32);
+            list.Concat(instr);
+            add_move_instruction(instr);
+          end
+        else if (fromsize=OS_F64) and
+          (tosize=OS_F64) then
+          begin
+            //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VMOV,tregister(longint(reg2)+1),tregister(longint(reg1)+1)), PF_F32));
+            //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VMOV,reg2,reg1), PF_F32));
+          end
+        else if (fromsize=OS_F32) and
+          (tosize=OS_F64) then
+          //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,reg2,reg1), PF_F32))
+          begin
+            //list.concat(nil);
+          end;
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
+      var
+        href: treference;
+        tmpreg: TRegister;
+        so: tshifterop;
+      begin
+        href:=ref;
+
+        if (href.base<>NR_NO) and
+          (href.index<>NR_NO) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            if href.shiftmode<>SM_None then
+              begin
+                so.rs:=href.index;
+                so.shiftimm:=href.shiftimm;
+                so.shiftmode:=href.shiftmode;
+                list.concat(taicpu.op_reg_reg_shifterop(A_ADD,tmpreg,href.base,so));
+              end
+            else
+              a_op_reg_reg_reg(list,OP_ADD,OS_INT,href.index,href.base,tmpreg);
+
+            reference_reset_base(href,tmpreg,href.offset,0);
+          end;
+
+        if assigned(href.symbol) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            a_loadaddr_ref_reg(list,href,tmpreg);
+
+            reference_reset_base(href,tmpreg,0,0);
+          end;
+
+        if fromsize=OS_F32 then
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VLDR,reg,href), PF_F32))
+        else
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VLDR,reg,href), PF_F64));
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
+      var
+        href: treference;
+        so: tshifterop;
+        tmpreg: TRegister;
+      begin
+        href:=ref;
+
+        if (href.base<>NR_NO) and
+          (href.index<>NR_NO) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            if href.shiftmode<>SM_None then
+              begin
+                so.rs:=href.index;
+                so.shiftimm:=href.shiftimm;
+                so.shiftmode:=href.shiftmode;
+                list.concat(taicpu.op_reg_reg_shifterop(A_ADD,tmpreg,href.base,so));
+              end
+            else
+              a_op_reg_reg_reg(list,OP_ADD,OS_INT,href.index,href.base,tmpreg);
+
+            reference_reset_base(href,tmpreg,href.offset,0);
+          end;
+
+        if assigned(href.symbol) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            a_loadaddr_ref_reg(list,href,tmpreg);
+
+            reference_reset_base(href,tmpreg,0,0);
+          end;
+
+        if fromsize=OS_F32 then
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VSTR,reg,href), PF_32))
+        else
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VSTR,reg,href), PF_64));
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
+      begin
+        if //(shuffle=nil) and
+          (tosize=OS_F32) then
+          list.Concat(taicpu.op_reg_reg(A_VMOV,mmreg,intreg))
+        else
+          internalerror(2012100813);
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
+      begin
+        if //(shuffle=nil) and
+          (fromsize=OS_F32) then
+          list.Concat(taicpu.op_reg_reg(A_VMOV,intreg,mmreg))
+        else
+          internalerror(2012100814);
+      end;
+

    procedure tthumb2cg64farm.a_op64_reg_reg(list : TAsmList;op:TOpCG;size : tcgsize;regsrc,regdst : tregister64);
      var tmpreg: tregister;
--- a/compiler/arm/cpubase.pas
+++ b/compiler/arm/cpubase.pas
@ -139,7 +139,11 @@ unit cpubase;
        { multiple load/store vfp address modes }
        PF_IAD,PF_DBD,PF_FDD,PF_EAD,
        PF_IAS,PF_DBS,PF_FDS,PF_EAS,
-        PF_IAX,PF_DBX,PF_FDX,PF_EAX
+        PF_IAX,PF_DBX,PF_FDX,PF_EAX,
+        { FPv4 postfixes }
+        PF_32,PF_64,PF_F32,PF_F64,
+        PF_F32S32,PF_F32U32,
+        PF_S32F32,PF_U32F32
      );

      TOpPostfixes = set of TOpPostfix;
@ -152,14 +156,17 @@ unit cpubase;
        PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,
        PF_S,PF_D,PF_E,PF_None,PF_None);

-      oppostfix2str : array[TOpPostfix] of string[3] = ('',
+      oppostfix2str : array[TOpPostfix] of string[8] = ('',
        's',
        'd','e','p','ep',
        'b','sb','bt','h','sh','t',
        'ia','ib','da','db','fd','fa','ed','ea',
        'iad','dbd','fdd','ead',
        'ias','dbs','fds','eas',
-        'iax','dbx','fdx','eax');
+        'iax','dbx','fdx','eax',
+        '.32','.64','.f32','.f64',
+        '.f32.s32','.f32.u32',
+        '.s32.f32','.u32.f32');

      roundingmode2str : array[TRoundingMode] of string[1] = ('',
        'p','m','z');
@ -371,7 +378,7 @@ unit cpubase;


    const
-      std_regname_table : array[tregisterindex] of string[7] = (
+      std_regname_table : array[tregisterindex] of string[10] = (
        {$i rarmstd.inc}
      );

--- a/compiler/arm/cpuinfo.pas
+++ b/compiler/arm/cpuinfo.pas
@ -65,7 +65,8 @@ Type
      fpu_fpa11,
      fpu_vfpv2,
      fpu_vfpv3,
-      fpu_vfpv3_d16
+      fpu_vfpv3_d16,
+      fpu_fpv4_s16
     );

   tcontrollertype =
@ -227,7 +228,8 @@ Const
     'FPA11',
     'VFPV2',
     'VFPV3',
-     'VFPV3_D16'
+     'VFPV3_D16',
+     'FPV4_S16'
   );


@ -1004,7 +1006,7 @@ Const
        )
    );

-   vfp_scalar = [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16];
+   vfp_scalar = [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16];

   { Supported optimizations, only used for information }
   supported_optimizerswitches = genericlevel1optimizerswitches+
--- a/compiler/arm/cpupara.pas
+++ b/compiler/arm/cpupara.pas
@ -124,7 +124,7 @@ unit cpupara;
                getparaloc:=LOC_MMREGISTER
              else if (calloption in [pocall_cdecl,pocall_cppdecl,pocall_softfloat]) or
                 (cs_fp_emulation in current_settings.moduleswitches) or
-                 (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16]) then
+                 (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16]) then
                { the ARM eabi also allows passing VFP values via VFP registers,
                  but Mac OS X doesn't seem to do that and linux only does it if
                  built with the "-mfloat-abi=hard" option }
@ -608,7 +608,7 @@ unit cpupara;
              end
            else if (p.proccalloption in [pocall_softfloat]) or
               (cs_fp_emulation in current_settings.moduleswitches) or
-               (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16]) then
+               (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16]) then
              begin
                case retcgsize of
                  OS_64,
--- a/compiler/arm/cpupi.pas
+++ b/compiler/arm/cpupi.pas
@ -118,6 +118,14 @@ unit cpupi;
                if r in regs then
                  inc(floatsavesize,8);
            end;
+          fpu_fpv4_s16:
+            begin
+              floatsavesize:=0;
+              regs:=cg.rg[R_MMREGISTER].used_in_proc-paramanager.get_volatile_registers_mm(pocall_stdcall);
+              for r:=RS_D0 to RS_D15 do
+                if r in regs then
+                  inc(floatsavesize,8);
+            end;
        end;
        floatsavesize:=align(floatsavesize,max(current_settings.alignment.localalignmin,4));
        result:=Align(tg.direction*tg.lasttemp,max(current_settings.alignment.localalignmin,4))+maxpushedparasize+aint(floatsavesize);
--- a/compiler/arm/itcpugas.pas
+++ b/compiler/arm/itcpugas.pas
@ -46,7 +46,7 @@ implementation
      cutils,verbose;

    const
-      gas_regname_table : array[tregisterindex] of string[7] = (
+      gas_regname_table : array[tregisterindex] of string[10] = (
        {$i rarmstd.inc}
      );

--- a/compiler/arm/narmadd.pas
+++ b/compiler/arm/narmadd.pas
@ -35,6 +35,7 @@ interface
       public
          function pass_1 : tnode;override;
       protected
+          function first_addfloat: tnode; override;
          procedure second_addfloat;override;
          procedure second_cmpfloat;override;
          procedure second_cmpordinal;override;
@ -48,12 +49,12 @@ interface
      globtype,systems,
      cutils,verbose,globals,
      constexp,
-      symconst,symdef,paramgr,
+      symconst,symdef,paramgr,symtable,symtype,
      aasmbase,aasmtai,aasmdata,aasmcpu,defutil,htypechk,
      cgbase,cgutils,cgcpu,
      cpuinfo,pass_1,pass_2,regvars,procinfo,
      cpupara,
-      ncon,nset,nadd,
+      ncon,nset,nadd,ncnv,ncal,nmat,
      ncgutil,tgobj,rgobj,rgcpu,cgobj,cg64f32,
      hlcgobj
      ;
@ -212,6 +213,36 @@ interface
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,
                 location.register,left.location.register,right.location.register));
            end;
+          fpu_fpv4_s16:
+            begin
+              { force mmreg as location, left right doesn't matter
+                as both will be in a fpureg }
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              if left.location.loc<>LOC_CMMREGISTER then
+                location.register:=left.location.register
+              else if right.location.loc<>LOC_CMMREGISTER then
+                location.register:=right.location.register
+              else
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+
+              case nodetype of
+                addn :
+                  op:=A_VADD;
+                muln :
+                  op:=A_VMUL;
+                subn :
+                  op:=A_VSUB;
+                slashn :
+                  op:=A_VDIV;
+                else
+                  internalerror(2009111401);
+              end;
+
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(op, location.register,left.location.register,right.location.register), PF_F32));
+            end;
          fpu_soft:
            { this case should be handled already by pass1 }
            internalerror(200308252);
@ -273,6 +304,21 @@ interface
              cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
              current_asmdata.CurrAsmList.concat(taicpu.op_none(A_FMSTAT));
            end;
+          fpu_fpv4_s16:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+
+              if nodetype in [equaln,unequaln] then
+                op:=A_VCMP
+              else
+                op:=A_VCMPE;
+
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
+                left.location.register,right.location.register));
+              cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+              current_asmdata.CurrAsmList.Concat(taicpu.op_reg_reg(A_VMRS, NR_APSR_nzcv, NR_FPSCR));
+            end;
          fpu_soft:
            { this case should be handled already by pass1 }
            internalerror(2009112404);
@ -464,6 +510,83 @@ interface
          end;
      end;

+    function tarmaddnode.first_addfloat: tnode;
+      var
+        procname: string[31];
+        { do we need to reverse the result ? }
+        notnode : boolean;
+        fdef : tdef;
+      begin
+        result := nil;
+        notnode := false;
+
+        if current_settings.fputype = fpu_fpv4_s16 then
+          begin
+            case tfloatdef(left.resultdef).floattype of
+              s32real:
+                begin
+                  result:=nil;
+                  notnode:=false;
+                end;
+              s64real:
+                begin
+                  fdef:=search_system_type('FLOAT64').typedef;
+                  procname:='float64';
+
+                  case nodetype of
+                    addn:
+                      procname:=procname+'_add';
+                    muln:
+                      procname:=procname+'_mul';
+                    subn:
+                      procname:=procname+'_sub';
+                    slashn:
+                      procname:=procname+'_div';
+                    ltn:
+                      procname:=procname+'_lt';
+                    lten:
+                      procname:=procname+'_le';
+                    gtn:
+                      begin
+                        procname:=procname+'_le';
+                        notnode:=true;
+                      end;
+                    gten:
+                      begin
+                        procname:=procname+'_lt';
+                        notnode:=true;
+                      end;
+                    equaln:
+                      procname:=procname+'_eq';
+                    unequaln:
+                      begin
+                        procname:=procname+'_eq';
+                        notnode:=true;
+                      end;
+                    else
+                      CGMessage3(type_e_operator_not_supported_for_types,node2opstr(nodetype),left.resultdef.typename,right.resultdef.typename);
+                  end;
+
+                  if nodetype in [ltn,lten,gtn,gten,equaln,unequaln] then
+                    resultdef:=pasbool8type;
+                  result:=ctypeconvnode.create_internal(ccallnode.createintern(procname,ccallparanode.create(
+                      ctypeconvnode.create_internal(right,fdef),
+                      ccallparanode.create(
+                        ctypeconvnode.create_internal(left,fdef),nil))),resultdef);
+
+                  left:=nil;
+                  right:=nil;
+
+                  { do we need to reverse the result }
+                  if notnode then
+                    result:=cnotnode.create(result);
+                end;
+            end;
+          end
+        else
+          result:=inherited first_addfloat;
+      end;
+

    procedure tarmaddnode.second_cmpordinal;
      var
--- a/compiler/arm/narmcal.pas
+++ b/compiler/arm/narmcal.pas
@ -49,7 +49,7 @@ implementation
      if (realresdef.typ=floatdef) and 
         (target_info.abi <> abi_eabihf) and
         ((cs_fp_emulation in current_settings.moduleswitches) or
-          (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16])) then
+          (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16])) then
        begin
          { keep the fpu values in integer registers for now, the code
            generator will move them to memory or an mmregister when necessary
--- a/compiler/arm/narmcnv.pas
+++ b/compiler/arm/narmcnv.pas
@ -32,6 +32,7 @@ interface
       tarmtypeconvnode = class(tcgtypeconvnode)
         protected
           function first_int_to_real: tnode;override;
+           function first_real_to_real: tnode; override;
         { procedure second_int_to_int;override; }
         { procedure second_string_to_string;override; }
         { procedure second_cstring_to_pchar;override; }
@ -58,7 +59,7 @@ implementation

   uses
      verbose,globtype,globals,systems,
-      symconst,symdef,aasmbase,aasmtai,aasmdata,
+      symconst,symdef,aasmbase,aasmtai,aasmdata,symtable,
      defutil,
      cgbase,cgutils,
      pass_1,pass_2,procinfo,
@ -76,7 +77,8 @@ implementation
      var
        fname: string[19];
      begin
-        if cs_fp_emulation in current_settings.moduleswitches then
+        if (cs_fp_emulation in current_settings.moduleswitches) or
+          (current_settings.fputype=fpu_fpv4_s16) then
          result:=inherited first_int_to_real
        else
          begin
@ -117,7 +119,8 @@ implementation
                expectloc:=LOC_FPUREGISTER;
              fpu_vfpv2,
              fpu_vfpv3,
-              fpu_vfpv3_d16:
+              fpu_vfpv3_d16,
+              fpu_fpv4_s16:
                expectloc:=LOC_MMREGISTER;
              else
                internalerror(2009112702);
@ -125,6 +128,48 @@ implementation
          end;
      end;

+    function tarmtypeconvnode.first_real_to_real: tnode;
+      begin
+        if (current_settings.fputype=fpu_fpv4_s16) then
+          begin
+            case tfloatdef(left.resultdef).floattype of
+              s32real:
+                case tfloatdef(resultdef).floattype of
+                  s64real:
+                    result:=ctypeconvnode.create_explicit(ccallnode.createintern('float32_to_float64',ccallparanode.create(
+                      ctypeconvnode.create_internal(left,search_system_type('FLOAT32REC').typedef),nil)),resultdef);
+                  s32real:
+                    begin
+                      result:=left;
+                      left:=nil;
+                    end;
+                  else
+                    internalerror(200610151);
+                end;
+              s64real:
+                case tfloatdef(resultdef).floattype of
+                  s32real:
+                    result:=ctypeconvnode.create_explicit(ccallnode.createintern('float64_to_float32',ccallparanode.create(
+                      ctypeconvnode.create_internal(left,search_system_type('FLOAT64').typedef),nil)),resultdef);
+                  s64real:
+                    begin
+                      result:=left;
+                      left:=nil;
+                    end;
+                  else
+                    internalerror(200610152);
+                end;
+              else
+                internalerror(200610153);
+            end;
+            left:=nil;
+            firstpass(result);
+            exit;
+          end
+        else
+          Result := inherited first_real_to_real;
+      end;
+

    procedure tarmtypeconvnode.second_int_to_real;
      const
@ -214,6 +259,22 @@ implementation
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(
                signedprec2vfpop[signed,location.size],location.register,left.location.register));
            end;
+          fpu_fpv4_s16:
+            begin
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              signed:=left.location.size=OS_S32;
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+              if (left.location.size<>OS_F32) then
+                internalerror(2009112703);
+              if left.location.size<>location.size then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size)
+              else
+                location.register:=left.location.register;
+              if signed then
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,location.register,left.location.register), PF_F32S32))
+              else
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,location.register,left.location.register), PF_F32U32));
+            end;
        end;
      end;

--- a/compiler/arm/narminl.pas
+++ b/compiler/arm/narminl.pas
@ -91,7 +91,8 @@ implementation
            end;
          fpu_vfpv2,
          fpu_vfpv3,
-          fpu_vfpv3_d16:
+          fpu_vfpv3_d16,
+          fpu_fpv4_s16:
            begin
              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
              location_copy(location,left.location);
@ -123,6 +124,13 @@ implementation
              fpu_vfpv3,
              fpu_vfpv3_d16:
                expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_abs_real);
+                end;
              else
                internalerror(2009112401);
            end;
@ -146,6 +154,13 @@ implementation
              fpu_vfpv3,
              fpu_vfpv3_d16:
                expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_sqr_real);
+                end;
              else
                internalerror(2009112402);
            end;
@ -169,6 +184,13 @@ implementation
              fpu_vfpv3,
              fpu_vfpv3_d16:
                expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_sqrt_real);
+                end;
              else
                internalerror(2009112403);
            end;
@ -227,6 +249,8 @@ implementation
                op:=A_FABSD;
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
            end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.Concat(setoppostfix(taicpu.op_reg_reg(A_VABS,location.register,left.location.register), PF_F32));
        else
          internalerror(2009111402);
        end;
@ -254,6 +278,8 @@ implementation
                op:=A_FMULD;
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,left.location.register,left.location.register));
            end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.Concat(setoppostfix(taicpu.op_reg_reg_reg(A_VMUL,location.register,left.location.register,left.location.register), PF_F32));
        else
          internalerror(2009111403);
        end;
@ -281,6 +307,8 @@ implementation
                op:=A_FSQRTD;
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
            end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VSQRT,location.register,left.location.register));
        else
          internalerror(2009111402);
        end;
--- a/compiler/arm/narmmat.pas
+++ b/compiler/arm/narmmat.pas
@ -39,6 +39,7 @@ interface
      end;

      tarmunaryminusnode = class(tcgunaryminusnode)
+        function pass_1: tnode; override;
        procedure second_float;override;
      end;

@ -54,9 +55,10 @@ implementation
      cutils,verbose,globals,constexp,
      aasmbase,aasmcpu,aasmtai,aasmdata,
      defutil,
+      symtype,symconst,symtable,
      cgbase,cgobj,hlcgobj,cgutils,
      pass_2,procinfo,
-      ncon,
+      ncon,ncnv,ncal,
      cpubase,cpuinfo,
      ncgutil,cgcpu,
      nadd,pass_1,symdef;
@ -326,6 +328,46 @@ implementation
                               TARMUNARYMINUSNODE
 *****************************************************************************}

+    function tarmunaryminusnode.pass_1: tnode;
+      var
+        procname: string[31];
+        fdef : tdef;
+      begin
+        if (current_settings.fputype<>fpu_fpv4_s16) or
+          (tfloatdef(resultdef).floattype=s32real) then
+          exit(inherited pass_1);
+
+        result:=nil;
+        firstpass(left);
+        if codegenerror then
+          exit;
+
+        if (left.resultdef.typ=floatdef) then
+          begin
+            case tfloatdef(resultdef).floattype of
+              s64real:
+                begin
+                  procname:='float64_sub';
+                  fdef:=search_system_type('FLOAT64').typedef;
+                end;
+              else
+                internalerror(2005082801);
+            end;
+            result:=ctypeconvnode.create_internal(ccallnode.createintern(procname,ccallparanode.create(
+              ctypeconvnode.create_internal(left,fDef),
+              ccallparanode.create(ctypeconvnode.create_internal(crealconstnode.create(0,resultdef),fdef),nil))),resultdef);
+
+            left:=nil;
+          end
+        else
+          begin
+            if (left.resultdef.typ=floatdef) then
+              expectloc:=LOC_FPUREGISTER
+             else if (left.resultdef.typ=orddef) then
+               expectloc:=LOC_REGISTER;
+          end;
+      end;
+
    procedure tarmunaryminusnode.second_float;
      var
        op: tasmop;
@ -357,6 +399,15 @@ implementation
              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
                location.register,left.location.register));
            end;
+          fpu_fpv4_s16:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location:=left.location;
+              if (left.location.loc=LOC_CMMREGISTER) then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VNEG,
+                location.register,left.location.register), PF_F32));
+            end
          else
            internalerror(2009112602);
        end;
--- a/compiler/nadd.pas
+++ b/compiler/nadd.pas
@ -2608,7 +2608,11 @@ implementation
        { In non-emulation mode, real opcodes are
          emitted for floating point values.
        }
-        if not (cs_fp_emulation in current_settings.moduleswitches) then
+        if not ((cs_fp_emulation in current_settings.moduleswitches)
+{$ifdef cpufpemu}
+                or (current_settings.fputype=fpu_soft)
+{$endif cpufpemu}
+                ) then
          exit;

        if not(target_info.system in systems_wince) then
@ -2768,12 +2772,9 @@ implementation
         if nodetype=slashn then
           begin
 {$ifdef cpufpemu}
-             if (current_settings.fputype=fpu_soft) or (cs_fp_emulation in current_settings.moduleswitches) then
-               begin
-                 result:=first_addfloat;
-                 if assigned(result) then
-                   exit;
-               end;
+             result:=first_addfloat;
+             if assigned(result) then
+               exit;
 {$endif cpufpemu}
             expectloc:=LOC_FPUREGISTER;
           end
@ -2984,12 +2985,9 @@ implementation
         else if (rd.typ=floatdef) or (ld.typ=floatdef) then
            begin
 {$ifdef cpufpemu}
-             if (current_settings.fputype=fpu_soft) or (cs_fp_emulation in current_settings.moduleswitches) then
-               begin
-                 result:=first_addfloat;
-                 if assigned(result) then
-                   exit;
-               end;
+             result:=first_addfloat;
+             if assigned(result) then
+               exit;
 {$endif cpufpemu}
              if nodetype in [addn,subn,muln,andn,orn,xorn] then
                expectloc:=LOC_FPUREGISTER
--- a/compiler/rgbase.pas
+++ b/compiler/rgbase.pas
@ -29,7 +29,7 @@ interface
      cpuBase,cgBase;

    type
-      TRegNameTable = array[tregisterindex] of string[7];
+      TRegNameTable = array[tregisterindex] of string[10];
      TRegisterIndexTable = array[tregisterindex] of tregisterindex;

    function findreg_by_number_table(r:Tregister;const regnumber_index:TRegisterIndexTable):tregisterindex;
--- a/rtl/arm/thumb2.inc
+++ b/rtl/arm/thumb2.inc
@ -33,10 +33,19 @@ Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
 begin
  { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  asm
+    {$IFDEF FPUFPV4_S16}
+    movw r0, #(0xed88)
+    movt r0, #(0xe000)
+    ldr r1, [r0]
+    orr r1, r1, #(0xF << 20)
+    str r1, [r0]
+    bx lr
+    {$ELSE FPUFPV4_S16}
    rfs r0
    and r0,r0,#0xffe0ffff
    orr r0,r0,#0x00070000
    wfs r0
+    {$endif FPUFPV4_S16}
  end;
 end;
 {$endif}