Browse Source

Added initial support for the Cortex-M4F FPv4_S16 FPU

git-svn-id: branches/laksen/arm-embedded@22597 -
Jeppe Johansen 12 years ago
parent
commit
a8f9b0dac4

+ 5 - 1
compiler/arm/agarmgas.pas

@@ -106,6 +106,8 @@ unit agarmgas;
           result:='-mfpu=vfpv3 '+result;
         if (current_settings.fputype = fpu_vfpv3_d16) then
           result:='-mfpu=vfpv3-d16 '+result;
+        if (current_settings.fputype = fpu_fpv4_s16) then
+          result:='-mfpu=fpv4-sp-d16 '+result;
 
         if current_settings.cputype=cpu_armv7m then
           result:='-march=armv7m -mthumb -mthumb-interwork '+result
@@ -292,8 +294,10 @@ unit agarmgas;
 
           if taicpu(hp).ops = 0 then
             s:=#9+gas_op2str[op]+' '+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix]
+          else if (taicpu(hp).opcode>=A_VABS) and (taicpu(hp).opcode<=A_VSUB) then
+            s:=#9+gas_op2str[op]+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix]
           else
-            s:=#9+gas_op2str[op]+oppostfix2str[taicpu(hp).oppostfix]+postfix+cond2str[taicpu(hp).condition]; // Conditional infixes are deprecated in unified syntax
+            s:=#9+gas_op2str[op]+oppostfix2str[taicpu(hp).oppostfix]+cond2str[taicpu(hp).condition]+postfix; // Conditional infixes are deprecated in unified syntax
         end
       else
         s:=#9+gas_op2str[op]+cond2str[taicpu(hp).condition]+oppostfix2str[taicpu(hp).oppostfix];

+ 137 - 3
compiler/arm/cgcpu.pas

@@ -161,6 +161,12 @@ unit cgcpu;
         procedure g_proc_exit(list : TAsmList;parasize : longint;nostackframe:boolean); override;
 
         function handle_load_store(list:TAsmList;op: tasmop;oppostfix : toppostfix;reg:tregister;ref: treference):treference; override;
+
+        procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
+        procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
+        procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister; shuffle : pmmshuffle); override;
       end;
 
       tthumb2cg64farm = class(tcg64farm)
@@ -3120,10 +3126,17 @@ unit cgcpu;
           rg[R_INTREGISTER]:=trgintcputhumb2.create(R_INTREGISTER,R_SUBWHOLE,
               [RS_R0,RS_R1,RS_R2,RS_R3,RS_R4,RS_R5,RS_R6,RS_R7,RS_R8,
                RS_R10,RS_R12,RS_R14],first_int_imreg,[]);
-        rg[R_FPUREGISTER]:=trgcputhumb2.create(R_FPUREGISTER,R_SUBNONE,
+        rg[R_FPUREGISTER]:=trgcpu.create(R_FPUREGISTER,R_SUBNONE,
             [RS_F0,RS_F1,RS_F2,RS_F3,RS_F4,RS_F5,RS_F6,RS_F7],first_fpu_imreg,[]);
-        rg[R_MMREGISTER]:=trgcputhumb2.create(R_MMREGISTER,R_SUBNONE,
-            [RS_S0,RS_S1,RS_R2,RS_R3,RS_R4,RS_S31],first_mm_imreg,[]);
+
+        if current_settings.fputype=fpu_fpv4_s16 then
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBFD,
+              [RS_D0,RS_D1,RS_D2,RS_D3,RS_D4,RS_D5,RS_D6,RS_D7,
+               RS_D8,RS_D9,RS_D10,RS_D11,RS_D12,RS_D13,RS_D14,RS_D15
+              ],first_mm_imreg,[])
+        else
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBNONE,
+              [RS_S0,RS_S1,RS_R2,RS_R3,RS_R4,RS_S31],first_mm_imreg,[]);
       end;
 
 
@@ -3959,6 +3972,127 @@ unit cgcpu;
         Result := ref;
       end;
 
+     procedure Tthumb2cgarm.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
+      var
+        instr: taicpu;
+      begin
+        if (fromsize=OS_F32) and
+          (tosize=OS_F32) then
+          begin
+            instr:=setoppostfix(taicpu.op_reg_reg(A_VMOV,reg2,reg1), PF_F32);
+            list.Concat(instr);
+            add_move_instruction(instr);
+          end
+        else if (fromsize=OS_F64) and
+          (tosize=OS_F64) then
+          begin
+            //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VMOV,tregister(longint(reg2)+1),tregister(longint(reg1)+1)), PF_F32));
+            //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VMOV,reg2,reg1), PF_F32));
+          end
+        else if (fromsize=OS_F32) and
+          (tosize=OS_F64) then
+          //list.Concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,reg2,reg1), PF_F32))
+          begin
+            //list.concat(nil);
+          end;
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
+      var
+        href: treference;
+        tmpreg: TRegister;
+        so: tshifterop;
+      begin
+        href:=ref;
+
+        if (href.base<>NR_NO) and
+          (href.index<>NR_NO) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            if href.shiftmode<>SM_None then
+              begin
+                so.rs:=href.index;
+                so.shiftimm:=href.shiftimm;
+                so.shiftmode:=href.shiftmode;
+                list.concat(taicpu.op_reg_reg_shifterop(A_ADD,tmpreg,href.base,so));
+              end
+            else
+              a_op_reg_reg_reg(list,OP_ADD,OS_INT,href.index,href.base,tmpreg);
+
+            reference_reset_base(href,tmpreg,href.offset,0);
+          end;
+
+        if assigned(href.symbol) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            a_loadaddr_ref_reg(list,href,tmpreg);
+
+            reference_reset_base(href,tmpreg,0,0);
+          end;
+
+        if fromsize=OS_F32 then
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VLDR,reg,href), PF_F32))
+        else
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VLDR,reg,href), PF_F64));
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
+      var
+        href: treference;
+        so: tshifterop;
+        tmpreg: TRegister;
+      begin
+        href:=ref;
+
+        if (href.base<>NR_NO) and
+          (href.index<>NR_NO) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            if href.shiftmode<>SM_None then
+              begin
+                so.rs:=href.index;
+                so.shiftimm:=href.shiftimm;
+                so.shiftmode:=href.shiftmode;
+                list.concat(taicpu.op_reg_reg_shifterop(A_ADD,tmpreg,href.base,so));
+              end
+            else
+              a_op_reg_reg_reg(list,OP_ADD,OS_INT,href.index,href.base,tmpreg);
+
+            reference_reset_base(href,tmpreg,href.offset,0);
+          end;
+
+        if assigned(href.symbol) then
+          begin
+            tmpreg:=getintregister(list,OS_INT);
+            a_loadaddr_ref_reg(list,href,tmpreg);
+
+            reference_reset_base(href,tmpreg,0,0);
+          end;
+
+        if fromsize=OS_F32 then
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VSTR,reg,href), PF_32))
+        else
+          list.Concat(setoppostfix(taicpu.op_reg_ref(A_VSTR,reg,href), PF_64));
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
+      begin
+        if //(shuffle=nil) and
+          (tosize=OS_F32) then
+          list.Concat(taicpu.op_reg_reg(A_VMOV,mmreg,intreg))
+        else
+          internalerror(2012100813);
+      end;
+
+     procedure Tthumb2cgarm.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
+      begin
+        if //(shuffle=nil) and
+          (fromsize=OS_F32) then
+          list.Concat(taicpu.op_reg_reg(A_VMOV,intreg,mmreg))
+        else
+          internalerror(2012100814);
+      end;
+
 
     procedure tthumb2cg64farm.a_op64_reg_reg(list : TAsmList;op:TOpCG;size : tcgsize;regsrc,regdst : tregister64);
       var tmpreg: tregister;

+ 11 - 4
compiler/arm/cpubase.pas

@@ -139,7 +139,11 @@ unit cpubase;
         { multiple load/store vfp address modes }
         PF_IAD,PF_DBD,PF_FDD,PF_EAD,
         PF_IAS,PF_DBS,PF_FDS,PF_EAS,
-        PF_IAX,PF_DBX,PF_FDX,PF_EAX
+        PF_IAX,PF_DBX,PF_FDX,PF_EAX,
+        { FPv4 postfixes }
+        PF_32,PF_64,PF_F32,PF_F64,
+        PF_F32S32,PF_F32U32,
+        PF_S32F32,PF_U32F32
       );
 
       TOpPostfixes = set of TOpPostfix;
@@ -152,14 +156,17 @@ unit cpubase;
         PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,
         PF_S,PF_D,PF_E,PF_None,PF_None);
 
-      oppostfix2str : array[TOpPostfix] of string[3] = ('',
+      oppostfix2str : array[TOpPostfix] of string[8] = ('',
         's',
         'd','e','p','ep',
         'b','sb','bt','h','sh','t',
         'ia','ib','da','db','fd','fa','ed','ea',
         'iad','dbd','fdd','ead',
         'ias','dbs','fds','eas',
-        'iax','dbx','fdx','eax');
+        'iax','dbx','fdx','eax',
+        '.32','.64','.f32','.f64',
+        '.f32.s32','.f32.u32',
+        '.s32.f32','.u32.f32');
 
       roundingmode2str : array[TRoundingMode] of string[1] = ('',
         'p','m','z');
@@ -371,7 +378,7 @@ unit cpubase;
 
 
     const
-      std_regname_table : array[tregisterindex] of string[7] = (
+      std_regname_table : array[tregisterindex] of string[10] = (
         {$i rarmstd.inc}
       );
 

+ 5 - 3
compiler/arm/cpuinfo.pas

@@ -65,7 +65,8 @@ Type
       fpu_fpa11,
       fpu_vfpv2,
       fpu_vfpv3,
-      fpu_vfpv3_d16
+      fpu_vfpv3_d16,
+      fpu_fpv4_s16
      );
 
    tcontrollertype =
@@ -227,7 +228,8 @@ Const
      'FPA11',
      'VFPV2',
      'VFPV3',
-     'VFPV3_D16'
+     'VFPV3_D16',
+     'FPV4_S16'
    );
 
 
@@ -1004,7 +1006,7 @@ Const
         )
     );
 
-   vfp_scalar = [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16];
+   vfp_scalar = [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16];
 
    { Supported optimizations, only used for information }
    supported_optimizerswitches = genericlevel1optimizerswitches+

+ 2 - 2
compiler/arm/cpupara.pas

@@ -124,7 +124,7 @@ unit cpupara;
                 getparaloc:=LOC_MMREGISTER
               else if (calloption in [pocall_cdecl,pocall_cppdecl,pocall_softfloat]) or
                  (cs_fp_emulation in current_settings.moduleswitches) or
-                 (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16]) then
+                 (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16]) then
                 { the ARM eabi also allows passing VFP values via VFP registers,
                   but Mac OS X doesn't seem to do that and linux only does it if
                   built with the "-mfloat-abi=hard" option }
@@ -608,7 +608,7 @@ unit cpupara;
               end
             else if (p.proccalloption in [pocall_softfloat]) or
                (cs_fp_emulation in current_settings.moduleswitches) or
-               (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16]) then
+               (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16]) then
               begin
                 case retcgsize of
                   OS_64,

+ 8 - 0
compiler/arm/cpupi.pas

@@ -118,6 +118,14 @@ unit cpupi;
                 if r in regs then
                   inc(floatsavesize,8);
             end;
+          fpu_fpv4_s16:
+            begin
+              floatsavesize:=0;
+              regs:=cg.rg[R_MMREGISTER].used_in_proc-paramanager.get_volatile_registers_mm(pocall_stdcall);
+              for r:=RS_D0 to RS_D15 do
+                if r in regs then
+                  inc(floatsavesize,8);
+            end;
         end;
         floatsavesize:=align(floatsavesize,max(current_settings.alignment.localalignmin,4));
         result:=Align(tg.direction*tg.lasttemp,max(current_settings.alignment.localalignmin,4))+maxpushedparasize+aint(floatsavesize);

+ 1 - 1
compiler/arm/itcpugas.pas

@@ -46,7 +46,7 @@ implementation
       cutils,verbose;
 
     const
-      gas_regname_table : array[tregisterindex] of string[7] = (
+      gas_regname_table : array[tregisterindex] of string[10] = (
         {$i rarmstd.inc}
       );
 

+ 125 - 2
compiler/arm/narmadd.pas

@@ -35,6 +35,7 @@ interface
        public
           function pass_1 : tnode;override;
        protected
+          function first_addfloat: tnode; override;
           procedure second_addfloat;override;
           procedure second_cmpfloat;override;
           procedure second_cmpordinal;override;
@@ -48,12 +49,12 @@ interface
       globtype,systems,
       cutils,verbose,globals,
       constexp,
-      symconst,symdef,paramgr,
+      symconst,symdef,paramgr,symtable,symtype,
       aasmbase,aasmtai,aasmdata,aasmcpu,defutil,htypechk,
       cgbase,cgutils,cgcpu,
       cpuinfo,pass_1,pass_2,regvars,procinfo,
       cpupara,
-      ncon,nset,nadd,
+      ncon,nset,nadd,ncnv,ncal,nmat,
       ncgutil,tgobj,rgobj,rgcpu,cgobj,cg64f32,
       hlcgobj
       ;
@@ -212,6 +213,36 @@ interface
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,
                  location.register,left.location.register,right.location.register));
             end;
+          fpu_fpv4_s16:
+            begin
+              { force mmreg as location, left right doesn't matter
+                as both will be in a fpureg }
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              if left.location.loc<>LOC_CMMREGISTER then
+                location.register:=left.location.register
+              else if right.location.loc<>LOC_CMMREGISTER then
+                location.register:=right.location.register
+              else
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+
+              case nodetype of
+                addn :
+                  op:=A_VADD;
+                muln :
+                  op:=A_VMUL;
+                subn :
+                  op:=A_VSUB;
+                slashn :
+                  op:=A_VDIV;
+                else
+                  internalerror(2009111401);
+              end;
+
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(op, location.register,left.location.register,right.location.register), PF_F32));
+            end;
           fpu_soft:
             { this case should be handled already by pass1 }
             internalerror(200308252);
@@ -273,6 +304,21 @@ interface
               cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
               current_asmdata.CurrAsmList.concat(taicpu.op_none(A_FMSTAT));
             end;
+          fpu_fpv4_s16:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+
+              if nodetype in [equaln,unequaln] then
+                op:=A_VCMP
+              else
+                op:=A_VCMPE;
+
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
+                left.location.register,right.location.register));
+              cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+              current_asmdata.CurrAsmList.Concat(taicpu.op_reg_reg(A_VMRS, NR_APSR_nzcv, NR_FPSCR));
+            end;
           fpu_soft:
             { this case should be handled already by pass1 }
             internalerror(2009112404);
@@ -464,6 +510,83 @@ interface
           end;
       end;
 
+    function tarmaddnode.first_addfloat: tnode;
+      var
+        procname: string[31];
+        { do we need to reverse the result ? }
+        notnode : boolean;
+        fdef : tdef;
+      begin
+        result := nil;
+        notnode := false;
+
+        if current_settings.fputype = fpu_fpv4_s16 then
+          begin
+            case tfloatdef(left.resultdef).floattype of
+              s32real:
+                begin
+                  result:=nil;
+                  notnode:=false;
+                end;
+              s64real:
+                begin
+                  fdef:=search_system_type('FLOAT64').typedef;
+                  procname:='float64';
+
+                  case nodetype of
+                    addn:
+                      procname:=procname+'_add';
+                    muln:
+                      procname:=procname+'_mul';
+                    subn:
+                      procname:=procname+'_sub';
+                    slashn:
+                      procname:=procname+'_div';
+                    ltn:
+                      procname:=procname+'_lt';
+                    lten:
+                      procname:=procname+'_le';
+                    gtn:
+                      begin
+                        procname:=procname+'_le';
+                        notnode:=true;
+                      end;
+                    gten:
+                      begin
+                        procname:=procname+'_lt';
+                        notnode:=true;
+                      end;
+                    equaln:
+                      procname:=procname+'_eq';
+                    unequaln:
+                      begin
+                        procname:=procname+'_eq';
+                        notnode:=true;
+                      end;
+                    else
+                      CGMessage3(type_e_operator_not_supported_for_types,node2opstr(nodetype),left.resultdef.typename,right.resultdef.typename);
+                  end;
+
+                  if nodetype in [ltn,lten,gtn,gten,equaln,unequaln] then
+                    resultdef:=pasbool8type;
+                  result:=ctypeconvnode.create_internal(ccallnode.createintern(procname,ccallparanode.create(
+                      ctypeconvnode.create_internal(right,fdef),
+                      ccallparanode.create(
+                        ctypeconvnode.create_internal(left,fdef),nil))),resultdef);
+
+                  left:=nil;
+                  right:=nil;
+
+                  { do we need to reverse the result }
+                  if notnode then
+                    result:=cnotnode.create(result);
+                end;
+            end;
+          end
+        else
+          result:=inherited first_addfloat;
+      end;
+
 
     procedure tarmaddnode.second_cmpordinal;
       var

+ 1 - 1
compiler/arm/narmcal.pas

@@ -49,7 +49,7 @@ implementation
       if (realresdef.typ=floatdef) and 
          (target_info.abi <> abi_eabihf) and
          ((cs_fp_emulation in current_settings.moduleswitches) or
-          (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16])) then
+          (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3,fpu_vfpv3_d16,fpu_fpv4_s16])) then
         begin
           { keep the fpu values in integer registers for now, the code
             generator will move them to memory or an mmregister when necessary

+ 64 - 3
compiler/arm/narmcnv.pas

@@ -32,6 +32,7 @@ interface
        tarmtypeconvnode = class(tcgtypeconvnode)
          protected
            function first_int_to_real: tnode;override;
+           function first_real_to_real: tnode; override;
          { procedure second_int_to_int;override; }
          { procedure second_string_to_string;override; }
          { procedure second_cstring_to_pchar;override; }
@@ -58,7 +59,7 @@ implementation
 
    uses
       verbose,globtype,globals,systems,
-      symconst,symdef,aasmbase,aasmtai,aasmdata,
+      symconst,symdef,aasmbase,aasmtai,aasmdata,symtable,
       defutil,
       cgbase,cgutils,
       pass_1,pass_2,procinfo,
@@ -76,7 +77,8 @@ implementation
       var
         fname: string[19];
       begin
-        if cs_fp_emulation in current_settings.moduleswitches then
+        if (cs_fp_emulation in current_settings.moduleswitches) or
+          (current_settings.fputype=fpu_fpv4_s16) then
           result:=inherited first_int_to_real
         else
           begin
@@ -117,7 +119,8 @@ implementation
                 expectloc:=LOC_FPUREGISTER;
               fpu_vfpv2,
               fpu_vfpv3,
-              fpu_vfpv3_d16:
+              fpu_vfpv3_d16,
+              fpu_fpv4_s16:
                 expectloc:=LOC_MMREGISTER;
               else
                 internalerror(2009112702);
@@ -125,6 +128,48 @@ implementation
           end;
       end;
 
+    function tarmtypeconvnode.first_real_to_real: tnode;
+      begin
+        if (current_settings.fputype=fpu_fpv4_s16) then
+          begin
+            case tfloatdef(left.resultdef).floattype of
+              s32real:
+                case tfloatdef(resultdef).floattype of
+                  s64real:
+                    result:=ctypeconvnode.create_explicit(ccallnode.createintern('float32_to_float64',ccallparanode.create(
+                      ctypeconvnode.create_internal(left,search_system_type('FLOAT32REC').typedef),nil)),resultdef);
+                  s32real:
+                    begin
+                      result:=left;
+                      left:=nil;
+                    end;
+                  else
+                    internalerror(200610151);
+                end;
+              s64real:
+                case tfloatdef(resultdef).floattype of
+                  s32real:
+                    result:=ctypeconvnode.create_explicit(ccallnode.createintern('float64_to_float32',ccallparanode.create(
+                      ctypeconvnode.create_internal(left,search_system_type('FLOAT64').typedef),nil)),resultdef);
+                  s64real:
+                    begin
+                      result:=left;
+                      left:=nil;
+                    end;
+                  else
+                    internalerror(200610152);
+                end;
+              else
+                internalerror(200610153);
+            end;
+            left:=nil;
+            firstpass(result);
+            exit;
+          end
+        else
+          Result := inherited first_real_to_real;
+      end;
+
 
     procedure tarmtypeconvnode.second_int_to_real;
       const
@@ -214,6 +259,22 @@ implementation
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(
                 signedprec2vfpop[signed,location.size],location.register,left.location.register));
             end;
+          fpu_fpv4_s16:
+            begin
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              signed:=left.location.size=OS_S32;
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+              if (left.location.size<>OS_F32) then
+                internalerror(2009112703);
+              if left.location.size<>location.size then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size)
+              else
+                location.register:=left.location.register;
+              if signed then
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,location.register,left.location.register), PF_F32S32))
+              else
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VCVT,location.register,left.location.register), PF_F32U32));
+            end;
         end;
       end;
 

+ 29 - 1
compiler/arm/narminl.pas

@@ -91,7 +91,8 @@ implementation
             end;
           fpu_vfpv2,
           fpu_vfpv3,
-          fpu_vfpv3_d16:
+          fpu_vfpv3_d16,
+          fpu_fpv4_s16:
             begin
               location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
               location_copy(location,left.location);
@@ -123,6 +124,13 @@ implementation
               fpu_vfpv3,
               fpu_vfpv3_d16:
                 expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_abs_real);
+                end;
               else
                 internalerror(2009112401);
             end;
@@ -146,6 +154,13 @@ implementation
               fpu_vfpv3,
               fpu_vfpv3_d16:
                 expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_sqr_real);
+                end;
               else
                 internalerror(2009112402);
             end;
@@ -169,6 +184,13 @@ implementation
               fpu_vfpv3,
               fpu_vfpv3_d16:
                 expectloc:=LOC_MMREGISTER;
+              fpu_fpv4_s16:
+                begin
+                  if tfloatdef(left.resultdef).floattype=s32real then
+                    expectloc:=LOC_MMREGISTER
+                  else
+                    exit(inherited first_sqrt_real);
+                end;
               else
                 internalerror(2009112403);
             end;
@@ -227,6 +249,8 @@ implementation
                 op:=A_FABSD;
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
             end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.Concat(setoppostfix(taicpu.op_reg_reg(A_VABS,location.register,left.location.register), PF_F32));
         else
           internalerror(2009111402);
         end;
@@ -254,6 +278,8 @@ implementation
                 op:=A_FMULD;
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,left.location.register,left.location.register));
             end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.Concat(setoppostfix(taicpu.op_reg_reg_reg(A_VMUL,location.register,left.location.register,left.location.register), PF_F32));
         else
           internalerror(2009111403);
         end;
@@ -281,6 +307,8 @@ implementation
                 op:=A_FSQRTD;
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
             end;
+          fpu_fpv4_s16:
+            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VSQRT,location.register,left.location.register));
         else
           internalerror(2009111402);
         end;

+ 52 - 1
compiler/arm/narmmat.pas

@@ -39,6 +39,7 @@ interface
       end;
 
       tarmunaryminusnode = class(tcgunaryminusnode)
+        function pass_1: tnode; override;
         procedure second_float;override;
       end;
 
@@ -54,9 +55,10 @@ implementation
       cutils,verbose,globals,constexp,
       aasmbase,aasmcpu,aasmtai,aasmdata,
       defutil,
+      symtype,symconst,symtable,
       cgbase,cgobj,hlcgobj,cgutils,
       pass_2,procinfo,
-      ncon,
+      ncon,ncnv,ncal,
       cpubase,cpuinfo,
       ncgutil,cgcpu,
       nadd,pass_1,symdef;
@@ -326,6 +328,46 @@ implementation
                                TARMUNARYMINUSNODE
 *****************************************************************************}
 
+    function tarmunaryminusnode.pass_1: tnode;
+      var
+        procname: string[31];
+        fdef : tdef;
+      begin
+        if (current_settings.fputype<>fpu_fpv4_s16) or
+          (tfloatdef(resultdef).floattype=s32real) then
+          exit(inherited pass_1);
+
+        result:=nil;
+        firstpass(left);
+        if codegenerror then
+          exit;
+
+        if (left.resultdef.typ=floatdef) then
+          begin
+            case tfloatdef(resultdef).floattype of
+              s64real:
+                begin
+                  procname:='float64_sub';
+                  fdef:=search_system_type('FLOAT64').typedef;
+                end;
+              else
+                internalerror(2005082801);
+            end;
+            result:=ctypeconvnode.create_internal(ccallnode.createintern(procname,ccallparanode.create(
+              ctypeconvnode.create_internal(left,fDef),
+              ccallparanode.create(ctypeconvnode.create_internal(crealconstnode.create(0,resultdef),fdef),nil))),resultdef);
+
+            left:=nil;
+          end
+        else
+          begin
+            if (left.resultdef.typ=floatdef) then
+              expectloc:=LOC_FPUREGISTER
+             else if (left.resultdef.typ=orddef) then
+               expectloc:=LOC_REGISTER;
+          end;
+      end;
+
     procedure tarmunaryminusnode.second_float;
       var
         op: tasmop;
@@ -357,6 +399,15 @@ implementation
               current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
                 location.register,left.location.register));
             end;
+          fpu_fpv4_s16:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location:=left.location;
+              if (left.location.loc=LOC_CMMREGISTER) then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_VNEG,
+                location.register,left.location.register), PF_F32));
+            end
           else
             internalerror(2009112602);
         end;

+ 11 - 13
compiler/nadd.pas

@@ -2608,7 +2608,11 @@ implementation
         { In non-emulation mode, real opcodes are
           emitted for floating point values.
         }
-        if not (cs_fp_emulation in current_settings.moduleswitches) then
+        if not ((cs_fp_emulation in current_settings.moduleswitches)
+{$ifdef cpufpemu}
+                or (current_settings.fputype=fpu_soft)
+{$endif cpufpemu}
+                ) then
           exit;
 
         if not(target_info.system in systems_wince) then
@@ -2768,12 +2772,9 @@ implementation
          if nodetype=slashn then
            begin
 {$ifdef cpufpemu}
-             if (current_settings.fputype=fpu_soft) or (cs_fp_emulation in current_settings.moduleswitches) then
-               begin
-                 result:=first_addfloat;
-                 if assigned(result) then
-                   exit;
-               end;
+             result:=first_addfloat;
+             if assigned(result) then
+               exit;
 {$endif cpufpemu}
              expectloc:=LOC_FPUREGISTER;
            end
@@ -2984,12 +2985,9 @@ implementation
          else if (rd.typ=floatdef) or (ld.typ=floatdef) then
             begin
 {$ifdef cpufpemu}
-             if (current_settings.fputype=fpu_soft) or (cs_fp_emulation in current_settings.moduleswitches) then
-               begin
-                 result:=first_addfloat;
-                 if assigned(result) then
-                   exit;
-               end;
+             result:=first_addfloat;
+             if assigned(result) then
+               exit;
 {$endif cpufpemu}
               if nodetype in [addn,subn,muln,andn,orn,xorn] then
                 expectloc:=LOC_FPUREGISTER

+ 1 - 1
compiler/rgbase.pas

@@ -29,7 +29,7 @@ interface
       cpuBase,cgBase;
 
     type
-      TRegNameTable = array[tregisterindex] of string[7];
+      TRegNameTable = array[tregisterindex] of string[10];
       TRegisterIndexTable = array[tregisterindex] of tregisterindex;
 
     function findreg_by_number_table(r:Tregister;const regnumber_index:TRegisterIndexTable):tregisterindex;

+ 9 - 0
rtl/arm/thumb2.inc

@@ -33,10 +33,19 @@ Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
 begin
   { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
   asm
+    {$IFDEF FPUFPV4_S16}
+    movw r0, #(0xed88)
+    movt r0, #(0xe000)
+    ldr r1, [r0]
+    orr r1, r1, #(0xF << 20)
+    str r1, [r0]
+    bx lr
+    {$ELSE FPUFPV4_S16}
     rfs r0
     and r0,r0,#0xffe0ffff
     orr r0,r0,#0x00070000
     wfs r0
+    {$endif FPUFPV4_S16}
   end;
 end;
 {$endif}