Browse Source

o added ARM VPFv2/VFPv3 support:
+ RTL support:
o VFP exceptions are disabled by default on Darwin,
because they cause kernel panics on iPhoneOS 2.2.1 at least
o all denormals are truncated to 0 on Darwin, because disabling
that also causes kernel panics on iPhoneOS 2.2.1 (probably
because otherwise denormals can also cause exceptions)
* set softfloat rounding mode correctly for non-wince/darwin/vfp
targets
+ compiler support: only half the number of single precision
registers is available due to limitations of the register
allocator
+ added a number of comments about why the stackframe on ARM is
set up the way it is by the compiler
+ added regtype and subregtype info to regsets, because they're
also used for VFP registers (+ support in assembler reader)
+ various generic support routines for dealing with floating point
values located in integer registers that have to be transferred to
mm registers (needed for VFP)
* renamed use_sse() to use_vectorfpu() and also use it for
ARM/vfp support
o only superficially tested for Linux (compiler compiled with -Cpvfpv6
-Cfvfpv2 works on a Cortex-A8, no testsuite run performed -- at least
the fpu exception handler still needs to be implemented), Darwin has
been tested more thoroughly
+ added ARMv6 cpu type and made it default for Darwin/ARM
+ ARMv6+ implementations of atomic operations using ldrex/strex
* don't use r9 on Darwin/ARM, as it's reserved under certain
circumstances (don't know yet which ones)
* changed C-test object files for ARM/Darwin to ARMv6 versions
* check in assembler reader that regsets are not empty, because
instructions with a regset operand have undefined behaviour in that
case
* fixed resultdef of tarmtypeconvnode.first_int_to_real in case of
int64->single type conversion
* fixed constant pool locations in case 64 bit constants are generated,
and/or when vfp instructions with limited reach are present

WARNING: when using VFP on an ARMv6 or later cpu, you *must* compile all
code with -Cparmv6 (or higher), or you will get crashes. The reason is
that storing/restoring multiple VFP registers must happen using
different instructions on pre/post-ARMv6.

git-svn-id: trunk@14317 -

Jonas Maebe 15 years ago
parent
commit
d1538ab023
67 changed files with 2531 additions and 879 deletions
  1. 1 0
      .gitattributes
  2. 1 1
      compiler/aasmtai.pas
  3. 211 79
      compiler/arm/aasmcpu.pas
  4. 5 5
      compiler/arm/agarmgas.pas
  5. 4 6
      compiler/arm/armatt.inc
  6. 0 2
      compiler/arm/armatts.inc
  7. 8 10
      compiler/arm/armins.dat
  8. 4 6
      compiler/arm/armop.inc
  9. 96 75
      compiler/arm/armreg.dat
  10. 467 73
      compiler/arm/cgcpu.pas
  11. 41 5
      compiler/arm/cpubase.pas
  12. 8 2
      compiler/arm/cpuinfo.pas
  13. 19 3
      compiler/arm/cpupara.pas
  14. 38 12
      compiler/arm/cpupi.pas
  15. 103 27
      compiler/arm/narmadd.pas
  16. 82 44
      compiler/arm/narmcnv.pas
  17. 128 20
      compiler/arm/narminl.pas
  18. 31 7
      compiler/arm/narmmat.pas
  19. 28 3
      compiler/arm/raarmgas.pas
  20. 66 49
      compiler/arm/rarmcon.inc
  21. 17 0
      compiler/arm/rarmdwa.inc
  22. 1 1
      compiler/arm/rarmnor.inc
  23. 66 49
      compiler/arm/rarmnum.inc
  24. 41 24
      compiler/arm/rarmrni.inc
  25. 18 1
      compiler/arm/rarmsri.inc
  26. 17 0
      compiler/arm/rarmsta.inc
  27. 18 1
      compiler/arm/rarmstd.inc
  28. 62 45
      compiler/arm/rarmsup.inc
  29. 28 0
      compiler/arm/rgcpu.pas
  30. 4 1
      compiler/assemble.pas
  31. 32 1
      compiler/cg64f32.pas
  32. 94 3
      compiler/cgobj.pas
  33. 7 1
      compiler/msg/errore.msg
  34. 4 2
      compiler/msgidx.inc
  35. 217 213
      compiler/msgtxt.inc
  36. 9 9
      compiler/ncgadd.pas
  37. 1 3
      compiler/ncgbas.pas
  38. 2 7
      compiler/ncgcal.pas
  39. 3 0
      compiler/ncgcnv.pas
  40. 15 7
      compiler/ncgld.pas
  41. 96 24
      compiler/ncgutil.pas
  42. 4 1
      compiler/ncnv.pas
  43. 10 1
      compiler/nld.pas
  44. 38 6
      compiler/options.pas
  45. 2 2
      compiler/rautils.pas
  46. 7 2
      compiler/rgobj.pas
  47. 14 9
      compiler/symdef.pas
  48. 1 4
      compiler/symsym.pas
  49. 1 1
      compiler/systems/t_bsd.pas
  50. 4 1
      compiler/utils/mkarmreg.pp
  51. 2 2
      compiler/x86/nx86add.pas
  52. 3 3
      compiler/x86/nx86cnv.pas
  53. 3 3
      compiler/x86/nx86con.pas
  54. 9 9
      compiler/x86/nx86inl.pas
  55. 1 1
      compiler/x86/nx86mat.pas
  56. 34 1
      compiler/x86_64/cgcpu.pas
  57. 3 3
      compiler/x86_64/nx64cnv.pas
  58. 83 0
      rtl/arm/arm.inc
  59. 1 1
      rtl/arm/math.inc
  60. 194 5
      rtl/arm/mathu.inc
  61. 21 3
      rtl/arm/setjump.inc
  62. 3 0
      rtl/arm/setjumph.inc
  63. BIN
      tests/test/cg/obj/darwin/arm/cpptcl1.o
  64. BIN
      tests/test/cg/obj/darwin/arm/ctest.o
  65. BIN
      tests/test/cg/obj/darwin/arm/tcext3.o
  66. BIN
      tests/test/cg/obj/darwin/arm/tcext4.o
  67. BIN
      tests/test/cg/obj/darwin/arm/tcext5.o

+ 1 - 0
.gitattributes

@@ -8283,6 +8283,7 @@ tests/test/cg/obj/beos/i386/tcext3.o -text
 tests/test/cg/obj/beos/i386/tcext4.o -text
 tests/test/cg/obj/beos/i386/tcext5.o -text
 tests/test/cg/obj/cpptcl1.cpp svneol=native#text/plain
+tests/test/cg/obj/darwin/arm/cpptcl1.o -text
 tests/test/cg/obj/darwin/arm/ctest.o -text
 tests/test/cg/obj/darwin/arm/tcext3.o -text
 tests/test/cg/obj/darwin/arm/tcext4.o -text

+ 1 - 1
compiler/aasmtai.pas

@@ -212,7 +212,7 @@ interface
           { local varsym that will be inserted in pass_generate_code }
           top_local  : (localoper:plocaloper);
       {$ifdef arm}
-          top_regset : (regset:^tcpuregisterset);
+          top_regset : (regset:^tcpuregisterset; regtyp: tregistertype; subreg: tsubregister);
           top_shifterop : (shifterop : pshifterop);
           top_conditioncode: (cc: TAsmCond);
       {$endif arm}

+ 211 - 79
compiler/arm/aasmcpu.pas

@@ -160,7 +160,7 @@ uses
          oppostfix : TOpPostfix;
          roundingmode : troundingmode;
          procedure loadshifterop(opidx:longint;const so:tshifterop);
-         procedure loadregset(opidx:longint;const s:tcpuregisterset);
+         procedure loadregset(opidx:longint; regsetregtype: tregistertype; regsetsubregtype: tsubregister; const s:tcpuregisterset);
          procedure loadconditioncode(opidx:longint;const cond:tasmcond);
          constructor op_none(op : tasmop);
 
@@ -172,7 +172,7 @@ uses
          constructor op_reg_ref(op : tasmop;_op1 : tregister;const _op2 : treference);
          constructor op_reg_const(op:tasmop; _op1: tregister; _op2: aint);
 
-         constructor op_ref_regset(op:tasmop; _op1: treference; _op2: tcpuregisterset);
+         constructor op_ref_regset(op:tasmop; _op1: treference; regtype: tregistertype; subreg: tsubregister; _op2: tcpuregisterset);
 
          constructor op_reg_reg_reg(op : tasmop;_op1,_op2,_op3 : tregister);
          constructor op_reg_reg_const(op : tasmop;_op1,_op2 : tregister; _op3: aint);
@@ -279,7 +279,7 @@ implementation
       end;
 
 
-    procedure taicpu.loadregset(opidx:longint;const s:tcpuregisterset);
+    procedure taicpu.loadregset(opidx:longint; regsetregtype: tregistertype; regsetsubregtype: tsubregister; const s:tcpuregisterset);
       var
         i : byte;
       begin
@@ -287,15 +287,29 @@ implementation
         with oper[opidx]^ do
          begin
            if typ<>top_regset then
-             clearop(opidx);
-           new(regset);
-           regset^:=s;
-           typ:=top_regset;
-           for i:=RS_R0 to RS_R15 do
              begin
-               if assigned(add_reg_instruction_hook) and (i in regset^) then
-                 add_reg_instruction_hook(self,newreg(R_INTREGISTER,i,R_SUBWHOLE));
+               clearop(opidx);
+               new(regset);
              end;
+           regset^:=s;
+           regtyp:=regsetregtype;
+           subreg:=regsetsubregtype;
+           typ:=top_regset;
+           case regsetregtype of
+             R_INTREGISTER:
+               for i:=RS_R0 to RS_R15 do
+                 begin
+                   if assigned(add_reg_instruction_hook) and (i in regset^) then
+                     add_reg_instruction_hook(self,newreg(R_INTREGISTER,i,regsetsubregtype));
+                 end;
+             R_MMREGISTER:
+               { both RS_S0 and RS_D0 range from 0 to 31 }
+               for i:=RS_D0 to RS_D31 do
+                 begin
+                   if assigned(add_reg_instruction_hook) and (i in regset^) then
+                     add_reg_instruction_hook(self,newreg(R_MMREGISTER,i,regsetsubregtype));
+                 end;
+           end;
          end;
       end;
 
@@ -366,12 +380,12 @@ implementation
       end;
 
 
-    constructor taicpu.op_ref_regset(op:tasmop; _op1: treference; _op2: tcpuregisterset);
+    constructor taicpu.op_ref_regset(op:tasmop; _op1: treference; regtype: tregistertype; subreg: tsubregister; _op2: tcpuregisterset);
       begin
          inherited create(op);
          ops:=2;
          loadref(0,_op1);
-         loadregset(1,_op2);
+         loadregset(1,regtype,subreg,_op2);
       end;
 
 
@@ -521,7 +535,8 @@ implementation
       begin
         { allow the register allocator to remove unnecessary moves }
         result:=(((opcode=A_MOV) and (regtype = R_INTREGISTER)) or
-                 ((opcode=A_MVF) and (regtype = R_FPUREGISTER) and (oppostfix in [PF_None,PF_D]))
+                 ((opcode=A_MVF) and (regtype = R_FPUREGISTER) and (oppostfix in [PF_None,PF_D])) or
+                 (((opcode=A_FCPYS) or (opcode=A_FCPYD)) and (regtype = R_MMREGISTER))
                 ) and
                 (condition=C_None) and
                 (ops=2) and
@@ -532,6 +547,8 @@ implementation
 
 
     function spilling_create_load(const ref:treference;r:tregister):Taicpu;
+      var
+        op: tasmop;
       begin
         case getregtype(r) of
           R_INTREGISTER :
@@ -541,6 +558,18 @@ implementation
               and avoid exceptions
             }
             result:=taicpu.op_reg_const_ref(A_LFM,r,1,ref);
+          R_MMREGISTER :
+            begin
+              case getsubreg(r) of
+                R_SUBFD:
+                  op:=A_FLDD;
+                R_SUBFS:
+                  op:=A_FLDS;
+                else
+                  internalerror(2009112905);
+              end;
+              result:=taicpu.op_reg_ref(op,r,ref);
+            end;
           else
             internalerror(200401041);
         end;
@@ -548,6 +577,8 @@ implementation
 
 
     function spilling_create_store(r:tregister; const ref:treference):Taicpu;
+      var
+        op: tasmop;
       begin
         case getregtype(r) of
           R_INTREGISTER :
@@ -557,6 +588,18 @@ implementation
               and avoid exceptions
             }
             result:=taicpu.op_reg_const_ref(A_SFM,r,1,ref);
+          R_MMREGISTER :
+            begin
+              case getsubreg(r) of
+                R_SUBFD:
+                  op:=A_FSTD;
+                R_SUBFS:
+                  op:=A_FSTS;
+                else
+                  internalerror(2009112904);
+              end;
+              result:=taicpu.op_reg_ref(op,r,ref);
+            end;
           else
             internalerror(200401041);
         end;
@@ -578,27 +621,43 @@ implementation
           A_RFS,A_RFC,A_RDF,
           A_RMF,A_RPW,A_RSF,A_SUF,A_ABS,A_ACS,A_ASN,A_ATN,A_COS,
           A_EXP,A_LOG,A_LGN,A_MVF,A_MNF,A_FRD,A_MUF,A_POL,A_RND,A_SIN,A_SQT,A_TAN,
-          A_LFM:
+          A_LFM,
+          A_FLDS,A_FLDD,
+          A_FMRX,A_FMXR,A_FMSTAT,
+          A_FMSR,A_FMRS,A_FMDRR,
+          A_FCPYS,A_FCPYD,A_FCVTSD,A_FCVTDS,
+          A_FABSS,A_FABSD,A_FSQRTS,A_FSQRTD,A_FMULS,A_FMULD,
+          A_FADDS,A_FADDD,A_FSUBS,A_FSUBD,A_FDIVS,A_FDIVD,
+          A_FMACS,A_FMACD,A_FMSCS,A_FMSCD,A_FNMACS,A_FNMACD,
+          A_FNMSCS,A_FNMSCD,A_FNMULS,A_FNMULD,
+          A_FMDHR,A_FMRDH,A_FMDLR,A_FMRDL,
+          A_FNEGS,A_FNEGD,
+          A_FSITOS,A_FSITOD,A_FTOSIS,A_FTOSID,
+          A_FTOUIS,A_FTOUID,A_FUITOS,A_FUITOD:
             if opnr=0 then
               result:=operand_write
             else
               result:=operand_read;
           A_BIC,A_BKPT,A_B,A_BL,A_BLX,A_BX,
           A_CMN,A_CMP,A_TEQ,A_TST,
-          A_CMF,A_CMFE,A_WFS,A_CNF:
+          A_CMF,A_CMFE,A_WFS,A_CNF,
+          A_FCMPS,A_FCMPD,A_FCMPES,A_FCMPED,A_FCMPEZS,A_FCMPEZD,
+          A_FCMPZS,A_FCMPZD:
             result:=operand_read;
           A_SMLAL,A_UMLAL:
             if opnr in [0,1] then
               result:=operand_readwrite
             else
               result:=operand_read;
-           A_SMULL,A_UMULL:
+           A_SMULL,A_UMULL,
+           A_FMRRD:
             if opnr in [0,1] then
               result:=operand_write
             else
               result:=operand_read;
           A_STR,A_STRB,A_STRBT,
-          A_STRH,A_STRT,A_STF,A_SFM:
+          A_STRH,A_STRT,A_STF,A_SFM,
+          A_FSTS,A_FSTD:
             { important is what happens with the involved registers }
             if opnr=0 then
               result := operand_read
@@ -695,11 +754,35 @@ implementation
       End;
 
 
+(*
+    function armconstequal(hp1,hp2: tai): boolean;
+      begin
+        result:=false;
+        if hp1.typ<>hp2.typ then
+          exit;
+        case hp1.typ of
+          tai_const:
+            result:=
+              (tai_const(hp2).sym=tai_const(hp).sym) and
+              (tai_const(hp2).value=tai_const(hp).value) and
+              (tai(hp2.previous).typ=ait_label);
+            tai_const:
+              result:=
+                (tai_const(hp2).sym=tai_const(hp).sym) and
+                (tai_const(hp2).value=tai_const(hp).value) and
+                (tai(hp2.previous).typ=ait_label);
+        end;
+      end;
+*)
+
     procedure insertpcrelativedata(list,listtoinsert : TAsmList);
       var
-        curpos,
+        curinspos,
         penalty,
-        lastpos : longint;
+        lastinspos,
+        { increased for every data element > 4 bytes inserted }
+        extradataoffset,
+        limit: longint;
         curop : longint;
         curtai : tai;
         curdatatai,hp,hp2 : tai;
@@ -709,72 +792,111 @@ implementation
         removeref : boolean;
       begin
         curdata:=TAsmList.create;
-        lastpos:=-1;
-        curpos:=0;
+        lastinspos:=-1;
+        curinspos:=0;
+        extradataoffset:=0;
+        limit:=1016;
         curtai:=tai(list.first);
         doinsert:=false;
         while assigned(curtai) do
           begin
             { instruction? }
-            if curtai.typ=ait_instruction then
-              begin
-                { walk through all operand of the instruction }
-                for curop:=0 to taicpu(curtai).ops-1 do
-                  begin
-                    { reference? }
-                    if (taicpu(curtai).oper[curop]^.typ=top_ref) then
-                      begin
-                        { pc relative symbol? }
-                        curdatatai:=tai(taicpu(curtai).oper[curop]^.ref^.symboldata);
-                        if assigned(curdatatai) and
-                          { move only if we're at the first reference of a label }
-                          (taicpu(curtai).oper[curop]^.ref^.offset=0) then
-                          begin
-                            { check if symbol already used. }
-                            { if yes, reuse the symbol }
-                            hp:=tai(curdatatai.next);
-                            removeref:=false;
-                            if assigned(hp) and (hp.typ=ait_const) then
-                              begin
-                                hp2:=tai(curdata.first);
-                                while assigned(hp2) do
-                                  begin
-                                    if (hp2.typ=ait_const) and (tai_const(hp2).sym=tai_const(hp).sym)
-                                      and (tai_const(hp2).value=tai_const(hp).value) and (tai(hp2.previous).typ=ait_label)
-                                    then
+            case curtai.typ of
+              ait_instruction:
+                begin
+                  { walk through all operand of the instruction }
+                  for curop:=0 to taicpu(curtai).ops-1 do
+                    begin
+                      { reference? }
+                      if (taicpu(curtai).oper[curop]^.typ=top_ref) then
+                        begin
+                          { pc relative symbol? }
+                          curdatatai:=tai(taicpu(curtai).oper[curop]^.ref^.symboldata);
+                          if assigned(curdatatai) and
+                            { move only if we're at the first reference of a label }
+                            (taicpu(curtai).oper[curop]^.ref^.offset=0) then
+                            begin
+                              { check if symbol already used. }
+                              { if yes, reuse the symbol }
+                              hp:=tai(curdatatai.next);
+                              removeref:=false;
+                              if assigned(hp) then
+                                begin
+                                  case hp.typ of
+                                    ait_const:
+                                      begin
+                                        if (tai_const(hp).consttype=aitconst_64bit) then
+                                          inc(extradataoffset);
+                                      end;
+                                    ait_comp_64bit,
+                                    ait_real_64bit:
+                                      begin
+                                        inc(extradataoffset);
+                                      end;
+                                    ait_real_80bit:
                                       begin
-                                        with taicpu(curtai).oper[curop]^.ref^ do
-                                          begin
-                                            symboldata:=hp2.previous;
-                                            symbol:=tai_label(hp2.previous).labsym;
-                                          end;
-                                        removeref:=true;
-                                        break;
+                                        inc(extradataoffset,2);
                                       end;
-                                    hp2:=tai(hp2.next);
                                   end;
-                              end;
-                            { move or remove symbol reference }
-                            repeat
-                              hp:=tai(curdatatai.next);
-                              listtoinsert.remove(curdatatai);
-                              if removeref then
-                                curdatatai.free
-                              else
-                                curdata.concat(curdatatai);
-                              curdatatai:=hp;
-                            until (curdatatai=nil) or (curdatatai.typ=ait_label);
-                            if lastpos=-1 then
-                              lastpos:=curpos;
-                          end;
-                      end;
-                  end;
-                inc(curpos);
-              end
-            else
-              if curtai.typ=ait_const then
-                inc(curpos);
-
+                                  if (hp.typ=ait_const) then
+                                    begin
+                                      hp2:=tai(curdata.first);
+                                      while assigned(hp2) do
+                                        begin
+    {                                      if armconstequal(hp2,hp) then }
+                                          if (hp2.typ=ait_const) and (tai_const(hp2).sym=tai_const(hp).sym)
+                                            and (tai_const(hp2).value=tai_const(hp).value) and (tai(hp2.previous).typ=ait_label)
+                                          then
+                                            begin
+                                              with taicpu(curtai).oper[curop]^.ref^ do
+                                                begin
+                                                  symboldata:=hp2.previous;
+                                                  symbol:=tai_label(hp2.previous).labsym;
+                                                end;
+                                              removeref:=true;
+                                              break;
+                                            end;
+                                          hp2:=tai(hp2.next);
+                                        end;
+                                    end;
+                                end;
+                              { move or remove symbol reference }
+                              repeat
+                                hp:=tai(curdatatai.next);
+                                listtoinsert.remove(curdatatai);
+                                if removeref then
+                                  curdatatai.free
+                                else
+                                  curdata.concat(curdatatai);
+                                curdatatai:=hp;
+                              until (curdatatai=nil) or (curdatatai.typ=ait_label);
+                              if lastinspos=-1 then
+                                lastinspos:=curinspos;
+                            end;
+                        end;
+                    end;
+                  inc(curinspos);
+                end;
+              ait_const:
+                begin
+                  inc(curinspos);
+                  if (tai_const(curtai).consttype=aitconst_64bit) then
+                    inc(curinspos);
+                end;
+              ait_real_32bit:
+                begin
+                  inc(curinspos);
+                end;
+              ait_comp_64bit,
+              ait_real_64bit:
+                begin
+                  inc(curinspos,2);
+                end;
+              ait_real_80bit:
+                begin
+                  inc(curinspos,3);
+                end;
+            end;
             { special case for case jump tables }
             if SimpleGetNextInstruction(curtai,hp) and
               (tai(hp).typ=ait_instruction) and
@@ -793,8 +915,16 @@ implementation
             else
               penalty:=0;
 
+            { FLD/FST VFP instructions have a limit of +/- 1024, not 4096 }
+            if SimpleGetNextInstruction(curtai,hp) and
+               (tai(hp).typ=ait_instruction) and
+               ((taicpu(hp).opcode=A_FLDS) or
+                (taicpu(hp).opcode=A_FLDD)) then
+              limit:=254;
+
             { don't miss an insert }
-            doinsert:=doinsert or (curpos-lastpos+penalty>1016);
+            doinsert:=doinsert or
+              (curinspos-lastinspos+penalty+extradataoffset>limit);
 
             { split only at real instructions else the test below fails }
             if doinsert and (curtai.typ=ait_instruction) and
@@ -809,7 +939,9 @@ implementation
                    )
               ) then
               begin
-                lastpos:=curpos;
+                lastinspos:=curinspos;
+                extradataoffset:=0;
+                limit:=1016;
                 doinsert:=false;
                 hp:=tai(curtai.next);
                 current_asmdata.getjumplabel(l);

+ 5 - 5
compiler/arm/agarmgas.pas

@@ -197,13 +197,13 @@ unit agarmgas;
                   begin
                     if not(first) then
                       getopstr:=getopstr+',';
-                    getopstr:=getopstr+gas_regname(newreg(R_INTREGISTER,r,R_SUBWHOLE));
+                    getopstr:=getopstr+gas_regname(newreg(o.regtyp,r,o.subreg));
                     first:=false;
                   end;
               getopstr:=getopstr+'}';
             end;
-			    top_conditioncode:
-			      getopstr:=cond2str[o.cc];
+          top_conditioncode:
+            getopstr:=cond2str[o.cc];
           top_ref:
             if o.ref^.refaddr=addr_full then
               begin
@@ -249,7 +249,7 @@ unit agarmgas;
                // writeln(taicpu(hp).fileinfo.line);
 
                { LDM and STM use references as first operand but they are written like a register }
-               if (i=0) and (op in [A_LDM,A_STM]) then
+               if (i=0) and (op in [A_LDM,A_STM,A_FSTM,A_FLDM]) then
                  begin
                    case taicpu(hp).oper[0]^.typ of
                      top_ref:
@@ -303,7 +303,7 @@ unit agarmgas;
             id     : as_darwin;
             idtxt  : 'AS-Darwin';
             asmbin : 'as';
-            asmcmd : '-o $OBJ $ASM -arch arm';
+            asmcmd : '-o $OBJ $ASM -arch $ARCH';
             supported_targets : [system_arm_darwin];
             flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : 'L';

+ 4 - 6
compiler/arm/armatt.inc

@@ -134,9 +134,7 @@
 'fdivd',
 'fdivs',
 'fldd',
-'fldmd',
-'fldms',
-'fldmx',
+'fldm',
 'flds',
 'fmacd',
 'fmacs',
@@ -166,9 +164,7 @@
 'fsqrtd',
 'fsqrts',
 'fstd',
-'fstmd',
-'fstms',
-'fstmx',
+'fstm',
 'fsts',
 'fsubd',
 'fsubs',
@@ -178,6 +174,8 @@
 'ftouis',
 'fuitod',
 'fuitos',
+'fmdrr',
+'fmrrd',
 'asr',
 'lsr',
 'lsl',

+ 0 - 2
compiler/arm/armatts.inc

@@ -199,7 +199,5 @@ attsufNONE,
 attsufNONE,
 attsufNONE,
 attsufNONE,
-attsufNONE,
-attsufNONE,
 attsufNONE
 );

+ 8 - 10
compiler/arm/armins.dat

@@ -486,11 +486,7 @@ reg32,reg32,reg32,reg32  \x16\x00\x80\x90		 ARM7
 
 [FLDDcc]
 
-[FLDMDcc]
-
-[FLDMScc]
-
-[FLDMXcc]
+[FLDMcc]
 
 [FLDScc]
 
@@ -550,11 +546,7 @@ reg32,reg32,reg32,reg32  \x16\x00\x80\x90		 ARM7
 
 [FSTDcc]
 
-[FSTMDcc]
-
-[FSTMScc]
-
-[FSTMXcc]
+[FSTMcc]
 
 [FSTScc]
 
@@ -574,6 +566,12 @@ reg32,reg32,reg32,reg32  \x16\x00\x80\x90		 ARM7
 
 [FUITOScc]
 
+[FMDRRcc]
+
+[FMRRDcc]
+
+; Thumb-2
+
 [ASRcc]
 
 [LSRcc]

+ 4 - 6
compiler/arm/armop.inc

@@ -134,9 +134,7 @@ A_FCVTSD,
 A_FDIVD,
 A_FDIVS,
 A_FLDD,
-A_FLDMD,
-A_FLDMS,
-A_FLDMX,
+A_FLDM,
 A_FLDS,
 A_FMACD,
 A_FMACS,
@@ -166,9 +164,7 @@ A_FSITOS,
 A_FSQRTD,
 A_FSQRTS,
 A_FSTD,
-A_FSTMD,
-A_FSTMS,
-A_FSTMX,
+A_FSTM,
 A_FSTS,
 A_FSUBD,
 A_FSUBS,
@@ -178,6 +174,8 @@ A_FTOUID,
 A_FTOUIS,
 A_FUITOD,
 A_FUITOS,
+A_FMDRR,
+A_FMRRD,
 A_ASR,
 A_LSR,
 A_LSL,

+ 96 - 75
compiler/arm/armreg.dat

@@ -2,86 +2,107 @@
 ; ARM registers
 ;
 ; layout
-; <name>,<type>,<value>,<stdname>,<stab idx>,<dwarf idx>
+; <name>,<type>,<subtype>,<value>,<stdname>,<stab idx>,<dwarf idx>
 ;
-NO,$00,$00,INVALID,-1,-1
+NO,$00,$00,$00,INVALID,-1,-1
 ; Integer registers
-R0,$01,$00,r0,0,0
-R1,$01,$01,r1,1,1
-R2,$01,$02,r2,2,2
-R3,$01,$03,r3,3,3
-R4,$01,$04,r4,4,4
-R5,$01,$05,r5,5,5
-R6,$01,$06,r6,6,6
-R7,$01,$07,r7,7,7
-R8,$01,$08,r8,8,8
-R9,$01,$09,r9,9,9
-R10,$01,$0a,r10,10,10
-R11,$01,$0b,r11,11,11
-R12,$01,$0c,r12,12,12
-R13,$01,$0d,r13,13,13
-R14,$01,$0e,r14,14,14
-R15,$01,$0f,r15,15,15
+R0,$01,$00,$00,r0,0,0
+R1,$01,$00,$01,r1,1,1
+R2,$01,$00,$02,r2,2,2
+R3,$01,$00,$03,r3,3,3
+R4,$01,$00,$04,r4,4,4
+R5,$01,$00,$05,r5,5,5
+R6,$01,$00,$06,r6,6,6
+R7,$01,$00,$07,r7,7,7
+R8,$01,$00,$08,r8,8,8
+R9,$01,$00,$09,r9,9,9
+R10,$01,$00,$0a,r10,10,10
+R11,$01,$00,$0b,r11,11,11
+R12,$01,$00,$0c,r12,12,12
+R13,$01,$00,$0d,r13,13,13
+R14,$01,$00,$0e,r14,14,14
+R15,$01,$00,$0f,r15,15,15
 
 ; Float registers
-F0,$02,$00,f0,32,16
-F1,$02,$01,f1,32,17
-F2,$02,$02,f2,32,18
-F3,$02,$03,f3,32,19
-F4,$02,$04,f4,32,20
-F5,$02,$05,f5,32,21
-F6,$02,$06,f6,32,22
-F7,$02,$07,f7,32,23
+F0,$02,$00,$00,f0,32,16
+F1,$02,$00,$01,f1,32,17
+F2,$02,$00,$02,f2,32,18
+F3,$02,$00,$03,f3,32,19
+F4,$02,$00,$04,f4,32,20
+F5,$02,$00,$05,f5,32,21
+F6,$02,$00,$06,f6,32,22
+F7,$02,$00,$07,f7,32,23
 
 ; MM registers
-S0,$03,$00,s0,0,0
-S1,$03,$00,s1,0,0
-D0,$03,$00,d0,0,0
-S2,$03,$00,s2,0,0
-S3,$03,$00,s3,0,0
-D1,$03,$00,d1,0,0
-S4,$03,$00,s4,0,0
-S5,$03,$00,s5,0,0
-D2,$03,$00,d2,0,0
-S6,$03,$00,s6,0,0
-S7,$03,$00,s7,0,0
-D3,$03,$00,d3,0,0
-S8,$03,$00,s8,0,0
-S9,$03,$00,s9,0,0
-D4,$03,$00,d4,0,0
-S10,$03,$00,s10,0,0
-S11,$03,$00,s11,0,0
-D5,$03,$00,d5,0,0
-S12,$03,$00,s12,0,0
-S13,$03,$00,s13,0,0
-D6,$03,$00,d6,0,0
-S14,$03,$00,s14,0,0
-S15,$03,$00,s15,0,0
-D7,$03,$00,d7,0,0
-S16,$03,$00,s16,0,0
-S17,$03,$00,s17,0,0
-D8,$03,$00,d8,0,0
-S18,$03,$00,s18,0,0
-S19,$03,$00,s19,0,0
-D9,$03,$00,d9,0,0
-S20,$03,$00,s20,0,0
-S21,$03,$00,s21,0,0
-D10,$03,$00,d10,0,0
-S22,$03,$00,s22,0,0
-S23,$03,$00,s23,0,0
-D11,$03,$00,d11,0,0
-S24,$03,$00,s24,0,0
-S25,$03,$00,s25,0,0
-D12,$03,$00,d12,0,0
-S26,$03,$00,s26,0,0
-S27,$03,$00,s27,0,0
-D13,$03,$00,d13,0,0
-S28,$03,$00,s28,0,0
-S29,$03,$00,s29,0,0
-D14,$03,$00,d14,0,0
-S30,$03,$00,s20,0,0
-S31,$03,$00,s21,0,0
-D15,$03,$00,d15,0,0
+; S0/S1/D0 etc have the same register number because the register allocated
+; cannot deal with D0 conflicting with both S0 and S1. This unfortunately
+; means that we can only use 16 single precision registers instead of 32,
+; even if no double precision ones are used...
+S0,$04,$06,$00,s0,0,0
+S1,$04,$06,$00,s1,0,0
+D0,$04,$07,$00,d0,0,0
+S2,$04,$06,$01,s2,0,0
+S3,$04,$06,$01,s3,0,0
+D1,$04,$07,$01,d1,0,0
+S4,$04,$06,$02,s4,0,0
+S5,$04,$06,$02,s5,0,0
+D2,$04,$07,$02,d2,0,0
+S6,$04,$06,$03,s6,0,0
+S7,$04,$06,$03,s7,0,0
+D3,$04,$07,$03,d3,0,0
+S8,$04,$06,$04,s8,0,0
+S9,$04,$06,$04,s9,0,0
+D4,$04,$07,$04,d4,0,0
+S10,$04,$06,$05,s10,0,0
+S11,$04,$06,$05,s11,0,0
+D5,$04,$07,$05,d5,0,0
+S12,$04,$06,$06,s12,0,0
+S13,$04,$06,$06,s13,0,0
+D6,$04,$07,$06,d6,0,0
+S14,$04,$06,$07,s14,0,0
+S15,$04,$06,$07,s15,0,0
+D7,$04,$07,$07,d7,0,0
+S16,$04,$06,$08,s16,0,0
+S17,$04,$06,$08,s17,0,0
+D8,$04,$07,$08,d8,0,0
+S18,$04,$06,$09,s18,0,0
+S19,$04,$06,$09,s19,0,0
+D9,$04,$07,$09,d9,0,0
+S20,$04,$06,$0A,s20,0,0
+S21,$04,$06,$0A,s21,0,0
+D10,$04,$07,$0A,d10,0,0
+S22,$04,$06,$0B,s22,0,0
+S23,$04,$06,$0B,s23,0,0
+D11,$04,$07,$0B,d11,0,0
+S24,$04,$06,$0C,s24,0,0
+S25,$04,$06,$0C,s25,0,0
+D12,$04,$07,$0C,d12,0,0
+S26,$04,$06,$0D,s26,0,0
+S27,$04,$06,$0D,s27,0,0
+D13,$04,$07,$0D,d13,0,0
+S28,$04,$06,$0E,s28,0,0
+S29,$04,$06,$0E,s29,0,0
+D14,$04,$07,$0E,d14,0,0
+S30,$04,$06,$0F,s20,0,0
+S31,$04,$06,$0F,s21,0,0
+D15,$04,$07,$0F,d15,0,0
+D16,$04,$07,$10,d16,0,0
+D17,$04,$07,$11,d17,0,0
+D18,$04,$07,$12,d18,0,0
+D19,$04,$07,$13,d19,0,0
+D20,$04,$07,$14,d20,0,0
+D21,$04,$07,$15,d21,0,0
+D22,$04,$07,$16,d22,0,0
+D23,$04,$07,$17,d23,0,0
+D24,$04,$07,$18,d24,0,0
+D25,$04,$07,$19,d25,0,0
+D26,$04,$07,$1A,d26,0,0
+D27,$04,$07,$1B,d27,0,0
+D28,$04,$07,$1C,d28,0,0
+D29,$04,$07,$1D,d29,0,0
+D30,$04,$07,$1E,d30,0,0
+D31,$04,$07,$1F,d31,0,0
 
 ; special registers
-CPSR_C,$04,$00,cpsr_c,0,0
+CPSR_C,$05,$00,$00,cpsr_c,0,0
+FPSCR,$05,$00,$01,fpscr,0,0

+ 467 - 73
compiler/arm/cgcpu.pas

@@ -104,6 +104,14 @@ unit cgcpu;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
         procedure g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: aint); override;
         procedure g_stackpointer_alloc(list : TAsmList;size : longint);override;
+
+        procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
+        procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
+        procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
+        procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister; shuffle : pmmshuffle); override;
+
+        procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle); override;
       private
         { clear out potential overflow bits from 8 or 16 bit operations  }
         { the upper 24/16 bits of a register after an operation          }
@@ -126,6 +134,8 @@ unit cgcpu;
         procedure a_op64_reg_reg_reg(list: TAsmList;op:TOpCG;size : tcgsize;regsrc1,regsrc2,regdst : tregister64);override;
         procedure a_op64_const_reg_reg_checkoverflow(list: TAsmList;op:TOpCG;size : tcgsize;value : int64;regsrc,regdst : tregister64;setflags : boolean;var ovloc : tlocation);override;
         procedure a_op64_reg_reg_reg_checkoverflow(list: TAsmList;op:TOpCG;size : tcgsize;regsrc1,regsrc2,regdst : tregister64;setflags : boolean;var ovloc : tlocation);override;
+        procedure a_loadmm_intreg64_reg(list: TAsmList; mmsize: tcgsize; intreg: tregister64; mmreg: tregister);override;
+        procedure a_loadmm_reg_intreg64(list: TAsmList; mmsize: tcgsize; mmreg: tregister; intreg: tregister64);override;
       end;
 
       Tthumb2cgarm = class(tcgarm)
@@ -203,14 +213,26 @@ unit cgcpu;
               [RS_R0,RS_R1,RS_R2,RS_R3,RS_R4,RS_R5,RS_R6,RS_R7,RS_R8,
                RS_R9,RS_R10,RS_R12,RS_R14],first_int_imreg,[])
         else
-          { r9 is not available on Darwin according to the llvm code generator }
+          { r9 is not (always) available on Darwin according to the llvm code
+            generator. }
           rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
               [RS_R0,RS_R1,RS_R2,RS_R3,RS_R4,RS_R5,RS_R6,RS_R7,RS_R8,
                RS_R10,RS_R12,RS_R14],first_int_imreg,[]);
         rg[R_FPUREGISTER]:=trgcpu.create(R_FPUREGISTER,R_SUBNONE,
             [RS_F0,RS_F1,RS_F2,RS_F3,RS_F4,RS_F5,RS_F6,RS_F7],first_fpu_imreg,[]);
-        rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBNONE,
-            [RS_S0,RS_S1,RS_R2,RS_R3,RS_R4,RS_S31],first_mm_imreg,[]);
+        { The register allocator currently cannot deal with multiple
+          non-overlapping subregs per register, so we can only use
+          half the single precision registers for now (as sub registers of the
+          double precision ones). }
+        if current_settings.fputype=fpu_vfpv3 then
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBFD,
+              [RS_D0,RS_D1,RS_D2,RS_D3,RS_D4,RS_D5,RS_D6,RS_D7,
+               RS_D16,RS_D17,RS_D18,RS_D19,RS_D20,RS_D21,RS_D22,RS_D23,RS_D24,RS_D25,RS_D26,RS_D27,RS_D28,RS_D29,RS_D30,RS_D31,
+               RS_D8,RS_D9,RS_D10,RS_D11,RS_D12,RS_D13,RS_D14,RS_D15
+              ],first_mm_imreg,[])
+        else
+          rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBFD,
+              [RS_D0,RS_D1,RS_D2,RS_D3,RS_D4,RS_D5,RS_D6,RS_D7,RS_D8,RS_D9,RS_D10,RS_D11,RS_D12,RS_D13,RS_D14,RS_D15],first_mm_imreg,[]);
       end;
 
 
@@ -895,7 +917,7 @@ unit cgcpu;
              (ref.offset>255)
             )
            ) or
-           ((op in [A_LDF,A_STF]) and
+           ((op in [A_LDF,A_STF,A_FLDS,A_FLDD,A_FSTS,A_FSTD]) and
             ((ref.offset<-1020) or
              (ref.offset>1020) or
              { the usual pc relative symbol handling assumes possible offsets of +/- 4095 }
@@ -965,7 +987,7 @@ unit cgcpu;
 
         { floating point operations have only limited references
           we expect here, that a base is already set }
-        if (op in [A_LDF,A_STF]) and (ref.index<>NR_NO) then
+        if (op in [A_LDF,A_STF,A_FLDS,A_FLDD,A_FSTS,A_FSTD]) and (ref.index<>NR_NO) then
           begin
             if ref.shiftmode<>SM_none then
               internalerror(200309121);
@@ -1023,7 +1045,9 @@ unit cgcpu;
            OS_S16:
              oppostfix:=PF_H;
            OS_32,
-           OS_S32:
+           OS_S32,
+           { for vfp value stored in integer register }
+           OS_F32:
              oppostfix:=PF_None;
            else
              InternalError(200308295);
@@ -1212,6 +1236,7 @@ unit cgcpu;
                 a_loadfpu_ref_reg(list,size,size,ref,hloc^.register);
               LOC_REGISTER :
                 case hloc^.size of
+                  OS_32,
                   OS_F32:
                     a_load_ref_reg(list,OS_32,OS_32,href,hloc^.register);
                   OS_64,
@@ -1224,7 +1249,7 @@ unit cgcpu;
                 begin
                   reference_reset_base(href2,hloc^.reference.index,hloc^.reference.offset,paraloc.alignment);
                   { concatcopy should choose the best way to copy the data }
-                  g_concatcopy(list,href,href2,tcgsize2size[size]);
+                  g_concatcopy(list,href,href2,tcgsize2size[hloc^.size]);
                 end;
               else
                 internalerror(200408241);
@@ -1354,8 +1379,10 @@ unit cgcpu;
          shift : byte;
          firstfloatreg,lastfloatreg,
          r : byte;
+         mmregs,
          regs : tcpuregisterset;
-         stackmisalignment: pint;
+         stackmisalignment : pint;
+         postfix: toppostfix;
       begin
         LocalSize:=align(LocalSize,4);
         { call instruction does not put anything on the stack }
@@ -1363,15 +1390,29 @@ unit cgcpu;
         if not(nostackframe) then
           begin
             firstfloatreg:=RS_NO;
-            { save floating point registers? }
-            for r:=RS_F0 to RS_F7 do
-              if r in rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall) then
+            mmregs:=[];
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
                 begin
-                  if firstfloatreg=RS_NO then
-                    firstfloatreg:=r;
-                  lastfloatreg:=r;
-                  inc(stackmisalignment,12);
+                  { save floating point registers? }
+                  regs:=rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall);
+                  for r:=RS_F0 to RS_F7 do
+                    if r in regs then
+                      begin
+                        if firstfloatreg=RS_NO then
+                          firstfloatreg:=r;
+                        lastfloatreg:=r;
+                        inc(stackmisalignment,12);
+                      end;
                 end;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                begin;
+                  mmregs:=rg[R_MMREGISTER].used_in_proc-paramanager.get_volatile_registers_mm(pocall_stdcall);
+                end;
+            end;
             a_reg_alloc(list,NR_STACK_POINTER_REG);
             if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
               begin
@@ -1385,21 +1426,31 @@ unit cgcpu;
             ref.index:=NR_STACK_POINTER_REG;
             ref.addressmode:=AM_PREINDEXED;
             regs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(pocall_stdcall);
+            { the (old) ARM APCS requires saving both the stack pointer (to
+              crawl the stack) and the PC (to identify the function this
+              stack frame belongs to) -> also save R12 (= copy of R13 on entry)
+              and R15 -- still needs updating for EABI and Darwin, they don't
+              need that }
             if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
-              regs:=regs+[RS_R11,RS_R12,RS_R14,RS_R15]
+              regs:=regs+[RS_FRAME_POINTER_REG,RS_R12,RS_R14,RS_R15]
             else
               if (regs<>[]) or (pi_do_call in current_procinfo.flags) then
                 include(regs,RS_R14);
             if regs<>[] then
-              begin
-                for r:=RS_R0 to RS_R15 do
-                  if (r in regs) then
-                    inc(stackmisalignment,4);
-                list.concat(setoppostfix(taicpu.op_ref_regset(A_STM,ref,regs),PF_FD));
-              end;
+               begin
+                 for r:=RS_R0 to RS_R15 do
+                   if (r in regs) then
+                     inc(stackmisalignment,4);
+                 list.concat(setoppostfix(taicpu.op_ref_regset(A_STM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_FD));
+               end;
 
             if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
-              list.concat(taicpu.op_reg_reg_const(A_SUB,NR_FRAME_POINTER_REG,NR_R12,4));
+              begin
+                { the framepointer now points to the saved R15, so the saved
+                  framepointer is at R11-12 (for get_caller_frame) }
+                list.concat(taicpu.op_reg_reg_const(A_SUB,NR_FRAME_POINTER_REG,NR_R12,4));
+                a_reg_dealloc(list,NR_R12);
+              end;
 
             stackmisalignment:=stackmisalignment mod current_settings.alignment.localalignmax;
             if (LocalSize<>0) or
@@ -1423,62 +1474,116 @@ unit cgcpu;
                   end;
               end;
 
-            if firstfloatreg<>RS_NO then
-              begin
-                reference_reset(ref,4);
-                if tg.direction*tarmprocinfo(current_procinfo).floatregstart>=1023 then
-                  begin
-                    a_load_const_reg(list,OS_ADDR,-tarmprocinfo(current_procinfo).floatregstart,NR_R12);
-                    list.concat(taicpu.op_reg_reg_reg(A_SUB,NR_R12,current_procinfo.framepointer,NR_R12));
-                    ref.base:=NR_R12;
-                  end
-                else
-                  begin
-                    ref.base:=current_procinfo.framepointer;
-                    ref.offset:=tarmprocinfo(current_procinfo).floatregstart;
-                  end;
-                list.concat(taicpu.op_reg_const_ref(A_SFM,newreg(R_FPUREGISTER,firstfloatreg,R_SUBWHOLE),
-                  lastfloatreg-firstfloatreg+1,ref));
-              end;
-          end;
+            if (mmregs<>[]) or
+               (firstfloatreg<>RS_NO) then
+             begin
+               reference_reset(ref,4);
+               if (tg.direction*tarmprocinfo(current_procinfo).floatregstart>=1023) or
+                  (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3]) then
+                 begin
+                   if not is_shifter_const(tarmprocinfo(current_procinfo).floatregstart,shift) then
+                     begin
+                       a_reg_alloc(list,NR_R12);
+                       a_load_const_reg(list,OS_ADDR,-tarmprocinfo(current_procinfo).floatregstart,NR_R12);
+                       list.concat(taicpu.op_reg_reg_reg(A_SUB,NR_R12,current_procinfo.framepointer,NR_R12));
+                       a_reg_dealloc(list,NR_R12);
+                     end
+                   else
+                     list.concat(taicpu.op_reg_reg_const(A_SUB,NR_R12,current_procinfo.framepointer,-tarmprocinfo(current_procinfo).floatregstart));
+                   ref.base:=NR_R12;
+                 end
+               else
+                 begin
+                   ref.base:=current_procinfo.framepointer;
+                   ref.offset:=tarmprocinfo(current_procinfo).floatregstart;
+                 end;
+
+               case current_settings.fputype of
+                 fpu_fpa,
+                 fpu_fpa10,
+                 fpu_fpa11:
+                   begin
+                     list.concat(taicpu.op_reg_const_ref(A_SFM,newreg(R_FPUREGISTER,firstfloatreg,R_SUBWHOLE),
+                       lastfloatreg-firstfloatreg+1,ref));
+                   end;
+                 fpu_vfpv2,
+                 fpu_vfpv3:
+                   begin
+                     ref.index:=ref.base;
+                     ref.base:=NR_NO;
+                     { FSTMX is deprecated on ARMv6 and later }
+                     if (current_settings.cputype<cpu_armv6) then
+                       postfix:=PF_IAX
+                     else
+                       postfix:=PF_IAD;
+                     list.concat(setoppostfix(taicpu.op_ref_regset(A_FSTM,ref,R_MMREGISTER,R_SUBFD,mmregs),postfix));
+                   end;
+               end;
+             end;
+        end;
       end;
 
 
     procedure tcgarm.g_proc_exit(list : TAsmList;parasize : longint;nostackframe:boolean);
       var
          ref : treference;
+         LocalSize : longint;
          firstfloatreg,lastfloatreg,
-         r : byte;
+         r,
          shift : byte;
+         mmregs,
          regs : tcpuregisterset;
-         LocalSize : longint;
          stackmisalignment: pint;
+         mmpostfix: toppostfix;
       begin
         if not(nostackframe) then
           begin
             stackmisalignment:=0;
-            { restore floating point register }
             firstfloatreg:=RS_NO;
-            { save floating point registers? }
-            for r:=RS_F0 to RS_F7 do
-              if r in rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall) then
+            mmregs:=[];
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
                 begin
-                  if firstfloatreg=RS_NO then
-                    firstfloatreg:=r;
-                  lastfloatreg:=r;
-                  { floating point register space is already included in
-                    localsize below by calc_stackframe_size
-                   inc(stackmisalignment,12);
-                  }
+                  { restore floating point registers? }
+                  regs:=rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall);
+                  for r:=RS_F0 to RS_F7 do
+                    if r in regs then
+                      begin
+                        if firstfloatreg=RS_NO then
+                          firstfloatreg:=r;
+                        lastfloatreg:=r;
+                        { floating point register space is already included in
+                          localsize below by calc_stackframe_size
+                         inc(stackmisalignment,12);
+                        }
+                      end;
+                end;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                begin;
+                  { restore vfp registers? }
+                  mmregs:=rg[R_MMREGISTER].used_in_proc-paramanager.get_volatile_registers_mm(pocall_stdcall);
                 end;
+            end;
 
-            if firstfloatreg<>RS_NO then
+            if (firstfloatreg<>RS_NO) or
+               (mmregs<>[]) then
               begin
                 reference_reset(ref,4);
-                if tg.direction*tarmprocinfo(current_procinfo).floatregstart>=1023 then
+                if (tg.direction*tarmprocinfo(current_procinfo).floatregstart>=1023) or
+                   (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3]) then
                   begin
-                    a_load_const_reg(list,OS_ADDR,-tarmprocinfo(current_procinfo).floatregstart,NR_R12);
-                    list.concat(taicpu.op_reg_reg_reg(A_SUB,NR_R12,current_procinfo.framepointer,NR_R12));
+                    if not is_shifter_const(tarmprocinfo(current_procinfo).floatregstart,shift) then
+                      begin
+                        a_reg_alloc(list,NR_R12);
+                        a_load_const_reg(list,OS_ADDR,-tarmprocinfo(current_procinfo).floatregstart,NR_R12);
+                        list.concat(taicpu.op_reg_reg_reg(A_SUB,NR_R12,current_procinfo.framepointer,NR_R12));
+                        a_reg_dealloc(list,NR_R12);
+                      end
+                    else
+                      list.concat(taicpu.op_reg_reg_const(A_SUB,NR_R12,current_procinfo.framepointer,-tarmprocinfo(current_procinfo).floatregstart));
                     ref.base:=NR_R12;
                   end
                 else
@@ -1486,23 +1591,44 @@ unit cgcpu;
                     ref.base:=current_procinfo.framepointer;
                     ref.offset:=tarmprocinfo(current_procinfo).floatregstart;
                   end;
-                list.concat(taicpu.op_reg_const_ref(A_LFM,newreg(R_FPUREGISTER,firstfloatreg,R_SUBWHOLE),
-                  lastfloatreg-firstfloatreg+1,ref));
+                case current_settings.fputype of
+                  fpu_fpa,
+                  fpu_fpa10,
+                  fpu_fpa11:
+                    begin
+                      list.concat(taicpu.op_reg_const_ref(A_LFM,newreg(R_FPUREGISTER,firstfloatreg,R_SUBWHOLE),
+                        lastfloatreg-firstfloatreg+1,ref));
+                    end;
+                  fpu_vfpv2,
+                  fpu_vfpv3:
+                    begin
+                      ref.index:=ref.base;
+                      ref.base:=NR_NO;
+                      { FLDMX is deprecated on ARMv6 and later }
+                      if (current_settings.cputype<cpu_armv6) then
+                        mmpostfix:=PF_IAX
+                      else
+                        mmpostfix:=PF_IAD;
+                      list.concat(setoppostfix(taicpu.op_ref_regset(A_FLDM,ref,R_MMREGISTER,R_SUBFD,mmregs),mmpostfix));
+                    end;
+                end;
               end;
 
-            regs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(pocall_stdcall);
+            regs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(pocall_stdcall)        ;
             if (pi_do_call in current_procinfo.flags) or (regs<>[]) then
               begin
                 exclude(regs,RS_R14);
                 include(regs,RS_R15);
               end;
+            { restore saved stack pointer to SP (R13) and saved lr to PC (R15).
+              The saved PC came after that but is discarded, since we restore
+              the stack pointer }
             if (current_procinfo.framepointer<>NR_STACK_POINTER_REG) then
-              regs:=regs+[RS_R11,RS_R13,RS_R15];
+              regs:=regs+[RS_FRAME_POINTER_REG,RS_R13,RS_R15];
 
             for r:=RS_R0 to RS_R15 do
               if (r in regs) then
                 inc(stackmisalignment,4);
-
             stackmisalignment:=stackmisalignment mod current_settings.alignment.localalignmax;
             if (current_procinfo.framepointer=NR_STACK_POINTER_REG) then
               begin
@@ -1533,7 +1659,7 @@ unit cgcpu;
                     reference_reset(ref,4);
                     ref.index:=NR_STACK_POINTER_REG;
                     ref.addressmode:=AM_PREINDEXED;
-                    list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,regs),PF_FD));
+                    list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_FD));
                   end;
               end
             else
@@ -1541,7 +1667,7 @@ unit cgcpu;
                 { restore int registers and return }
                 reference_reset(ref,4);
                 ref.index:=NR_FRAME_POINTER_REG;
-                list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,regs),PF_EA));
+                list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_EA));
               end;
           end
         else
@@ -2046,6 +2172,254 @@ unit cgcpu;
       end;
 
 
+    function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
+      const
+        convertop : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+          (A_FCPYS,A_FCVTSD,A_NONE,A_NONE,A_NONE),
+          (A_FCVTDS,A_FCPYD,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+      begin
+        result:=convertop[fromsize,tosize];
+        if result=A_NONE then
+          internalerror(200312205);
+      end;
+
+
+    procedure tcgarm.a_loadmm_reg_reg(list: tasmlist; fromsize,tosize: tcgsize; reg1,reg2: tregister; shuffle: pmmshuffle);
+      var
+        instr: taicpu;
+      begin
+        if shuffle=nil then
+          begin
+            if fromsize=tosize then
+              { needs correct size in case of spilling }
+              case fromsize of
+                OS_F32:
+                  instr:=taicpu.op_reg_reg(A_FCPYS,reg2,reg1);
+                OS_F64:
+                  instr:=taicpu.op_reg_reg(A_FCPYD,reg2,reg1);
+                else
+                  internalerror(2009112405);
+              end
+            else
+              internalerror(2009112406);
+          end
+        else if shufflescalar(shuffle) then
+          instr:=taicpu.op_reg_reg(get_scalar_mm_op(tosize,fromsize),reg2,reg1)
+        else
+          internalerror(2009112407);
+        list.concat(instr);
+        case instr.opcode of
+          A_FCPYS,
+          A_FCPYD:
+            add_move_instruction(instr);
+        end;
+      end;
+
+
+    procedure tcgarm.a_loadmm_ref_reg(list: tasmlist; fromsize,tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
+      var
+        intreg,
+        tmpmmreg : tregister;
+        reg64    : tregister64;
+        op       : tasmop;
+      begin
+        if assigned(shuffle) and
+           not(shufflescalar(shuffle)) then
+          internalerror(2009112413);
+
+        case fromsize of
+          OS_32,OS_S32:
+            begin
+              fromsize:=OS_F32;
+              { since we are loading an integer, no conversion may be required }
+              if (fromsize<>tosize) then
+                internalerror(2009112801);
+            end;
+          OS_64,OS_S64:
+            begin
+              fromsize:=OS_F64;
+              { since we are loading an integer, no conversion may be required }
+              if (fromsize<>tosize) then
+                internalerror(2009112901);
+            end;
+        end;
+
+        if (fromsize<>tosize) then
+          tmpmmreg:=getmmregister(list,fromsize)
+        else
+          tmpmmreg:=reg;
+        if (ref.alignment in [1,2]) then
+          begin
+            case fromsize of
+              OS_F32:
+                begin
+                  intreg:=getintregister(list,OS_32);
+                  a_load_ref_reg(list,OS_32,OS_32,ref,intreg);
+                  a_loadmm_intreg_reg(list,OS_32,OS_F32,intreg,tmpmmreg,mms_movescalar);
+                end;
+              OS_F64:
+                begin
+                  reg64.reglo:=getintregister(list,OS_32);
+                  reg64.reghi:=getintregister(list,OS_32);
+                  cg64.a_load64_ref_reg(list,ref,reg64);
+                  cg64.a_loadmm_intreg64_reg(list,OS_F64,reg64,tmpmmreg);
+                end;
+              else
+                internalerror(2009112412);
+            end;
+          end
+        else
+          begin
+             case fromsize of
+               OS_F32:
+                 op:=A_FLDS;
+               OS_F64:
+                 op:=A_FLDD;
+               else
+                 internalerror(2009112415);
+             end;
+             handle_load_store(list,op,PF_None,tmpmmreg,ref);
+          end;
+
+        if (tmpmmreg<>reg) then
+          a_loadmm_reg_reg(list,fromsize,tosize,tmpmmreg,reg,shuffle);
+      end;
+
+
+    procedure tcgarm.a_loadmm_reg_ref(list: tasmlist; fromsize,tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
+      var
+        intreg,
+        tmpmmreg : tregister;
+        reg64    : tregister64;
+        op       : tasmop;
+      begin
+        if assigned(shuffle) and
+           not(shufflescalar(shuffle)) then
+          internalerror(2009112416);
+
+        case tosize of
+          OS_32,OS_S32:
+            begin
+              tosize:=OS_F32;
+              { since we are loading an integer, no conversion may be required }
+              if (fromsize<>tosize) then
+                internalerror(2009112801);
+            end;
+          OS_64,OS_S64:
+            begin
+              tosize:=OS_F64;
+              { since we are loading an integer, no conversion may be required }
+              if (fromsize<>tosize) then
+                internalerror(2009112901);
+            end;
+        end;
+
+        if (fromsize<>tosize) then
+          begin
+            tmpmmreg:=getmmregister(list,tosize);
+            a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpmmreg,shuffle);
+          end
+        else
+          tmpmmreg:=reg;
+        if (ref.alignment in [1,2]) then
+          begin
+            case tosize of
+              OS_F32:
+                begin
+                  intreg:=getintregister(list,OS_32);
+                  a_loadmm_reg_intreg(list,OS_F32,OS_32,tmpmmreg,intreg,shuffle);
+                  a_load_reg_ref(list,OS_32,OS_32,intreg,ref);
+                end;
+              OS_F64:
+                begin
+                  reg64.reglo:=getintregister(list,OS_32);
+                  reg64.reghi:=getintregister(list,OS_32);
+                  cg64.a_loadmm_reg_intreg64(list,OS_F64,tmpmmreg,reg64);
+                  cg64.a_load64_reg_ref(list,reg64,ref);
+                end;
+              else
+                internalerror(2009112417);
+            end;
+          end
+        else
+          begin
+             case fromsize of
+               OS_F32:
+                 op:=A_FSTS;
+               OS_F64:
+                 op:=A_FSTD;
+               else
+                 internalerror(2009112418);
+             end;
+             handle_load_store(list,op,PF_None,tmpmmreg,ref);
+          end;
+      end;
+
+
+    procedure tcgarm.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (tosize<>OS_F32) then
+          internalerror(2009112419);
+        if not(fromsize in [OS_32,OS_S32]) then
+          internalerror(2009112420);
+        if assigned(shuffle) and
+           not shufflescalar(shuffle) then
+          internalerror(2009112516);
+        list.concat(taicpu.op_reg_reg(A_FMSR,mmreg,intreg));
+      end;
+
+
+    procedure tcgarm.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister;shuffle : pmmshuffle);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (fromsize<>OS_F32) then
+          internalerror(2009112430);
+        if not(tosize in [OS_32,OS_S32]) then
+          internalerror(2009112420);
+        if assigned(shuffle) and
+           not shufflescalar(shuffle) then
+          internalerror(2009112514);
+        list.concat(taicpu.op_reg_reg(A_FMRS,intreg,mmreg));
+      end;
+
+
+      procedure tcgarm.a_opmm_reg_reg(list: tasmlist; op: topcg; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
+        var
+          tmpreg: tregister;
+        begin
+          { the vfp doesn't support xor nor any other logical operation, but
+            this routine is used to initialise global mm regvars. We can
+            easily initialise an mm reg with 0 though. }
+          case op of
+            OP_XOR:
+              begin
+                if (src<>dst) or
+                   (reg_cgsize(src)<>size) or
+                   assigned(shuffle) then
+                  internalerror(2009112907);
+                tmpreg:=getintregister(list,OS_32);
+                a_load_const_reg(list,OS_32,0,tmpreg);
+                case size of
+                  OS_F32:
+                    list.concat(taicpu.op_reg_reg(A_FMSR,dst,tmpreg));
+                  OS_F64:
+                    list.concat(taicpu.op_reg_reg_reg(A_FMDRR,dst,tmpreg,tmpreg));
+                  else
+                    internalerror(2009112908);
+                end;
+              end
+            else
+              internalerror(2009112906);
+          end;
+        end;
+
+
     procedure tcgarm.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);
 
       procedure loadvmttor12;
@@ -2213,6 +2587,26 @@ unit cgcpu;
       end;
 
 
+    procedure tcg64farm.a_loadmm_intreg64_reg(list: TAsmList; mmsize: tcgsize; intreg: tregister64; mmreg: tregister);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (mmsize<>OS_F64) then
+          internalerror(2009112405);
+        list.concat(taicpu.op_reg_reg_reg(A_FMDRR,mmreg,intreg.reglo,intreg.reghi));
+      end;
+
+
+    procedure tcg64farm.a_loadmm_reg_intreg64(list: TAsmList; mmsize: tcgsize; mmreg: tregister; intreg: tregister64);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (mmsize<>OS_F64) then
+          internalerror(2009112406);
+        list.concat(taicpu.op_reg_reg_reg(A_FMRRD,intreg.reglo,intreg.reghi,mmreg));
+      end;
+
+
     procedure tcg64farm.a_op64_const_reg_reg_checkoverflow(list: TAsmList;op:TOpCG;size : tcgsize;value : int64;regsrc,regdst : tregister64;setflags : boolean;var ovloc : tlocation);
       var
         tmpreg : tregister;
@@ -2913,7 +3307,7 @@ unit cgcpu;
             regs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(pocall_stdcall);
 
             if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
-              regs:=regs+[RS_R11,RS_R14]
+              regs:=regs+[RS_FRAME_POINTER_REG,RS_R14]
             else if (regs<>[]) or (pi_do_call in current_procinfo.flags) then
               include(regs,RS_R14);
 
@@ -2922,7 +3316,7 @@ unit cgcpu;
                 for r:=RS_R0 to RS_R15 do
                   if (r in regs) then
                     inc(stackmisalignment,4);
-                list.concat(setoppostfix(taicpu.op_ref_regset(A_STM,ref,regs),PF_FD));
+                list.concat(setoppostfix(taicpu.op_ref_regset(A_STM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_FD));
               end;
 
             if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
@@ -3024,7 +3418,7 @@ unit cgcpu;
                 include(regs,RS_R15);
               end;
             if (current_procinfo.framepointer<>NR_STACK_POINTER_REG) then
-              regs:=regs+[RS_R11,RS_R15];
+              regs:=regs+[RS_FRAME_POINTER_REG,RS_R15];
 
             for r:=RS_R0 to RS_R15 do
               if (r in regs) then
@@ -3060,17 +3454,17 @@ unit cgcpu;
                     reference_reset(ref,4);
                     ref.index:=NR_STACK_POINTER_REG;
                     ref.addressmode:=AM_PREINDEXED;
-                    list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,regs),PF_FD));
+                    list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_FD));
                   end;
               end
             else
               begin
                 { restore int registers and return }
-                list.concat(taicpu.op_reg_reg(A_MOV, NR_STACK_POINTER_REG, NR_R11));
+                list.concat(taicpu.op_reg_reg(A_MOV, NR_STACK_POINTER_REG, NR_FRAME_POINTER_REG));
 
                 reference_reset(ref,4);
                 ref.index:=NR_STACK_POINTER_REG;
-                list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,regs),PF_DB));
+                list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_DB));
               end;
           end
         else
@@ -3121,7 +3515,7 @@ unit cgcpu;
              (ref.offset>255)
             )
            ) or
-           ((op in [A_LDF,A_STF]) and
+           ((op in [A_LDF,A_STF,A_FLDS,A_FLDD,A_FSTS,A_FSTD]) and
             ((ref.offset<-1020) or
              (ref.offset>1020) or
              { the usual pc relative symbol handling assumes possible offsets of +/- 4095 }
@@ -3201,7 +3595,7 @@ unit cgcpu;
 
         { floating point operations have only limited references
           we expect here, that a base is already set }
-        if (op in [A_LDF,A_STF]) and (ref.index<>NR_NO) then
+        if (op in [A_LDF,A_STF,A_FLDS,A_FLDD,A_FSTS,A_FSTD]) and (ref.index<>NR_NO) then
           begin
             if ref.shiftmode<>SM_none then
               internalerror(200309121);

+ 41 - 5
compiler/arm/cpubase.pas

@@ -104,8 +104,11 @@ unit cpubase;
         {$i rarmdwa.inc}
       );
       { registers which may be destroyed by calls }
-      VOLATILE_INTREGISTERS = [RS_R0..RS_R3,RS_R12..RS_R15];
+      VOLATILE_INTREGISTERS = [RS_R0..RS_R3,RS_R12..RS_R14];
       VOLATILE_FPUREGISTERS = [RS_F0..RS_F3];
+      VOLATILE_MMREGISTERS =  [RS_D0..RS_D7,RS_D16..RS_D31];
+
+      VOLATILE_INTREGISTERS_DARWIN = [RS_R0..RS_R3,RS_R9,RS_R12..RS_R14];
 
     type
       totherregisterset = set of tregisterindex;
@@ -127,7 +130,11 @@ unit cpubase;
         { load/store }
         PF_B,PF_SB,PF_BT,PF_H,PF_SH,PF_T,
         { multiple load/store address modes }
-        PF_IA,PF_IB,PF_DA,PF_DB,PF_FD,PF_FA,PF_ED,PF_EA
+        PF_IA,PF_IB,PF_DA,PF_DB,PF_FD,PF_FA,PF_ED,PF_EA,
+        { multiple load/store vfp address modes }
+        PF_IAD,PF_DBD,PF_FDD,PF_EAD,
+        PF_IAS,PF_DBS,PF_FDS,PF_EAS,
+        PF_IAX,PF_DBX,PF_FDX,PF_EAX
       );
 
       TRoundingMode = (RM_None,RM_P,RM_M,RM_Z);
@@ -138,11 +145,14 @@ unit cpubase;
         PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,PF_None,
         PF_S,PF_D,PF_E,PF_None,PF_None);
 
-      oppostfix2str : array[TOpPostfix] of string[2] = ('',
+      oppostfix2str : array[TOpPostfix] of string[3] = ('',
         's',
         'd','e','p','ep',
         'b','sb','bt','h','sh','t',
-        'ia','ib','da','db','fd','fa','ed','ea');
+        'ia','ib','da','db','fd','fa','ed','ea',
+        'iad','dbd','fdd','ead',
+        'ias','dbs','fds','eas',
+        'iax','dbx','fdx','eax');
 
       roundingmode2str : array[TRoundingMode] of string[1] = ('',
         'p','m','z');
@@ -393,7 +403,21 @@ unit cpubase;
 
     function cgsize2subreg(regtype: tregistertype; s:Tcgsize):Tsubregister;
       begin
-        cgsize2subreg:=R_SUBWHOLE;
+        case regtype of
+          R_MMREGISTER:
+            begin
+              case s of
+                OS_F32:
+                  cgsize2subreg:=R_SUBFS;
+                OS_F64:
+                  cgsize2subreg:=R_SUBFD;
+                else
+                  internalerror(2009112701);
+              end;
+            end;
+          else
+            cgsize2subreg:=R_SUBWHOLE;
+        end;
       end;
 
 
@@ -404,6 +428,18 @@ unit cpubase;
             reg_cgsize:=OS_32;
           R_FPUREGISTER :
             reg_cgsize:=OS_F80;
+          R_MMREGISTER :
+            begin
+              case getsubreg(reg) of
+                R_SUBFD,
+                R_SUBWHOLE:
+                  result:=OS_F64;
+                R_SUBFS:
+                  result:=OS_F32;
+                else
+                  internalerror(2009112903);
+              end;
+            end;
           else
             internalerror(200303181);
           end;

+ 8 - 2
compiler/arm/cpuinfo.pas

@@ -35,6 +35,7 @@ Type
        cpu_armv3,
        cpu_armv4,
        cpu_armv5,
+       cpu_armv6,
        cpu_armv7m,
        cpu_cortexm3
       );
@@ -52,7 +53,8 @@ Type
       fpu_fpa,
       fpu_fpa10,
       fpu_fpa11,
-      fpu_vfp
+      fpu_vfpv2,
+      fpu_vfpv3
      );
 
    tcontrollertype =
@@ -101,6 +103,7 @@ Const
      'ARMV3',
      'ARMV4',
      'ARMV5',
+     'ARMV6',
      'ARMV7M',
      'CORTEXM3'
    );
@@ -111,7 +114,8 @@ Const
      'FPA',
      'FPA10',
      'FPA11',
-     'VFP'
+     'VFPV2',
+     'VFPV3'
    );
 
    controllertypestr : array[tcontrollertype] of string[20] =
@@ -138,6 +142,8 @@ Const
       'STM32F103'
      );
 
+   vfp_scalar = [fpu_vfpv2,fpu_vfpv3];
+
    { Supported optimizations, only used for information }
    supported_optimizerswitches = genericlevel1optimizerswitches+
                                  genericlevel2optimizerswitches+

+ 19 - 3
compiler/arm/cpupara.pas

@@ -36,6 +36,7 @@ unit cpupara;
        tarmparamanager = class(tparamanager)
           function get_volatile_registers_int(calloption : tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_fpu(calloption : tproccalloption):tcpuregisterset;override;
+          function get_volatile_registers_mm(calloption : tproccalloption):tcpuregisterset;override;
           function push_addr_param(varspez:tvarspez;def : tdef;calloption : tproccalloption) : boolean;override;
           function ret_in_param(def : tdef;calloption : tproccalloption) : boolean;override;
           procedure getintparaloc(calloption : tproccalloption; nr : longint;var cgpara:TCGPara);override;
@@ -59,7 +60,10 @@ unit cpupara;
 
     function tarmparamanager.get_volatile_registers_int(calloption : tproccalloption):tcpuregisterset;
       begin
-        result:=VOLATILE_INTREGISTERS;
+        if (target_info.system<>system_arm_darwin) then
+          result:=VOLATILE_INTREGISTERS
+        else
+          result:=VOLATILE_INTREGISTERS_DARWIN;
       end;
 
 
@@ -69,6 +73,12 @@ unit cpupara;
       end;
 
 
+    function tarmparamanager.get_volatile_registers_mm(calloption: tproccalloption): tcpuregisterset;
+      begin
+        result:=VOLATILE_MMREGISTERS;
+      end;
+
+
     procedure tarmparamanager.getintparaloc(calloption : tproccalloption; nr : longint;var cgpara:TCGPara);
       var
         paraloc : pcgparalocation;
@@ -109,7 +119,11 @@ unit cpupara;
             orddef:
               getparaloc:=LOC_REGISTER;
             floatdef:
-              if (calloption in [pocall_cdecl,pocall_cppdecl,pocall_softfloat]) or (cs_fp_emulation in current_settings.moduleswitches) then
+              if (calloption in [pocall_cdecl,pocall_cppdecl,pocall_softfloat]) or
+                 (cs_fp_emulation in current_settings.moduleswitches) or
+                 (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3]) then
+                { the ARM eabi also allows passing VFP values via VFP registers,
+                  but at least neither Mac OS X nor Linux seems to do that }
                 getparaloc:=LOC_REGISTER
               else
                 getparaloc:=LOC_FPUREGISTER;
@@ -466,7 +480,9 @@ unit cpupara;
         { Return in FPU register? }
         if def.typ=floatdef then
           begin
-            if (p.proccalloption in [pocall_softfloat]) or (cs_fp_emulation in current_settings.moduleswitches) then
+            if (p.proccalloption in [pocall_softfloat]) or
+               (cs_fp_emulation in current_settings.moduleswitches) or
+               (current_settings.fputype in [fpu_vfpv2,fpu_vfpv3]) then
               begin
                 case retcgsize of
                   OS_64,

+ 38 - 12
compiler/arm/cpupi.pas

@@ -63,7 +63,16 @@ unit cpupi;
           this extra memory should hurt less than generating all local contants with offsets
           >256 as non shifter constants }
         if tg.direction = -1 then
-          tg.setfirsttemp(-12-28)
+          begin
+            if (target_info.system<>system_arm_darwin) then
+              { Non-Darwin, worst case: r4-r10,r11,r13,r14,r15 is saved -> -28-16, but we
+                always adjust the frame pointer to point to the first stored
+                register (= last register in list above) -> + 4 }
+              tg.setfirsttemp(-28-16+4)
+            else
+              { on Darwin r9 is not usable -> one less register to save }
+              tg.setfirsttemp(-24-16+4)
+          end
         else
           tg.setfirsttemp(maxpushedparasize);
       end;
@@ -74,21 +83,38 @@ unit cpupi;
          firstfloatreg,lastfloatreg,
          r : byte;
          floatsavesize : aword;
+         regs: tcpuregisterset;
       begin
         maxpushedparasize:=align(maxpushedparasize,max(current_settings.alignment.localalignmin,4));
-        firstfloatreg:=RS_NO;
-        { save floating point registers? }
-        for r:=RS_F0 to RS_F7 do
-          if r in cg.rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall) then
+        floatsavesize:=0;
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
             begin
-              if firstfloatreg=RS_NO then
-                firstfloatreg:=r;
-              lastfloatreg:=r;
+              { save floating point registers? }
+              firstfloatreg:=RS_NO;
+              regs:=cg.rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall);
+              for r:=RS_F0 to RS_F7 do
+                if r in regs then
+                  begin
+                    if firstfloatreg=RS_NO then
+                      firstfloatreg:=r;
+                    lastfloatreg:=r;
+                  end;
+              if firstfloatreg<>RS_NO then
+                floatsavesize:=(lastfloatreg-firstfloatreg+1)*12;
             end;
-        if firstfloatreg<>RS_NO then
-          floatsavesize:=(lastfloatreg-firstfloatreg+1)*12
-        else
-          floatsavesize:=0;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              floatsavesize:=0;
+              regs:=cg.rg[R_MMREGISTER].used_in_proc-paramanager.get_volatile_registers_mm(pocall_stdcall);
+              for r:=RS_D0 to RS_D31 do
+                if r in regs then
+                  inc(floatsavesize,8);
+            end;
+        end;
         floatsavesize:=align(floatsavesize,max(current_settings.alignment.localalignmin,4));
         result:=Align(tg.direction*tg.lasttemp,max(current_settings.alignment.localalignmin,4))+maxpushedparasize+aint(floatsavesize);
         floatregstart:=tg.direction*result+maxpushedparasize;

+ 103 - 27
compiler/arm/narmadd.pas

@@ -32,8 +32,9 @@ interface
        tarmaddnode = class(tcgaddnode)
        private
           function  GetResFlags(unsigned:Boolean):TResFlags;
-       protected
+       public
           function pass_1 : tnode;override;
+       protected
           procedure second_addfloat;override;
           procedure second_cmpfloat;override;
           procedure second_cmpordinal;override;
@@ -123,15 +124,27 @@ interface
     procedure tarmaddnode.second_addfloat;
       var
         op : TAsmOp;
+        singleprec: boolean;
       begin
+        pass_left_right;
+        if (nf_swapped in flags) then
+          swapleftright;
+
         case current_settings.fputype of
           fpu_fpa,
           fpu_fpa10,
           fpu_fpa11:
             begin
-              pass_left_right;
-              if (nf_swapped in flags) then
-                swapleftright;
+              { force fpureg as location, left right doesn't matter
+                as both will be in a fpureg }
+              location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
+              location_force_fpureg(current_asmdata.CurrAsmList,right.location,(left.location.loc<>LOC_CFPUREGISTER));
+
+              location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
+              if left.location.loc<>LOC_CFPUREGISTER then
+                location.register:=left.location.register
+              else
+                location.register:=right.location.register;
 
               case nodetype of
                 addn :
@@ -146,22 +159,54 @@ interface
                   internalerror(200308313);
               end;
 
-              { force fpureg as location, left right doesn't matter
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(op,
+                 location.register,left.location.register,right.location.register),
+                 cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+            end;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              { force mmreg as location, left right doesn't matter
                 as both will be in a fpureg }
-              location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
-              location_force_fpureg(current_asmdata.CurrAsmList,right.location,(left.location.loc<>LOC_CFPUREGISTER));
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
 
-              location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
-              if left.location.loc<>LOC_CFPUREGISTER then
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              if left.location.loc<>LOC_CMMREGISTER then
                 location.register:=left.location.register
+              else if right.location.loc<>LOC_CMMREGISTER then
+                location.register:=right.location.register
               else
-                location.register:=right.location.register;
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
 
-              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(op,
-                 location.register,left.location.register,right.location.register),
-                 cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+              singleprec:=tfloatdef(left.resultdef).floattype=s32real;
+              case nodetype of
+                addn :
+                  if singleprec then
+                    op:=A_FADDS
+                  else
+                    op:=A_FADDD;
+                muln :
+                  if singleprec then
+                    op:=A_FMULS
+                  else
+                    op:=A_FMULD;
+                subn :
+                  if singleprec then
+                    op:=A_FSUBS
+                  else
+                    op:=A_FSUBD;
+                slashn :
+                  if singleprec then
+                    op:=A_FDIVS
+                  else
+                    op:=A_FDIVD;
+                else
+                  internalerror(2009111401);
+              end;
 
-              location.loc:=LOC_FPUREGISTER;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,
+                 location.register,left.location.register,right.location.register));
             end;
           fpu_soft:
             { this case should be handled already by pass1 }
@@ -173,27 +218,58 @@ interface
 
 
     procedure tarmaddnode.second_cmpfloat;
+      var
+        op: TAsmOp;
       begin
         pass_left_right;
         if (nf_swapped in flags) then
           swapleftright;
 
-        { force fpureg as location, left right doesn't matter
-          as both will be in a fpureg }
-        location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
-        location_force_fpureg(current_asmdata.CurrAsmList,right.location,true);
-
         location_reset(location,LOC_FLAGS,OS_NO);
         location.resflags:=getresflags(true);
 
-        if nodetype in [equaln,unequaln] then
-          current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_CMF,
-             left.location.register,right.location.register),
-             cgsize2fpuoppostfix[def_cgsize(resultdef)]))
-        else
-          current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_CMFE,
-             left.location.register,right.location.register),
-             cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            begin
+              { force fpureg as location, left right doesn't matter
+                as both will be in a fpureg }
+              location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
+              location_force_fpureg(current_asmdata.CurrAsmList,right.location,true);
+
+              if nodetype in [equaln,unequaln] then
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_CMF,
+                   left.location.register,right.location.register),
+                   cgsize2fpuoppostfix[def_cgsize(resultdef)]))
+              else
+                current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_CMFE,
+                   left.location.register,right.location.register),
+                   cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+            end;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+
+              if (tfloatdef(left.resultdef).floattype=s32real) then
+                if nodetype in [equaln,unequaln] then
+                  op:=A_FCMPS
+                 else
+                   op:=A_FCMPES
+              else if nodetype in [equaln,unequaln] then
+                op:=A_FCMPD
+              else
+                op:=A_FCMPED;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
+                left.location.register,right.location.register));
+              current_asmdata.CurrAsmList.concat(taicpu.op_none(A_FMSTAT));
+            end;
+          fpu_soft:
+            { this case should be handled already by pass1 }
+            internalerror(2009112404);
+        end;
 
         location_reset(location,LOC_FLAGS,OS_NO);
         location.resflags:=getresflags(false);

+ 82 - 44
compiler/arm/narmcnv.pas

@@ -64,7 +64,7 @@ implementation
       pass_1,pass_2,procinfo,
       ncon,ncal,
       ncgutil,
-      cpubase,aasmcpu,
+      cpubase,cpuinfo,aasmcpu,
       rgobj,tgobj,cgobj,cgcpu;
 
 
@@ -95,6 +95,8 @@ implementation
                 result := ccallnode.createintern(fname,ccallparanode.create(
                   left,nil));
                 left:=nil;
+                if (tfloatdef(resultdef).floattype=s32real) then
+                  inserttypeconv(result,s32floattype);
                 firstpass(result);
                 exit;
               end
@@ -108,68 +110,104 @@ implementation
                 firstpass(left);
               end;
             result := nil;
-            expectloc:=LOC_FPUREGISTER;
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
+                expectloc:=LOC_FPUREGISTER;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                expectloc:=LOC_MMREGISTER;
+              else
+                internalerror(2009112702);
+            end;
           end;
       end;
 
 
     procedure tarmtypeconvnode.second_int_to_real;
+      const
+        signedprec2vfpop: array[boolean,OS_F32..OS_F64] of tasmop =
+          ((A_FUITOS,A_FUITOD),
+           (A_FSITOS,A_FSITOD));
       var
         instr : taicpu;
         href : treference;
         l1,l2 : tasmlabel;
         hregister : tregister;
+        signed : boolean;
       begin
-
-        { convert first to double to avoid precision loss }
-        location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
-        location_force_reg(current_asmdata.CurrAsmList,left.location,OS_32,true);
-        location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
-        instr:=taicpu.op_reg_reg(A_FLT,location.register,left.location.register);
-        if is_signed(left.resultdef) then
-          begin
-            instr.oppostfix:=cgsize2fpuoppostfix[def_cgsize(resultdef)];
-            current_asmdata.CurrAsmList.concat(instr);
-          end
-        else
-          begin
-            { flt does a signed load, fix this }
-            case tfloatdef(resultdef).floattype of
-              s32real,
-              s64real:
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            begin
+              { convert first to double to avoid precision loss }
+              location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
+              location_force_reg(current_asmdata.CurrAsmList,left.location,OS_32,true);
+              location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
+              instr:=taicpu.op_reg_reg(A_FLT,location.register,left.location.register);
+              if is_signed(left.resultdef) then
                 begin
-                  { converting dword to s64real first and cut off at the end avoids precision loss }
-                  instr.oppostfix:=PF_D;
+                  instr.oppostfix:=cgsize2fpuoppostfix[def_cgsize(resultdef)];
                   current_asmdata.CurrAsmList.concat(instr);
+                end
+              else
+                begin
+                  { flt does a signed load, fix this }
+                  case tfloatdef(resultdef).floattype of
+                    s32real,
+                    s64real:
+                      begin
+                        { converting dword to s64real first and cut off at the end avoids precision loss }
+                        instr.oppostfix:=PF_D;
+                        current_asmdata.CurrAsmList.concat(instr);
 
-                  current_asmdata.getdatalabel(l1);
-                  current_asmdata.getjumplabel(l2);
-                  reference_reset_symbol(href,l1,0,const_align(8));
+                        current_asmdata.getdatalabel(l1);
+                        current_asmdata.getjumplabel(l2);
+                        reference_reset_symbol(href,l1,0,const_align(8));
 
-                  current_asmdata.CurrAsmList.concat(Taicpu.op_reg_const(A_CMP,left.location.register,0));
-                  cg.a_jmp_flags(current_asmdata.CurrAsmList,F_GE,l2);
+                        current_asmdata.CurrAsmList.concat(Taicpu.op_reg_const(A_CMP,left.location.register,0));
+                        cg.a_jmp_flags(current_asmdata.CurrAsmList,F_GE,l2);
 
-                  hregister:=cg.getfpuregister(current_asmdata.CurrAsmList,OS_F64);
-                  current_asmdata.asmlists[al_typedconsts].concat(tai_align.create(const_align(8)));
-                  current_asmdata.asmlists[al_typedconsts].concat(Tai_label.Create(l1));
-                  { I got this constant from a test program (FK) }
-                  current_asmdata.asmlists[al_typedconsts].concat(Tai_const.Create_32bit($41f00000));
-                  current_asmdata.asmlists[al_typedconsts].concat(Tai_const.Create_32bit(0));
+                        hregister:=cg.getfpuregister(current_asmdata.CurrAsmList,OS_F64);
+                        current_asmdata.asmlists[al_typedconsts].concat(tai_align.create(const_align(8)));
+                        current_asmdata.asmlists[al_typedconsts].concat(Tai_label.Create(l1));
+                        { I got this constant from a test program (FK) }
+                        current_asmdata.asmlists[al_typedconsts].concat(Tai_const.Create_32bit($41f00000));
+                        current_asmdata.asmlists[al_typedconsts].concat(Tai_const.Create_32bit(0));
 
-                  cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,OS_F64,OS_F64,href,hregister);
-                  current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(A_ADF,location.register,hregister,location.register),PF_D));
-                  cg.a_label(current_asmdata.CurrAsmList,l2);
+                        cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,OS_F64,OS_F64,href,hregister);
+                        current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(A_ADF,location.register,hregister,location.register),PF_D));
+                        cg.a_label(current_asmdata.CurrAsmList,l2);
 
-                  { cut off if we should convert to single }
-                  if tfloatdef(resultdef).floattype=s32real then
-                    begin
-                      hregister:=location.register;
-                      location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
-                      current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_MVF,location.register,hregister),PF_S));
-                    end;
-                end;
+                        { cut off if we should convert to single }
+                        if tfloatdef(resultdef).floattype=s32real then
+                          begin
+                            hregister:=location.register;
+                            location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
+                            current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_MVF,location.register,hregister),PF_S));
+                          end;
+                      end;
+                    else
+                      internalerror(200410031);
+                  end;
+              end;
+            end;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+              signed:=left.location.size=OS_S32;
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+              if (left.location.size<>OS_F32) then
+                internalerror(2009112703);
+              if left.location.size<>location.size then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size)
               else
-                internalerror(200410031);
+                location.register:=left.location.register;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(
+                signedprec2vfpop[signed,location.size],location.register,left.location.register));
             end;
         end;
       end;

+ 128 - 20
compiler/arm/narminl.pas

@@ -50,7 +50,7 @@ interface
         }
         procedure second_prefetch; override;
       private
-        procedure load_fpu_location;
+        procedure load_fpu_location(out singleprec: boolean);
       end;
 
 
@@ -72,26 +72,57 @@ implementation
                               tarminlinenode
 *****************************************************************************}
 
-    procedure tarminlinenode.load_fpu_location;
+    procedure tarminlinenode.load_fpu_location(out singleprec: boolean);
       begin
         secondpass(left);
-        location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
-        location_copy(location,left.location);
-        if left.location.loc=LOC_CFPUREGISTER then
-          begin
-           location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
-           location.loc := LOC_FPUREGISTER;
-         end;
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            begin
+              location_force_fpureg(current_asmdata.CurrAsmList,left.location,true);
+              location_copy(location,left.location);
+              if left.location.loc=LOC_CFPUREGISTER then
+                begin
+                 location.register:=cg.getfpuregister(current_asmdata.CurrAsmList,location.size);
+                 location.loc := LOC_FPUREGISTER;
+               end;
+            end;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location_copy(location,left.location);
+              if left.location.loc=LOC_CMMREGISTER then
+                begin
+                 location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+                 location.loc := LOC_MMREGISTER;
+               end;
+            end;
+          else
+            internalerror(2009111801);
+        end;
+        singleprec:=tfloatdef(left.resultdef).floattype=s32real;
       end;
 
 
     function tarminlinenode.first_abs_real : tnode;
       begin
-        if cs_fp_emulation in current_settings.moduleswitches then
+        if (cs_fp_emulation in current_settings.moduleswitches) then
           result:=inherited first_abs_real
         else
           begin
-            expectloc:=LOC_FPUREGISTER;
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
+                expectloc:=LOC_FPUREGISTER;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                expectloc:=LOC_MMREGISTER;
+              else
+                internalerror(2009112401);
+            end;
             first_abs_real:=nil;
           end;
       end;
@@ -99,11 +130,21 @@ implementation
 
     function tarminlinenode.first_sqr_real : tnode;
       begin
-        if cs_fp_emulation in current_settings.moduleswitches then
+        if (cs_fp_emulation in current_settings.moduleswitches) then
           result:=inherited first_sqr_real
         else
           begin
-            expectloc:=LOC_FPUREGISTER;
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
+                expectloc:=LOC_FPUREGISTER;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                expectloc:=LOC_MMREGISTER;
+              else
+                internalerror(2009112402);
+            end;
             first_sqr_real:=nil;
           end;
       end;
@@ -115,7 +156,17 @@ implementation
           result:=inherited first_sqrt_real
         else
           begin
-            expectloc:=LOC_FPUREGISTER;
+            case current_settings.fputype of
+              fpu_fpa,
+              fpu_fpa10,
+              fpu_fpa11:
+                expectloc:=LOC_FPUREGISTER;
+              fpu_vfpv2,
+              fpu_vfpv3:
+                expectloc:=LOC_MMREGISTER;
+              else
+                internalerror(2009112403);
+            end;
             first_sqrt_real := nil;
           end;
       end;
@@ -151,23 +202,80 @@ implementation
 
 
     procedure tarminlinenode.second_abs_real;
+      var
+        singleprec: boolean;
+        op: TAsmOp;
       begin
-        load_fpu_location;
-        current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_ABS,location.register,left.location.register),get_fpu_postfix(resultdef)));
+        load_fpu_location(singleprec);
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_ABS,location.register,left.location.register),get_fpu_postfix(resultdef)));
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              if singleprec then
+                op:=A_FABSS
+              else
+                op:=A_FABSD;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
+            end;
+        else
+          internalerror(2009111402);
+        end;
       end;
 
 
     procedure tarminlinenode.second_sqr_real;
+      var
+        singleprec: boolean;
+        op: TAsmOp;
       begin
-        load_fpu_location;
-        current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(A_MUF,location.register,left.location.register,left.location.register),get_fpu_postfix(resultdef)));
+        load_fpu_location(singleprec);
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_reg(A_MUF,location.register,left.location.register,left.location.register),get_fpu_postfix(resultdef)));
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              if singleprec then
+                op:=A_FMULS
+              else
+                op:=A_FMULD;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,location.register,left.location.register,left.location.register));
+            end;
+        else
+          internalerror(2009111403);
+        end;
       end;
 
 
     procedure tarminlinenode.second_sqrt_real;
+      var
+        singleprec: boolean;
+        op: TAsmOp;
       begin
-        load_fpu_location;
-        current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_SQT,location.register,left.location.register),get_fpu_postfix(resultdef)));
+        load_fpu_location(singleprec);
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_SQT,location.register,left.location.register),get_fpu_postfix(resultdef)));
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              if singleprec then
+                op:=A_FSQRTS
+              else
+                op:=A_FSQRTD;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,location.register,left.location.register));
+            end;
+        else
+          internalerror(2009111402);
+        end;
       end;
 
 

+ 31 - 7
compiler/arm/narmmat.pas

@@ -53,7 +53,7 @@ implementation
       cgbase,cgobj,cgutils,
       pass_2,procinfo,
       ncon,
-      cpubase,
+      cpubase,cpuinfo,
       ncgutil,cgcpu;
 
 {*****************************************************************************
@@ -257,14 +257,38 @@ implementation
 *****************************************************************************}
 
     procedure tarmunaryminusnode.second_float;
+      var
+        op: tasmop;
       begin
         secondpass(left);
-        location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
-        location_force_fpureg(current_asmdata.CurrAsmList,left.location,false);
-        location:=left.location;
-        current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_const(A_RSF,
-          location.register,left.location.register,0),
-          cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+        case current_settings.fputype of
+          fpu_fpa,
+          fpu_fpa10,
+          fpu_fpa11:
+            begin
+              location_force_fpureg(current_asmdata.CurrAsmList,left.location,false);
+              location:=left.location;
+              current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg_const(A_RSF,
+                location.register,left.location.register,0),
+                cgsize2fpuoppostfix[def_cgsize(resultdef)]));
+            end;
+          fpu_vfpv2,
+          fpu_vfpv3:
+            begin
+              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,true);
+              location:=left.location;
+              if (left.location.loc=LOC_CMMREGISTER) then
+                location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+              if (location.size=OS_F32) then
+                op:=A_FNEGS
+              else
+                op:=A_FNEGD;
+              current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,
+                location.register,left.location.register));
+            end;
+          else
+            internalerror(2009112602);
+        end;
       end;
 
 

+ 28 - 3
compiler/arm/raarmgas.pas

@@ -674,6 +674,8 @@ Unit raarmgas;
       var
         tempreg : tregister;
         ireg : tsuperregister;
+        regtype: tregistertype;
+        subreg: tsubregister;
         hl : tasmlabel;
         {ofs : longint;}
         registerset : tcpuregisterset;
@@ -822,7 +824,7 @@ Unit raarmgas;
                   oper.opr.typ:=OPR_REGISTER;
                   oper.opr.reg:=tempreg;
                 end
-              else if (actasmtoken=AS_NOT) and (actopcode in [A_LDM,A_STM]) then
+              else if (actasmtoken=AS_NOT) and (actopcode in [A_LDM,A_STM,A_FLDM,A_FSTM]) then
                 begin
                   consume(AS_NOT);
                   oper.opr.typ:=OPR_REFERENCE;
@@ -838,11 +840,24 @@ Unit raarmgas;
             begin
               consume(AS_LSBRACKET);
               registerset:=[];
+              regtype:=R_INVALIDREGISTER;
+              subreg:=R_SUBNONE;
               while true do
                 begin
                   if actasmtoken=AS_REGISTER then
                     begin
                       include(registerset,getsupreg(actasmregister));
+                      if regtype<>R_INVALIDREGISTER then
+                        begin
+                          if (getregtype(actasmregister)<>regtype) or
+                             (getsubreg(actasmregister)<>subreg) then
+                            Message(asmr_e_mixing_regtypes);
+                        end
+                      else
+                        begin
+                          regtype:=getregtype(actasmregister);
+                          subreg:=getsubreg(actasmregister);
+                        end;
                       tempreg:=actasmregister;
                       consume(AS_REGISTER);
                       if actasmtoken=AS_MINUS then
@@ -862,7 +877,11 @@ Unit raarmgas;
                 end;
               consume(AS_RSBRACKET);
               oper.opr.typ:=OPR_REGSET;
+              oper.opr.regtype:=regtype;
+              oper.opr.subreg:=subreg;
               oper.opr.regset:=registerset;
+              if (registerset=[]) then
+                Message(asmr_e_empty_regset);
             end;
           AS_end,
           AS_SEPARATOR,
@@ -947,12 +966,18 @@ Unit raarmgas;
 
       const
         { sorted by length so longer postfixes will match first }
-        postfix2strsorted : array[1..19] of string[2] = (
+        postfix2strsorted : array[1..31] of string[3] = (
+          'IAD','DBD','FDD','EAD',
+          'IAS','DBS','FDS','EAS',
+          'IAX','DBX','FDX','EAX',
           'EP','SB','BT','SH',
           'IA','IB','DA','DB','FD','FA','ED','EA',
           'B','D','E','P','T','H','S');
 
-        postfixsorted : array[1..19] of TOpPostfix = (
+        postfixsorted : array[1..31] of TOpPostfix = (
+          PF_IAD,PF_DBD,PF_FDD,PF_EAD,
+          PF_IAS,PF_DBS,PF_FDS,PF_EAS,
+          PF_IAX,PF_DBX,PF_FDX,PF_EAX,
           PF_EP,PF_SB,PF_BT,PF_SH,
           PF_IA,PF_IB,PF_DA,PF_DB,PF_FD,PF_FA,PF_ED,PF_EA,
           PF_B,PF_D,PF_E,PF_P,PF_T,PF_H,PF_S);

+ 66 - 49
compiler/arm/rarmcon.inc

@@ -24,52 +24,69 @@ NR_F4 = tregister($02000004);
 NR_F5 = tregister($02000005);
 NR_F6 = tregister($02000006);
 NR_F7 = tregister($02000007);
-NR_S0 = tregister($03000000);
-NR_S1 = tregister($03000000);
-NR_D0 = tregister($03000000);
-NR_S2 = tregister($03000000);
-NR_S3 = tregister($03000000);
-NR_D1 = tregister($03000000);
-NR_S4 = tregister($03000000);
-NR_S5 = tregister($03000000);
-NR_D2 = tregister($03000000);
-NR_S6 = tregister($03000000);
-NR_S7 = tregister($03000000);
-NR_D3 = tregister($03000000);
-NR_S8 = tregister($03000000);
-NR_S9 = tregister($03000000);
-NR_D4 = tregister($03000000);
-NR_S10 = tregister($03000000);
-NR_S11 = tregister($03000000);
-NR_D5 = tregister($03000000);
-NR_S12 = tregister($03000000);
-NR_S13 = tregister($03000000);
-NR_D6 = tregister($03000000);
-NR_S14 = tregister($03000000);
-NR_S15 = tregister($03000000);
-NR_D7 = tregister($03000000);
-NR_S16 = tregister($03000000);
-NR_S17 = tregister($03000000);
-NR_D8 = tregister($03000000);
-NR_S18 = tregister($03000000);
-NR_S19 = tregister($03000000);
-NR_D9 = tregister($03000000);
-NR_S20 = tregister($03000000);
-NR_S21 = tregister($03000000);
-NR_D10 = tregister($03000000);
-NR_S22 = tregister($03000000);
-NR_S23 = tregister($03000000);
-NR_D11 = tregister($03000000);
-NR_S24 = tregister($03000000);
-NR_S25 = tregister($03000000);
-NR_D12 = tregister($03000000);
-NR_S26 = tregister($03000000);
-NR_S27 = tregister($03000000);
-NR_D13 = tregister($03000000);
-NR_S28 = tregister($03000000);
-NR_S29 = tregister($03000000);
-NR_D14 = tregister($03000000);
-NR_S30 = tregister($03000000);
-NR_S31 = tregister($03000000);
-NR_D15 = tregister($03000000);
-NR_CPSR_C = tregister($04000000);
+NR_S0 = tregister($04060000);
+NR_S1 = tregister($04060000);
+NR_D0 = tregister($04070000);
+NR_S2 = tregister($04060001);
+NR_S3 = tregister($04060001);
+NR_D1 = tregister($04070001);
+NR_S4 = tregister($04060002);
+NR_S5 = tregister($04060002);
+NR_D2 = tregister($04070002);
+NR_S6 = tregister($04060003);
+NR_S7 = tregister($04060003);
+NR_D3 = tregister($04070003);
+NR_S8 = tregister($04060004);
+NR_S9 = tregister($04060004);
+NR_D4 = tregister($04070004);
+NR_S10 = tregister($04060005);
+NR_S11 = tregister($04060005);
+NR_D5 = tregister($04070005);
+NR_S12 = tregister($04060006);
+NR_S13 = tregister($04060006);
+NR_D6 = tregister($04070006);
+NR_S14 = tregister($04060007);
+NR_S15 = tregister($04060007);
+NR_D7 = tregister($04070007);
+NR_S16 = tregister($04060008);
+NR_S17 = tregister($04060008);
+NR_D8 = tregister($04070008);
+NR_S18 = tregister($04060009);
+NR_S19 = tregister($04060009);
+NR_D9 = tregister($04070009);
+NR_S20 = tregister($0406000A);
+NR_S21 = tregister($0406000A);
+NR_D10 = tregister($0407000A);
+NR_S22 = tregister($0406000B);
+NR_S23 = tregister($0406000B);
+NR_D11 = tregister($0407000B);
+NR_S24 = tregister($0406000C);
+NR_S25 = tregister($0406000C);
+NR_D12 = tregister($0407000C);
+NR_S26 = tregister($0406000D);
+NR_S27 = tregister($0406000D);
+NR_D13 = tregister($0407000D);
+NR_S28 = tregister($0406000E);
+NR_S29 = tregister($0406000E);
+NR_D14 = tregister($0407000E);
+NR_S30 = tregister($0406000F);
+NR_S31 = tregister($0406000F);
+NR_D15 = tregister($0407000F);
+NR_D16 = tregister($04070010);
+NR_D17 = tregister($04070011);
+NR_D18 = tregister($04070012);
+NR_D19 = tregister($04070013);
+NR_D20 = tregister($04070014);
+NR_D21 = tregister($04070015);
+NR_D22 = tregister($04070016);
+NR_D23 = tregister($04070017);
+NR_D24 = tregister($04070018);
+NR_D25 = tregister($04070019);
+NR_D26 = tregister($0407001A);
+NR_D27 = tregister($0407001B);
+NR_D28 = tregister($0407001C);
+NR_D29 = tregister($0407001D);
+NR_D30 = tregister($0407001E);
+NR_D31 = tregister($0407001F);
+NR_CPSR_C = tregister($05000000);
+NR_FPSCR = tregister($05000001);

+ 17 - 0
compiler/arm/rarmdwa.inc

@@ -72,4 +72,21 @@
 0,
 0,
 0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
 0

+ 1 - 1
compiler/arm/rarmnor.inc

@@ -1,2 +1,2 @@
 { don't edit, this file is generated from armreg.dat }
-74
+91

+ 66 - 49
compiler/arm/rarmnum.inc

@@ -24,52 +24,69 @@ tregister($02000004),
 tregister($02000005),
 tregister($02000006),
 tregister($02000007),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($03000000),
-tregister($04000000)
+tregister($04060000),
+tregister($04060000),
+tregister($04070000),
+tregister($04060001),
+tregister($04060001),
+tregister($04070001),
+tregister($04060002),
+tregister($04060002),
+tregister($04070002),
+tregister($04060003),
+tregister($04060003),
+tregister($04070003),
+tregister($04060004),
+tregister($04060004),
+tregister($04070004),
+tregister($04060005),
+tregister($04060005),
+tregister($04070005),
+tregister($04060006),
+tregister($04060006),
+tregister($04070006),
+tregister($04060007),
+tregister($04060007),
+tregister($04070007),
+tregister($04060008),
+tregister($04060008),
+tregister($04070008),
+tregister($04060009),
+tregister($04060009),
+tregister($04070009),
+tregister($0406000A),
+tregister($0406000A),
+tregister($0407000A),
+tregister($0406000B),
+tregister($0406000B),
+tregister($0407000B),
+tregister($0406000C),
+tregister($0406000C),
+tregister($0407000C),
+tregister($0406000D),
+tregister($0406000D),
+tregister($0407000D),
+tregister($0406000E),
+tregister($0406000E),
+tregister($0407000E),
+tregister($0406000F),
+tregister($0406000F),
+tregister($0407000F),
+tregister($04070010),
+tregister($04070011),
+tregister($04070012),
+tregister($04070013),
+tregister($04070014),
+tregister($04070015),
+tregister($04070016),
+tregister($04070017),
+tregister($04070018),
+tregister($04070019),
+tregister($0407001A),
+tregister($0407001B),
+tregister($0407001C),
+tregister($0407001D),
+tregister($0407001E),
+tregister($0407001F),
+tregister($05000000),
+tregister($05000001)

+ 41 - 24
compiler/arm/rarmrni.inc

@@ -26,50 +26,67 @@
 24,
 25,
 26,
-27,
-28,
 29,
-30,
+28,
 31,
 32,
-33,
-34,
 35,
-36,
+34,
 37,
 38,
-39,
-40,
 41,
-42,
+40,
 43,
 44,
-45,
-46,
 47,
-48,
+46,
 49,
 50,
-51,
-52,
 53,
-54,
+52,
 55,
 56,
-57,
-58,
 59,
-60,
+58,
 61,
 62,
-63,
-64,
 65,
-66,
+64,
 67,
 68,
-69,
-70,
 71,
+70,
+27,
+30,
+33,
+36,
+39,
+42,
+45,
+48,
+51,
+54,
+57,
+60,
+63,
+66,
+69,
 72,
-73
+73,
+74,
+75,
+76,
+77,
+78,
+79,
+80,
+81,
+82,
+83,
+84,
+85,
+86,
+87,
+88,
+89,
+90

+ 18 - 1
compiler/arm/rarmsri.inc

@@ -1,6 +1,6 @@
 { don't edit, this file is generated from armreg.dat }
 0,
-73,
+89,
 27,
 30,
 57,
@@ -9,8 +9,24 @@
 66,
 69,
 72,
+73,
+74,
+75,
+76,
 33,
+77,
+78,
+79,
+80,
+81,
+82,
+83,
+84,
+85,
+86,
 36,
+87,
+88,
 39,
 42,
 45,
@@ -25,6 +41,7 @@
 22,
 23,
 24,
+90,
 1,
 2,
 11,

+ 17 - 0
compiler/arm/rarmsta.inc

@@ -72,4 +72,21 @@
 0,
 0,
 0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
+0,
 0

+ 18 - 1
compiler/arm/rarmstd.inc

@@ -72,4 +72,21 @@
 's20',
 's21',
 'd15',
-'cpsr_c'
+'d16',
+'d17',
+'d18',
+'d19',
+'d20',
+'d21',
+'d22',
+'d23',
+'d24',
+'d25',
+'d26',
+'d27',
+'d28',
+'d29',
+'d30',
+'d31',
+'cpsr_c',
+'fpscr'

+ 62 - 45
compiler/arm/rarmsup.inc

@@ -27,49 +27,66 @@ RS_F7 = $07;
 RS_S0 = $00;
 RS_S1 = $00;
 RS_D0 = $00;
-RS_S2 = $00;
-RS_S3 = $00;
-RS_D1 = $00;
-RS_S4 = $00;
-RS_S5 = $00;
-RS_D2 = $00;
-RS_S6 = $00;
-RS_S7 = $00;
-RS_D3 = $00;
-RS_S8 = $00;
-RS_S9 = $00;
-RS_D4 = $00;
-RS_S10 = $00;
-RS_S11 = $00;
-RS_D5 = $00;
-RS_S12 = $00;
-RS_S13 = $00;
-RS_D6 = $00;
-RS_S14 = $00;
-RS_S15 = $00;
-RS_D7 = $00;
-RS_S16 = $00;
-RS_S17 = $00;
-RS_D8 = $00;
-RS_S18 = $00;
-RS_S19 = $00;
-RS_D9 = $00;
-RS_S20 = $00;
-RS_S21 = $00;
-RS_D10 = $00;
-RS_S22 = $00;
-RS_S23 = $00;
-RS_D11 = $00;
-RS_S24 = $00;
-RS_S25 = $00;
-RS_D12 = $00;
-RS_S26 = $00;
-RS_S27 = $00;
-RS_D13 = $00;
-RS_S28 = $00;
-RS_S29 = $00;
-RS_D14 = $00;
-RS_S30 = $00;
-RS_S31 = $00;
-RS_D15 = $00;
+RS_S2 = $01;
+RS_S3 = $01;
+RS_D1 = $01;
+RS_S4 = $02;
+RS_S5 = $02;
+RS_D2 = $02;
+RS_S6 = $03;
+RS_S7 = $03;
+RS_D3 = $03;
+RS_S8 = $04;
+RS_S9 = $04;
+RS_D4 = $04;
+RS_S10 = $05;
+RS_S11 = $05;
+RS_D5 = $05;
+RS_S12 = $06;
+RS_S13 = $06;
+RS_D6 = $06;
+RS_S14 = $07;
+RS_S15 = $07;
+RS_D7 = $07;
+RS_S16 = $08;
+RS_S17 = $08;
+RS_D8 = $08;
+RS_S18 = $09;
+RS_S19 = $09;
+RS_D9 = $09;
+RS_S20 = $0A;
+RS_S21 = $0A;
+RS_D10 = $0A;
+RS_S22 = $0B;
+RS_S23 = $0B;
+RS_D11 = $0B;
+RS_S24 = $0C;
+RS_S25 = $0C;
+RS_D12 = $0C;
+RS_S26 = $0D;
+RS_S27 = $0D;
+RS_D13 = $0D;
+RS_S28 = $0E;
+RS_S29 = $0E;
+RS_D14 = $0E;
+RS_S30 = $0F;
+RS_S31 = $0F;
+RS_D15 = $0F;
+RS_D16 = $10;
+RS_D17 = $11;
+RS_D18 = $12;
+RS_D19 = $13;
+RS_D20 = $14;
+RS_D21 = $15;
+RS_D22 = $16;
+RS_D23 = $17;
+RS_D24 = $18;
+RS_D25 = $19;
+RS_D26 = $1A;
+RS_D27 = $1B;
+RS_D28 = $1C;
+RS_D29 = $1D;
+RS_D30 = $1E;
+RS_D31 = $1F;
 RS_CPSR_C = $00;
+RS_FPSCR = $01;

+ 28 - 0
compiler/arm/rgcpu.pas

@@ -37,6 +37,8 @@ unit rgcpu;
        trgcpu = class(trgobj)
          procedure do_spill_read(list:TAsmList;pos:tai;const spilltemp:treference;tempreg:tregister);override;
          procedure do_spill_written(list:TAsmList;pos:tai;const spilltemp:treference;tempreg:tregister);override;
+         procedure add_constraints(reg:tregister);override;
+         function  get_spill_subreg(r:tregister) : tsubregister;override;
        end;
 
        trgcputhumb2 = class(trgobj)
@@ -162,6 +164,32 @@ unit rgcpu;
       end;
 
 
+    procedure trgcpu.add_constraints(reg:tregister);
+      var
+        supreg,i : Tsuperregister;
+      begin
+        case getsubreg(reg) of
+          { Let 32bit floats conflict with all double precision regs > 15
+            (since these don't have 32 bit equivalents) }
+          R_SUBFS:
+            begin
+              supreg:=getsupreg(reg);
+              for i:=RS_D16 to RS_D31 do
+                add_edge(supreg,i);
+            end;
+        end;
+      end;
+
+
+    function  trgcpu.get_spill_subreg(r:tregister) : tsubregister;
+      begin
+        if (getregtype(r)<>R_MMREGISTER) then
+          result:=defaultsub
+        else
+          result:=getsubreg(r);
+      end;
+
+
     procedure trgcputhumb2.do_spill_read(list:TAsmList;pos:tai;const spilltemp:treference;tempreg:tregister);
       var
         tmpref : treference;

+ 4 - 1
compiler/assemble.pas

@@ -509,7 +509,10 @@ Implementation
         else
           result:='-m68000 '+result;
 {$endif}
-
+{$ifdef arm}
+        if (target_info.system=system_arm_darwin) then
+          Replace(result,'$ARCH',lower(cputypestr[current_settings.cputype]));
+{$endif arm}
         if (cs_link_on_target in current_settings.globalswitches) then
          begin
            Replace(result,'$ASM',maybequoted(ScriptFixFileName(AsmFileName)));

+ 32 - 1
compiler/cg64f32.pas

@@ -81,6 +81,8 @@ unit cg64f32;
         procedure a_param64_ref(list : TAsmList;const r : treference;const paraloc : tcgpara);override;
         procedure a_param64_loc(list : TAsmList;const l : tlocation;const paraloc : tcgpara);override;
 
+        procedure a_loadmm_intreg64_reg(list: TAsmList; mmsize: tcgsize; intreg: tregister64; mmreg: tregister);override;
+        procedure a_loadmm_reg_intreg64(list: TAsmList; mmsize: tcgsize; mmreg: tregister; intreg: tregister64);override;
         {# This routine tries to optimize the a_op64_const_reg operation, by
            removing superfluous opcodes. Returns TRUE if normal processing
            must continue in op64_const_reg, otherwise, everything is processed
@@ -99,7 +101,8 @@ unit cg64f32;
     uses
        globtype,systems,constexp,
        verbose,cutils,
-       symbase,symconst,symdef,symtable,defutil,paramgr;
+       symbase,symconst,symdef,symtable,defutil,paramgr,
+       tgobj;
 
 {****************************************************************************
                                      Helpers
@@ -460,6 +463,8 @@ unit cg64f32;
             a_load64_reg_reg(list,reg,l.register64);
           LOC_SUBSETREF, LOC_CSUBSETREF:
             a_load64_reg_subsetref(list,reg,l.sref);
+          LOC_MMREGISTER, LOC_CMMREGISTER:
+            a_loadmm_intreg64_reg(list,l.size,reg,l.register);
           else
             internalerror(200112293);
         end;
@@ -708,6 +713,32 @@ unit cg64f32;
       end;
 
 
+    procedure tcg64f32.a_loadmm_intreg64_reg(list: TAsmList; mmsize: tcgsize; intreg: tregister64; mmreg: tregister);
+      var
+        tmpref: treference;
+      begin
+        if (tcgsize2size[mmsize]<>8) then
+          internalerror(2009112501);
+        tg.gettemp(list,8,8,tt_normal,tmpref);
+        a_load64_reg_ref(list,intreg,tmpref);
+        cg.a_loadmm_ref_reg(list,mmsize,mmsize,tmpref,mmreg,mms_movescalar);
+        tg.ungettemp(list,tmpref);
+      end;
+
+
+    procedure tcg64f32.a_loadmm_reg_intreg64(list: TAsmList; mmsize: tcgsize; mmreg: tregister; intreg: tregister64);
+      var
+        tmpref: treference;
+      begin
+        if (tcgsize2size[mmsize]<>8) then
+          internalerror(2009112502);
+        tg.gettemp(list,8,8,tt_normal,tmpref);
+        cg.a_loadmm_reg_ref(list,mmsize,mmsize,mmreg,tmpref,mms_movescalar);
+        a_load64_ref_reg(list,tmpref,intreg);
+        tg.ungettemp(list,tmpref);
+      end;
+
+
     procedure tcg64f32.g_rangecheck64(list : TAsmList;const l:tlocation;fromdef,todef:tdef);
 
       var

+ 94 - 3
compiler/cgobj.pas

@@ -280,6 +280,9 @@ unit cgobj;
           procedure a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle); virtual;
           procedure a_opmm_reg_ref(list: TAsmList; Op: TOpCG; size : tcgsize;reg: tregister;const ref: treference; shuffle : pmmshuffle); virtual;
 
+          procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); virtual;
+          procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister; shuffle : pmmshuffle); virtual;
+
           { basic arithmetic operations }
           { note: for operators which require only one argument (not, neg), use }
           { the op_reg_reg, op_reg_ref or op_reg_loc methods and keep in mind   }
@@ -556,6 +559,8 @@ unit cgobj;
         procedure a_param64_ref(list : TAsmList;const r : treference;const loc : TCGPara);virtual;abstract;
         procedure a_param64_loc(list : TAsmList;const l : tlocation;const loc : TCGPara);virtual;abstract;
 
+        procedure a_loadmm_intreg64_reg(list: TAsmList; mmsize: tcgsize; intreg: tregister64; mmreg: tregister); virtual;abstract;
+        procedure a_loadmm_reg_intreg64(list: TAsmList; mmsize: tcgsize; mmreg: tregister; intreg: tregister64); virtual;abstract;
         {
              This routine tries to optimize the const_reg opcode, and should be
              called at the start of a_op64_const_reg. It returns the actual opcode
@@ -2307,6 +2312,8 @@ implementation
             a_load_reg_subsetreg(list,fromsize,loc.size,reg,loc.sreg);
           LOC_SUBSETREF,LOC_CSUBSETREF:
             a_load_reg_subsetref(list,fromsize,loc.size,reg,loc.sref);
+          LOC_MMREGISTER,LOC_CMMREGISTER:
+            a_loadmm_intreg_reg(list,fromsize,loc.size,reg,loc.register,mms_movescalar);
           else
             internalerror(200203271);
         end;
@@ -2855,7 +2862,7 @@ implementation
       end;
 
 
-    procedure tcg.a_cmp_subsetref_reg_label(list : TAsmList; subsetsize : tcgsize; cmpsize : tcgsize; cmp_op : topcmp; const sref: tsubsetreference; reg : tregister; l : tasmlabel);
+    procedure tcg.a_cmp_subsetref_reg_label(list : TAsmList; subsetsize, cmpsize : tcgsize; cmp_op : topcmp; const sref: tsubsetreference; reg : tregister; l : tasmlabel);
       var
         tmpreg: tregister;
       begin
@@ -2904,6 +2911,8 @@ implementation
             a_loadmm_reg_reg(list,loc.size,size,loc.register,reg,shuffle);
           LOC_REFERENCE,LOC_CREFERENCE:
             a_loadmm_ref_reg(list,loc.size,size,loc.reference,reg,shuffle);
+          LOC_REGISTER,LOC_CREGISTER:
+            a_loadmm_intreg_reg(list,loc.size,size,loc.register,reg,shuffle);
           else
             internalerror(200310121);
         end;
@@ -2925,9 +2934,17 @@ implementation
 
     procedure tcg.a_parammm_reg(list: TAsmList; size: tcgsize; reg: tregister;const cgpara : TCGPara;shuffle : pmmshuffle);
       var
-        href : treference;
+        href  : treference;
+{$ifndef cpu64bitalu}
+        tmpreg : tregister;
+        reg64 : tregister64;
+{$endif not cpu64bitalu}
       begin
-         cgpara.check_simple_location;
+{$ifndef cpu64bitalu}
+         if not(cgpara.location^.loc in [LOC_REGISTER,LOC_CREGISTER]) or
+            (size<>OS_F64) then
+{$endif not cpu64bitalu}
+           cgpara.check_simple_location;
          case cgpara.location^.loc of
           LOC_MMREGISTER,LOC_CMMREGISTER:
             a_loadmm_reg_reg(list,size,cgpara.location^.size,reg,cgpara.location^.register,shuffle);
@@ -2935,6 +2952,52 @@ implementation
             begin
               reference_reset_base(href,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
               a_loadmm_reg_ref(list,size,cgpara.location^.size,reg,href,shuffle);
+            end;
+          LOC_REGISTER,LOC_CREGISTER:
+            begin
+              if assigned(shuffle) and
+                 not shufflescalar(shuffle) then
+                internalerror(2009112510);
+{$ifndef cpu64bitalu}
+              if (size=OS_F64) then
+                begin
+                  if not assigned(cgpara.location^.next) or
+                     assigned(cgpara.location^.next^.next) then
+                    internalerror(2009112512);
+                  case cgpara.location^.next^.loc of
+                    LOC_REGISTER,LOC_CREGISTER:
+                      tmpreg:=cgpara.location^.next^.register;
+                    LOC_REFERENCE,LOC_CREFERENCE:
+                      tmpreg:=getintregister(list,OS_32);
+                    else
+                      internalerror(2009112910);
+                  end;
+                  if (target_info.endian=ENDIAN_BIG) then
+                    begin
+                      { paraloc^ -> high
+                        paraloc^.next -> low }
+                      reg64.reghi:=cgpara.location^.register;
+                      reg64.reglo:=tmpreg;
+                    end
+                  else
+                    begin
+                      { paraloc^ -> low
+                        paraloc^.next -> high }
+                      reg64.reglo:=cgpara.location^.register;
+                      reg64.reghi:=tmpreg;
+                    end;
+                  cg64.a_loadmm_reg_intreg64(list,size,reg,reg64);
+                  if (cgpara.location^.next^.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
+                    begin
+                      if not(cgpara.location^.next^.size in [OS_32,OS_S32]) then
+                        internalerror(2009112911);
+                      reference_reset_base(href,cgpara.location^.next^.reference.index,cgpara.location^.next^.reference.offset,cgpara.alignment);
+                      a_load_reg_ref(list,OS_32,cgpara.location^.next^.size,tmpreg,href);
+                    end;
+                end
+              else
+{$endif not cpu64bitalu}
+                a_loadmm_reg_intreg(list,size,cgpara.location^.size,reg,cgpara.location^.register,mms_movescalar);
             end
           else
             internalerror(200310123);
@@ -3014,6 +3077,34 @@ implementation
       end;
 
 
+    procedure tcg.a_loadmm_intreg_reg(list: tasmlist; fromsize,tosize: tcgsize; intreg,mmreg: tregister; shuffle: pmmshuffle);
+      var
+        tmpref: treference;
+      begin
+        if (tcgsize2size[fromsize]<>4) or
+           (tcgsize2size[tosize]<>4) then
+          internalerror(2009112503);
+        tg.gettemp(list,4,4,tt_normal,tmpref);
+        a_load_reg_ref(list,fromsize,fromsize,intreg,tmpref);
+        a_loadmm_ref_reg(list,tosize,tosize,tmpref,mmreg,shuffle);
+        tg.ungettemp(list,tmpref);
+      end;
+
+
+    procedure tcg.a_loadmm_reg_intreg(list: tasmlist; fromsize,tosize: tcgsize; mmreg,intreg: tregister; shuffle: pmmshuffle);
+      var
+        tmpref: treference;
+      begin
+        if (tcgsize2size[fromsize]<>4) or
+           (tcgsize2size[tosize]<>4) then
+          internalerror(2009112504);
+        tg.gettemp(list,8,8,tt_normal,tmpref);
+        cg.a_loadmm_reg_ref(list,fromsize,fromsize,mmreg,tmpref,shuffle);
+        a_load_ref_reg(list,tosize,tosize,tmpref,intreg);
+        tg.ungettemp(list,tmpref);
+      end;
+
+
     procedure tcg.a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle);
       begin
         case loc.loc of

+ 7 - 1
compiler/msg/errore.msg

@@ -1939,7 +1939,7 @@ cg_f_unknown_type_in_unit=06050_F_Cannot find type "$1" in unit "$2". Check if y
 #
 # Assembler reader
 #
-# 07107 is the last used one
+# 07109 is the last used one
 #
 asmr_d_start_reading=07000_DL_Starting $1 styled assembler parsing
 % This informs you that an assembler block is being parsed
@@ -2185,6 +2185,12 @@ asmr_e_need_pic_ref=07107_E_Generating PIC, but reference is not PIC-safe
 % The compiler has been configured to generate position-independent code
 % (PIC), but there are position-dependent references in the current
 % handwritten assembler instruction.
+asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the same kind and width
+% Instructions on the ARM architecture that take a register set as argument require that all registers
+% in this set are of the same kind (e.g., integer, vfp) and width (e.g., single precision, double precision).
+asmr_e_empty_regset=07109_E_A register set cannot be empty
+% Instructions on the ARM architecture that take a register set as argument require that such a set
+% contains at least one register.
 #
 # Assembler/binary writers
 #

+ 4 - 2
compiler/msgidx.inc

@@ -641,6 +641,8 @@ const
   asmr_w_direct_esp_neg_offset=07105;
   asmr_e_no_vmtoffset_possible=07106;
   asmr_e_need_pic_ref=07107;
+  asmr_e_mixing_regtypes=07108;
+  asmr_e_empty_regset=07109;
   asmw_f_too_many_asm_files=08000;
   asmw_f_assembler_output_not_supported=08001;
   asmw_f_comp_not_supported=08002;
@@ -832,9 +834,9 @@ const
   option_info=11024;
   option_help_pages=11025;
 
-  MsgTxtSize = 54546;
+  MsgTxtSize = 54660;
 
   MsgIdxMax : array[1..20] of longint=(
-    24,87,281,95,71,51,108,22,202,62,
+    24,87,281,95,71,51,110,22,202,62,
     48,20,1,1,1,1,1,1,1,1
   );

+ 217 - 213
compiler/msgtxt.inc

@@ -742,239 +742,241 @@ const msgtxt : array[0..000227,1..240] of char=(
   '07106_E_VMTOffset must be used in combination with a virtual method, a'+
   'nd "$1" is no','t virtual'#000+
   '07107_E_Generating PIC, but reference is not PIC-safe'#000+
+  '07108_E_All registers in a register set must be of the same kind and w'+
+  'idth'#000+
+  '07109_E_A register set cannot be empty'#000+
   '08000_F_Too many assembler files'#000+
-  '08001_F_Selected assembler output not supported'#000+
+  '08001_F_Selected assembler ou','tput not supported'#000+
   '08002_F_Comp not supported'#000+
   '08003_F_Direct not support for binary writers'#000+
-  '08004_E_Allocating of ','data is only allowed in bss section'#000+
+  '08004_E_Allocating of data is only allowed in bss section'#000+
   '08005_F_No binary writer selected'#000+
   '08006_E_Asm: Opcode $1 not in table'#000+
-  '08007_E_Asm: $1 invalid combination of opcode and operands'#000+
+  '08007_E_Asm: $1 inva','lid combination of opcode and operands'#000+
   '08008_E_Asm: 16 Bit references not supported'#000+
-  '08009_E_Asm: Invalid effective',' address'#000+
+  '08009_E_Asm: Invalid effective address'#000+
   '08010_E_Asm: Immediate or reference expected'#000+
   '08011_E_Asm: $1 value exceeds bounds $2'#000+
-  '08012_E_Asm: Short jump is out of range $1'#000+
+  '08012_E_Asm: Short jump is out o','f range $1'#000+
   '08013_E_Asm: Undefined label $1'#000+
   '08014_E_Asm: Comp type not supported for this target'#000+
-  '08015_E_Asm: Exten','ded type not supported for this target'#000+
+  '08015_E_Asm: Extended type not supported for this target'#000+
   '08016_E_Asm: Duplicate label $1'#000+
   '08017_E_Asm: Redefined label $1'#000+
-  '08018_E_Asm: First defined here'#000+
+  '08018_E_Asm: First defi','ned here'#000+
   '08019_E_Asm: Invalid register $1'#000+
   '08020_E_Asm: 16 or 32 Bit references not supported'#000+
-  '08021_E_Asm: 64 Bit o','perands not supported'#000+
+  '08021_E_Asm: 64 Bit operands not supported'#000+
   '09000_W_Source operating system redefined'#000+
   '09001_I_Assembling (pipe) $1'#000+
-  '09002_E_Can'#039't create assembler file: $1'#000+
+  '09002_E_Can'#039't create assembler fi','le: $1'#000+
   '09003_E_Can'#039't create object file: $1'#000+
   '09004_E_Can'#039't create archive file: $1'#000+
-  '09005_E_Assembler $1 not found, ','switching to external assembling'#000+
+  '09005_E_Assembler $1 not found, switching to external assembling'#000+
   '09006_T_Using assembler: $1'#000+
   '09007_E_Error while assembling exitcode $1'#000+
-  '09008_E_Can'#039't call the assembler, error $1 switching to external a'+
-  'ssembling'#000+
+  '09008_E_Can'#039't call the',' assembler, error $1 switching to external'+
+  ' assembling'#000+
   '09009_I_Assembling $1'#000+
-  '09010_I_Assembling with smartlinking $','1'#000+
+  '09010_I_Assembling with smartlinking $1'#000+
   '09011_W_Object $1 not found, Linking may fail !'#000+
   '09012_W_Library $1 not found, Linking may fail !'#000+
-  '09013_E_Error while linking'#000+
+  '09013_E_Error while linking',#000+
   '09014_E_Can'#039't call the linker, switching to external linking'#000+
   '09015_I_Linking $1'#000+
-  '09016_E_Util $1 not found, switch','ing to external linking'#000+
+  '09016_E_Util $1 not found, switching to external linking'#000+
   '09017_T_Using util $1'#000+
   '09018_E_Creation of Executables not supported'#000+
-  '09019_E_Creation of Dynamic/Shared Libraries not supported'#000+
+  '09019_E_Creation of Dynamic/Shared',' Libraries not supported'#000+
   '09020_I_Closing script $1'#000+
-  '09021_E_resource compiler "$1" not found, switching to external',' mode'+
-  #000+
+  '09021_E_resource compiler "$1" not found, switching to external mode'#000+
   '09022_I_Compiling resource $1'#000+
   '09023_T_unit $1 can'#039't be statically linked, switching to smart lin'+
   'king'#000+
-  '09024_T_unit $1 can'#039't be smart linked, switching to static linking'+
+  '09024_T_unit $1 can',#039't be smart linked, switching to static linking'+
   #000+
-  '09025_T_unit $1 can'#039't be shared linked, switching to static linkin',
+  '09025_T_unit $1 can'#039't be shared linked, switching to static linkin'+
   'g'#000+
   '09026_E_unit $1 can'#039't be smart or static linked'#000+
   '09027_E_unit $1 can'#039't be shared or static linked'#000+
-  '09028_D_Calling resource compiler "$1" with "$2" as command line'#000+
+  '09028_D_Calling resource co','mpiler "$1" with "$2" as command line'#000+
   '09029_E_Error while compiling resources'#000+
-  '09030_E_Can'#039't call the resource comp','iler "$1", switching to exte'+
-  'rnal mode'#000+
+  '09030_E_Can'#039't call the resource compiler "$1", switching to extern'+
+  'al mode'#000+
   '09031_E_Can'#039't open resource file "$1"'#000+
   '09032_E_Can'#039't write resource file "$1"'#000+
-  '09128_F_Can'#039't post process executable $1'#000+
+  '09128_F_Can',#039't post process executable $1'#000+
   '09129_F_Can'#039't open executable $1'#000+
   '09130_X_Size of Code: $1 bytes'#000+
-  '09131_X_Size of init','ialized data: $1 bytes'#000+
+  '09131_X_Size of initialized data: $1 bytes'#000+
   '09132_X_Size of uninitialized data: $1 bytes'#000+
   '09133_X_Stack space reserved: $1 bytes'#000+
-  '09134_X_Stack space committed: $1 bytes'#000+
+  '09134_X_Stack space',' committed: $1 bytes'#000+
   '09200_F_Executable image size is too big for $1 target.'#000+
-  '09201_W_Object file "$1" contains 32-','bit absolute relocation to symb'+
-  'ol "$2".'#000+
+  '09201_W_Object file "$1" contains 32-bit absolute relocation to symbol'+
+  ' "$2".'#000+
   '10000_T_Unitsearch: $1'#000+
   '10001_T_PPU Loading $1'#000+
   '10002_U_PPU Name: $1'#000+
-  '10003_U_PPU Flags: $1'#000+
+  '10003_U_PPU Flags: ','$1'#000+
   '10004_U_PPU Crc: $1'#000+
   '10005_U_PPU Time: $1'#000+
   '10006_U_PPU File too short'#000+
-  '10007_U_PPU Invalid Header (no PPU at the b','egin)'#000+
+  '10007_U_PPU Invalid Header (no PPU at the begin)'#000+
   '10008_U_PPU Invalid Version $1'#000+
   '10009_U_PPU is compiled for another processor'#000+
-  '10010_U_PPU is compiled for an other target'#000+
+  '10010_U_PPU is compiled for an other target',#000+
   '10011_U_PPU Source: $1'#000+
   '10012_U_Writing $1'#000+
   '10013_F_Can'#039't Write PPU-File'#000+
   '10014_F_Error reading PPU-File'#000+
-  '10015_F_une','xpected end of PPU-File'#000+
+  '10015_F_unexpected end of PPU-File'#000+
   '10016_F_Invalid PPU-File entry: $1'#000+
   '10017_F_PPU Dbx count problem'#000+
   '10018_E_Illegal unit name: $1'#000+
-  '10019_F_Too much units'#000+
+  '10019_F','_Too much units'#000+
   '10020_F_Circular unit reference between $1 and $2'#000+
-  '10021_F_Can'#039't compile unit $1, no sources availa','ble'#000+
+  '10021_F_Can'#039't compile unit $1, no sources available'#000+
   '10022_F_Can'#039't find unit $1 used by $2'#000+
   '10023_W_Unit $1 was not found but $2 exists'#000+
   '10024_F_Unit $1 searched but $2 found'#000+
-  '10025_W_Compiling the system unit requires the -Us switch'#000+
+  '10','025_W_Compiling the system unit requires the -Us switch'#000+
   '10026_F_There were $1 errors compiling module, stopping'#000+
-  '10','027_U_Load from $1 ($2) unit $3'#000+
+  '10027_U_Load from $1 ($2) unit $3'#000+
   '10028_U_Recompiling $1, checksum changed for $2'#000+
   '10029_U_Recompiling $1, source found only'#000+
-  '10030_U_Recompiling unit, static lib is older than ppufile'#000+
-  '10031_U_Recompiling unit, shared lib is older than ppufile'#000,
+  '1003','0_U_Recompiling unit, static lib is older than ppufile'#000+
+  '10031_U_Recompiling unit, shared lib is older than ppufile'#000+
   '10032_U_Recompiling unit, obj and asm are older than ppufile'#000+
   '10033_U_Recompiling unit, obj is older than asm'#000+
-  '10034_U_Parsing interface of $1'#000+
+  '10034_U_Parsing i','nterface of $1'#000+
   '10035_U_Parsing implementation of $1'#000+
   '10036_U_Second load for unit $1'#000+
-  '10037_U_PPU Check file $1 time',' $2'#000+
+  '10037_U_PPU Check file $1 time $2'#000+
   '10040_W_Can'#039't recompile unit $1, but found modifed include files'#000+
   '10041_U_File $1 is newer than PPU file $2'#000+
-  '10042_U_Trying to use a unit which was compiled with a different FPU m'+
-  'ode'#000+
+  '10042_U_Trying ','to use a unit which was compiled with a different FPU'+
+  ' mode'#000+
   '10043_U_Loading interface units from $1'#000+
-  '10044_U_Loading',' implementation units from $1'#000+
+  '10044_U_Loading implementation units from $1'#000+
   '10045_U_Interface CRC changed for unit $1'#000+
   '10046_U_Implementation CRC changed for unit $1'#000+
-  '10047_U_Finished compiling unit $1'#000+
+  '10047_U','_Finished compiling unit $1'#000+
   '10048_U_Add dependency of $1 to $2'#000+
   '10049_U_No reload, is caller: $1'#000+
-  '10050_U_No reload,',' already in second compile: $1'#000+
+  '10050_U_No reload, already in second compile: $1'#000+
   '10051_U_Flag for reload: $1'#000+
   '10052_U_Forced reloading'#000+
   '10053_U_Previous state of $1: $2'#000+
-  '10054_U_Already compiling $1, setting second compile'#000+
+  '10054_U_A','lready compiling $1, setting second compile'#000+
   '10055_U_Loading unit $1'#000+
   '10056_U_Finished loading unit $1'#000+
-  '10057_U_Regis','tering new unit $1'#000+
+  '10057_U_Registering new unit $1'#000+
   '10058_U_Re-resolving unit $1'#000+
   '10059_U_Skipping re-resolving unit $1, still loading used units'#000+
-  '10060_U_Unloading resource unit $1 (not needed)'#000+
+  '10060_U_Unload','ing resource unit $1 (not needed)'#000+
   '10061_E_Unit $1 was compiled using a different whole program optimizat'+
-  'ion feedba','ck input ($2, $3); recompile it without wpo or use the sam'+
-  'e wpo feedback input file for this compilation invocation'#000+
-  '11000_O_$1 [options] <inputfile> [options]'#000+
+  'ion feedback input ($2, $3); recompile it without wpo or use the same '+
+  'wpo feedback input file for this compilation invocation'#000+
+  '11000_O_$1',' [options] <inputfile> [options]'#000+
   '11001_W_Only one source file supported, changing source file to compil'+
-  'e from "$1"',' into "$2"'#000+
+  'e from "$1" into "$2"'#000+
   '11002_W_DEF file can be created only for OS/2'#000+
   '11003_E_nested response files are not supported'#000+
-  '11004_F_No source file name in command line'#000+
+  '11004_F_No source fil','e name in command line'#000+
   '11005_N_No option inside $1 config file'#000+
   '11006_E_Illegal parameter: $1'#000+
-  '11007_H_-? writes hel','p pages'#000+
+  '11007_H_-? writes help pages'#000+
   '11008_F_Too many config files nested'#000+
   '11009_F_Unable to open file $1'#000+
   '11010_D_Reading further options from $1'#000+
-  '11011_W_Target is already set to: $1'#000+
+  '11011_W_Ta','rget is already set to: $1'#000+
   '11012_W_Shared libs not supported on DOS platform, reverting to static'+
   #000+
-  '11013_F_In optio','ns file $1 at line $2 too many \var{\#IF(N)DEFs} enc'+
-  'ountered'#000+
-  '11014_F_In options file $1 at line $2 unexpected \var{\#ENDIFs} encoun'+
-  'tered'#000+
+  '11013_F_In options file $1 at line $2 too many \var{\#IF(N)DEFs} encou'+
+  'ntered'#000+
+  '11014_F_In options file $1 at line $2 unexpected \var{\#ENDIFs} e','nco'+
+  'untered'#000+
   '11015_F_Open conditional at the end of the options file'#000+
-  '11016_W_Debug information generation is not sup','ported by this execut'+
-  'able'#000+
+  '11016_W_Debug information generation is not supported by this executab'+
+  'le'#000+
   '11017_H_Try recompiling with -dGDB'#000+
   '11018_W_You are using the obsolete switch $1'#000+
-  '11019_W_You are using the obsolete switch $1, please use $2'#000+
+  '11019_W_You are usin','g the obsolete switch $1, please use $2'#000+
   '11020_N_Switching assembler to default source writing assembler'#000+
-  '11021_W_As','sembler output selected "$1" is not compatible with "$2"'#000+
+  '11021_W_Assembler output selected "$1" is not compatible with "$2"'#000+
   '11022_W_"$1" assembler use forced'#000+
-  '11026_T_Reading options from file $1'#000+
+  '11026_T_Reading options from file $','1'#000+
   '11027_T_Reading options from environment $1'#000+
   '11028_D_Handling option "$1"'#000+
   '11029_O_*** press enter ***'#000+
-  '11030_H_Sta','rt of reading config file $1'#000+
+  '11030_H_Start of reading config file $1'#000+
   '11031_H_End of reading config file $1'#000+
   '11032_D_interpreting option "$1"'#000+
-  '11036_D_interpreting firstpass option "$1"'#000+
+  '11036_D_interpreting first','pass option "$1"'#000+
   '11033_D_interpreting file option "$1"'#000+
   '11034_D_Reading config file "$1"'#000+
-  '11035_D_found source file ','name "$1"'#000+
+  '11035_D_found source file name "$1"'#000+
   '11039_E_Unknown code page'#000+
   '11040_F_Config file $1 is a directory'#000+
-  '11041_W_Assembler output selected "$1" cannot generate debug info, deb'+
-  'ugging disabled'#000+
+  '11041_W_Assembler output selected "$1" cannot genera','te debug info, d'+
+  'ebugging disabled'#000+
   '11042_W_Use of ppc386.cfg is deprecated, please use fpc.cfg instead'#000+
-  '11043_F_In o','ptions file $1 at line $2 \var{\#ELSE} directive without'+
-  ' \var{\#IF(N)DEF} found'#000+
-  '11044_F_Option "$1" is not, or not yet, supported on the current targe'+
-  't platform'#000+
+  '11043_F_In options file $1 at line $2 \var{\#ELSE} directive without \'+
+  'var{\#IF(N)DEF} found'#000+
+  '11044_F_Option "$1" is not, or not yet, suppor','ted on the current tar'+
+  'get platform'#000+
   '11045_F_The feature "$1" is not, or not yet, supported on the selected'+
-  ' target p','latform'#000+
+  ' target platform'#000+
   '11046_N_DWARF debug information cannot be used with smart linking on t'+
   'his target, switching to static linking'#000+
-  '11047_W_Option "$1" is ignored for the current target platform.'#000+
-  '12000_F_Cannot open whole program optimization feedback fi','le "$1"'#000+
+  '11047_W_','Option "$1" is ignored for the current target platform.'#000+
+  '12000_F_Cannot open whole program optimization feedback file "$1"'#000+
   '12001_D_Processing whole program optimization information in wpo feedb'+
   'ack file "$1"'#000+
-  '12002_D_Finished processing the whole program optimization information'+
-  ' in wpo feedback file "$1"'#000+
-  '12003_E_Expected section header, but got "$2" at li','ne $1 of wpo feed'+
-  'back file'#000+
+  '12002_D_Finished processing the wh','ole program optimization informati'+
+  'on in wpo feedback file "$1"'#000+
+  '12003_E_Expected section header, but got "$2" at line $1 of wpo feedba'+
+  'ck file'#000+
   '12004_W_No handler registered for whole program optimization section "'+
-  '$2" at line $1 of wpo feedback file, ignoring'#000+
+  '$2" at line $1 of wpo feedbac','k file, ignoring'#000+
   '12005_D_Found whole program optimization section "$1" with information'+
   ' about "$2"'#000+
-  '12006_F_The sel','ected whole program optimizations require a previousl'+
-  'y generated feedback file (use -Fw to specify)'#000+
-  '12007_E_No collected information necessary to perform "$1" whole progr'+
-  'am optimization found'#000+
-  '12008_F_Specify a whole program optimization fee','dback file to store '+
-  'the generated info in (using -FW)'#000+
+  '12006_F_The selected whole program optimizations require a previously '+
+  'generated feedback file (use -Fw to specify)'#000+
+  '12007_E_No collected infor','mation necessary to perform "$1" whole pro'+
+  'gram optimization found'#000+
+  '12008_F_Specify a whole program optimization feedback file to store th'+
+  'e generated info in (using -FW)'#000+
   '12009_E_Not generating any whole program optimization information, yet'+
-  ' a feedback file was specified (using -FW)'#000+
+  ' a',' feedback file was specified (using -FW)'#000+
   '12010_E_Not performing any whole program optimizations, yet an input f'+
-  'eed','back file was specified (using -Fw)'#000+
+  'eedback file was specified (using -Fw)'#000+
   '12011_D_Skipping whole program optimization section "$1", because not '+
-  'needed by the requested optimizations'#000+
+  'needed by the reques','ted optimizations'#000+
   '12012_W_Overriding previously read information for "$1" from feedback '+
-  'input file using informati','on in section "$2"'#000+
+  'input file using information in section "$2"'#000+
   '12013_E_Cannot extract symbol liveness information from program when s'+
   'tripping symbols, use -Xs-'#000+
-  '12014_E_Cannot extract symbol liveness information from program when w'+
-  'hen not linking'#000+
-  '12015_F_Cannot find "$1" or "$2" to ex','tract symbol liveness informat'+
-  'ion from linked program'#000+
+  '12014_E_Ca','nnot extract symbol liveness information from program when'+
+  ' when not linking'#000+
+  '12015_F_Cannot find "$1" or "$2" to extract symbol liveness informatio'+
+  'n from linked program'#000+
   '12016_E_Error during reading symbol liveness information produced by "'+
-  '$1"'#000+
+  '$1','"'#000+
   '12017_F_Error executing "$1" (exitcode: $2) to extract symbol informat'+
   'ion from linked program'#000+
-  '12018_E_Collection',' of symbol liveness information can only help when'+
-  ' using smart linking, use -CX -XX'#000+
-  '12019_E_Cannot create specified whole program optimisation feedback fi'+
-  'le "$1"'#000+
+  '12018_E_Collection of symbol liveness information can only help when u'+
+  'sing smart linking, use -CX -XX'#000+
+  '12019_E_Cannot create specified whole prog','ram optimisation feedback '+
+  'file "$1"'#000+
   '11023_Free Pascal Compiler version $FPCFULLVERSION [$FPCDATE] for $FPC'+
   'CPU'#010+
-  'Copy','right (c) 1993-2009 by Florian Klaempfl'#000+
+  'Copyright (c) 1993-2009 by Florian Klaempfl'#000+
   '11024_Free Pascal Compiler version $FPCVERSION'#010+
   #010+
   'Compiler Date      : $FPCDATE'#010+
-  'Compiler CPU Target: $FPCCPU'#010+
+  'Compiler',' CPU Target: $FPCCPU'#010+
   #010+
   'Supported targets:'#010+
   '  $OSTARGETS'#010+
@@ -982,305 +984,307 @@ const msgtxt : array[0..000227,1..240] of char=(
   'Supported CPU instruction sets:'#010+
   '  $INSTRUCTIONSETS'#010+
   #010+
-  'Support','ed FPU instruction sets:'#010+
+  'Supported FPU instruction sets:'#010+
   '  $FPUINSTRUCTIONSETS'#010+
   #010+
   'Supported ABI targets:'#010+
   '  $ABITARGETS'#010+
   #010+
   'Supported Optimizations:'#010+
-  '  $OPTIMIZATIONS'#010+
+  '  $OPTIMIZATION','S'#010+
   #010+
   'Supported Whole Program Optimizations:'#010+
   '  All'#010+
   '  $WPOPTIMIZATIONS'#010+
   #010+
   'Supported Microcontroller types:'#010+
-  '  $CONTROLLER','TYPES'#010+
+  '  $CONTROLLERTYPES'#010+
   #010+
   'This program comes under the GNU General Public Licence'#010+
   'For more information read COPYING.FPC'#010+
   #010+
-  'Report bugs, suggestions, etc. to:'#010+
+  'Report bugs, suggestions',', etc. to:'#010+
   '                 http://bugs.freepascal.org'#010+
   'or'#010+
   '                 [email protected]'#000+
-  '11025_**0*_Put + af','ter a boolean switch option to enable it, - to di'+
-  'sable it'#010+
+  '11025_**0*_Put + after a boolean switch option to enable it, - to disa'+
+  'ble it'#010+
   '**1a_The compiler doesn'#039't delete the generated assembler file'#010+
-  '**2al_List sourcecode lines in assembler file'#010+
+  '**2al_','List sourcecode lines in assembler file'#010+
   '**2an_List node info in assembler file'#010+
-  '*L2ap_Use pipes instead of creating',' temporary assembler files'#010+
+  '*L2ap_Use pipes instead of creating temporary assembler files'#010+
   '**2ar_List register allocation/release info in assembler file'#010+
-  '**2at_List temp allocation/release info in assembler file'#010+
+  '**2at_List temp allocation/release in','fo in assembler file'#010+
   '**1A<x>_Output format:'#010+
   '**2Adefault_Use default assembler'#010+
   '3*2Aas_Assemble using GNU AS'#010+
-  '3*2Anas','mcoff_COFF (Go32v2) file using Nasm'#010+
+  '3*2Anasmcoff_COFF (Go32v2) file using Nasm'#010+
   '3*2Anasmelf_ELF32 (Linux) file using Nasm'#010+
   '3*2Anasmwin32_Win32 object file using Nasm'#010+
-  '3*2Anasmwdosx_Win32/WDOSX object file using Nasm'#010+
+  '3*2An','asmwdosx_Win32/WDOSX object file using Nasm'#010+
   '3*2Awasm_Obj file using Wasm (Watcom)'#010+
-  '3*2Anasmobj_Obj file using Nasm'#010,
+  '3*2Anasmobj_Obj file using Nasm'#010+
   '3*2Amasm_Obj file using Masm (Microsoft)'#010+
   '3*2Atasm_Obj file using Tasm (Borland)'#010+
   '3*2Aelf_ELF (Linux) using internal writer'#010+
-  '3*2Acoff_COFF (Go32v2) using internal writer'#010+
+  '3*2A','coff_COFF (Go32v2) using internal writer'#010+
   '3*2Apecoff_PE-COFF (Win32) using internal writer'#010+
-  '4*2Aas_Assemble using GN','U AS'#010+
+  '4*2Aas_Assemble using GNU AS'#010+
   '6*2Aas_Unix o-file using GNU AS'#010+
   '6*2Agas_GNU Motorola assembler'#010+
   '6*2Amit_MIT Syntax (old GAS)'#010+
-  '6*2Amot_Standard Motorola assembler'#010+
+  '6*2Amot_Standard Motorola ass','embler'#010+
   'A*2Aas_Assemble using GNU AS'#010+
   'P*2Aas_Assemble using GNU AS'#010+
   'S*2Aas_Assemble using GNU AS'#010+
-  '**1b_Generate browse','r info'#010+
+  '**1b_Generate browser info'#010+
   '**2bl_Generate local symbol info'#010+
   '**1B_Build all modules'#010+
   '**1C<x>_Code generation options:'#010+
-  '**2Ca<x>_Select ABI, see fpc -i for possible values'#010+
+  '**2Ca<x>_Select ABI, see fpc -','i for possible values'#010+
   '**2Cb_Generate big-endian code'#010+
   '**2Cc<x>_Set default calling convention to <x>'#010+
-  '**2CD_Create a','lso dynamic library (not supported)'#010+
+  '**2CD_Create also dynamic library (not supported)'#010+
   '**2Ce_Compilation with emulated floating point opcodes'#010+
-  '**2Cf<x>_Select fpu instruction set to use, see fpc -i for possible va'+
-  'lues'#010+
+  '**2Cf<x>_Select fpu instruction set',' to use, see fpc -i for possible '+
+  'values'#010+
   '**2CF<x>_Minimal floating point constant precision (default, 32, 64)'#010+
-  '**2Cg','_Generate PIC code'#010+
+  '**2Cg_Generate PIC code'#010+
   '**2Ch<n>_<n> bytes heap (between 1023 and 67107840)'#010+
   '**2Ci_IO-checking'#010+
   '**2Cn_Omit linking stage'#010+
-  '**2Co_Check overflow of integer operations'#010+
+  '**2Co_Check ','overflow of integer operations'#010+
   '**2CO_Check for possible overflow of integer operations'#010+
-  '**2Cp<x>_Select instruction',' set, see fpc -i for possible values'#010+
+  '**2Cp<x>_Select instruction set, see fpc -i for possible values'#010+
   '**2CP<x>=<y>_ packing settings'#010+
-  '**3CPPACKSET=<y>_ <y> set allocation: 0, 1 or DEFAULT or NORMAL, 2, 4 '+
-  'and 8'#010+
+  '**3CPPACKSET=<y>_ <y> set allocation: 0, 1 or DEFAULT or N','ORMAL, 2, '+
+  '4 and 8'#010+
   '**2Cr_Range checking'#010+
   '**2CR_Verify object method call validity'#010+
-  '**2Cs<n>_Set stack checking size t','o <n>'#010+
+  '**2Cs<n>_Set stack checking size to <n>'#010+
   '**2Ct_Stack checking (for testing only, see manual)'#010+
   '**2CX_Create also smartlinked library'#010+
-  '**1d<x>_Defines the symbol <x>'#010+
+  '**1d<x>_Defines the symbol <x>',#010+
   '**1D_Generate a DEF file'#010+
   '**2Dd<x>_Set description to <x>'#010+
   '**2Dv<x>_Set DLL version to <x>'#010+
   '*O2Dw_PM application'#010+
-  '**1','e<x>_Set path to executable'#010+
+  '**1e<x>_Set path to executable'#010+
   '**1E_Same as -Cn'#010+
   '**1fPIC_Same as -Cg'#010+
   '**1F<x>_Set file names and paths:'#010+
-  '**2Fa<x>[,y]_(for a program) load units <x> and [y] before uses is par'+
-  'sed'#010+
+  '**2Fa<x>[,y]_(for a program',') load units <x> and [y] before uses is p'+
+  'arsed'#010+
   '**2Fc<x>_Set input codepage to <x>'#010+
-  '**2FC<x>_Set RC compiler binary ','name to <x>'#010+
+  '**2FC<x>_Set RC compiler binary name to <x>'#010+
   '**2Fd_Disable the compiler'#039's internal directory cache'#010+
-  '**2FD<x>_Set the directory where to search for compiler utilities'#010+
+  '**2FD<x>_Set the directory where to search for compiler util','ities'#010+
   '**2Fe<x>_Redirect error output to <x>'#010+
   '**2Ff<x>_Add <x> to framework path (Darwin only)'#010+
-  '**2FE<x>_Set exe/unit',' output path to <x>'#010+
+  '**2FE<x>_Set exe/unit output path to <x>'#010+
   '**2Fi<x>_Add <x> to include path'#010+
   '**2Fl<x>_Add <x> to library path'#010+
   '**2FL<x>_Use <x> as dynamic linker'#010+
-  '**2Fm<x>_Load unicode conversion table from <x>.txt in the compiler di'+
-  'r'#010+
+  '**2Fm','<x>_Load unicode conversion table from <x>.txt in the compiler '+
+  'dir'#010+
   '**2Fo<x>_Add <x> to object path'#010+
-  '**2Fr<x>_Load e','rror message file <x>'#010+
+  '**2Fr<x>_Load error message file <x>'#010+
   '**2FR<x>_Set resource (.res) linker to <x>'#010+
   '**2Fu<x>_Add <x> to unit path'#010+
-  '**2FU<x>_Set unit output path to <x>, overrides -FE'#010+
+  '**2FU<x>_Set unit output path t','o <x>, overrides -FE'#010+
   '**2FW<x>_Store generated whole-program optimization feedback in <x>'#010+
-  '**2Fw<x>_Load previously ','stored whole-program optimization feedback '+
-  'from <x>'#010+
+  '**2Fw<x>_Load previously stored whole-program optimization feedback fr'+
+  'om <x>'#010+
   '*g1g_Generate debug information (default format for target)'#010+
-  '*g2gc_Generate checks for pointers'#010+
+  '*g2gc_Generate',' checks for pointers'#010+
   '*g2gh_Use heaptrace unit (for memory leak/corruption debugging)'#010+
-  '*g2gl_Use line info unit (sho','w more info with backtraces)'#010+
+  '*g2gl_Use line info unit (show more info with backtraces)'#010+
   '*g2go<x>_Set debug information options'#010+
-  '*g3godwarfsets_ Enable DWARF set debug information (breaks gdb < 6.5)'#010+
+  '*g3godwarfsets_ Enable DWARF set debug information (breaks',' gdb < 6.5'+
+  ')'#010+
   '*g3gostabsabsincludes_ Store absolute/full include file paths in Stabs'+
   #010+
-  '*g2gp_Preserve case in stabs sy','mbol names'#010+
+  '*g2gp_Preserve case in stabs symbol names'#010+
   '*g2gs_Generate Stabs debug information'#010+
   '*g2gt_Trash local variables (to detect uninitialized uses)'#010+
-  '*g2gv_Generates programs traceable with Valgrind'#010+
+  '*g2gv_Generates p','rograms traceable with Valgrind'#010+
   '*g2gw_Generate DWARFv2 debug information (same as -gw2)'#010+
-  '*g2gw2_Generate DWARFv2 de','bug information'#010+
+  '*g2gw2_Generate DWARFv2 debug information'#010+
   '*g2gw3_Generate DWARFv3 debug information'#010+
   '**1i_Information'#010+
   '**2iD_Return compiler date'#010+
-  '**2iV_Return short compiler version'#010+
+  '**2iV_Return short compi','ler version'#010+
   '**2iW_Return full compiler version'#010+
   '**2iSO_Return compiler OS'#010+
   '**2iSP_Return compiler host processor'#010+
-  '**2','iTO_Return target OS'#010+
+  '**2iTO_Return target OS'#010+
   '**2iTP_Return target processor'#010+
   '**1I<x>_Add <x> to include path'#010+
   '**1k<x>_Pass <x> to the linker'#010+
-  '**1l_Write logo'#010+
+  '**1l_Write ','logo'#010+
   '**1M<x>_Set language mode to <x>'#010+
   '**2Mfpc_Free Pascal dialect (default)'#010+
-  '**2Mobjfpc_FPC mode with Object Pascal',' support'#010+
+  '**2Mobjfpc_FPC mode with Object Pascal support'#010+
   '**2Mdelphi_Delphi 7 compatibility mode'#010+
   '**2Mtp_TP/BP 7.0 compatibility mode'#010+
-  '**2Mmacpas_Macintosh Pascal dialects compatibility mode'#010+
+  '**2Mmacpas_Macintosh Pascal dialects compa','tibility mode'#010+
   '**1n_Do not read the default config files'#010+
   '**1N<x>_Node tree optimizations'#010+
   '**2Nu_Unroll loops'#010+
-  '**1o<x>','_Change the name of the executable produced to <x>'#010+
+  '**1o<x>_Change the name of the executable produced to <x>'#010+
   '**1O<x>_Optimizations:'#010+
   '**2O-_Disable optimizations'#010+
-  '**2O1_Level 1 optimizations (quick and debugger friendly)'#010+
+  '**2O1_Level 1 optimizati','ons (quick and debugger friendly)'#010+
   '**2O2_Level 2 optimizations (-O1 + quick optimizations)'#010+
-  '**2O3_Level 3 optimizati','ons (-O2 + slow optimizations)'#010+
+  '**2O3_Level 3 optimizations (-O2 + slow optimizations)'#010+
   '**2Oa<x>=<y>_Set alignment'#010+
-  '**2Oo[NO]<x>_Enable or disable optimizations, see fpc -i for possible '+
-  'values'#010+
+  '**2Oo[NO]<x>_Enable or disable optimizations, see fpc -i for possibl','e'+
+  ' values'#010+
   '**2Op<x>_Set target cpu for optimizing, see fpc -i for possible values'+
   #010+
-  '**2OW<x>_Generate whole-program op','timization feedback for optimizati'+
-  'on <x>, see fpc -i for possible values'#010+
-  '**2Ow<x>_Perform whole-program optimization <x>, see fpc -i for possib'+
-  'le values'#010+
+  '**2OW<x>_Generate whole-program optimization feedback for optimization'+
+  ' <x>, see fpc -i for possible values'#010+
+  '**2Ow<x>_Perform whole-program optimization <x>, see ','fpc -i for poss'+
+  'ible values'#010+
   '**2Os_Optimize for size rather than speed'#010+
-  '**1pg_Generate profile code for gprof (define','s FPC_PROFILE)'#010+
+  '**1pg_Generate profile code for gprof (defines FPC_PROFILE)'#010+
   '**1R<x>_Assembler reading style:'#010+
   '**2Rdefault_Use default assembler for target'#010+
-  '3*2Ratt_Read AT&T style assembler'#010+
+  '3*2Ratt_Read AT&T style assembler',#010+
   '3*2Rintel_Read Intel style assembler'#010+
   '6*2RMOT_Read motorola style assembler'#010+
   '**1S<x>_Syntax options:'#010+
-  '**2S2_Same as ','-Mobjfpc'#010+
+  '**2S2_Same as -Mobjfpc'#010+
   '**2Sc_Support operators like C (*=,+=,/= and -=)'#010+
   '**2Sa_Turn on assertions'#010+
   '**2Sd_Same as -Mdelphi'#010+
-  '**2Se<x>_Error options. <x> is a combination of the following:'#010+
+  '**2Se<x>_Error optio','ns. <x> is a combination of the following:'#010+
   '**3*_<n> : Compiler halts after the <n> errors (default is 1)'#010+
-  '**3*_w : ','Compiler also halts after warnings'#010+
+  '**3*_w : Compiler also halts after warnings'#010+
   '**3*_n : Compiler also halts after notes'#010+
   '**3*_h : Compiler also halts after hints'#010+
-  '**2Sg_Enable LABEL and GOTO (default in -Mtp and -Mdelphi)'#010+
+  '**2Sg_Ena','ble LABEL and GOTO (default in -Mtp and -Mdelphi)'#010+
   '**2Sh_Use ansistrings by default instead of shortstrings'#010+
-  '**2Si_T','urn on inlining of procedures/functions declared as "inline"'#010+
+  '**2Si_Turn on inlining of procedures/functions declared as "inline"'#010+
   '**2Sk_Load fpcylix unit'#010+
   '**2SI<x>_Set interface style to <x>'#010+
-  '**3SIcom_COM compatible interface (default)'#010+
+  '**3SI','com_COM compatible interface (default)'#010+
   '**3SIcorba_CORBA compatible interface'#010+
-  '**2Sm_Support macros like C (global)'#010,
+  '**2Sm_Support macros like C (global)'#010+
   '**2So_Same as -Mtp'#010+
   '**2Ss_Constructor name must be init (destructor must be done)'#010+
   '**2St_Allow static keyword in objects'#010+
-  '**2Sx_Enable exception keywords (default in Delphi/ObjFPC modes)'#010+
+  '**2Sx_E','nable exception keywords (default in Delphi/ObjFPC modes)'#010+
   '**1s_Do not call assembler and linker'#010+
-  '**2sh_Generate scr','ipt to link on host'#010+
+  '**2sh_Generate script to link on host'#010+
   '**2st_Generate script to link on target'#010+
   '**2sr_Skip register allocation phase (use with -alr)'#010+
-  '**1T<x>_Target operating system:'#010+
+  '**1T<x>_Targe','t operating system:'#010+
   '3*2Temx_OS/2 via EMX (including EMX/RSX extender)'#010+
   '3*2Tfreebsd_FreeBSD'#010+
-  '3*2Tgo32v2_Version 2 of ','DJ Delorie DOS extender'#010+
+  '3*2Tgo32v2_Version 2 of DJ Delorie DOS extender'#010+
   '3*2Tlinux_Linux'#010+
   '3*2Tnetbsd_NetBSD'#010+
   '3*2Tnetware_Novell Netware Module (clib)'#010+
-  '3*2Tnetwlibc_Novell Netware Module (libc)'#010+
+  '3*2Tnetwlibc_Novell Netware',' Module (libc)'#010+
   '3*2Topenbsd_OpenBSD'#010+
   '3*2Tos2_OS/2 / eComStation'#010+
   '3*2Tsunos_SunOS/Solaris'#010+
   '3*2Tsymbian_Symbian OS'#010+
-  '3*2Tw','atcom_Watcom compatible DOS extender'#010+
+  '3*2Twatcom_Watcom compatible DOS extender'#010+
   '3*2Twdosx_WDOSX DOS extender'#010+
   '3*2Twin32_Windows 32 Bit'#010+
   '3*2Twince_Windows CE'#010+
-  '4*2Tlinux_Linux'#010+
+  '4*2Tlinux_Linu','x'#010+
   '6*2Tamiga_Commodore Amiga'#010+
   '6*2Tatari_Atari ST/STe/TT'#010+
   '6*2Tlinux_Linux/m68k'#010+
-  '6*2Tmacos_Macintosh m68k (not supported',')'#010+
+  '6*2Tmacos_Macintosh m68k (not supported)'#010+
   '6*2Tpalmos_PalmOS'#010+
   'A*2Tlinux_Linux'#010+
   'A*2Twince_Windows CE'#010+
   'P*2Tamiga_AmigaOS on PowerPC'#010+
-  'P*2Tdarwin_Darwin and Mac OS X on PowerPC'#010+
+  'P*2Tdarwin_Darwin and Mac OS X on PowerP','C'#010+
   'P*2Tlinux_Linux on PowerPC'#010+
   'P*2Tmacos_Mac OS (classic) on PowerPC'#010+
   'P*2Tmorphos_MorphOS'#010+
   'S*2Tlinux_Linux'#010+
-  '**1u<x>_Und','efines the symbol <x>'#010+
+  '**1u<x>_Undefines the symbol <x>'#010+
   '**1U_Unit options:'#010+
   '**2Un_Do not check where the unit name matches the file name'#010+
-  '**2Ur_Generate release unit files (never automatically recompiled)'#010+
+  '**2Ur_Generate release u','nit files (never automatically recompiled)'#010+
   '**2Us_Compile a system unit'#010+
-  '**1v<x>_Be verbose. <x> is a combination of',' the following letters:'#010+
+  '**1v<x>_Be verbose. <x> is a combination of the following letters:'#010+
   '**2*_e : Show errors (default)       0 : Show nothing (except errors)'#010+
-  '**2*_w : Show warnings               u : Show unit info'#010+
+  '**2*_w : Show warnings          ','     u : Show unit info'#010+
   '**2*_n : Show notes                  t : Show tried/used files'#010+
-  '**2*_h : Show hints        ','          c : Show conditionals'#010+
+  '**2*_h : Show hints                  c : Show conditionals'#010+
   '**2*_i : Show general info           d : Show debug info'#010+
-  '**2*_l : Show linenumbers            r : Rhide/GCC compatibility mode'#010+
+  '**2*_l : Show linenumbers            ','r : Rhide/GCC compatibility mod'+
+  'e'#010+
   '**2*_s : Show time stamps            q : Show message numbers'#010+
-  '**2*_a : Show every','thing             x : Executable info (Win32 only'+
-  ')'#010+
+  '**2*_a : Show everything             x : Executable info (Win32 only)'#010+
   '**2*_b : Write file names messages   p : Write tree.log with parse tre'+
   'e'#010+
-  '**2*_    with full path              v : Write fpcdebug.txt with'#010+
-  '**2*_                                    lots of deb','ugging info'#010+
+  '**2','*_    with full path              v : Write fpcdebug.txt with'#010+
+  '**2*_                                    lots of debugging info'#010+
   '**2*_m<x>,<y> : Don'#039't show messages numbered <x> and <y>'#010+
   '3*1W<x>_Target-specific options (targets)'#010+
-  'A*1W<x>_Target-specific options (targets)'#010+
+  'A*1W<x>_Target-','specific options (targets)'#010+
   'P*1W<x>_Target-specific options (targets)'#010+
   'p*1W<x>_Target-specific options (targets)'#010+
-  '3*2','Wb_Create a bundle instead of a library (Darwin)'#010+
+  '3*2Wb_Create a bundle instead of a library (Darwin)'#010+
   'P*2Wb_Create a bundle instead of a library (Darwin)'#010+
-  'p*2Wb_Create a bundle instead of a library (Darwin)'#010+
+  'p*2Wb_Create a bundle ins','tead of a library (Darwin)'#010+
   '3*2WB_Create a relocatable image (Windows)'#010+
-  'A*2WB_Create a relocatable image (Windows, S','ymbian)'#010+
+  'A*2WB_Create a relocatable image (Windows, Symbian)'#010+
   '3*2WC_Specify console type application (EMX, OS/2, Windows)'#010+
   'A*2WC_Specify console type application (Windows)'#010+
-  'P*2WC_Specify console type application (Classic Mac OS)'#010+
+  'P*2WC_Spe','cify console type application (Classic Mac OS)'#010+
   '3*2WD_Use DEFFILE to export functions of DLL or EXE (Windows)'#010+
-  'A*2WD','_Use DEFFILE to export functions of DLL or EXE (Windows)'#010+
+  'A*2WD_Use DEFFILE to export functions of DLL or EXE (Windows)'#010+
   '3*2We_Use external resources (Darwin)'#010+
-  'P*2We_Use external resources (Darwin)'#010+
+  'P*2We_Use external resources (D','arwin)'#010+
   'p*2We_Use external resources (Darwin)'#010+
   '3*2WF_Specify full-screen type application (EMX, OS/2)'#010+
-  '3*2WG_Specify ','graphic type application (EMX, OS/2, Windows)'#010+
+  '3*2WG_Specify graphic type application (EMX, OS/2, Windows)'#010+
   'A*2WG_Specify graphic type application (Windows)'#010+
-  'P*2WG_Specify graphic type application (Classic Mac OS)'#010+
+  'P*2WG_Specify graphic type appl','ication (Classic Mac OS)'#010+
   '3*2Wi_Use internal resources (Darwin)'#010+
   'P*2Wi_Use internal resources (Darwin)'#010+
-  'p*2Wi_Use int','ernal resources (Darwin)'#010+
+  'p*2Wi_Use internal resources (Darwin)'#010+
   '3*2WN_Do not generate relocation code, needed for debugging (Windows)'#010+
-  'A*2WN_Do not generate relocation code, needed for debugging (Windows)'#010+
+  'A*2WN_Do not generate relocatio','n code, needed for debugging (Windows'+
+  ')'#010+
   '3*2WR_Generate relocation code (Windows)'#010+
-  'A*2WR_Generate relocation code (Wi','ndows)'#010+
+  'A*2WR_Generate relocation code (Windows)'#010+
   'P*2WT_Specify MPW tool type application (Classic Mac OS)'#010+
   '3*2WX_Enable executable stack (Linux)'#010+
-  'A*2WX_Enable executable stack (Linux)'#010+
+  'A*2WX_Enable executable ','stack (Linux)'#010+
   'p*2WX_Enable executable stack (Linux)'#010+
   'P*2WX_Enable executable stack (Linux)'#010+
-  '**1X_Executable options:',#010+
+  '**1X_Executable options:'#010+
   '**2Xc_Pass --shared/-dynamic to the linker (BeOS, Darwin, FreeBSD, Lin'+
   'ux)'#010+
-  '**2Xd_Do not use standard library search path (needed for cross compil'+
-  'e)'#010+
+  '**2Xd_Do not use standard library search path (need','ed for cross comp'+
+  'ile)'#010+
   '**2Xe_Use external linker'#010+
-  '**2Xg_Create debuginfo in a separate file and add a debuglink sect','io'+
-  'n to executable'#010+
+  '**2Xg_Create debuginfo in a separate file and add a debuglink section '+
+  'to executable'#010+
   '**2XD_Try to link units dynamically      (defines FPC_LINK_DYNAMIC)'#010+
   '**2Xi_Use internal linker'#010+
-  '**2Xm_Generate link map'#010+
+  '**2Xm_Generate',' link map'#010+
   '**2XM<x>_Set the name of the '#039'main'#039' program routine (default i'+
   's '#039'main'#039')'#010+
-  '**2XP<x>_Prepend the binutils na','mes with the prefix <x>'#010+
+  '**2XP<x>_Prepend the binutils names with the prefix <x>'#010+
   '**2Xr<x>_Set the linker'#039's rlink-path to <x> (needed for cross comp'+
-  'ile, see the ld manual for more information) (BeOS, Linux)'#010+
+  'ile, see the ld manual for more info','rmation) (BeOS, Linux)'#010+
   '**2XR<x>_Prepend <x> to all linker search paths (BeOS, Darwin, FreeBSD'+
-  ', Linux, Mac OS, Sola','ris)'#010+
+  ', Linux, Mac OS, Solaris)'#010+
   '**2Xs_Strip all symbols from executable'#010+
   '**2XS_Try to link units statically (default, defines FPC_LINK_STATIC)'#010+
-  '**2Xt_Link with static libraries (-static is passed to linker)'#010+
-  '**2XX_Try to smartlink units             (defines FPC_LINK_SMA','RT)'#010+
+  '**2Xt_Link ','with static libraries (-static is passed to linker)'#010+
+  '**2XX_Try to smartlink units             (defines FPC_LINK_SMART)'#010+
   '**1*_'#010+
   '**1?_Show this help'#010+
   '**1h_Shows this help without waiting'

+ 9 - 9
compiler/ncgadd.pas

@@ -85,8 +85,10 @@ interface
     procedure tcgaddnode.pass_left_right;
       var
         tmpreg     : tregister;
-        isjump,
-        pushedfpu  : boolean;
+{$ifdef i386}
+        pushedfpu,
+{$endif i386}
+        isjump     : boolean;
         otl,ofl    : tasmlabel;
       begin
         { calculate the operator which is more difficult }
@@ -113,9 +115,9 @@ interface
             current_procinfo.CurrFalseLabel:=ofl;
           end;
 
+{$ifdef i386}
         { are too few registers free? }
         pushedfpu:=false;
-{$ifdef i386}
         if (left.location.loc=LOC_FPUREGISTER) and
            (node_resources_fpu(right)>=maxfpuregs) then
           begin
@@ -140,32 +142,30 @@ interface
             current_procinfo.CurrTrueLabel:=otl;
             current_procinfo.CurrFalseLabel:=ofl;
           end;
+{$ifdef i386}
         if pushedfpu then
           begin
-{$ifdef x86}
-            if use_sse(left.resultdef) then
+            if use_vectorfpu(left.resultdef) then
               begin
                 tmpreg := cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
                 cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,left.location.size,left.location,tmpreg,mms_movescalar);
                 location_freetemp(current_asmdata.CurrAsmList,left.location);
                 location_reset(left.location,LOC_MMREGISTER,left.location.size);
-                left.location.register := tmpreg;
+                left.location.register:=tmpreg;
               end
             else
-{$endif x86}
               begin
                 tmpreg := cg.getfpuregister(current_asmdata.CurrAsmList,left.location.size);
                 cg.a_loadfpu_loc_reg(current_asmdata.CurrAsmList,left.location.size,left.location,tmpreg);
                 location_freetemp(current_asmdata.CurrAsmList,left.location);
                 location_reset(left.location,LOC_FPUREGISTER,left.location.size);
                 left.location.register := tmpreg;
-{$ifdef x86}
                 { left operand is now on top of the stack, instead of the right one! }
                 if (right.location.loc=LOC_FPUREGISTER) then
                   toggleflag(nf_swapped);
-{$endif x86}
               end;
           end;
+{$endif i386}
       end;
 
 

+ 1 - 3
compiler/ncgbas.pas

@@ -393,8 +393,7 @@ interface
           begin
             if tempinfo^.typedef.typ=floatdef then
               begin
-{$ifdef x86}
-                if use_sse(tempinfo^.typedef) then
+                if use_vectorfpu(tempinfo^.typedef) then
                   begin
                     if (tempinfo^.temptype = tt_persistent) then
                       location_reset(tempinfo^.location,LOC_CMMREGISTER,def_cgsize(tempinfo^.typedef))
@@ -403,7 +402,6 @@ interface
                     tempinfo^.location.register:=cg.getmmregister(current_asmdata.CurrAsmList,tempinfo^.location.size);
                   end
                 else
-{$endif x86}
                   begin
                     if (tempinfo^.temptype = tt_persistent) then
                       location_reset(tempinfo^.location,LOC_CFPUREGISTER,def_cgsize(tempinfo^.typedef))

+ 2 - 7
compiler/ncgcal.pas

@@ -194,15 +194,10 @@ implementation
                  LOC_REFERENCE,
                  LOC_CREFERENCE,
                  LOC_MMREGISTER,
-                 LOC_CMMREGISTER:
-                   cg.a_parammm_reg(current_asmdata.CurrAsmList,left.location.size,left.location.register,tempcgpara,mms_movescalar);
-{$ifdef x86_64}
+                 LOC_CMMREGISTER,
                  LOC_REGISTER,
                  LOC_CREGISTER :
-                   begin
-                     current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MOVD,S_NO,left.location.register,tempcgpara.location^.register));
-                   end;
-{$endif x86_64}
+                   cg.a_parammm_reg(current_asmdata.CurrAsmList,left.location.size,left.location.register,tempcgpara,mms_movescalar);
                  LOC_FPUREGISTER,
                  LOC_CFPUREGISTER:
                    begin

+ 3 - 0
compiler/ncgcnv.pas

@@ -284,6 +284,9 @@ interface
              left.location.reference:=tr;
            end;
 {$endif x86}
+         { ARM VFP values are in integer registers when they are function results }
+         if (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
+           location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
          case left.location.loc of
             LOC_FPUREGISTER,
             LOC_CFPUREGISTER:

+ 15 - 7
compiler/ncgld.pas

@@ -784,7 +784,8 @@ implementation
                     LOC_CMMREGISTER:
                       begin
 {$ifdef x86}
-                        if not use_sse(right.resultdef) then
+                        if (right.resultdef.typ=floatdef) and
+                           not use_vectorfpu(right.resultdef) then
                           begin
                             { perform size conversion if needed (the mm-code cannot }
                             { convert an extended into a double/single, since sse   }
@@ -839,17 +840,24 @@ implementation
                     end
                   else
                     begin
-                      if left.location.loc=LOC_CMMREGISTER then
-                        cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,left.location.size,right.location.register,left.location.register,mms_movescalar)
-                      else
-                        cg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.location.size,left.location.size,right.location.register,left.location.reference,mms_movescalar);
+                      case left.location.loc of
+                        LOC_CMMREGISTER,
+                        LOC_MMREGISTER:
+                          cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,left.location.size,right.location.register,left.location.register,mms_movescalar);
+                        LOC_REFERENCE,
+                        LOC_CREFERENCE:
+                          cg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.location.size,left.location.size,right.location.register,left.location.reference,mms_movescalar);
+                        else
+                          internalerror(2009112601);
+                      end;
                     end;
                 end;
               LOC_REGISTER,
               LOC_CREGISTER :
                 begin
 {$ifndef cpu64bitalu}
-                  if left.location.size in [OS_64,OS_S64] then
+                  { also OS_F64 in case of mmreg -> intreg }
+                  if left.location.size in [OS_64,OS_S64,OS_F64] then
                     cg64.a_load64_reg_loc(current_asmdata.CurrAsmList,
                       right.location.register64,left.location)
                   else
@@ -863,7 +871,7 @@ implementation
                   if left.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER] then
                     begin
 {$ifdef x86}
-                      if not use_sse(right.resultdef) then
+                      if not use_vectorfpu(right.resultdef) then
                         begin
                           { perform size conversion if needed (the mm-code cannot convert an   }
                           { extended into a double/single, since sse doesn't support extended) }

+ 96 - 24
compiler/ncgutil.pas

@@ -731,6 +731,7 @@ implementation
       var
         reg : tregister;
         href : treference;
+        newsize : tcgsize;
       begin
         if (l.loc<>LOC_MMREGISTER)  and
            ((l.loc<>LOC_CMMREGISTER) or (not maybeconst)) then
@@ -743,8 +744,30 @@ implementation
                 location_reset_ref(l,LOC_REFERENCE,l.size,0);
                 l.reference:=href;
               end;
-            reg:=cg.getmmregister(list,l.size);
-            cg.a_loadmm_loc_reg(list,l.size,l,reg,mms_movescalar);
+{$ifndef cpu64bitalu}
+            if (l.loc in [LOC_REGISTER,LOC_CREGISTER]) and
+               (l.size in [OS_64,OS_S64]) then
+              begin
+                reg:=cg.getmmregister(list,OS_F64);
+                cg64.a_loadmm_intreg64_reg(list,OS_F64,l.register64,reg);
+                l.size:=OS_F64
+              end
+            else
+{$endif not cpu64bitalu}
+              begin
+                 { on ARM, CFP values may be located in integer registers,
+                   and its second_int_to_real() also uses this routine to
+                   force integer (memory) values in an mmregister }
+                 if (l.size in [OS_32,OS_S32]) then
+                   newsize:=OS_F32
+                 else if (l.size in [OS_64,OS_S64]) then
+                   newsize:=OS_F64
+                 else
+                   newsize:=l.size;
+                 reg:=cg.getmmregister(list,newsize);
+                 cg.a_loadmm_loc_reg(list,newsize,l,reg,mms_movescalar);
+                 l.size:=newsize;
+               end;
             location_freetemp(list,l);
             location_reset(l,LOC_MMREGISTER,l.size);
             l.register:=reg;
@@ -1505,6 +1528,11 @@ implementation
                           end;
                         LOC_CREGISTER :
                           cg.a_load_reg_reg(list,OS_32,OS_32,restmploc.register64.reglo,resloc.register64.reglo);
+                        LOC_CMMREGISTER :
+                          { perform the whole move at once below, both result
+                            registers are required (and since restmploc is an mmreg
+                            and resloc intregs, they don't conflict anyway) }
+                          ;
                         else
                           internalerror(200409203);
                       end;
@@ -1522,6 +1550,8 @@ implementation
                           end;
                         LOC_CREGISTER :
                           cg.a_load_reg_reg(list,OS_32,OS_32,restmploc.register64.reghi,resloc.register64.reghi);
+                        LOC_CMMREGISTER :
+                          cg64.a_loadmm_reg_intreg64(list,restmploc.size,restmploc.register,resloc.register64);
                         else
                           internalerror(200409204);
                       end;
@@ -1550,6 +1580,8 @@ implementation
                           end;
                         LOC_CREGISTER :
                           cg.a_load_reg_reg(list,OS_32,OS_32,restmploc.register,resloc.register);
+                        LOC_CMMREGISTER :
+                          cg.a_loadmm_reg_intreg(list,restmploc.size,resloc.size,restmploc.register,resloc.register,mms_movescalar);
                         else
                           internalerror(200409203);
                       end;
@@ -1726,27 +1758,39 @@ implementation
          end;
 
 
-       procedure gen_load_reg(const paraloc:TCGParaLocation;reg:tregister; alignment: longint);
+       procedure gen_load_reg(const paraloc:TCGParaLocation; regsize: tcgsize; reg:tregister; alignment: longint);
          var
            href : treference;
          begin
             case paraloc.loc of
               LOC_REGISTER :
-                cg.a_load_reg_reg(list,paraloc.size,paraloc.size,paraloc.register,reg);
+                begin
+                  case getregtype(reg) of
+                    R_INTREGISTER:
+                      cg.a_load_reg_reg(list,paraloc.size,regsize,paraloc.register,reg);
+                    R_MMREGISTER:
+                      cg.a_loadmm_intreg_reg(list,paraloc.size,regsize,paraloc.register,reg,mms_movescalar);
+                    else
+                      internalerror(2009112422);
+                  end;
+                end;
               LOC_MMREGISTER :
-                cg.a_loadmm_reg_reg(list,paraloc.size,paraloc.size,paraloc.register,reg,mms_movescalar);
+                cg.a_loadmm_reg_reg(list,paraloc.size,regsize,paraloc.register,reg,mms_movescalar);
               LOC_FPUREGISTER :
-                cg.a_loadfpu_reg_reg(list,paraloc.size,paraloc.size,paraloc.register,reg);
+                cg.a_loadfpu_reg_reg(list,paraloc.size,regsize,paraloc.register,reg);
               LOC_REFERENCE :
                 begin
                   reference_reset_base(href,paraloc.reference.index,paraloc.reference.offset,alignment);
                   case getregtype(reg) of
                     R_INTREGISTER :
-                      cg.a_load_ref_reg(list,paraloc.size,paraloc.size,href,reg);
+                      cg.a_load_ref_reg(list,paraloc.size,regsize,href,reg);
                     R_FPUREGISTER :
-                      cg.a_loadfpu_ref_reg(list,paraloc.size,paraloc.size,href,reg);
+                      cg.a_loadfpu_ref_reg(list,paraloc.size,regsize,href,reg);
                     R_MMREGISTER :
-                      cg.a_loadmm_ref_reg(list,paraloc.size,paraloc.size,href,reg,mms_movescalar);
+                      { not paraloc.size, because it may be OS_64 instead of
+                        OS_F64 in case the parameter is passed using integer
+                        conventions (e.g., on ARM) }
+                      cg.a_loadmm_ref_reg(list,regsize,regsize,href,reg,mms_movescalar);
                     else
                       internalerror(2004101012);
                   end;
@@ -1765,6 +1809,9 @@ implementation
 {$if defined(sparc) or defined(arm)}
         tempref  : treference;
 {$endif sparc}
+{$ifndef cpu64bitalu}
+        reg64: tregister64;
+{$endif not cpu64bitalu}
       begin
         if (po_assembler in current_procinfo.procdef.procoptions) then
           exit;
@@ -1843,9 +1890,9 @@ implementation
                                 unget_para(paraloc^);
                                 gen_alloc_regvar(list,currpara);
                                 { reg->reg, alignment is irrelevant }
-                                gen_load_reg(paraloc^,currpara.initialloc.register64.reghi,4);
+                                gen_load_reg(paraloc^,OS_32,currpara.initialloc.register64.reghi,4);
                                 unget_para(paraloc^.next^);
-                                gen_load_reg(paraloc^.next^,currpara.initialloc.register64.reglo,4);
+                                gen_load_reg(paraloc^.next^,OS_32,currpara.initialloc.register64.reglo,4);
                               end
                             else
                               begin
@@ -1853,9 +1900,9 @@ implementation
                                   paraloc^.next -> high }
                                 unget_para(paraloc^);
                                 gen_alloc_regvar(list,currpara);
-                                gen_load_reg(paraloc^,currpara.initialloc.register64.reglo,4);
+                                gen_load_reg(paraloc^,OS_32,currpara.initialloc.register64.reglo,4);
                                 unget_para(paraloc^.next^);
-                                gen_load_reg(paraloc^.next^,currpara.initialloc.register64.reghi,4);
+                                gen_load_reg(paraloc^.next^,OS_32,currpara.initialloc.register64.reghi,4);
                               end;
                           end;
                         LOC_REFERENCE:
@@ -1876,7 +1923,7 @@ implementation
                         internalerror(200410105);
                       unget_para(paraloc^);
                       gen_alloc_regvar(list,currpara);
-                      gen_load_reg(paraloc^,currpara.initialloc.register,sizeof(aint));
+                      gen_load_reg(paraloc^,currpara.initialloc.size,currpara.initialloc.register,sizeof(aint));
                     end;
                 end;
               LOC_CFPUREGISTER :
@@ -1902,22 +1949,47 @@ implementation
                   unget_para(paraloc^);
                   gen_alloc_regvar(list,currpara);
                   { from register to register -> alignment is irrelevant }
-                  gen_load_reg(paraloc^,currpara.initialloc.register,0);
+                  gen_load_reg(paraloc^,currpara.initialloc.size,currpara.initialloc.register,0);
                   if assigned(paraloc^.next) then
                     internalerror(200410109);
 {$endif sparc}
                 end;
               LOC_CMMREGISTER :
                 begin
-                  unget_para(paraloc^);
-                  gen_alloc_regvar(list,currpara);
-                  { from register to register -> alignment is irrelevant }
-                  gen_load_reg(paraloc^,currpara.initialloc.register,0);
-                  { data could come in two memory locations, for now
-                    we simply ignore the sanity check (FK)
-                  if assigned(paraloc^.next) then
-                    internalerror(200410108);
-                  }
+{$ifndef cpu64bitalu}
+                  { ARM vfp floats are passed in integer registers }
+                  if (currpara.paraloc[calleeside].size=OS_F64) and
+                     (paraloc^.size in [OS_32,OS_S32]) and
+                     use_vectorfpu(currpara.vardef) then
+                    begin
+                      { we need 2x32bit reg }
+                      if not assigned(paraloc^.next) or
+                         assigned(paraloc^.next^.next) then
+                        internalerror(2009112421);
+                      unget_para(paraloc^);
+                      unget_para(paraloc^.next^);
+                      gen_alloc_regvar(list,currpara);
+                      if (target_info.endian=endian_big) then
+                        { paraloc^ -> high
+                          paraloc^.next -> low }
+                        reg64:=joinreg64(paraloc^.next^.register,paraloc^.register)
+                      else
+                        reg64:=joinreg64(paraloc^.register,paraloc^.next^.register);
+                      cg64.a_loadmm_intreg64_reg(list,OS_F64,reg64,currpara.initialloc.register);
+                    end
+                  else
+{$endif not cpu64bitalu}
+                    begin
+                      unget_para(paraloc^);
+                      gen_alloc_regvar(list,currpara);
+                      { from register to register -> alignment is irrelevant }
+                      gen_load_reg(paraloc^,currpara.initialloc.size,currpara.initialloc.register,0);
+                      { data could come in two memory locations, for now
+                        we simply ignore the sanity check (FK)
+                      if assigned(paraloc^.next) then
+                        internalerror(200410108);
+                      }
+                    end;
                 end;
             end;
           end;

+ 4 - 1
compiler/ncnv.pas

@@ -2466,7 +2466,10 @@ implementation
 {$endif cpufpemu}
           begin
             first_real_to_real:=nil;
-            expectloc:=LOC_FPUREGISTER;
+            if not use_vectorfpu(resultdef) then
+              expectloc:=LOC_FPUREGISTER
+            else
+              expectloc:=LOC_MMREGISTER;
           end;
       end;
 

+ 10 - 1
compiler/nld.pas

@@ -584,7 +584,16 @@ implementation
                 { sse register to an extended value in memory more      }
                 { efficiently than a type conversion node, so don't     }
                 { bother implementing support for that                  }
-                and (use_sse(left.resultdef) or not(use_sse(right.resultdef)))
+                and (use_vectorfpu(left.resultdef) or not(use_vectorfpu(right.resultdef)))
+{$endif}
+
+{$ifdef arm}
+                { the assignment node code can't convert a single in
+                  an interger register to a double in an mmregister or
+                  vice versa }
+                and (use_vectorfpu(left.resultdef) and
+                     use_vectorfpu(right.resultdef) and
+                     (tfloatdef(left.resultdef).floattype=tfloatdef(right.resultdef).floattype))
 {$endif}
         then
           begin

+ 38 - 6
compiler/options.pas

@@ -34,7 +34,10 @@ Type
     FirstPass,
     ParaLogo,
     NoPressEnter,
-    LogoWritten : boolean;
+    LogoWritten,
+    FPUSetExplicitly,
+    CPUSetExplicitly,
+    OptCPUSetExplicitly: boolean;
     FileLevel : longint;
     QuickInfo : string;
     ParaIncludePath,
@@ -629,6 +632,7 @@ begin
                         s:=upper(copy(more,j+1,length(more)-j));
                         if not(SetFpuType(s,init_settings.fputype)) then
                           IllegalPara(opt);
+                        FPUSetExplicitly:=True;
                         break;
                       end;
                     'F' :
@@ -682,6 +686,7 @@ begin
                         s:=upper(copy(more,j+1,length(more)-j));
                         if not(Setcputype(s,init_settings.cputype)) then
                           IllegalPara(opt);
+                        CPUSetExplicitly:=true;
                         break;
                       end;
                     'P':
@@ -1140,6 +1145,7 @@ begin
                       begin
                         if not Setcputype(copy(more,j+1,length(more)),init_settings.optimizecputype) then
                           begin
+                            OptCPUSetExplicitly:=true;
                             { Give warning for old i386 switches }
                             if (Length(More)-j=1) and
                                (More[j+1]>='1') and (More[j+1]<='5')then
@@ -2240,6 +2246,13 @@ begin
           ForceStaticLinking;
         end;
     end;
+{$ifdef arm}
+  if (init_settings.cputype in [cpu_armv7m,cpu_cortexm3]) and
+     (init_settings.fputype in [fpu_vfpv2,fpu_vfpv3]) then
+    begin
+      Writeln(
+    end;
+{$endif arm}
 end;
 
 
@@ -2248,6 +2261,9 @@ begin
   LogoWritten:=false;
   NoPressEnter:=false;
   FirstPass:=false;
+  FPUSetExplicitly:=false;
+  CPUSetExplicitly:=false;
+  OptCPUSetExplicitly:=false;
   FileLevel:=0;
   Quickinfo:='';
   ParaIncludePath:=TSearchPathList.Create;
@@ -2698,12 +2714,17 @@ begin
      not(cs_link_separate_dbg_file in init_settings.globalswitches) then
     exclude(init_settings.globalswitches,cs_link_strip);
 
-  { force fpu emulation on arm/wince, arm/gba, arm/embedded and arm/nds}
-  if (target_info.system in [system_arm_wince,system_arm_gba,system_m68k_amiga,
-    system_m68k_linux,system_arm_nds,system_arm_darwin,system_arm_embedded])
+  { force fpu emulation on arm/wince, arm/gba, arm/embedded, arm/nds and
+    arm/darwin if fpu type not explicitly set }
+  if not(option.FPUSetExplicitly) and
+     ((target_info.system in [system_arm_wince,system_arm_gba,system_m68k_amiga,
+         system_m68k_linux,system_arm_nds,system_arm_embedded,system_arm_darwin])
+{$ifdef arm}
+      or (target_info.abi=abi_eabi)
+{$endif arm}
+     )
 {$ifdef arm}
-    or (init_settings.fputype=fpu_soft)
-    or (target_info.abi=abi_eabi)
+     or (init_settings.fputype=fpu_soft)
 {$endif arm}
   then
     begin
@@ -2714,6 +2735,17 @@ begin
 {$endif cpufpemu}
     end;
 
+{$ifdef arm}
+{ set default cpu type to ARMv6 for Darwin unless specified otherwise }
+if (target_info.system=system_arm_darwin) then
+  begin
+    if not option.CPUSetExplicitly then
+      init_settings.cputype:=cpu_armv6;
+    if not option.OptCPUSetExplicitly then
+      init_settings.optimizecputype:=cpu_armv6;
+  end;
+{$endif arm}
+
   { now we can define cpu and fpu type }
   def_system_macro('CPU'+Cputypestr[init_settings.cputype]);
 

+ 2 - 2
compiler/rautils.pas

@@ -85,7 +85,7 @@ type
       OPR_COND      : (cond : tasmcond);
 {$endif POWERPC64}
 {$ifdef arm}
-      OPR_REGSET    : (regset : tcpuregisterset);
+      OPR_REGSET    : (regset : tcpuregisterset; regtype: tregistertype; subreg: tsubregister);
       OPR_SHIFTEROP : (shifterop : tshifterop);
       OPR_COND      : (cc : tasmcond);
 {$endif arm}
@@ -1060,7 +1060,7 @@ end;
                 ai.loadref(i-1,ref);
 {$ifdef ARM}
               OPR_REGSET:
-                ai.loadregset(i-1,regset);
+                ai.loadregset(i-1,regtype,subreg,regset);
               OPR_SHIFTEROP:
                 ai.loadshifterop(i-1,shifterop);
               OPR_COND:

+ 7 - 2
compiler/rgobj.pas

@@ -1740,8 +1740,13 @@ unit rgobj;
               {Get a temp for the spilled register, the size must at least equal a complete register,
                take also care of the fact that subreg can be larger than a single register like doubles
                that occupy 2 registers }
-              size:=max(tcgsize2size[reg_cgsize(newreg(regtype,t,R_SUBWHOLE))],
-                             tcgsize2size[reg_cgsize(newreg(regtype,t,reginfo[t].subreg))]);
+              { only force the whole register in case of integers. Storing a register that contains
+                a single precision value as a double can cause conversion errors on e.g. ARM VFP }
+              if (regtype=R_INTREGISTER) then
+                size:=max(tcgsize2size[reg_cgsize(newreg(regtype,t,R_SUBWHOLE))],
+                               tcgsize2size[reg_cgsize(newreg(regtype,t,reginfo[t].subreg))])
+              else
+                size:=tcgsize2size[reg_cgsize(newreg(regtype,t,reginfo[t].subreg))];
               tg.gettemp(templist,
                          size,size,
                          tt_noreuse,spill_temps^[t]);

+ 14 - 9
compiler/symdef.pas

@@ -751,9 +751,7 @@ interface
 
     procedure loadobjctypes;
 
-{$ifdef x86}
-    function use_sse(def : tdef) : boolean;
-{$endif x86}
+    function use_vectorfpu(def : tdef) : boolean;
 
 implementation
 
@@ -1160,7 +1158,7 @@ implementation
    function tstoreddef.is_fpuregable : boolean;
      begin
 {$ifdef x86}
-       result:=use_sse(self);
+       result:=use_vectorfpu(self);
 {$else x86}
        result:=(typ=floatdef) and not(cs_fp_emulation in current_settings.moduleswitches);
 {$endif x86}
@@ -5220,13 +5218,20 @@ implementation
       end;
 
 
-{$ifdef x86}
-
-    function use_sse(def : tdef) : boolean;
+    function use_vectorfpu(def : tdef) : boolean;
       begin
-        use_sse:=(is_single(def) and (current_settings.fputype in sse_singlescalar)) or
+{$ifdef x86}
+{$define use_vectorfpuimplemented}
+        use_vectorfpu:=(is_single(def) and (current_settings.fputype in sse_singlescalar)) or
           (is_double(def) and (current_settings.fputype in sse_doublescalar));
-      end;
 {$endif x86}
+{$ifdef arm}
+{$define use_vectorfpuimplemented}
+        use_vectorfpu:=(current_settings.fputype in vfp_scalar);
+{$endif arm}
+{$ifndef use_vectorfpuimplemented}
+        use_vectorfpu:=false;
+{$endif}
+      end;
 
 end.

+ 1 - 4
compiler/symsym.pas

@@ -1136,13 +1136,10 @@ implementation
                  ) and }
                  tstoreddef(vardef).is_fpuregable then
                  begin
-{$ifdef x86}
-                   if use_sse(vardef) then
+                   if use_vectorfpu(vardef) then
                      varregable:=vr_mmreg
                    else
-{$else x86}
                      varregable:=vr_fpureg;
-{$endif x86}
                  end;
           end;
       end;

+ 1 - 1
compiler/systems/t_bsd.pas

@@ -333,7 +333,7 @@ begin
             system_x86_64_darwin:
               LinkRes.Add('x86_64');
             system_arm_darwin:
-              LinkRes.Add('arm');
+              LinkRes.Add(lower(cputypestr[current_settings.cputype]));
           end;
       end;
   end;

+ 4 - 1
compiler/utils/mkarmreg.pp

@@ -24,6 +24,7 @@ var s : string;
     regcount_bsstart:byte;
     names,
     regtypes,
+    subtypes,
     supregs,
     numbers,
     stdnames,
@@ -173,6 +174,8 @@ begin
         readcomma;
         regtypes[regcount]:=readstr;
         readcomma;
+        subtypes[regcount]:=readstr;
+        readcomma;
         supregs[regcount]:=readstr;
         readcomma;
         stdnames[regcount]:=readstr;
@@ -187,7 +190,7 @@ begin
             writeln('Line: "',s,'"');
             halt(1);
           end;
-        numbers[regcount]:=regtypes[regcount]+'0000'+copy(supregs[regcount],2,255);
+        numbers[regcount]:=regtypes[regcount]+copy(subtypes[regcount],2,255)+'00'+copy(supregs[regcount],2,255);
         if i<length(s) then
           begin
             writeln('Extra chars at end of line, at line ',line);

+ 2 - 2
compiler/x86/nx86add.pas

@@ -834,7 +834,7 @@ unit nx86add;
       var
         op : TAsmOp;
       begin
-        if use_sse(resultdef) then
+        if use_vectorfpu(resultdef) then
           begin
             second_addfloatsse;
             exit;
@@ -878,7 +878,7 @@ unit nx86add;
       var
         resflags   : tresflags;
       begin
-        if use_sse(left.resultdef) or use_sse(right.resultdef) then
+        if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
           begin
             second_cmpfloatsse;
             exit;

+ 3 - 3
compiler/x86/nx86cnv.pas

@@ -76,7 +76,7 @@ implementation
             (tfloatdef(left.resultdef).floattype<>s64comp) and
             not (nf_explicit in flags) then
            CGMessage(type_w_convert_real_2_comp);
-         if use_sse(resultdef) then
+         if use_vectorfpu(resultdef) then
            expectloc:=LOC_MMREGISTER
          else
            expectloc:=LOC_FPUREGISTER;
@@ -224,7 +224,7 @@ implementation
             firstpass(left)
           end;
 
-        if use_sse(resultdef) and
+        if use_vectorfpu(resultdef) and
            (torddef(left.resultdef).ordtype = s32bit) then
           expectloc:=LOC_MMREGISTER
         else
@@ -243,7 +243,7 @@ implementation
       begin
         if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) then
           location_force_reg(current_asmdata.CurrAsmList,left.location,left.location.size,false);
-        if use_sse(resultdef) and
+        if use_vectorfpu(resultdef) and
 {$ifdef cpu64bitalu}
            (torddef(left.resultdef).ordtype in [s32bit,s64bit]) then
 {$else cpu64bitalu}

+ 3 - 3
compiler/x86/nx86con.pas

@@ -50,7 +50,7 @@ implementation
     function tx86realconstnode.pass_1 : tnode;
       begin
          result:=nil;
-         if is_number_float(value_real) and not(use_sse(resultdef)) and (value_real=1.0) or (value_real=0.0) then
+         if is_number_float(value_real) and not(use_vectorfpu(resultdef)) and (value_real=1.0) or (value_real=0.0) then
            expectloc:=LOC_FPUREGISTER
          else
            expectloc:=LOC_CREFERENCE;
@@ -61,14 +61,14 @@ implementation
       begin
          if is_number_float(value_real) then
            begin
-             if (value_real=1.0) and not(use_sse(resultdef)) then
+             if (value_real=1.0) and not(use_vectorfpu(resultdef)) then
                begin
                   emit_none(A_FLD1,S_NO);
                   location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
                   location.register:=NR_ST;
                   tcgx86(cg).inc_fpu_stack;
                end
-             else if (value_real=0.0) and not(use_sse(resultdef)) then
+             else if (value_real=0.0) and not(use_vectorfpu(resultdef)) then
                begin
                   emit_none(A_FLDZ,S_NO);
                   if (get_real_sign(value_real) < 0) then

+ 9 - 9
compiler/x86/nx86inl.pas

@@ -100,7 +100,7 @@ implementation
 
      function tx86inlinenode.first_abs_real : tnode;
        begin
-         if use_sse(resultdef) then
+         if use_vectorfpu(resultdef) then
            expectloc:=LOC_MMREGISTER
          else
            expectloc:=LOC_FPUREGISTER;
@@ -141,7 +141,7 @@ implementation
      function tx86inlinenode.first_round_real : tnode;
       begin
 {$ifdef x86_64}
-        if use_sse(left.resultdef) then
+        if use_vectorfpu(left.resultdef) then
           expectloc:=LOC_REGISTER
         else
 {$endif x86_64}
@@ -154,14 +154,14 @@ implementation
        begin
          if (cs_opt_size in current_settings.optimizerswitches)
 {$ifdef x86_64}
-           and not(use_sse(left.resultdef))
+           and not(use_vectorfpu(left.resultdef))
 {$endif x86_64}
            then
            result:=inherited
          else
            begin
 {$ifdef x86_64}
-             if use_sse(left.resultdef) then
+             if use_vectorfpu(left.resultdef) then
                expectloc:=LOC_REGISTER
              else
 {$endif x86_64}
@@ -222,7 +222,7 @@ implementation
        var
          href : treference;
        begin
-         if use_sse(resultdef) then
+         if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
@@ -249,7 +249,7 @@ implementation
      procedure tx86inlinenode.second_round_real;
        begin
 {$ifdef x86_64}
-         if use_sse(left.resultdef) then
+         if use_vectorfpu(left.resultdef) then
            begin
              secondpass(left);
              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
@@ -281,7 +281,7 @@ implementation
          oldcw,newcw : treference;
        begin
 {$ifdef x86_64}
-         if use_sse(left.resultdef) and
+         if use_vectorfpu(left.resultdef) and
            not((left.location.loc=LOC_FPUREGISTER) and (current_settings.fputype>=fpu_sse3)) then
            begin
              secondpass(left);
@@ -331,7 +331,7 @@ implementation
      procedure tx86inlinenode.second_sqr_real;
 
        begin
-         if use_sse(resultdef) then
+         if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
@@ -348,7 +348,7 @@ implementation
 
      procedure tx86inlinenode.second_sqrt_real;
        begin
-         if use_sse(resultdef) then
+         if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
              location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);

+ 1 - 1
compiler/x86/nx86mat.pas

@@ -71,7 +71,7 @@ interface
 
          if (left.resultdef.typ=floatdef) then
            begin
-             if use_sse(left.resultdef) then
+             if use_vectorfpu(left.resultdef) then
                expectloc:=LOC_MMREGISTER
              else
                expectloc:=LOC_FPUREGISTER;

+ 34 - 1
compiler/x86_64/cgcpu.pas

@@ -41,6 +41,9 @@ unit cgcpu;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
 
         procedure a_param_ref(list : TAsmList;size : tcgsize;const r : treference;const paraloc : TCGPara);override;
+
+        procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
+        procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister;shuffle : pmmshuffle); override;
       end;
 
     procedure create_codegen;
@@ -246,7 +249,37 @@ unit cgcpu;
         List.concat(Tai_symbol_end.Createname(labelname));
       end;
 
-      
+
+    procedure tcgx86_64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (tosize<>OS_F64) then
+          internalerror(2009112505);
+        if not(fromsize in [OS_64,OS_S64]) then
+          internalerror(2009112506);
+        if assigned(shuffle) and
+           not shufflescalar(shuffle) then
+          internalerror(2009112517);
+        list.concat(taicpu.op_reg_reg(A_MOVD,S_NO,intreg,mmreg));
+      end;
+
+
+    procedure tcgx86_64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister;shuffle : pmmshuffle);
+      begin
+        { this code can only be used to transfer raw data, not to perform
+          conversions }
+        if (fromsize<>OS_F64) then
+          internalerror(2009112507);
+        if not(tosize in [OS_64,OS_S64]) then
+          internalerror(2009112408);
+        if assigned(shuffle) and
+           not shufflescalar(shuffle) then
+          internalerror(2009112515);
+        list.concat(taicpu.op_reg_reg(A_MOVD,S_NO,mmreg,intreg));
+      end;
+
+
     procedure create_codegen;
       begin
         cg:=tcgx86_64.create;

+ 3 - 3
compiler/x86_64/nx64cnv.pas

@@ -72,7 +72,7 @@ implementation
     function tx8664typeconvnode.first_int_to_real : tnode;
       begin
         result:=nil;
-        if use_sse(resultdef) and
+        if use_vectorfpu(resultdef) and
            (torddef(left.resultdef).ordtype=u32bit) then
           begin
             inserttypeconv(left,s64inttype);
@@ -80,7 +80,7 @@ implementation
           end
         else
           result:=inherited first_int_to_real;
-       if use_sse(resultdef) then
+       if use_vectorfpu(resultdef) then
          expectloc:=LOC_MMREGISTER;
       end;
 
@@ -91,7 +91,7 @@ implementation
          l1,l2 : tasmlabel;
          op : tasmop;
       begin
-        if use_sse(resultdef) then
+        if use_vectorfpu(resultdef) then
           begin
             if is_double(resultdef) then
               op:=A_CVTSI2SD

+ 83 - 0
rtl/arm/arm.inc

@@ -28,7 +28,9 @@ const
 {$endif FPC_SYSTEM_FPC_MOVE}
 
 {$if not(defined(wince)) and not(defined(gba)) and not(defined(nds)) and not(defined(FPUSOFT)) and not(defined(FPULIBGCC))}
+
 {$define FPC_SYSTEM_HAS_SYSINITFPU}
+{$if not defined(darwin) and not defined(FPUVFPV2) and not defined(FPUVFPV3)}
 Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
 begin
   { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
@@ -39,6 +41,30 @@ begin
     wfs r0
   end;
 end;
+{$else}
+Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
+begin
+  { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
+  asm
+    fmrx r0,fpscr
+    // set "round to nearest" mode
+    and  r0,r0,#0xff3fffff
+    // mask "exception happened" and overflow flags
+    and  r0,r0,#0xffffff20
+    // mask exception flags
+    and  r0,r0,#0xffff40ff    
+{$ifndef darwin}
+    // Floating point exceptions cause kernel panics on iPhoneOS 2.2.1...
+
+    // disable flush-to-zero mode (IEEE math compliant)
+    and  r0,r0,#0xfeffffff
+    // enable invalid operation, div-by-zero and overflow exceptions
+    orr  r0,r0,#0x00000700
+{$endif}
+    fmxr fpscr,r0
+  end;
+end;
+{$endif}
 {$endif}
 
 procedure fpc_cpuinit;
@@ -92,6 +118,7 @@ function get_caller_frame(framebp:pointer):pointer;assembler;
 asm
   movs r0,r0
   beq .Lgnf_null
+  // see comments in arm/cgcpu.pas, g_proc_entry
   ldr r0,[r0,#-12]
 .Lgnf_null:
 end;
@@ -477,6 +504,16 @@ var
 
 function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
 asm
+{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
+.Lloop:
+  ldrex r1, [r0]
+  sub   r1, r1, #1
+  strex r2, r1, [r0]
+  cmp r2, #0
+  bne .Lloop
+  mov r0, r1
+  mov pc, lr
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r1, #1
@@ -495,11 +532,22 @@ asm
 
 .Lfpc_system_lock:
   .long fpc_system_lock
+{$endif}
 end;
 
 
 function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
 asm
+{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
+.Lloop:
+  ldrex r1, [r0]
+  add   r1, r1, #1
+  strex r2, r1, [r0]
+  cmp r2, #0
+  bne .Lloop
+  mov r0, r1
+  mov pc, lr
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r1, #1
@@ -518,17 +566,39 @@ asm
 
 .Lfpc_system_lock:
   .long fpc_system_lock
+{$endif}
 end;
 
 
 function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
 asm
+{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
+// swp is deprecated on ARMv6 and above
+.Lloop:
+  ldrex r2, [r0]
+  strex r3, r1, [r0]
+  cmp r3, #0
+  bne .Lloop
+  mov r0, r2
+  mov pc, lr
+{$else}
   swp r1, r1, [r0]
   mov r0,r1
+{$endif}
 end;
 
 function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
 asm
+{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
+.Lloop:
+  ldrex r2, [r0]
+  add   r12, r1, r2
+  strex r3, r12, [r0]
+  cmp r3, #0
+  bne .Lloop
+  mov  r0, r2
+  mov pc, lr
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r2, #1
@@ -548,11 +618,23 @@ asm
 
 .Lfpc_system_lock:
   .long fpc_system_lock
+{$endif}
 end;
 
 
 function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
 asm
+{$if defined(cpuarmv6) or defined(cpuarmv7m) or defined(cpucortexm3)}
+.Lloop:
+  ldrex    r3, [r0]
+  mov      r12, #0
+  cmp      r3, r2
+  strexeq  r12, r1, [r0]
+  cmp      r12, #0
+  bne      .Lloop
+  mov      r0, r3
+  mov      pc, lr
+{$else}
 // lock
   ldr r12, .Lfpc_system_lock
   mov r3, #1
@@ -572,6 +654,7 @@ asm
 
 .Lfpc_system_lock:
   .long fpc_system_lock
+{$endif}
 end;
 
 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}

+ 1 - 1
rtl/arm/math.inc

@@ -14,7 +14,7 @@
 
  **********************************************************************}
 
-{$if defined(FPUFPA) or defined(FPUFPA10) or defined(FPUFPA11)}
+{$if defined(FPUFPA) or defined(FPUFPA10) or defined(FPUFPA11) or defined(FPUVFPV2) or defined(FPUVFPV3)}
     {$define FPC_SYSTEM_HAS_ABS}
     function fpc_abs_real(d : ValReal) : ValReal;compilerproc;
     begin

+ 194 - 5
rtl/arm/mathu.inc

@@ -46,7 +46,7 @@ begin
       include(result,exPrecision);
 end;
 
-{$ifdef wince}
+{$if defined(wince)}
 
 const
   _DN_SAVE  = $00000000;
@@ -177,7 +177,167 @@ procedure ClearExceptions(RaisePending: Boolean =true);
 begin
 end;
 
-{$else wince}
+{$elseif defined(darwin) or defined(FPUVFPV2) or defined(FPUVFPV3)}
+
+const
+  _VFP_MASK_IM  =  1 shl 8;         { invalid operation      }
+  _VFP_MASK_ZM  =  1 shl 9;         { divide by zero         }
+  _VFP_MASK_OM  =  1 shl 10;        { overflow               }
+  _VFP_MASK_UM  =  1 shl 11;        { underflow              }
+  _VFP_MASK_PM  =  1 shl 12;        { inexact                }
+  _VFP_MASK_DM  =  1 shl 15;        { denormalized operation }
+  _VFP_MASK_ALL =  _VFP_MASK_IM or
+                   _VFP_MASK_ZM or
+                   _VFP_MASK_OM or
+                   _VFP_MASK_UM or
+                   _VFP_MASK_PM or
+                   _VFP_MASK_DM;    { mask for all flags     }
+                   
+  _VFP_ROUNDINGMODE_MASK_SHIFT = 22;
+  _VFP_ROUNDINGMODE_MASK = 3 shl _VFP_ROUNDINGMODE_MASK_SHIFT;
+
+  _VFP_EXCEPTIONS_PENDING_MASK =
+    (1 shl 0) or
+    (1 shl 1) or
+    (1 shl 2) or
+    (1 shl 3) or
+    (1 shl 4) or
+    (1 shl 7);
+
+function VFP_GetCW : dword; nostackframe; assembler;
+  asm
+    fmrx r0,fpscr
+  end;
+
+
+procedure VFP_SetCW(cw : dword); nostackframe; assembler;
+  asm
+    fmxr fpscr,r0
+  end;
+
+
+function GetRoundMode: TFPURoundingMode; 
+  var
+    rm: byte;
+  begin
+    case (VFP_GetCW and _VFP_ROUNDINGMODE_MASK) shr _VFP_ROUNDINGMODE_MASK_SHIFT of
+      0 : result := rmNearest;
+      1 : result := rmUp;
+      2 : result := rmDown;
+      3 : result := rmTruncate;
+    end;
+  end;
+
+
+function SetRoundMode(const RoundMode: TFPURoundingMode): TFPURoundingMode;
+  var
+    mode: dword;
+  begin
+    case (RoundMode) of
+      rmNearest :
+        begin
+          mode := 0;
+          softfloat_rounding_mode := float_round_nearest_even;
+        end;
+      rmUp :
+        begin
+          mode := 1;
+          softfloat_rounding_mode := float_round_up;
+        end;
+      rmDown :
+        begin
+          mode := 2;
+          softfloat_rounding_mode := float_round_down;
+        end;
+      rmTruncate :
+        begin
+          mode := 3;
+          softfloat_rounding_mode := float_round_to_zero;
+        end;
+    end;
+    mode:=mode shl _VFP_ROUNDINGMODE_MASK_SHIFT;
+    VFP_SetCW((VFP_GetCW and (not _VFP_ROUNDINGMODE_MASK)) or mode);
+    result := RoundMode;
+  end;
+
+
+function GetPrecisionMode: TFPUPrecisionMode;
+  begin
+    result := pmDouble;
+  end;
+
+
+function SetPrecisionMode(const Precision: TFPUPrecisionMode): TFPUPrecisionMode;
+  begin
+    { nothing to do, not supported }
+    result := pmDouble;
+  end;
+
+
+function GetExceptionMask: TFPUExceptionMask;
+  var
+    cw : dword;
+  begin
+    Result:=[];
+    cw:=VFP_GetCW;
+
+    if (cw and _VFP_MASK_IM)=0 then
+      include(Result,exInvalidOp);
+
+    if (cw and _VFP_MASK_DM)=0 then
+      include(Result,exDenormalized);
+
+    if (cw and _VFP_MASK_ZM)=0 then
+      include(Result,exZeroDivide);
+
+    if (cw and _VFP_MASK_OM)=0 then
+      include(Result,exOverflow);
+
+    if (cw and _VFP_MASK_UM)=0 then
+      include(Result,exUnderflow);
+
+    if (cw and _VFP_MASK_PM)=0 then
+      include(Result,exPrecision);
+  end;
+
+
+function SetExceptionMask(const Mask: TFPUExceptionMask): TFPUExceptionMask;
+  var
+    cw : dword;
+  begin
+    cw:=VFP_GetCW or _VFP_MASK_ALL;
+
+    if exInvalidOp in Mask then
+      cw:=cw and not(_VFP_MASK_IM);
+
+    if exDenormalized in Mask then
+      cw:=cw and not(_VFP_MASK_DM);
+
+    if exZeroDivide in Mask then
+      cw:=cw and not(_VFP_MASK_ZM);
+
+    if exOverflow in Mask then
+      cw:=cw and not(_VFP_MASK_OM);
+
+    if exUnderflow in Mask then
+      cw:=cw and not(_VFP_MASK_UM);
+
+    if exPrecision in Mask then
+      cw:=cw and not(_VFP_MASK_PM);
+    VFP_SetCW(cw);
+    result:=Mask;
+
+    softfloat_exception_mask:=FPUExceptionMaskToSoftFloatMask(Mask);
+  end;
+
+
+procedure ClearExceptions(RaisePending: Boolean =true);
+  begin
+    { RaisePending has no effect on ARM, it always raises them at the correct location }
+    VFP_SetCW(VFP_GetCW and (not _VFP_EXCEPTIONS_PENDING_MASK));
+  end;
+
+{$else wince/darwin/vfpv2/vfpv3}
 
 {*****************************************************************************
                                    FPA code
@@ -281,25 +441,53 @@ procedure FPU_SetCW(cw : dword); nostackframe; assembler;
 
 function GetRoundMode: TFPURoundingMode;
   begin
-    { does not apply }
+    case softfloat_rounding_mode of
+      float_round_nearest_even:
+        GetRoundMode:=rmNearest;
+      float_round_up:
+        GetRoundMode:=rmUp;
+      float_round_down:
+        GetRoundMode:=rmDown;
+      float_round_to_zero:
+        GetRoundMode:=rmTruncate;
+    end;
   end;
 
 
 function SetRoundMode(const RoundMode: TFPURoundingMode): TFPURoundingMode;
   begin
-    { does not apply }
+    case (RoundMode) of
+      rmNearest :
+        begin
+          softfloat_rounding_mode := float_round_nearest_even;
+        end;
+      rmUp :
+        begin
+          softfloat_rounding_mode := float_round_up;
+        end;
+      rmDown :
+        begin
+          softfloat_rounding_mode := float_round_down;
+        end;
+      rmTruncate :
+        begin
+          softfloat_rounding_mode := float_round_to_zero;
+        end;
+    end;
+    SetRoundMode:=RoundMode;
   end;
 
 
 function GetPrecisionMode: TFPUPrecisionMode;
   begin
-    { does not apply }
+    result := pmDouble;
   end;
 
 
 function SetPrecisionMode(const Precision: TFPUPrecisionMode): TFPUPrecisionMode;
   begin
     { does not apply }
+    result := pmDouble;
   end;
 
 
@@ -362,6 +550,7 @@ function SetExceptionMask(const Mask: TFPUExceptionMask): TFPUExceptionMask;
     FPU_SetCW(cw);
 {$endif}
     softfloat_exception_mask:=FPUExceptionMaskToSoftFloatMask(Mask);
+    Result:=Mask;
   end;
 
 

+ 21 - 3
rtl/arm/setjump.inc

@@ -16,6 +16,14 @@
 
 function setjmp(var S : jmp_buf) : longint;assembler;[Public, alias : 'FPC_SETJMP'];nostackframe;
   asm
+    {$if defined(FPUVFPV2) or defined(FPUVFPV3)}
+    {$if defined(CPUARMV3) or defined(CPUARMV4) or defined(CPUARMV5)}
+    fstmiax r0!, {d8-d15}
+    {$else}
+    fstmiad r0!, {d8-d15}
+    {$endif}
+    {$endif}
+
     {$if defined(CPUCORTEXM3) or defined(CPUARMV7M)}
     stmia	r0!, {v1-v6, sl, fp}
     mov	   r2, sp
@@ -38,14 +46,24 @@ procedure longjmp(var S : jmp_buf;value : longint);assembler;[Public, alias : 'F
     movs    r0, r1
     it eq
     moveq   r0, #1
+    {$if defined(FPUVFPV2) or defined(FPUVFPV3)}
+    fldmiad ip!, {d8-d15}
+    {$endif}
     ldmia   ip,{v1-v6, sl, fp}
-    ldr		 sp, [ip]
-    add		 ip, ip, #4
-    ldr		 pc, [ip]
+    ldr     sp, [ip]
+    add     ip, ip, #4
+    ldr     pc, [ip]
     {$else}
     mov     ip, r0
     movs    r0, r1
     moveq   r0, #1
+    {$if defined(FPUVFPV2) or defined(FPUVFPV3)}
+    {$if defined(CPUARMV3) or defined(CPUARMV4) or defined(CPUARMV5)}
+    fldmiax ip!, {d8-d15}
+    {$else}
+    fldmiad ip!, {d8-d15}
+    {$endif}
+    {$endif}
     ldmia   ip,{v1-v6, sl, fp, sp, pc}
     {$endif}
   end;

+ 3 - 0
rtl/arm/setjumph.inc

@@ -16,6 +16,9 @@
 
 type
    jmp_buf = packed record
+{$if defined(FPUVFPV2) or defined(FPUVFPV3)}
+      d8,d9,d10,d11,d12,d13,d14,d15: double;
+{$endif}
       v1,v2,v3,v4,v5,v6,sl,fp,sp,pc : dword;
    end;
    pjmp_buf = ^jmp_buf;

BIN
tests/test/cg/obj/darwin/arm/cpptcl1.o


BIN
tests/test/cg/obj/darwin/arm/ctest.o


BIN
tests/test/cg/obj/darwin/arm/tcext3.o


BIN
tests/test/cg/obj/darwin/arm/tcext4.o


BIN
tests/test/cg/obj/darwin/arm/tcext5.o