2
0
Эх сурвалжийг харах

* first part of merging parts of Jeppe's intrinsics patch, mainly r31135
is merged by this commit with a lot of adaptions

git-svn-id: trunk@43949 -

florian 5 жил өмнө
parent
commit
70a836c4a2

+ 10 - 0
.gitattributes

@@ -910,6 +910,7 @@ compiler/utils/mkjvmreg.pp svneol=native#text/plain
 compiler/utils/mkmpsreg.pp svneol=native#text/plain
 compiler/utils/mkppcreg.pp svneol=native#text/plain
 compiler/utils/mkspreg.pp svneol=native#text/plain
+compiler/utils/mkx86inl.pp svneol=native#text/plain
 compiler/utils/mkx86ins.pp svneol=native#text/plain
 compiler/utils/mkx86reg.pp svneol=native#text/plain
 compiler/utils/msg2inc.pp svneol=native#text/plain
@@ -937,6 +938,7 @@ compiler/x86/cga.pas svneol=native#text/plain
 compiler/x86/cgx86.pas svneol=native#text/plain
 compiler/x86/cpubase.pas svneol=native#text/plain
 compiler/x86/cx86innr.inc svneol=native#text/plain
+compiler/x86/cx86mminnr.inc svneol=native#text/plain
 compiler/x86/hlcgx86.pas svneol=native#text/plain
 compiler/x86/itcpugas.pas svneol=native#text/plain
 compiler/x86/itx86int.pas svneol=native#text/plain
@@ -957,6 +959,10 @@ compiler/x86/rgx86.pas svneol=native#text/plain
 compiler/x86/symi86.pas svneol=native#text/plain
 compiler/x86/symx86.pas svneol=native#text/plain
 compiler/x86/x86ins.dat svneol=native#text/plain
+compiler/x86/x86intr.dat svneol=native#text/plain
+compiler/x86/x86mmfirst.inc svneol=native#text/plain
+compiler/x86/x86mmsecond.inc svneol=native#text/plain
+compiler/x86/x86mmtype.inc svneol=native#text/plain
 compiler/x86/x86reg.dat svneol=native#text/plain
 compiler/x86_64/aoptcpu.pas svneol=native#text/plain
 compiler/x86_64/aoptcpub.pas svneol=native#text/plain
@@ -10636,6 +10642,8 @@ rtl/haiku/x86_64/sighnd.inc svneol=native#text/plain
 rtl/i386/cpu.pp svneol=native#text/plain
 rtl/i386/cpuh.inc svneol=native#text/plain
 rtl/i386/cpuinnr.inc svneol=native#text/plain
+rtl/i386/cpumminnr.inc svneol=native#text/plain
+rtl/i386/cpummprocs.inc svneol=native#text/plain
 rtl/i386/fastmove.inc svneol=native#text/plain
 rtl/i386/i386.inc svneol=native#text/plain
 rtl/i386/int64p.inc svneol=native#text/plain
@@ -11931,6 +11939,8 @@ rtl/wince/winres.inc svneol=native#text/plain
 rtl/x86_64/cpu.pp svneol=native#text/pascal
 rtl/x86_64/cpuh.inc svneol=native#text/plain
 rtl/x86_64/cpuinnr.inc svneol=native#text/plain
+rtl/x86_64/cpumminnr.inc svneol=native#text/plain
+rtl/x86_64/cpummprocs.inc svneol=native#text/plain
 rtl/x86_64/int64p.inc svneol=native#text/plain
 rtl/x86_64/makefile.cpu svneol=native#text/plain
 rtl/x86_64/math.inc svneol=native#text/plain

+ 2 - 2
compiler/Makefile

@@ -529,10 +529,10 @@ endif
 override LOCALOPT+=-dllvm -Fullvm
 endif
 ifeq ($(PPC_TARGET),i386)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 ifeq ($(PPC_TARGET),x86_64)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 ifeq ($(PPC_TARGET),powerpc)
 override LOCALOPT+=-Fuppcgen

+ 2 - 2
compiler/Makefile.fpc

@@ -282,12 +282,12 @@ endif
 
 # i386 specific
 ifeq ($(PPC_TARGET),i386)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 
 # x86_64 specific
 ifeq ($(PPC_TARGET),x86_64)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 
 # PowerPC specific

+ 45 - 12
compiler/cgbase.pas

@@ -299,11 +299,12 @@ interface
         passed to an mm operation is nil, it means that the whole location is moved }
       tmmshuffle = record
         { describes how many shuffles are actually described, if len=0 then
-          moving the scalar with index 0 to the scalar with index 0 is meant }
-        len : byte;
-        { lower nibble of each entry of this array describes index of the source data index while
-          the upper nibble describes the destination index }
-        shuffles : array[1..1] of byte;
+          moving the scalar with index 0 to the scalar with index 0 is meant,
+          if len=-1, then a variable/unknown length is assumed }
+        len : Shortint;
+        { lower byte of each entry of this array describes index of the source data index while
+          the upper byte describes the destination index }
+        shuffles : array[1..1] of word;
       end;
 
       Tsuperregisterarray=array[0..$ffff] of Tsuperregister;
@@ -417,7 +418,13 @@ interface
             );
 
     var
-       mms_movescalar : pmmshuffle;
+       mms_movescalar,
+       mms_variable,
+       mms_2,
+       mms_4,
+       mms_8,
+       mms_16,
+       mms_32 : pmmshuffle;
 
     procedure supregset_reset(var regs:tsuperregisterset;setall:boolean;
                               maxreg:Tsuperregister);{$ifdef USEINLINE}inline;{$endif}
@@ -465,7 +472,8 @@ interface
 implementation
 
     uses
-      verbose;
+      verbose,
+      cutils;
 
 {******************************************************************************
                              tsuperregisterworklist
@@ -815,13 +823,13 @@ implementation
         i : longint;
       begin
         realshuffle:=true;
-        if (shuffle=nil) or (shuffle^.len=0) then
+        if (shuffle=nil) or (shuffle^.len<1) then
           realshuffle:=false
         else
           begin
             for i:=1 to shuffle^.len do
               begin
-                if (shuffle^.shuffles[i] and $f)<>((shuffle^.shuffles[i] and $f0) shr 4) then
+                if (shuffle^.shuffles[i] and $ff)<>((shuffle^.shuffles[i] and $ff00) shr 8) then
                   exit;
               end;
             realshuffle:=false;
@@ -846,9 +854,34 @@ implementation
       end;
 
 
+   procedure Initmms(var p : pmmshuffle;len : ShortInt);
+     var
+       i : Integer;
+     begin
+       Getmem(p,sizeof(tmmshuffle)+(max(len,0)-1)*2);
+       p^.len:=len;
+       for i:=1 to len do
+{$push}
+{$R-}
+         p^.shuffles[i]:=i;
+{$pop}
+     end;
+
 initialization
-  new(mms_movescalar);
-  mms_movescalar^.len:=0;
+  Initmms(mms_movescalar,0);
+  Initmms(mms_variable,-1);
+  Initmms(mms_2,2);
+  Initmms(mms_4,4);
+  Initmms(mms_8,8);
+  Initmms(mms_16,16);
+  Initmms(mms_32,32);
 finalization
-  dispose(mms_movescalar);
+  Freemem(mms_movescalar);
+  Freemem(mms_variable);
+  Freemem(mms_2);
+  Freemem(mms_4);
+  Freemem(mms_8);
+  Freemem(mms_16);
+  Freemem(mms_32);
 end.
+

+ 3 - 1
compiler/compinnr.pas

@@ -21,7 +21,9 @@ unit compinnr;
 interface
 
 const
-  fpc_in_cpu_first   = 10000;
+  { this file needs to be kept in sync with rtl/inc/innr.in }
+  in_cpu_first   = 10000;
+  in_x86_mm_first    = 11000;
 
 type
    tinlinenumber=(

+ 55 - 15
compiler/defutil.pas

@@ -1337,8 +1337,9 @@ implementation
       begin
         result:=(p.typ=arraydef) and
                 not(is_special_array(p)) and
+                (tarraydef(p).elementdef.typ in [floatdef,orddef]) {and
                 (tarraydef(p).elementdef.typ=floatdef) and
-                (tfloatdef(tarraydef(p).elementdef).floattype in [s32real,s64real]);
+                (tfloatdef(tarraydef(p).elementdef).floattype in [s32real,s64real])};
       end;
 
 
@@ -1348,21 +1349,60 @@ implementation
 {$ifdef x86}
         result:= is_vector(p) and
                  (
-                  (tarraydef(p).elementdef.typ=floatdef) and
                   (
-                   (tarraydef(p).lowrange=0) and
-                   (tarraydef(p).highrange=3) and
-                   (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
-                  )
-                 ) or
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=3) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
+                   )
+                  ) or
 
-                 (
-                  (tarraydef(p).elementdef.typ=floatdef) and
                   (
-                   (tarraydef(p).lowrange=0) and
-                   (tarraydef(p).highrange=1) and
-                   (tfloatdef(tarraydef(p).elementdef).floattype=s64real)
-                  )
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s64real)
+                   )
+                  ) {or
+
+                  // MMX registers
+                  (
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
+                   )
+                  ) or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s32bit,u32bit])
+                   )
+                  )  or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=3) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s16bit,u16bit])
+                   )
+                  ) or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=7) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s8bit,u8bit])
+                   )
+                  ) }
                  );
 {$else x86}
         result:=false;
@@ -1488,11 +1528,11 @@ implementation
             begin
               if is_dynamic_array(def) or not is_special_array(def) then
                 begin
-                  if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
+                  if is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
                     begin
                       { Determine if, based on the floating-point type and the size
                         of the array, if it can be made into a vector }
-                      case TFloatDef(def).floattype of
+                      case tfloatdef(tarraydef(def).elementdef).floattype of
                         s32real:
                           result := float_array_cgsize(def.size);
                         s64real:

+ 0 - 1
compiler/ncginl.pas

@@ -960,7 +960,6 @@ implementation
         internalerror(2014032701);
       end;
 
-
 begin
    cinlinenode:=tcginlinenode;
 end.

+ 8 - 4
compiler/ncgutil.pas

@@ -417,10 +417,10 @@ implementation
         if (l.loc<>LOC_MMREGISTER)  and
            ((l.loc<>LOC_CMMREGISTER) or (not maybeconst)) then
           begin
-            reg:=cg.getmmregister(list,OS_VECTOR);
-            cg.a_loadmm_loc_reg(list,OS_VECTOR,l,reg,nil);
+            reg:=cg.getmmregister(list,l.size);
+            cg.a_loadmm_loc_reg(list,l.size,l,reg,nil);
             location_freetemp(list,l);
-            location_reset(l,LOC_MMREGISTER,OS_VECTOR);
+            location_reset(l,LOC_MMREGISTER,l.size);
             l.register:=reg;
           end;
       end;
@@ -909,7 +909,11 @@ implementation
               localvarsym :
                 begin
                   vs:=tabstractnormalvarsym(sym);
-                  vs.initialloc.size:=def_cgsize(vs.vardef);
+                  if is_vector(vs.vardef) and
+                     fits_in_mm_register(vs.vardef) then
+                    vs.initialloc.size:=def_cgmmsize(vs.vardef)
+                  else
+                    vs.initialloc.size:=def_cgsize(vs.vardef);
                   if ([po_assembler,po_nostackframe] * pd.procoptions = [po_assembler,po_nostackframe]) and
                      (vo_is_funcret in vs.varoptions) then
                     begin

+ 15 - 2
compiler/ninl.pas

@@ -27,6 +27,7 @@ interface
 
     uses
        node,htypechk,symtype,compinnr;
+
     type
        tinlinenode = class(tunarynode)
           inlinenumber : tinlinenumber;
@@ -2902,7 +2903,6 @@ implementation
           result:=cstringconstnode.createpchar(ansistring2pchar(encodedtype),length(encodedtype),nil);
         end;
 
-
       var
          hightree,
          hp        : tnode;
@@ -5308,7 +5308,20 @@ implementation
          result:=nil;
        end;
 
-
+//
+//||||||| .merge-left.r31134
+//
+//{$ifdef ARM}
+//              {$i armtype.inc}
+//{$endif ARM}
+//=======
+//
+//{$ifdef x86}
+//              {$i x86type.inc}
+//{$endif x86}
+//{$ifdef ARM}
+//              {$i armtype.inc}
+//{$endif ARM}
 {$if not defined(cpu64bitalu) and not defined(cpuhighleveltarget)}
      function tinlinenode.first_ShiftRot_assign_64bitint: tnode;
        var

+ 36 - 0
compiler/psystem.pas

@@ -397,6 +397,21 @@ implementation
         wordfarpointertype:=tcpupointerdefclass(cpointerdef).createx86(u16inttype,x86pt_far);
         longintfarpointertype:=tcpupointerdefclass(cpointerdef).createx86(s32inttype,x86pt_far);
   {$endif i8086}
+        x86_m64type:=carraydef.create(0,1,s32inttype);
+        x86_m128type:=carraydef.create(0,3,s32inttype);
+        x86_m128dtype:=carraydef.create(0,1,s32inttype);
+        x86_m128itype:=carraydef.create(0,3,s32inttype);
+        x86_m256type:=carraydef.create(0,7,s32inttype);
+        x86_m256dtype:=carraydef.create(0,3,s32inttype);
+        x86_m256itype:=carraydef.create(0,7,s32inttype);
+
+        tarraydef(x86_m64type).elementdef:=s32floattype;
+        tarraydef(x86_m128type).elementdef:=s32floattype;
+        tarraydef(x86_m128dtype).elementdef:=s64floattype;
+        tarraydef(x86_m128itype).elementdef:=s32floattype;
+        tarraydef(x86_m256type).elementdef:=s32floattype;
+        tarraydef(x86_m256dtype).elementdef:=s64floattype;
+        tarraydef(x86_m256itype).elementdef:=s32floattype;
 {$endif x86}
         set_default_ptr_types;
         openchararraytype:=carraydef.create_openarray;
@@ -461,6 +476,13 @@ implementation
         addtype('FarPointer',voidfarpointertype);
         addtype('HugePointer',voidhugepointertype);
   {$endif i8086}
+        addtype('__m64',x86_m64type);
+        addtype('__m128', x86_m128type);
+        addtype('__m128d',x86_m128dtype);
+        addtype('__m128i',x86_m128itype);
+        addtype('__m256', x86_m256type);
+        addtype('__m256d',x86_m256dtype);
+        addtype('__m256i',x86_m256itype);
 {$endif x86}
         addtype('ShortString',cshortstringtype);
 {$ifdef support_longstring}
@@ -569,6 +591,13 @@ implementation
         addtype('$word_farpointer',wordfarpointertype);
         addtype('$longint_farpointer',longintfarpointertype);
   {$endif i8086}
+        addtype('$__m64',  x86_m64type);
+        addtype('$__m128', x86_m128type);
+        addtype('$__m128d',x86_m128dtype);
+        addtype('$__m128i',x86_m128itype);
+        addtype('$__m256', x86_m256type);
+        addtype('$__m256d',x86_m256dtype);
+        addtype('$__m256i',x86_m256itype);
 {$endif x86}
         addtype('$openchararray',openchararraytype);
         addtype('$file',cfiletype);
@@ -721,6 +750,13 @@ implementation
         loadtype('word_farpointer',wordfarpointertype);
         loadtype('longint_farpointer',longintfarpointertype);
   {$endif i8086}
+        loadtype('__m64',  x86_m64type);
+        loadtype('__m128', x86_m128type);
+        loadtype('__m128d',x86_m128dtype);
+        loadtype('__m128i',x86_m128itype);
+        loadtype('__m256', x86_m256type);
+        loadtype('__m256d',x86_m256dtype);
+        loadtype('__m256i',x86_m256itype);
 {$endif x86}
 {$ifdef llvm}
         loadtype('llvmbool1',llvmbool1type);

+ 9 - 0
compiler/symdef.pas

@@ -1233,6 +1233,15 @@ interface
        { FPC java procvar base class }
        java_procvarbase          : tobjectdef;
 
+       { x86 vector types }
+       x86_m64type,
+       x86_m128type,
+       x86_m128dtype,
+       x86_m128itype,
+       x86_m256type,
+       x86_m256dtype,
+       x86_m256itype             : tdef;
+
 
     function make_mangledname(const typeprefix:TSymStr;st:TSymtable;const suffix:TSymStr):TSymStr;
     function make_dllmangledname(const dllname,importname:TSymStr;

+ 5 - 0
compiler/symsym.pas

@@ -1784,6 +1784,11 @@ implementation
                   varregable:=vr_mmreg
                 else
                   varregable:=vr_fpureg;
+              end
+            else if is_vector(vardef) and
+              fits_in_mm_register(vardef) then
+              begin
+                varregable:=vr_mmreg;
               end;
           end;
       end;

+ 543 - 0
compiler/utils/mkx86inl.pp

@@ -0,0 +1,543 @@
+program mkx86inl;
+
+{$mode objfpc}
+{$H+}
+
+uses
+  sysutils, classes,
+  strutils;
+
+type
+  TOperDirection = (operIn, operVar, operOut);
+
+  TOperand = record
+    name,
+    typ: string;
+    direction: TOperDirection;
+  end;
+
+const
+  DirLUT: array[TOperDirection] of string = ('','var ','out ');
+
+function GetPascalType(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('longword');
+      'rs32':  exit('longint');
+      'r64':   exit('qword');
+      'rs64':  exit('int64');
+      'f32':   exit('single');
+      'mm':    exit('__m64');
+      'xmm':   exit('__m128');
+      'i32':   exit('longint');
+      'ptr32': exit('pointer');
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetTypeDef(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('u32inttype');
+      'rs32':  exit('s32inttype');
+      'r64':   exit('u64inttype');
+      'rs64':  exit('s64inttype');
+      'f32':   exit('s32floattype');
+      'mm':    exit('x86_m64type');
+      'xmm':   exit('x86_m128type');
+      'i32':   exit('s32inttype');
+      'ptr32': exit('voidpointertype');
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetOper(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('_reg');
+      'rs32':  exit('_reg');
+      'r64':   exit('_reg_reg');
+      'rs64':  exit('_reg_reg');
+      'f32':   exit('_reg');
+      'mm':    exit('_reg');
+      'xmm':   exit('_reg');
+      'i32':   exit('_const');
+      'ptr32': exit('_ref');
+    else
+      exit('');
+    end;
+  end;
+
+function GetOperand(const ATyp: string; AIndex: longint): string;
+  begin
+    case ATyp of
+      'r32':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'rs32':  exit(format(',paraarray[%d].location.register', [AIndex]));
+      'r64':   exit(format(',paraarray[%d].location.register64.reglo,paraarray[%d].location.register64.reghi', [AIndex,AIndex]));
+      'rs64':  exit(format(',paraarray[%d].location.register64.reglo,paraarray[%d].location.register64.reghi', [AIndex,AIndex]));
+      'f32':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'mm':    exit(format(',paraarray[%d].location.register', [AIndex]));
+      'xmm':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'i32':   exit(format(',GetConstInt(paraarray[%d])',[AIndex]));
+      'ptr32': exit(format(',paraarray[%d].location.reference', [AIndex]));
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetOperandLoc(const ATyp: string): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':  exit(',location.register');
+      'rs32': exit(',location.register');
+      'r64':  exit(',location.register64.reglo,location.register64.reghi');
+      'rs64': exit(',location.register64.reglo,location.register64.reghi');
+      'f32':  exit(',location.register');
+      'mm':   exit(',location.register');
+      'xmm':  exit(',location.register');
+    end;
+  end;
+
+function GetLocStatement(AIndex: longint; const ATyp: string; AConst: boolean): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':   exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u32inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'rs32':  exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u32inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'r64':   exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u64inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'rs64':  exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u64inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'f32':   exit(format('location_force_mmreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'mm':    exit(format('location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'xmm':   exit(format('location_force_mmreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'ptr32': exit(format('location_make_ref(paraarray[%d].location);', [AIndex+1]));
+    end;
+  end;
+
+function GetLoc(const ATyp: string; AWithSize: boolean = true): string;
+  begin
+    result:='';
+    if AWithSize then
+      case ATyp of
+        'r32':   exit('LOC_REGISTER,OS_32');
+        'rs32':  exit('LOC_REGISTER,OS_S32');
+        'r64':   exit('LOC_REGISTER,OS_64');
+        'rs64':  exit('LOC_REGISTER,OS_S64');
+        'f32':   exit('LOC_MMREGISTER,OS_M128');
+        'mm':    exit('LOC_MMXREGISTER,OS_M64');
+        'xmm':   exit('LOC_MMREGISTER,OS_M128');
+        'ptr32': exit('LOC_MEM,OS_32');
+      end
+    else
+      case ATyp of
+        'r32':   exit('LOC_REGISTER');
+        'rs32':  exit('LOC_REGISTER');
+        'r64':   exit('LOC_REGISTER');
+        'rs64':  exit('LOC_REGISTER');
+        'f32':   exit('LOC_MMREGISTER');
+        'mm':    exit('LOC_MMXREGISTER');
+        'xmm':   exit('LOC_MMREGISTER');
+        'ptr32': exit('LOC_MEM');
+      end;
+  end;
+
+function GetLocAllocation(const ATyp: string): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':  exit('location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'rs32': exit('location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'r64':  exit('location.register64.reglo:=cg.getintregister(current_asmdata.CurrAsmList, OS_32); location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'rs64': exit('location.register64.reglo:=cg.getintregister(current_asmdata.CurrAsmList, OS_32); location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'f32':  exit('location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);');
+      'mm':   exit('location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);');
+      'xmm':  exit('location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);');
+    end;
+  end;
+
+function GetPostFix(const APF: string): string;
+  begin
+    if APF<>'' then
+      result:='PF_'+APF
+    else
+      result:='PF_None';
+  end;
+
+procedure ParseList(const APrefix, AFilename: string);
+  var
+    f: TextFile;
+
+    fprocs,
+    fcinnr, fcpumminnr: TextFile;
+    ftypechk, ffirst, fsecond: TStringList;
+
+    str,
+    instrPart,postfix,_alias,
+    params, operline: String;
+
+    opers: array[0..7] of TOperand;
+    opercnt: longint;
+
+    hasOutput: boolean;
+    outputType: string;
+    cnt,
+    i, intrnum: longint;
+    tmp: String;
+
+  function ParseOperands(AIndex: longint = -1): string;
+    var
+      idx: LongInt;
+      pt: Integer;
+      c: Char;
+    begin
+      idx:=opercnt;
+
+      params:=trim(params);
+      if params='' then
+        exit('');
+
+      inc(opercnt);
+
+      if pos('var ', params)=1 then
+        begin
+          opers[idx].direction:=operVar;
+          Delete(params,1,4);
+          params:=trim(params);
+          hasOutput:=true;
+        end
+      else if pos('out ', params)=1 then
+        begin
+          opers[idx].direction:=operOut;
+          Delete(params,1,4);
+          params:=trim(params);
+          hasOutput:=true;
+        end
+      else
+        begin
+          if AIndex<>-1 then
+            opers[idx].direction:=opers[AIndex].direction
+          else
+            opers[idx].direction:=operIn;
+        end;
+
+          pt:=PosSet([',',':'], params);
+
+      c:=params[pt];
+      opers[idx].name:=Copy2SymbDel(params, c);
+      params:=trim(params);
+
+      if c = ':' then
+        begin
+          opers[idx].typ:=Copy2SymbDel(params, ';');
+          result:=opers[idx].typ;
+        end
+      else
+        begin
+          opers[idx].typ:=ParseOperands(idx);
+          result:=opers[idx].typ;
+        end;
+
+      if opers[idx].direction<>operIn then
+        outputType:=opers[idx].typ;
+    end;
+
+  function GetOperLine: string;
+    var
+      i: longint;
+    begin
+      result:='';
+      for i := 0 to opercnt-1 do
+        result:=result+DirLUT[opers[i].direction]+opers[i].name+':'+opers[i].typ+';';
+    end;
+
+  function GetParams: longint;
+    var
+      i: longint;
+    begin
+      result:=0;
+      for i := 0 to opercnt-1 do
+        if opers[i].direction in [operIn,operVar] then
+          inc(result);
+    end;
+
+  function FindOperIdx(const AOper: string): longint;
+    var
+      i,cnt: longint;
+    begin
+      cnt:=0;
+      result:=0;
+      for i := 0 to opercnt-1 do
+        if (opers[i].direction in [operIn,operVar]) then
+          begin
+            if opers[i].name=AOper then
+              exit(cnt);
+            inc(cnt);
+          end;
+    end;
+
+  begin
+    intrnum:=0;
+
+    assignfile(f, AFilename);
+    reset(f);
+
+    assignfile(fprocs, 'cpummprocs.inc'); rewrite(fprocs);
+    assignfile(fcinnr, 'c'+APrefix+'mminnr.inc'); rewrite(fcinnr);
+    assignfile(fcpumminnr, 'cpumminnr.inc'); rewrite(fcpumminnr);
+
+//    writeln(finnr,'const');
+
+    ftypechk:=TStringList.Create;
+    ffirst:=TStringList.Create;
+    fsecond:=TStringList.Create;
+
+//    writeln(finnr, '  fpc_in_', APrefix,'_first = fpc_in_',APrefix,'_base;');
+
+    while not EOF(f) do
+      begin
+        readln(f, str);
+
+        str:=trim(str);
+
+        if (str='') or (Pos(';',str)=1) then
+          continue;
+
+        instrPart:=Copy2SymbDel(str, '(');
+
+        // Check for postfix
+        if pos('{',instrPart)>0 then
+          begin
+            postfix:=instrPart;
+            instrPart:=Copy2SymbDel(postfix, '{');
+            postfix:=TrimRightSet(postfix,['}']);
+          end
+        else
+          postfix:='';
+
+        // Check for alias
+        if pos('[',instrPart)>0 then
+          begin
+            _alias:=instrPart;
+            instrPart:=Copy2SymbDel(_alias, '[');
+            _alias:='_'+TrimRightSet(_alias,[']']);
+          end
+        else
+          _alias:='';
+
+        // Get parameters
+        params:=trim(Copy2SymbDel(str,')'));
+        str:=trim(str);
+
+        hasOutput:=false;
+        opercnt:=0;
+        outputType:='';
+
+        while params<>'' do
+          ParseOperands;
+
+        operline:=GetOperLine;
+        // Write typecheck code
+        i:=ftypechk.IndexOf(': //'+operline);
+        if i>=0 then
+          ftypechk.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias)
+        else
+          begin
+            ftypechk.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            ftypechk.Add(': //'+operline);
+            ftypechk.Add('  begin');
+            ftypechk.Add('    CheckParameters('+inttostr(GetParams())+');');
+            if hasOutput then
+              ftypechk.Add('    resultdef:='+GetTypeDef(outputType)+';')
+            else
+              ftypechk.Add('    resultdef:=voidtype;');
+            ftypechk.Add('  end;')
+          end;
+
+        // Write firstpass code
+        i:=ffirst.IndexOf(': //'+operline);
+        if i>=0 then
+          ffirst.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias)
+        else
+          begin
+            ffirst.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            ffirst.Add(': //'+operline);
+            ffirst.Add('  begin');
+            if hasOutput then
+              ffirst.Add('    expectloc:='+GetLoc(outputType,false)+';')
+            else
+              ffirst.Add('    expectloc:=LOC_VOID;');
+            ffirst.Add('    result:=nil;');
+            ffirst.Add('  end;')
+          end;
+
+        // Write secondpass code
+        i:=fsecond.IndexOf(': //'+operline);
+        if i>=0 then
+          begin
+            fsecond.Insert(i+3,'      in_'+APrefix+'_'+instrPart+postfix+_alias+': begin op:=A_'+instrPart+' end;');
+            fsecond.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias);
+          end
+        else
+          begin
+            fsecond.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            fsecond.Add(': //'+operline);
+            fsecond.Add('  begin');
+            fsecond.Add('    case inlinenumber of');
+            fsecond.Add('      in_'+APrefix+'_'+instrPart+postfix+_alias+': begin op:=A_'+instrPart+'; end;');
+            fsecond.Add('      else');
+            fsecond.Add('        Internalerror(2020010201);');
+            fsecond.Add('    end;');
+            fsecond.Add('');
+
+            i:=GetParams;
+            fsecond.Add('    GetParameters('+inttostr(i)+');');
+            fsecond.Add('');
+
+            fsecond.Add('    for i := 1 to '+inttostr(i)+' do secondpass(paraarray[i]);');
+            fsecond.Add('');
+
+            // Force inputs
+            cnt:=0;
+            for i := 0 to opercnt-1 do
+              begin
+                case opers[i].direction of
+                  operIn:
+                    begin
+                      tmp:=GetLocStatement(cnt, opers[i].typ, true);
+                      if tmp<>'' then
+                        fsecond.add('    '+tmp);
+                      inc(cnt);
+                    end;
+                  operVar:
+                    begin
+                      tmp:=GetLocStatement(cnt, opers[i].typ, false);
+                      if tmp<>'' then
+                        fsecond.add('    '+tmp);
+                      inc(cnt);
+                    end;
+                end;
+              end;
+
+            // Allocate output
+            cnt:=0;
+            for i := 0 to opercnt-1 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    begin
+                      fsecond.add('    location_reset(location,'+GetLoc(opers[i].typ)+');');
+                      fsecond.Add('    '+GetLocAllocation(opers[i].typ));
+                    end;
+                  operVar:
+                    begin
+                      fsecond.Add('    location:=paraarray['+inttostr(cnt+1)+'].location;');
+                      inc(cnt);
+                    end;
+                  operIn:
+                    inc(cnt);
+                end;
+              end;
+
+            operline:='taicpu.op';
+            //for i := 0 to opercnt-1 do
+            for i := opercnt-1 downto 0 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    operline:=operline+GetOper(opers[i].typ);
+                  operVar:
+                    operline:=operline+GetOper(opers[i].typ);
+                  operIn:
+                    operline:=operline+GetOper(opers[i].typ);
+                end;
+              end;
+
+            if operline='taicpu.op' then
+              operline:='taicpu.op_none(op,S_NO'
+            else
+              operline:=operline+'(op,S_NO';
+
+            //for i := 0 to opercnt-1 do
+            for i := opercnt-1 downto 0 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    operline:=operline+GetOperandLoc(opers[i].typ);
+                  operIn,
+                  operVar:
+                    begin
+                      dec(cnt);
+                      operline:=operline+GetOperand(opers[i].typ, cnt+1);
+                    end;
+                end;
+              end;
+
+            operline:=operline+')';
+
+            fsecond.Add('    current_asmdata.CurrAsmList.concat('+operline+');');
+
+            fsecond.Add('  end;')
+          end;
+
+        // Write innr
+        writeln(fcinnr, '  in_', APrefix,'_',instrPart,postfix+_alias,' = in_',APrefix,'_mm_first+',intrnum,',');
+        writeln(fcpumminnr, '  fpc_in_', APrefix,'_',instrPart,postfix+_alias,' = fpc_in_',APrefix,'_mm_first+',intrnum,';');
+
+        // Write function
+        if hasOutput then write(fprocs,'function ') else write(fprocs,'procedure ');
+        write(fprocs,APrefix,'_',instrPart,postfix,'(');
+
+        cnt:=0;
+        for i:=0 to opercnt-1 do
+          begin
+            if opers[i].direction=operOut then
+              Continue;
+
+            if cnt>0 then
+              begin
+                if opers[i].typ<>opers[i-1].typ then
+                  write(fprocs,': ',GetPascalType(opers[i-1].typ),'; ')
+                else
+                  write(fprocs,', ');
+              end;
+
+            write(fprocs,opers[i].name);
+            if i=opercnt-1 then
+              write(fprocs,': ',GetPascalType(opers[i].typ));
+
+            inc(cnt);
+          end;
+
+        write(fprocs,')');
+
+        if hasOutput then write(fprocs,': ',GetPascalType(outputType));
+        writeln(fprocs,'; [INTERNPROC: fpc_in_',APrefix,'_',instrPart,postfix+_alias,'];');
+
+        // Str now contains conditionals
+
+        inc(intrnum);
+      end;
+
+    writeln(fcinnr, '  in_', APrefix,'mm_last = in_',APrefix,'_mm_first+',intrnum-1);
+
+    ftypechk.SaveToFile(APrefix+'mmtype.inc');
+    ffirst.SaveToFile(APrefix+'mmfirst.inc');
+    fsecond.SaveToFile(APrefix+'mmsecond.inc');
+
+    ftypechk.Free;
+    ffirst.Free;
+    fsecond.Free;
+
+    CloseFile(fprocs);
+    CloseFile(fcinnr);
+    CloseFile(fcpumminnr);
+
+    closefile(f);
+  end;
+
+begin
+  ParseList('x86', 'x86intr.dat');
+end.
+

+ 23 - 8
compiler/x86/cgx86.pas

@@ -1341,7 +1341,7 @@ unit cgx86;
       end;
 
 
-    function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
+    function get_scalar_mm_op(fromsize,tosize : tcgsize;aligned : boolean) : tasmop;
       const
         convertopsse : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
           (A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
@@ -1391,9 +1391,16 @@ unit cgx86;
               OS_M128:
                 { 128-bit aligned vector }
                 if UseAVX then
-                  result:=A_VMOVAPS
+                  begin
+                    if aligned then
+                      result:=A_VMOVAPS
+                    else
+                      result:=A_VMOVUPS;
+                  end
+                else if aligned then
+                  result:=A_MOVAPS
                 else
-                  result:=A_MOVAPS;
+                  result:=A_MOVUPS;
               OS_M256,
               OS_M512:
                 { 256-bit aligned vector }
@@ -1406,6 +1413,14 @@ unit cgx86;
                 InternalError(2018012920);
             end;
           end
+        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
+          (fromsize=OS_M128) then
+          begin
+            if UseAVX then
+              result:=A_VMOVDQU
+            else
+              result:=A_MOVDQU;
+          end
         else
           internalerror(2010060104);
         if result=A_NONE then
@@ -1459,7 +1474,7 @@ unit cgx86;
           end
         else if shufflescalar(shuffle) then
           begin
-            op:=get_scalar_mm_op(fromsize,tosize);
+            op:=get_scalar_mm_op(fromsize,tosize,true);
 
             { MOVAPD/MOVAPS are normally faster }
             if op=A_MOVSD then
@@ -1579,7 +1594,7 @@ unit cgx86;
            end
          else if shufflescalar(shuffle) then
            begin
-             op:=get_scalar_mm_op(fromsize,tosize);
+             op:=get_scalar_mm_op(fromsize,tosize,tcgsize2size[fromsize]=ref.alignment);
 
              { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
              if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
@@ -1673,7 +1688,7 @@ unit cgx86;
              if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
                begin
                  hreg:=getmmregister(list,tosize);
-                 op:=get_scalar_mm_op(fromsize,tosize);
+                 op:=get_scalar_mm_op(fromsize,tosize,tcgsize2size[tosize]=ref.alignment);
 
                  { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
                  if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
@@ -1681,10 +1696,10 @@ unit cgx86;
                  else
                    list.concat(taicpu.op_reg_reg(op,S_NO,reg,hreg));
 
-                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref))
+                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize,tcgsize2size[tosize]=tmpref.alignment),S_NO,hreg,tmpref))
                end
              else
-               list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
+               list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize,tcgsize2size[tosize]=tmpref.alignment),S_NO,reg,tmpref));
            end
          else
            internalerror(200312252);

+ 19 - 14
compiler/x86/cx86innr.inc

@@ -12,17 +12,22 @@
 
  **********************************************************************}
 
-  in_x86_inportb = fpc_in_cpu_first,
-  in_x86_inportw = fpc_in_cpu_first+1,
-  in_x86_inportl = fpc_in_cpu_first+2,
-  in_x86_outportb = fpc_in_cpu_first+3,
-  in_x86_outportw = fpc_in_cpu_first+4,
-  in_x86_outportl = fpc_in_cpu_first+5,
-  in_x86_cli      = fpc_in_cpu_first+6,
-  in_x86_sti      = fpc_in_cpu_first+7,
-  in_x86_get_cs   = fpc_in_cpu_first+8,
-  in_x86_get_ss   = fpc_in_cpu_first+9,
-  in_x86_get_ds   = fpc_in_cpu_first+10,
-  in_x86_get_es   = fpc_in_cpu_first+11,
-  in_x86_get_fs   = fpc_in_cpu_first+12,
-  in_x86_get_gs   = fpc_in_cpu_first+13
+  in_x86_inportb = in_cpu_first,
+  in_x86_inportw = in_cpu_first+1,
+  in_x86_inportl = in_cpu_first+2,
+  in_x86_outportb = in_cpu_first+3,
+  in_x86_outportw = in_cpu_first+4,
+  in_x86_outportl = in_cpu_first+5,
+  in_x86_cli      = in_cpu_first+6,
+  in_x86_sti      = in_cpu_first+7,
+  in_x86_get_cs   = in_cpu_first+8,
+  in_x86_get_ss   = in_cpu_first+9,
+  in_x86_get_ds   = in_cpu_first+10,
+  in_x86_get_es   = in_cpu_first+11,
+  in_x86_get_fs   = in_cpu_first+12,
+  in_x86_get_gs   = in_cpu_first+13,
+
+  { include mm inline routines }
+  {$I cx86mminnr.inc}
+
+

+ 68 - 0
compiler/x86/cx86mminnr.inc

@@ -0,0 +1,68 @@
+  in_x86_movss = in_x86_mm_first+0,
+  in_x86_movaps = in_x86_mm_first+1,
+  in_x86_movups = in_x86_mm_first+2,
+  in_x86_movss_to_mem = in_x86_mm_first+3,
+  in_x86_movaps_to_mem = in_x86_mm_first+4,
+  in_x86_movups_to_mem = in_x86_mm_first+5,
+  in_x86_movss_to_val = in_x86_mm_first+6,
+  in_x86_movss_from_val = in_x86_mm_first+7,
+  in_x86_movlps = in_x86_mm_first+8,
+  in_x86_movhps = in_x86_mm_first+9,
+  in_x86_movlhps = in_x86_mm_first+10,
+  in_x86_movhlps = in_x86_mm_first+11,
+  in_x86_addss = in_x86_mm_first+12,
+  in_x86_subss = in_x86_mm_first+13,
+  in_x86_mulss = in_x86_mm_first+14,
+  in_x86_divss = in_x86_mm_first+15,
+  in_x86_rcpss = in_x86_mm_first+16,
+  in_x86_sqrtss = in_x86_mm_first+17,
+  in_x86_maxss = in_x86_mm_first+18,
+  in_x86_minss = in_x86_mm_first+19,
+  in_x86_rsqrtss = in_x86_mm_first+20,
+  in_x86_addps = in_x86_mm_first+21,
+  in_x86_subps = in_x86_mm_first+22,
+  in_x86_mulps = in_x86_mm_first+23,
+  in_x86_divps = in_x86_mm_first+24,
+  in_x86_rcpps = in_x86_mm_first+25,
+  in_x86_sqrtps = in_x86_mm_first+26,
+  in_x86_maxps = in_x86_mm_first+27,
+  in_x86_minps = in_x86_mm_first+28,
+  in_x86_rsqrtps = in_x86_mm_first+29,
+  in_x86_andps = in_x86_mm_first+30,
+  in_x86_orps = in_x86_mm_first+31,
+  in_x86_xorps = in_x86_mm_first+32,
+  in_x86_andnps = in_x86_mm_first+33,
+  in_x86_cmpss = in_x86_mm_first+34,
+  in_x86_cmpps = in_x86_mm_first+35,
+  in_x86_shufps = in_x86_mm_first+36,
+  in_x86_unpckhps = in_x86_mm_first+37,
+  in_x86_unpcklps = in_x86_mm_first+38,
+  in_x86_cvtsi2ss = in_x86_mm_first+39,
+  in_x86_cvtss2si = in_x86_mm_first+40,
+  in_x86_cvttss2si = in_x86_mm_first+41,
+  in_x86_cvtpi2ps = in_x86_mm_first+42,
+  in_x86_cvtps2pi = in_x86_mm_first+43,
+  in_x86_cvttps2pi = in_x86_mm_first+44,
+  in_x86_pmulhuw_mmx = in_x86_mm_first+45,
+  in_x86_psadbw_mmx = in_x86_mm_first+46,
+  in_x86_pavgb_mmx = in_x86_mm_first+47,
+  in_x86_pavgw_mmx = in_x86_mm_first+48,
+  in_x86_pmaxub_mmx = in_x86_mm_first+49,
+  in_x86_pminub_mmx = in_x86_mm_first+50,
+  in_x86_pmaxsw_mmx = in_x86_mm_first+51,
+  in_x86_pminsw_mmx = in_x86_mm_first+52,
+  in_x86_pextrw_mmx = in_x86_mm_first+53,
+  in_x86_pinsrw_mmx = in_x86_mm_first+54,
+  in_x86_pmovmskb = in_x86_mm_first+55,
+  in_x86_pshufw = in_x86_mm_first+56,
+  in_x86_pmulhuw = in_x86_mm_first+57,
+  in_x86_psadbw = in_x86_mm_first+58,
+  in_x86_pavgb = in_x86_mm_first+59,
+  in_x86_pavgw = in_x86_mm_first+60,
+  in_x86_pmaxub = in_x86_mm_first+61,
+  in_x86_pminub = in_x86_mm_first+62,
+  in_x86_pmaxsw = in_x86_mm_first+63,
+  in_x86_pminsw = in_x86_mm_first+64,
+  in_x86_pextrw = in_x86_mm_first+65,
+  in_x86_pinsrw = in_x86_mm_first+66,
+  in_x86mm_last = in_x86_mm_first+66

+ 88 - 3
compiler/x86/nx86inl.pas

@@ -96,7 +96,7 @@ implementation
       htypechk,
       cgbase,pass_1,pass_2,
       cpuinfo,cpubase,nutils,
-      ncal,ncgutil,nld,
+      ncal,ncgutil,nld,ncon,
       tgobj,
       cga,cgutils,cgx86,cgobj,hlcgobj;
 
@@ -151,6 +151,8 @@ implementation
 {$else i8086}
              resultdef:=s32inttype;
 {$endif i8086}
+           { include automatically generated code }
+           {$i x86mmtype.inc}
            else
              Result:=inherited pass_typecheck_cpu;
          end;
@@ -177,6 +179,8 @@ implementation
            in_x86_cli,
            in_x86_sti:
              expectloc:=LOC_VOID;
+           { include automatically generated code }
+           {$i x86mmfirst.inc}
            else
              Result:=inherited first_cpu;
          end;
@@ -411,6 +415,11 @@ implementation
 
      procedure tx86inlinenode.pass_generate_code_cpu;
 
+       var
+         paraarray : array[1..4] of tnode;
+         i : integer;
+         op: TAsmOp;
+
        procedure inport(dreg:TRegister;dsize:topsize;dtype:tdef);
          var
            portnumber: tnode;
@@ -442,6 +451,7 @@ implementation
              end;
          end;
 
+
        procedure outport(dreg:TRegister;dsize:topsize;dtype:tdef);
          var
            portnumber, portdata: tnode;
@@ -466,6 +476,7 @@ implementation
            hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
          end;
 
+
        procedure get_segreg(segreg:tregister);
          begin
            location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
@@ -473,7 +484,82 @@ implementation
            current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MOV,TCGSize2OpSize[def_cgsize(resultdef)],segreg,location.register));
          end;
 
+
+      function GetConstInt(n: tnode): longint;
+        begin
+          Result:=0;
+          if is_constintnode(n) then
+            result:=tordconstnode(n).value.svalue
+          else
+            Message(type_e_constant_expr_expected);
+        end;
+
+
+      procedure GetParameters(count: longint);
+        var
+          i: longint;
+          p: tnode;
+        begin
+          if (count=1) and
+             (not (left is tcallparanode)) then
+            paraarray[1]:=left
+          else
+            begin
+              p:=left;
+              for i := count downto 1 do
+                begin
+                  paraarray[i]:=tcallparanode(p).paravalue;
+                  p:=tcallparanode(p).nextpara;
+                end;
+            end;
+        end;
+
+      procedure location_force_mmxreg(list:TAsmList;var l: tlocation;maybeconst:boolean);
+        var
+          reg : tregister;
+        begin
+          if (l.loc<>LOC_MMXREGISTER)  and
+             ((l.loc<>LOC_CMMXREGISTER) or (not maybeconst)) then
+            begin
+              reg:=tcgx86(cg).getmmxregister(list);
+              cg.a_loadmm_loc_reg(list,OS_M64,l,reg,nil);
+              location_freetemp(list,l);
+              location_reset(l,LOC_MMXREGISTER,OS_M64);
+              l.register:=reg;
+            end;
+        end;
+
+      procedure location_make_ref(var loc: tlocation);
+        var
+          hloc: tlocation;
+        begin
+          case loc.loc of
+            LOC_CREGISTER,
+            LOC_REGISTER:
+              begin
+                location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
+                hloc.reference.base:=loc.register;
+
+                loc:=hloc;
+              end;
+            LOC_CREFERENCE,
+            LOC_REFERENCE:
+              begin
+              end;
+          else
+            begin
+              hlcg.location_force_reg(current_asmdata.CurrAsmList,loc,u32inttype,u32inttype,false);
+
+              location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
+              hloc.reference.base:=loc.register;
+
+              loc:=hloc;
+            end;
+          end;
+        end;
+
        begin
+         FillChar(paraarray,sizeof(paraarray),0);
          case inlinenumber of
            in_x86_inportb:
              inport(NR_AL,S_B,u8inttype);
@@ -503,6 +589,7 @@ implementation
              get_segreg(NR_FS);
            in_x86_get_gs:
              get_segreg(NR_GS);
+           {$i x86mmsecond.inc}
            else
              inherited pass_generate_code_cpu;
          end;
@@ -1308,6 +1395,4 @@ implementation
         location.register:=hregister;
       end;
 
-
-
 end.

+ 81 - 0
compiler/x86/x86intr.dat

@@ -0,0 +1,81 @@
+movss(out r0: xmm; r1: ptr32)
+movaps(out r0: xmm; r1: ptr32)
+movups(out r0: xmm; r1: ptr32)
+
+movss[to_mem](r0: ptr32; r1: xmm)
+movaps[to_mem](r0: ptr32; r1: xmm)
+movups[to_mem](r0: ptr32; r1: xmm)
+
+movss[to_val](out r0: f32; r1: xmm)
+movss[from_val](out r0: xmm; r1: f32)
+
+movlps(var r0: xmm; r1: ptr32)
+movhps(var r0: xmm; r1: ptr32)
+
+movlhps(var r0: xmm; r1: xmm)
+movhlps(var r0: xmm; r1: xmm)
+
+addss(var r0: xmm; r1: xmm)
+subss(var r0: xmm; r1: xmm)
+mulss(var r0: xmm; r1: xmm)
+divss(var r0: xmm; r1: xmm)
+rcpss(var r0: xmm; r1: xmm)
+sqrtss(var r0: xmm; r1: xmm)
+maxss(var r0: xmm; r1: xmm)
+minss(var r0: xmm; r1: xmm)
+rsqrtss(var r0: xmm; r1: xmm)
+
+addps(var r0: xmm; r1: xmm)
+subps(var r0: xmm; r1: xmm)
+mulps(var r0: xmm; r1: xmm)
+divps(var r0: xmm; r1: xmm)
+rcpps(var r0: xmm; r1: xmm)
+sqrtps(var r0: xmm; r1: xmm)
+maxps(var r0: xmm; r1: xmm)
+minps(var r0: xmm; r1: xmm)
+rsqrtps(var r0: xmm; r1: xmm)
+
+andps(var r0: xmm; r1: xmm)
+orps(var r0: xmm; r1: xmm)
+xorps(var r0: xmm; r1: xmm)
+andnps(var r0: xmm; r1: xmm)
+
+cmpss(var r0: xmm; r1: xmm; imm: i32)             (imm in [0..7])
+cmpps(var r0: xmm; r1: xmm; imm: i32)             (imm in [0..7])
+
+shufps(var r0: xmm; r1: xmm; imm: i32)            (imm in [0..$ff])
+unpckhps(var r0: xmm; r1: xmm)
+unpcklps(var r0: xmm; r1: xmm)
+
+cvtsi2ss(var r0: xmm; r1: r32)
+cvtss2si(out r0: r32; r1: xmm)
+cvttss2si(out r0: r32; r1: xmm)
+
+cvtpi2ps(var r0: xmm; r1: mm)
+cvtps2pi(out r0: mm; r1: xmm)
+cvttps2pi(out r0: mm; r1: xmm)
+
+pmulhuw[mmx](var r0: mm; r1: mm)
+psadbw[mmx](var r0: mm; r1: mm)
+pavgb[mmx](var r0: mm; r1: mm)
+pavgw[mmx](var r0: mm; r1: mm)
+pmaxub[mmx](var r0: mm; r1: mm)
+pminub[mmx](var r0: mm; r1: mm)
+pmaxsw[mmx](var r0: mm; r1: mm)
+pminsw[mmx](var r0: mm; r1: mm)
+pextrw[mmx](out r0: r32; r1: mm; imm: i32)             (imm in [0..3])
+pinsrw[mmx](var r0: mm; r1: r32; imm: i32)             (imm in [0..3])
+
+pmovmskb(out r0: r32; r1: mm)
+pshufw(out r0: mm; r1: mm; imm: i32)                   (imm in [0..$ff])
+
+pmulhuw(var r0: xmm; r1: xmm)
+psadbw(var r0: xmm; r1: xmm)
+pavgb(var r0: xmm; r1: xmm)
+pavgw(var r0: xmm; r1: xmm)
+pmaxub(var r0: xmm; r1: xmm)
+pminub(var r0: xmm; r1: xmm)
+pmaxsw(var r0: xmm; r1: xmm)
+pminsw(var r0: xmm; r1: xmm)
+pextrw(out r0: r32; r1: xmm; imm: i32)            (imm in [0..7])
+pinsrw(var r0: xmm; r1: r32; imm: i32)            (imm in [0..7])         

+ 157 - 0
compiler/x86/x86mmfirst.inc

@@ -0,0 +1,157 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    expectloc:=LOC_VOID;
+    result:=nil;
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;

+ 421 - 0
compiler/x86/x86mmsecond.inc

@@ -0,0 +1,421 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    case inlinenumber of
+      in_x86_movups: begin op:=A_movups end;
+      in_x86_movaps: begin op:=A_movaps end;
+      in_x86_movss: begin op:=A_movss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_make_ref(paraarray[1].location);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,paraarray[1].location.reference,location.register));
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_movups_to_mem: begin op:=A_movups end;
+      in_x86_movaps_to_mem: begin op:=A_movaps end;
+      in_x86_movss_to_mem: begin op:=A_movss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_make_ref(paraarray[1].location);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_ref(op,S_NO,paraarray[2].location.register,paraarray[1].location.reference));
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_movss_to_val: begin op:=A_movss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    case inlinenumber of
+      in_x86_movss_from_val: begin op:=A_movss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    case inlinenumber of
+      in_x86_movhps: begin op:=A_movhps end;
+      in_x86_movlps: begin op:=A_movlps; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_make_ref(paraarray[2].location);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,paraarray[2].location.reference,paraarray[1].location.register));
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_pminsw: begin op:=A_pminsw end;
+      in_x86_pmaxsw: begin op:=A_pmaxsw end;
+      in_x86_pminub: begin op:=A_pminub end;
+      in_x86_pmaxub: begin op:=A_pmaxub end;
+      in_x86_pavgw: begin op:=A_pavgw end;
+      in_x86_pavgb: begin op:=A_pavgb end;
+      in_x86_psadbw: begin op:=A_psadbw end;
+      in_x86_pmulhuw: begin op:=A_pmulhuw end;
+      in_x86_unpcklps: begin op:=A_unpcklps end;
+      in_x86_unpckhps: begin op:=A_unpckhps end;
+      in_x86_andnps: begin op:=A_andnps end;
+      in_x86_xorps: begin op:=A_xorps end;
+      in_x86_orps: begin op:=A_orps end;
+      in_x86_andps: begin op:=A_andps end;
+      in_x86_rsqrtps: begin op:=A_rsqrtps end;
+      in_x86_minps: begin op:=A_minps end;
+      in_x86_maxps: begin op:=A_maxps end;
+      in_x86_sqrtps: begin op:=A_sqrtps end;
+      in_x86_rcpps: begin op:=A_rcpps end;
+      in_x86_divps: begin op:=A_divps end;
+      in_x86_mulps: begin op:=A_mulps end;
+      in_x86_subps: begin op:=A_subps end;
+      in_x86_addps: begin op:=A_addps end;
+      in_x86_rsqrtss: begin op:=A_rsqrtss end;
+      in_x86_minss: begin op:=A_minss end;
+      in_x86_maxss: begin op:=A_maxss end;
+      in_x86_sqrtss: begin op:=A_sqrtss end;
+      in_x86_rcpss: begin op:=A_rcpss end;
+      in_x86_divss: begin op:=A_divss end;
+      in_x86_mulss: begin op:=A_mulss end;
+      in_x86_subss: begin op:=A_subss end;
+      in_x86_addss: begin op:=A_addss end;
+      in_x86_movhlps: begin op:=A_movhlps end;
+      in_x86_movlhps: begin op:=A_movlhps; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_shufps: begin op:=A_shufps end;
+      in_x86_cmpps: begin op:=A_cmpps end;
+      in_x86_cmpss: begin op:=A_cmpss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    case inlinenumber of
+      in_x86_cvtsi2ss: begin op:=A_cvtsi2ss; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_cvttss2si: begin op:=A_cvttss2si end;
+      in_x86_cvtss2si: begin op:=A_cvtss2si; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_cvtpi2ps: begin op:=A_cvtpi2ps; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_cvttps2pi: begin op:=A_cvttps2pi end;
+      in_x86_cvtps2pi: begin op:=A_cvtps2pi; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMXREGISTER,OS_M64);
+    location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_pminsw_mmx: begin op:=A_pminsw end;
+      in_x86_pmaxsw_mmx: begin op:=A_pmaxsw end;
+      in_x86_pminub_mmx: begin op:=A_pminub end;
+      in_x86_pmaxub_mmx: begin op:=A_pmaxub end;
+      in_x86_pavgw_mmx: begin op:=A_pavgw end;
+      in_x86_pavgb_mmx: begin op:=A_pavgb end;
+      in_x86_psadbw_mmx: begin op:=A_psadbw end;
+      in_x86_pmulhuw_mmx: begin op:=A_pmulhuw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pextrw_mmx: begin op:=A_pextrw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pinsrw_mmx: begin op:=A_pinsrw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_pmovmskb: begin op:=A_pmovmskb; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pshufw: begin op:=A_pshufw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMXREGISTER,OS_M64);
+    location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pextrw: begin op:=A_pextrw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pinsrw: begin op:=A_pinsrw; end;
+      else
+        Internalerror(2020010201);
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;

+ 157 - 0
compiler/x86/x86mmtype.inc

@@ -0,0 +1,157 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    CheckParameters(2);
+    resultdef:=voidtype;
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=s32floattype;
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=u32inttype;
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=u32inttype;
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    CheckParameters(1);
+    resultdef:=u32inttype;
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=u32inttype;
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m128type;
+  end;

+ 3 - 0
rtl/i386/cpuh.inc

@@ -40,3 +40,6 @@ function fpc_x86_get_ds:longint;[internproc:fpc_in_x86_get_ds];
 function fpc_x86_get_es:longint;[internproc:fpc_in_x86_get_es];
 function fpc_x86_get_fs:longint;[internproc:fpc_in_x86_get_fs];
 function fpc_x86_get_gs:longint;[internproc:fpc_in_x86_get_gs];
+
+{ include automatically generated procs }
+{$i cpummprocs.inc}  

+ 4 - 1
rtl/i386/cpuinnr.inc

@@ -25,4 +25,7 @@
   fpc_in_x86_get_ds   = fpc_in_cpu_first+10;
   fpc_in_x86_get_es   = fpc_in_cpu_first+11;
   fpc_in_x86_get_fs   = fpc_in_cpu_first+12;
-  fpc_in_x86_get_gs   = fpc_in_cpu_first+13;
+  fpc_in_x86_get_gs   = fpc_in_cpu_first+13;
+  
+  { include automatically generated numbers }
+  {$i cpumminnr.inc}  

+ 67 - 0
rtl/i386/cpumminnr.inc

@@ -0,0 +1,67 @@
+  fpc_in_x86_movss = fpc_in_x86_mm_first+0;
+  fpc_in_x86_movaps = fpc_in_x86_mm_first+1;
+  fpc_in_x86_movups = fpc_in_x86_mm_first+2;
+  fpc_in_x86_movss_to_mem = fpc_in_x86_mm_first+3;
+  fpc_in_x86_movaps_to_mem = fpc_in_x86_mm_first+4;
+  fpc_in_x86_movups_to_mem = fpc_in_x86_mm_first+5;
+  fpc_in_x86_movss_to_val = fpc_in_x86_mm_first+6;
+  fpc_in_x86_movss_from_val = fpc_in_x86_mm_first+7;
+  fpc_in_x86_movlps = fpc_in_x86_mm_first+8;
+  fpc_in_x86_movhps = fpc_in_x86_mm_first+9;
+  fpc_in_x86_movlhps = fpc_in_x86_mm_first+10;
+  fpc_in_x86_movhlps = fpc_in_x86_mm_first+11;
+  fpc_in_x86_addss = fpc_in_x86_mm_first+12;
+  fpc_in_x86_subss = fpc_in_x86_mm_first+13;
+  fpc_in_x86_mulss = fpc_in_x86_mm_first+14;
+  fpc_in_x86_divss = fpc_in_x86_mm_first+15;
+  fpc_in_x86_rcpss = fpc_in_x86_mm_first+16;
+  fpc_in_x86_sqrtss = fpc_in_x86_mm_first+17;
+  fpc_in_x86_maxss = fpc_in_x86_mm_first+18;
+  fpc_in_x86_minss = fpc_in_x86_mm_first+19;
+  fpc_in_x86_rsqrtss = fpc_in_x86_mm_first+20;
+  fpc_in_x86_addps = fpc_in_x86_mm_first+21;
+  fpc_in_x86_subps = fpc_in_x86_mm_first+22;
+  fpc_in_x86_mulps = fpc_in_x86_mm_first+23;
+  fpc_in_x86_divps = fpc_in_x86_mm_first+24;
+  fpc_in_x86_rcpps = fpc_in_x86_mm_first+25;
+  fpc_in_x86_sqrtps = fpc_in_x86_mm_first+26;
+  fpc_in_x86_maxps = fpc_in_x86_mm_first+27;
+  fpc_in_x86_minps = fpc_in_x86_mm_first+28;
+  fpc_in_x86_rsqrtps = fpc_in_x86_mm_first+29;
+  fpc_in_x86_andps = fpc_in_x86_mm_first+30;
+  fpc_in_x86_orps = fpc_in_x86_mm_first+31;
+  fpc_in_x86_xorps = fpc_in_x86_mm_first+32;
+  fpc_in_x86_andnps = fpc_in_x86_mm_first+33;
+  fpc_in_x86_cmpss = fpc_in_x86_mm_first+34;
+  fpc_in_x86_cmpps = fpc_in_x86_mm_first+35;
+  fpc_in_x86_shufps = fpc_in_x86_mm_first+36;
+  fpc_in_x86_unpckhps = fpc_in_x86_mm_first+37;
+  fpc_in_x86_unpcklps = fpc_in_x86_mm_first+38;
+  fpc_in_x86_cvtsi2ss = fpc_in_x86_mm_first+39;
+  fpc_in_x86_cvtss2si = fpc_in_x86_mm_first+40;
+  fpc_in_x86_cvttss2si = fpc_in_x86_mm_first+41;
+  fpc_in_x86_cvtpi2ps = fpc_in_x86_mm_first+42;
+  fpc_in_x86_cvtps2pi = fpc_in_x86_mm_first+43;
+  fpc_in_x86_cvttps2pi = fpc_in_x86_mm_first+44;
+  fpc_in_x86_pmulhuw_mmx = fpc_in_x86_mm_first+45;
+  fpc_in_x86_psadbw_mmx = fpc_in_x86_mm_first+46;
+  fpc_in_x86_pavgb_mmx = fpc_in_x86_mm_first+47;
+  fpc_in_x86_pavgw_mmx = fpc_in_x86_mm_first+48;
+  fpc_in_x86_pmaxub_mmx = fpc_in_x86_mm_first+49;
+  fpc_in_x86_pminub_mmx = fpc_in_x86_mm_first+50;
+  fpc_in_x86_pmaxsw_mmx = fpc_in_x86_mm_first+51;
+  fpc_in_x86_pminsw_mmx = fpc_in_x86_mm_first+52;
+  fpc_in_x86_pextrw_mmx = fpc_in_x86_mm_first+53;
+  fpc_in_x86_pinsrw_mmx = fpc_in_x86_mm_first+54;
+  fpc_in_x86_pmovmskb = fpc_in_x86_mm_first+55;
+  fpc_in_x86_pshufw = fpc_in_x86_mm_first+56;
+  fpc_in_x86_pmulhuw = fpc_in_x86_mm_first+57;
+  fpc_in_x86_psadbw = fpc_in_x86_mm_first+58;
+  fpc_in_x86_pavgb = fpc_in_x86_mm_first+59;
+  fpc_in_x86_pavgw = fpc_in_x86_mm_first+60;
+  fpc_in_x86_pmaxub = fpc_in_x86_mm_first+61;
+  fpc_in_x86_pminub = fpc_in_x86_mm_first+62;
+  fpc_in_x86_pmaxsw = fpc_in_x86_mm_first+63;
+  fpc_in_x86_pminsw = fpc_in_x86_mm_first+64;
+  fpc_in_x86_pextrw = fpc_in_x86_mm_first+65;
+  fpc_in_x86_pinsrw = fpc_in_x86_mm_first+66;

+ 67 - 0
rtl/i386/cpummprocs.inc

@@ -0,0 +1,67 @@
+function x86_movss(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movss];
+function x86_movaps(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movaps];
+function x86_movups(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movups];
+procedure x86_movss(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movss_to_mem];
+procedure x86_movaps(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movaps_to_mem];
+procedure x86_movups(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movups_to_mem];
+function x86_movss(r1: __m128): single; [INTERNPROC: fpc_in_x86_movss_to_val];
+function x86_movss(r1: single): __m128; [INTERNPROC: fpc_in_x86_movss_from_val];
+function x86_movlps(r0: __m128; r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movlps];
+function x86_movhps(r0: __m128; r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movhps];
+function x86_movlhps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_movlhps];
+function x86_movhlps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_movhlps];
+function x86_addss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_addss];
+function x86_subss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_subss];
+function x86_mulss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_mulss];
+function x86_divss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_divss];
+function x86_rcpss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rcpss];
+function x86_sqrtss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_sqrtss];
+function x86_maxss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_maxss];
+function x86_minss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_minss];
+function x86_rsqrtss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rsqrtss];
+function x86_addps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_addps];
+function x86_subps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_subps];
+function x86_mulps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_mulps];
+function x86_divps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_divps];
+function x86_rcpps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rcpps];
+function x86_sqrtps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_sqrtps];
+function x86_maxps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_maxps];
+function x86_minps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_minps];
+function x86_rsqrtps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rsqrtps];
+function x86_andps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_andps];
+function x86_orps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_orps];
+function x86_xorps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_xorps];
+function x86_andnps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_andnps];
+function x86_cmpss(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_cmpss];
+function x86_cmpps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_cmpps];
+function x86_shufps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_shufps];
+function x86_unpckhps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_unpckhps];
+function x86_unpcklps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_unpcklps];
+function x86_cvtsi2ss(r0: __m128; r1: longword): __m128; [INTERNPROC: fpc_in_x86_cvtsi2ss];
+function x86_cvtss2si(r1: __m128): longword; [INTERNPROC: fpc_in_x86_cvtss2si];
+function x86_cvttss2si(r1: __m128): longword; [INTERNPROC: fpc_in_x86_cvttss2si];
+function x86_cvtpi2ps(r0: __m128; r1: __m64): __m128; [INTERNPROC: fpc_in_x86_cvtpi2ps];
+function x86_cvtps2pi(r1: __m128): __m64; [INTERNPROC: fpc_in_x86_cvtps2pi];
+function x86_cvttps2pi(r1: __m128): __m64; [INTERNPROC: fpc_in_x86_cvttps2pi];
+function x86_pmulhuw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmulhuw_mmx];
+function x86_psadbw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_psadbw_mmx];
+function x86_pavgb(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pavgb_mmx];
+function x86_pavgw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pavgw_mmx];
+function x86_pmaxub(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmaxub_mmx];
+function x86_pminub(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pminub_mmx];
+function x86_pmaxsw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmaxsw_mmx];
+function x86_pminsw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pminsw_mmx];
+function x86_pextrw(r1: __m64; imm: longint): longword; [INTERNPROC: fpc_in_x86_pextrw_mmx];
+function x86_pinsrw(r0: __m64; r1: longword; imm: longint): __m64; [INTERNPROC: fpc_in_x86_pinsrw_mmx];
+function x86_pmovmskb(r1: __m64): longword; [INTERNPROC: fpc_in_x86_pmovmskb];
+function x86_pshufw(r1: __m64; imm: longint): __m64; [INTERNPROC: fpc_in_x86_pshufw];
+function x86_pmulhuw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmulhuw];
+function x86_psadbw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_psadbw];
+function x86_pavgb(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pavgb];
+function x86_pavgw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pavgw];
+function x86_pmaxub(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmaxub];
+function x86_pminub(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pminub];
+function x86_pmaxsw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmaxsw];
+function x86_pminsw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pminsw];
+function x86_pextrw(r1: __m128; imm: longint): longword; [INTERNPROC: fpc_in_x86_pextrw];
+function x86_pinsrw(r0: __m128; r1: longword; imm: longint): __m128; [INTERNPROC: fpc_in_x86_pinsrw];

+ 2 - 0
rtl/inc/innr.inc

@@ -154,3 +154,5 @@ const
    { SSE }
 
    fpc_in_cpu_first        = 10000;
+   fpc_in_x86_mm_first     = 11000;
+   

+ 0 - 1
rtl/inc/systemh.inc

@@ -837,7 +837,6 @@ procedure WriteBarrier;
 procedure MoveData(srcseg,srcoff,destseg,destoff:Word;n:Word);
 {$endif cpui8086}
 
-
 {****************************************************************************
                           Math Routines
 ****************************************************************************}

+ 3 - 0
rtl/x86_64/cpuh.inc

@@ -31,3 +31,6 @@ function fpc_x86_get_ds:longint;[internproc:fpc_in_x86_get_ds];
 function fpc_x86_get_es:longint;[internproc:fpc_in_x86_get_es];
 function fpc_x86_get_fs:longint;[internproc:fpc_in_x86_get_fs];
 function fpc_x86_get_gs:longint;[internproc:fpc_in_x86_get_gs];
+
+{ include automatically generated procs }
+{ $i cpummprocs.inc}

+ 4 - 1
rtl/x86_64/cpuinnr.inc

@@ -25,4 +25,7 @@
   fpc_in_x86_get_ds   = fpc_in_cpu_first+10;
   fpc_in_x86_get_es   = fpc_in_cpu_first+11;
   fpc_in_x86_get_fs   = fpc_in_cpu_first+12;
-  fpc_in_x86_get_gs   = fpc_in_cpu_first+13;
+  fpc_in_x86_get_gs   = fpc_in_cpu_first+13;
+  
+   { include automatically generated numbers }
+   { $i cpumminnr.inc}

+ 67 - 0
rtl/x86_64/cpumminnr.inc

@@ -0,0 +1,67 @@
+  fpc_in_x86_movss = fpc_in_x86_mm_first+0;
+  fpc_in_x86_movaps = fpc_in_x86_mm_first+1;
+  fpc_in_x86_movups = fpc_in_x86_mm_first+2;
+  fpc_in_x86_movss_to_mem = fpc_in_x86_mm_first+3;
+  fpc_in_x86_movaps_to_mem = fpc_in_x86_mm_first+4;
+  fpc_in_x86_movups_to_mem = fpc_in_x86_mm_first+5;
+  fpc_in_x86_movss_to_val = fpc_in_x86_mm_first+6;
+  fpc_in_x86_movss_from_val = fpc_in_x86_mm_first+7;
+  fpc_in_x86_movlps = fpc_in_x86_mm_first+8;
+  fpc_in_x86_movhps = fpc_in_x86_mm_first+9;
+  fpc_in_x86_movlhps = fpc_in_x86_mm_first+10;
+  fpc_in_x86_movhlps = fpc_in_x86_mm_first+11;
+  fpc_in_x86_addss = fpc_in_x86_mm_first+12;
+  fpc_in_x86_subss = fpc_in_x86_mm_first+13;
+  fpc_in_x86_mulss = fpc_in_x86_mm_first+14;
+  fpc_in_x86_divss = fpc_in_x86_mm_first+15;
+  fpc_in_x86_rcpss = fpc_in_x86_mm_first+16;
+  fpc_in_x86_sqrtss = fpc_in_x86_mm_first+17;
+  fpc_in_x86_maxss = fpc_in_x86_mm_first+18;
+  fpc_in_x86_minss = fpc_in_x86_mm_first+19;
+  fpc_in_x86_rsqrtss = fpc_in_x86_mm_first+20;
+  fpc_in_x86_addps = fpc_in_x86_mm_first+21;
+  fpc_in_x86_subps = fpc_in_x86_mm_first+22;
+  fpc_in_x86_mulps = fpc_in_x86_mm_first+23;
+  fpc_in_x86_divps = fpc_in_x86_mm_first+24;
+  fpc_in_x86_rcpps = fpc_in_x86_mm_first+25;
+  fpc_in_x86_sqrtps = fpc_in_x86_mm_first+26;
+  fpc_in_x86_maxps = fpc_in_x86_mm_first+27;
+  fpc_in_x86_minps = fpc_in_x86_mm_first+28;
+  fpc_in_x86_rsqrtps = fpc_in_x86_mm_first+29;
+  fpc_in_x86_andps = fpc_in_x86_mm_first+30;
+  fpc_in_x86_orps = fpc_in_x86_mm_first+31;
+  fpc_in_x86_xorps = fpc_in_x86_mm_first+32;
+  fpc_in_x86_andnps = fpc_in_x86_mm_first+33;
+  fpc_in_x86_cmpss = fpc_in_x86_mm_first+34;
+  fpc_in_x86_cmpps = fpc_in_x86_mm_first+35;
+  fpc_in_x86_shufps = fpc_in_x86_mm_first+36;
+  fpc_in_x86_unpckhps = fpc_in_x86_mm_first+37;
+  fpc_in_x86_unpcklps = fpc_in_x86_mm_first+38;
+  fpc_in_x86_cvtsi2ss = fpc_in_x86_mm_first+39;
+  fpc_in_x86_cvtss2si = fpc_in_x86_mm_first+40;
+  fpc_in_x86_cvttss2si = fpc_in_x86_mm_first+41;
+  fpc_in_x86_cvtpi2ps = fpc_in_x86_mm_first+42;
+  fpc_in_x86_cvtps2pi = fpc_in_x86_mm_first+43;
+  fpc_in_x86_cvttps2pi = fpc_in_x86_mm_first+44;
+  fpc_in_x86_pmulhuw_mmx = fpc_in_x86_mm_first+45;
+  fpc_in_x86_psadbw_mmx = fpc_in_x86_mm_first+46;
+  fpc_in_x86_pavgb_mmx = fpc_in_x86_mm_first+47;
+  fpc_in_x86_pavgw_mmx = fpc_in_x86_mm_first+48;
+  fpc_in_x86_pmaxub_mmx = fpc_in_x86_mm_first+49;
+  fpc_in_x86_pminub_mmx = fpc_in_x86_mm_first+50;
+  fpc_in_x86_pmaxsw_mmx = fpc_in_x86_mm_first+51;
+  fpc_in_x86_pminsw_mmx = fpc_in_x86_mm_first+52;
+  fpc_in_x86_pextrw_mmx = fpc_in_x86_mm_first+53;
+  fpc_in_x86_pinsrw_mmx = fpc_in_x86_mm_first+54;
+  fpc_in_x86_pmovmskb = fpc_in_x86_mm_first+55;
+  fpc_in_x86_pshufw = fpc_in_x86_mm_first+56;
+  fpc_in_x86_pmulhuw = fpc_in_x86_mm_first+57;
+  fpc_in_x86_psadbw = fpc_in_x86_mm_first+58;
+  fpc_in_x86_pavgb = fpc_in_x86_mm_first+59;
+  fpc_in_x86_pavgw = fpc_in_x86_mm_first+60;
+  fpc_in_x86_pmaxub = fpc_in_x86_mm_first+61;
+  fpc_in_x86_pminub = fpc_in_x86_mm_first+62;
+  fpc_in_x86_pmaxsw = fpc_in_x86_mm_first+63;
+  fpc_in_x86_pminsw = fpc_in_x86_mm_first+64;
+  fpc_in_x86_pextrw = fpc_in_x86_mm_first+65;
+  fpc_in_x86_pinsrw = fpc_in_x86_mm_first+66;

+ 68 - 0
rtl/x86_64/cpummprocs.inc

@@ -0,0 +1,68 @@
+function x86_movss(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movss];
+function x86_movaps(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movaps];
+function x86_movups(r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movups];
+procedure x86_movss(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movss_to_mem];
+procedure x86_movaps(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movaps_to_mem];
+procedure x86_movups(r0: pointer; r1: __m128); [INTERNPROC: fpc_in_x86_movups_to_mem];
+function x86_movss(r1: __m128): single; [INTERNPROC: fpc_in_x86_movss_to_val];
+function x86_movss(r1: single): __m128; [INTERNPROC: fpc_in_x86_movss_from_val];
+function x86_movlps(r0: __m128; r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movlps];
+function x86_movhps(r0: __m128; r1: pointer): __m128; [INTERNPROC: fpc_in_x86_movhps];
+function x86_movlhps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_movlhps];
+function x86_movhlps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_movhlps];
+function x86_addss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_addss];
+function x86_subss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_subss];
+function x86_mulss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_mulss];
+function x86_divss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_divss];
+function x86_rcpss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rcpss];
+function x86_sqrtss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_sqrtss];
+function x86_maxss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_maxss];
+function x86_minss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_minss];
+function x86_rsqrtss(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rsqrtss];
+function x86_addps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_addps];
+function x86_subps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_subps];
+function x86_mulps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_mulps];
+function x86_divps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_divps];
+function x86_rcpps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rcpps];
+function x86_sqrtps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_sqrtps];
+function x86_maxps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_maxps];
+function x86_minps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_minps];
+function x86_rsqrtps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_rsqrtps];
+function x86_andps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_andps];
+function x86_orps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_orps];
+function x86_xorps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_xorps];
+function x86_andnps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_andnps];
+function x86_cmpss(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_cmpss];
+function x86_cmpps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_cmpps];
+function x86_shufps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: fpc_in_x86_shufps];
+function x86_unpckhps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_unpckhps];
+function x86_unpcklps(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_unpcklps];
+function x86_cvtsi2ss(r0: __m128; r1: longword): __m128; [INTERNPROC: fpc_in_x86_cvtsi2ss];
+function x86_cvtss2si(r1: __m128): longword; [INTERNPROC: fpc_in_x86_cvtss2si];
+function x86_cvttss2si(r1: __m128): longword; [INTERNPROC: fpc_in_x86_cvttss2si];
+function x86_cvtpi2ps(r0: __m128; r1: __m64): __m128; [INTERNPROC: fpc_in_x86_cvtpi2ps];
+function x86_cvtps2pi(r1: __m128): __m64; [INTERNPROC: fpc_in_x86_cvtps2pi];
+function x86_cvttps2pi(r1: __m128): __m64; [INTERNPROC: fpc_in_x86_cvttps2pi];
+function x86_pmulhuw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmulhuw_mmx];
+function x86_psadbw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_psadbw_mmx];
+function x86_pavgb(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pavgb_mmx];
+function x86_pavgw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pavgw_mmx];
+function x86_pmaxub(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmaxub_mmx];
+function x86_pminub(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pminub_mmx];
+function x86_pmaxsw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pmaxsw_mmx];
+function x86_pminsw(r0, r1: __m64): __m64; [INTERNPROC: fpc_in_x86_pminsw_mmx];
+function x86_pextrw(r1: __m64; imm: longint): longword; [INTERNPROC: fpc_in_x86_pextrw_mmx];
+function x86_pinsrw(r0: __m64; r1: longword; imm: longint): __m64; [INTERNPROC: fpc_in_x86_pinsrw_mmx];
+function x86_pmovmskb(r1: __m64): longword; [INTERNPROC: fpc_in_x86_pmovmskb];
+function x86_pshufw(r1: __m64; imm: longint): __m64; [INTERNPROC: fpc_in_x86_pshufw];
+function x86_pmulhuw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmulhuw];
+function x86_psadbw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_psadbw];
+function x86_pavgb(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pavgb];
+function x86_pavgw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pavgw];
+function x86_pmaxub(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmaxub];
+function x86_pminub(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pminub];
+function x86_pmaxsw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pmaxsw];
+function x86_pminsw(r0, r1: __m128): __m128; [INTERNPROC: fpc_in_x86_pminsw];
+function x86_pextrw(r1: __m128; imm: longint): longword; [INTERNPROC: fpc_in_x86_pextrw];
+function x86_pinsrw(r0: __m128; r1: longword; imm: longint): __m128; [INTERNPROC: fpc_in_x86_pinsrw];
+