瀏覽代碼

Add support for SSE and related MMX intrinsics. Still needs a lot of polishing for mmreg vars and parameter passing.

git-svn-id: branches/laksen/intrinsics@31135 -
Jeppe Johansen 10 年之前
父節點
當前提交
939da7273a

+ 10 - 0
.gitattributes

@@ -782,6 +782,7 @@ compiler/utils/mkjvmreg.pp svneol=native#text/plain
 compiler/utils/mkmpsreg.pp svneol=native#text/plain
 compiler/utils/mkppcreg.pp svneol=native#text/plain
 compiler/utils/mkspreg.pp svneol=native#text/plain
+compiler/utils/mkx86inl.pp svneol=native#text/plain
 compiler/utils/mkx86ins.pp svneol=native#text/plain
 compiler/utils/mkx86reg.pp svneol=native#text/plain
 compiler/utils/msg2inc.pp svneol=native#text/plain
@@ -825,8 +826,13 @@ compiler/x86/rax86int.pas svneol=native#text/plain
 compiler/x86/rgx86.pas svneol=native#text/plain
 compiler/x86/symi86.pas svneol=native#text/plain
 compiler/x86/symx86.pas svneol=native#text/plain
+compiler/x86/x86first.inc svneol=native#text/plain
+compiler/x86/x86innr.inc svneol=native#text/plain
 compiler/x86/x86ins.dat svneol=native#text/plain
+compiler/x86/x86intr.dat svneol=native#text/plain
 compiler/x86/x86reg.dat svneol=native#text/plain
+compiler/x86/x86second.inc svneol=native#text/plain
+compiler/x86/x86type.inc svneol=native#text/plain
 compiler/x86_64/aoptcpu.pas svneol=native#text/plain
 compiler/x86_64/aoptcpub.pas svneol=native#text/plain
 compiler/x86_64/aoptcpud.pas svneol=native#text/plain
@@ -8497,6 +8503,8 @@ rtl/i386/setjump.inc svneol=native#text/plain
 rtl/i386/setjumph.inc svneol=native#text/plain
 rtl/i386/strings.inc svneol=native#text/plain
 rtl/i386/stringss.inc svneol=native#text/plain
+rtl/i386/x86innr.inc svneol=native#text/plain
+rtl/i386/x86procs.inc svneol=native#text/plain
 rtl/i8086/hugeptr.inc svneol=native#text/plain
 rtl/i8086/i8086.inc svneol=native#text/plain
 rtl/i8086/int32p.inc svneol=native#text/plain
@@ -9601,6 +9609,8 @@ rtl/x86_64/setjumph.inc svneol=native#text/plain
 rtl/x86_64/strings.inc svneol=native#text/plain
 rtl/x86_64/stringss.inc svneol=native#text/plain
 rtl/x86_64/x86_64.inc svneol=native#text/plain
+rtl/x86_64/x86innr.inc svneol=native#text/plain
+rtl/x86_64/x86procs.inc svneol=native#text/plain
 tests/MPWMake -text
 tests/Makefile svneol=native#text/plain
 tests/Makefile.fpc svneol=native#text/plain

+ 2 - 2
compiler/Makefile

@@ -489,10 +489,10 @@ endif
 endif
 override LOCALOPT+=-d$(CPC_TARGET) -dGDB -dBROWSERLOG
 ifeq ($(PPC_TARGET),i386)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 ifeq ($(PPC_TARGET),x86_64)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 ifeq ($(PPC_TARGET),powerpc)
 override LOCALOPT+=-Fuppcgen

+ 2 - 2
compiler/Makefile.fpc

@@ -241,12 +241,12 @@ override LOCALOPT+=-d$(CPC_TARGET) -dGDB -dBROWSERLOG
 
 # i386 specific
 ifeq ($(PPC_TARGET),i386)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 
 # x86_64 specific
 ifeq ($(PPC_TARGET),x86_64)
-override LOCALOPT+=-Fux86
+override LOCALOPT+=-Fux86 -Fix86
 endif
 
 # PowerPC specific

+ 3 - 0
compiler/compinnr.inc

@@ -138,3 +138,6 @@ const
 
 { ARM }
    in_arm_base         = 300;
+
+{ X86 }
+   in_x86_base         = 500;

+ 53 - 13
compiler/defutil.pas

@@ -1081,8 +1081,9 @@ implementation
       begin
         result:=(p.typ=arraydef) and
                 not(is_special_array(p)) and
+                (tarraydef(p).elementdef.typ in [floatdef,orddef]) {and
                 (tarraydef(p).elementdef.typ=floatdef) and
-                (tfloatdef(tarraydef(p).elementdef).floattype in [s32real,s64real]);
+                (tfloatdef(tarraydef(p).elementdef).floattype in [s32real,s64real])};
       end;
 
 
@@ -1092,21 +1093,60 @@ implementation
 {$ifdef x86}
         result:= is_vector(p) and
                  (
-                  (tarraydef(p).elementdef.typ=floatdef) and
                   (
-                   (tarraydef(p).lowrange=0) and
-                   (tarraydef(p).highrange=3) and
-                   (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
-                  )
-                 ) or
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=3) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
+                   )
+                  ) or
 
-                 (
-                  (tarraydef(p).elementdef.typ=floatdef) and
                   (
-                   (tarraydef(p).lowrange=0) and
-                   (tarraydef(p).highrange=1) and
-                   (tfloatdef(tarraydef(p).elementdef).floattype=s64real)
-                  )
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s64real)
+                   )
+                  ) {or
+
+                  // MMX registers
+                  (
+                   (tarraydef(p).elementdef.typ=floatdef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (tfloatdef(tarraydef(p).elementdef).floattype=s32real)
+                   )
+                  ) or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=1) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s32bit,u32bit])
+                   )
+                  )  or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=3) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s16bit,u16bit])
+                   )
+                  ) or
+
+                  (
+                   (tarraydef(p).elementdef.typ=orddef) and
+                   (
+                    (tarraydef(p).lowrange=0) and
+                    (tarraydef(p).highrange=7) and
+                    (torddef(tarraydef(p).elementdef).ordtype in [s8bit,u8bit])
+                   )
+                  ) }
                  );
 {$else x86}
         result:=false;

+ 10 - 0
compiler/ncginl.pas

@@ -61,6 +61,7 @@ interface
           procedure second_popcnt; virtual;
           procedure second_seg; virtual; abstract;
           procedure second_fma; virtual;
+          procedure second_sse; virtual;
           procedure second_arm; virtual;
        end;
 
@@ -198,6 +199,8 @@ implementation
             in_fma_float128:
                second_fma;
 
+            in_x86_first..in_x86_last:
+               second_sse;
             in_arm_first..in_arm_last:
                second_arm;
             else internalerror(9);
@@ -784,6 +787,13 @@ implementation
         internalerror(2014032701);
       end;
 
+
+    procedure tcginlinenode.second_sse;
+      begin
+        internalerror(2015062001);
+      end;
+
+
     procedure tcginlinenode.second_arm;
       begin
         internalerror(2015061701);

+ 10 - 0
compiler/ncgld.pas

@@ -872,6 +872,16 @@ implementation
                 begin
                   if left.resultdef.typ=arraydef then
                     begin
+                      case left.location.loc of
+                        LOC_CMMREGISTER,
+                        LOC_MMREGISTER:
+                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,nil);
+                        LOC_REFERENCE,
+                        LOC_CREFERENCE:
+                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,nil);
+                        else
+                          internalerror(2009112601);
+                      end;
                     end
                   else
                     begin

+ 5 - 1
compiler/ncgutil.pas

@@ -1580,7 +1580,11 @@ implementation
               localvarsym :
                 begin
                   vs:=tabstractnormalvarsym(sym);
-                  vs.initialloc.size:=def_cgsize(vs.vardef);
+                  if is_vector(vs.vardef) and
+                     fits_in_mm_register(vs.vardef) then
+                    vs.initialloc.size:=def_cgmmsize(vs.vardef)
+                  else
+                    vs.initialloc.size:=def_cgsize(vs.vardef);
                   if ([po_assembler,po_nostackframe] * pd.procoptions = [po_assembler,po_nostackframe]) and
                      (vo_is_funcret in vs.varoptions) then
                     begin

+ 19 - 2
compiler/ninl.pas

@@ -30,6 +30,7 @@ interface
 
     {$i compinnr.inc}
     {$i arm/arminnr.inc}
+    {$i x86/x86innr.inc}
 
     type
        tinlinenode = class(tunarynode)
@@ -87,6 +88,7 @@ interface
           function first_seg: tnode; virtual;
           function first_sar: tnode; virtual;
           function first_fma : tnode; virtual;
+          function first_sse : tnode; virtual;
           function first_arm : tnode; virtual;
         private
           function handle_str: tnode;
@@ -2584,8 +2586,11 @@ implementation
         var
           p: tnode;
         begin
-          if count=1 then
-            set_varstate(left,vs_read,[vsf_must_be_valid])
+          if (count=1) and
+             (not (left is tcallparanode)) then
+            begin
+              set_varstate(left,vs_read,[vsf_must_be_valid])
+            end
           else
             begin
               p:=left;
@@ -3294,6 +3299,9 @@ implementation
                   resultdef:=tcallparanode(left).left.resultdef;
                 end;
 
+{$ifdef x86}
+              {$i x86type.inc}
+{$endif x86}
 {$ifdef ARM}
               {$i armtype.inc}
 {$endif ARM}
@@ -3621,6 +3629,8 @@ implementation
          in_fma_extended,
          in_fma_float128:
            result:=first_fma;
+         in_x86_first..in_x86_last:
+           result:=first_sse;
          in_arm_first..in_arm_last:
            result:=first_arm;
          else
@@ -4335,6 +4345,13 @@ implementation
        end;
 
 
+     function tinlinenode.first_sse: tnode;
+       begin
+         CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'SSE');
+         result:=nil;
+       end;
+
+
      function tinlinenode.first_arm: tnode;
        begin
          CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'ARM');

+ 1 - 1
compiler/pexpr.pas

@@ -255,7 +255,7 @@ implementation
        end;
 
 
-     function statement_syssym(l : byte) : tnode;
+     function statement_syssym(l : word) : tnode;
       var
         p1,p2,paras  : tnode;
         err,

+ 36 - 0
compiler/psystem.pas

@@ -316,6 +316,21 @@ implementation
         wordfarpointertype:=tcpupointerdefclass(cpointerdef).createx86(u16inttype,x86pt_far);
         longintfarpointertype:=tcpupointerdefclass(cpointerdef).createx86(s32inttype,x86pt_far);
   {$endif i8086}
+        x86_m64type:=carraydef.create(0,1,s32inttype);
+        x86_m128type:=carraydef.create(0,3,s32inttype);
+        x86_m128dtype:=carraydef.create(0,1,s32inttype);
+        x86_m128itype:=carraydef.create(0,3,s32inttype);
+        x86_m256type:=carraydef.create(0,7,s32inttype);
+        x86_m256dtype:=carraydef.create(0,3,s32inttype);
+        x86_m256itype:=carraydef.create(0,7,s32inttype);
+
+        tarraydef(x86_m64type).elementdef:=s32floattype;
+        tarraydef(x86_m128type).elementdef:=s32floattype;
+        tarraydef(x86_m128dtype).elementdef:=s64floattype;
+        tarraydef(x86_m128itype).elementdef:=s32floattype;
+        tarraydef(x86_m256type).elementdef:=s32floattype;
+        tarraydef(x86_m256dtype).elementdef:=s64floattype;
+        tarraydef(x86_m256itype).elementdef:=s32floattype;
 {$endif x86}
         set_default_ptr_types;
         cfiletype:=cfiledef.createuntyped;
@@ -375,6 +390,13 @@ implementation
         addtype('FarPointer',voidfarpointertype);
         addtype('HugePointer',voidhugepointertype);
   {$endif i8086}
+        addtype('__m64',x86_m64type);
+        addtype('__m128', x86_m128type);
+        addtype('__m128d',x86_m128dtype);
+        addtype('__m128i',x86_m128itype);
+        addtype('__m256', x86_m256type);
+        addtype('__m256d',x86_m256dtype);
+        addtype('__m256i',x86_m256itype);
 {$endif x86}
         addtype('ShortString',cshortstringtype);
 {$ifdef support_longstring}
@@ -458,6 +480,13 @@ implementation
         addtype('$word_farpointer',wordfarpointertype);
         addtype('$longint_farpointer',longintfarpointertype);
   {$endif i8086}
+        addtype('$__m64',  x86_m64type);
+        addtype('$__m128', x86_m128type);
+        addtype('$__m128d',x86_m128dtype);
+        addtype('$__m128i',x86_m128itype);
+        addtype('$__m256', x86_m256type);
+        addtype('$__m256d',x86_m256dtype);
+        addtype('$__m256i',x86_m256itype);
 {$endif x86}
         addtype('$openchararray',openchararraytype);
         addtype('$file',cfiletype);
@@ -596,6 +625,13 @@ implementation
         loadtype('word_farpointer',wordfarpointertype);
         loadtype('longint_farpointer',longintfarpointertype);
   {$endif i8086}
+        loadtype('__m64',  x86_m64type);
+        loadtype('__m128', x86_m128type);
+        loadtype('__m128d',x86_m128dtype);
+        loadtype('__m128i',x86_m128itype);
+        loadtype('__m256', x86_m256type);
+        loadtype('__m256d',x86_m256dtype);
+        loadtype('__m256i',x86_m256itype);
 {$endif x86}
         loadtype('file',cfiletype);
         if not(target_info.system in systems_managed_vm) then

+ 9 - 0
compiler/symdef.pas

@@ -1077,6 +1077,15 @@ interface
        { FPC java procvar base class }
        java_procvarbase          : tobjectdef;
 
+       { x86 vector types }
+       x86_m64type,
+       x86_m128type,
+       x86_m128dtype,
+       x86_m128itype,
+       x86_m256type,
+       x86_m256dtype,
+       x86_m256itype             : tdef;
+
 
     function make_mangledname(const typeprefix:TSymStr;st:TSymtable;const suffix:TSymStr):TSymStr;
     function make_dllmangledname(const dllname,importname:TSymStr;

+ 6 - 1
compiler/symsym.pas

@@ -1770,7 +1770,12 @@ implementation
                      varregable:=vr_mmreg
                    else
                      varregable:=vr_fpureg;
-                 end;
+                 end
+            else if is_vector(vardef) and
+                    fits_in_mm_register(vardef) then
+              begin
+                varregable:=vr_mmreg;
+              end;
           end;
       end;
 

+ 538 - 0
compiler/utils/mkx86inl.pp

@@ -0,0 +1,538 @@
+program mkx86inl;
+
+{$mode objfpc}
+{$H+}
+
+uses
+  sysutils, classes,
+  strutils;
+
+type
+  TOperDirection = (operIn, operVar, operOut);
+
+  TOperand = record
+    name,
+    typ: string;
+    direction: TOperDirection;
+  end;
+
+const
+  DirLUT: array[TOperDirection] of string = ('','var ','out ');
+
+function GetPascalType(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('longword');
+      'rs32':  exit('longint');
+      'r64':   exit('qword');
+      'rs64':  exit('int64');
+      'f32':   exit('single');
+      'mm':    exit('__m64');
+      'xmm':   exit('__m128');
+      'i32':   exit('longint');
+      'ptr32': exit('pointer');
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetTypeDef(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('u32inttype');
+      'rs32':  exit('s32inttype');
+      'r64':   exit('u64inttype');
+      'rs64':  exit('s64inttype');
+      'f32':   exit('s32floattype');
+      'mm':    exit('x86_m64type');
+      'xmm':   exit('x86_m128type');
+      'i32':   exit('s32inttype');
+      'ptr32': exit('voidpointertype');
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetOper(const ATyp: string): string;
+  begin
+    case ATyp of
+      'r32':   exit('_reg');
+      'rs32':  exit('_reg');
+      'r64':   exit('_reg_reg');
+      'rs64':  exit('_reg_reg');
+      'f32':   exit('_reg');
+      'mm':    exit('_reg');
+      'xmm':   exit('_reg');
+      'i32':   exit('_const');
+      'ptr32': exit('_ref');
+    else
+      exit('');
+    end;
+  end;
+
+function GetOperand(const ATyp: string; AIndex: longint): string;
+  begin
+    case ATyp of
+      'r32':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'rs32':  exit(format(',paraarray[%d].location.register', [AIndex]));
+      'r64':   exit(format(',paraarray[%d].location.register64.reglo,paraarray[%d].location.register64.reghi', [AIndex,AIndex]));
+      'rs64':  exit(format(',paraarray[%d].location.register64.reglo,paraarray[%d].location.register64.reghi', [AIndex,AIndex]));
+      'f32':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'mm':    exit(format(',paraarray[%d].location.register', [AIndex]));
+      'xmm':   exit(format(',paraarray[%d].location.register', [AIndex]));
+      'i32':   exit(format(',GetConstInt(paraarray[%d])',[AIndex]));
+      'ptr32': exit(format(',paraarray[%d].location.reference', [AIndex]));
+    else
+      exit(ATyp);
+    end;
+  end;
+
+function GetOperandLoc(const ATyp: string): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':  exit(',location.register');
+      'rs32': exit(',location.register');
+      'r64':  exit(',location.register64.reglo,location.register64.reghi');
+      'rs64': exit(',location.register64.reglo,location.register64.reghi');
+      'f32':  exit(',location.register');
+      'mm':   exit(',location.register');
+      'xmm':  exit(',location.register');
+    end;
+  end;
+
+function GetLocStatement(AIndex: longint; const ATyp: string; AConst: boolean): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':   exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u32inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'rs32':  exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u32inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'r64':   exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u64inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'rs64':  exit(format('hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[%d].location, paraarray[%d].resultdef,u64inttype,%s);', [AIndex+1, AIndex+1, BoolToStr(aconst,'true','false')]));
+      'f32':   exit(format('location_force_mmreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'mm':    exit(format('location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'xmm':   exit(format('location_force_mmreg(current_asmdata.CurrAsmList, paraarray[%d].location, %s);', [AIndex+1, BoolToStr(aconst,'true','false')]));
+      'ptr32': exit(format('location_make_ref(paraarray[%d].location);', [AIndex+1]));
+    end;
+  end;
+
+function GetLoc(const ATyp: string; AWithSize: boolean = true): string;
+  begin
+    result:='';
+    if AWithSize then
+      case ATyp of
+        'r32':   exit('LOC_REGISTER,OS_32');
+        'rs32':  exit('LOC_REGISTER,OS_S32');
+        'r64':   exit('LOC_REGISTER,OS_64');
+        'rs64':  exit('LOC_REGISTER,OS_S64');
+        'f32':   exit('LOC_MMREGISTER,OS_M128');
+        'mm':    exit('LOC_MMXREGISTER,OS_M64');
+        'xmm':   exit('LOC_MMREGISTER,OS_M128');
+        'ptr32': exit('LOC_MEM,OS_32');
+      end
+    else
+      case ATyp of
+        'r32':   exit('LOC_REGISTER');
+        'rs32':  exit('LOC_REGISTER');
+        'r64':   exit('LOC_REGISTER');
+        'rs64':  exit('LOC_REGISTER');
+        'f32':   exit('LOC_MMREGISTER');
+        'mm':    exit('LOC_MMXREGISTER');
+        'xmm':   exit('LOC_MMREGISTER');
+        'ptr32': exit('LOC_MEM');
+      end;
+  end;
+
+function GetLocAllocation(const ATyp: string): string;
+  begin
+    result:='';
+    case ATyp of
+      'r32':  exit('location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'rs32': exit('location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'r64':  exit('location.register64.reglo:=cg.getintregister(current_asmdata.CurrAsmList, OS_32); location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'rs64': exit('location.register64.reglo:=cg.getintregister(current_asmdata.CurrAsmList, OS_32); location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);');
+      'f32':  exit('location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);');
+      'mm':   exit('location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);');
+      'xmm':  exit('location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);');
+    end;
+  end;
+
+function GetPostFix(const APF: string): string;
+  begin
+    if APF<>'' then
+      result:='PF_'+APF
+    else
+      result:='PF_None';
+  end;
+
+procedure ParseList(const APrefix, AFilename: string);
+  var
+    f: TextFile;
+
+    fprocs,
+    finnr: TextFile;
+    ftypechk, ffirst, fsecond: TStringList;
+
+    str,
+    instrPart,postfix,_alias,
+    params, operline: String;
+
+    opers: array[0..7] of TOperand;
+    opercnt: longint;
+
+    hasOutput: boolean;
+    outputType: string;
+    cnt,
+    i, intrnum: longint;
+    tmp: String;
+
+  function ParseOperands(AIndex: longint = -1): string;
+    var
+      idx: LongInt;
+      pt: Integer;
+      c: Char;
+    begin
+      idx:=opercnt;
+
+      params:=trim(params);
+      if params='' then
+        exit('');
+
+      inc(opercnt);
+
+      if pos('var ', params)=1 then
+        begin
+          opers[idx].direction:=operVar;
+          Delete(params,1,4);
+          params:=trim(params);
+          hasOutput:=true;
+        end
+      else if pos('out ', params)=1 then
+        begin
+          opers[idx].direction:=operOut;
+          Delete(params,1,4);
+          params:=trim(params);
+          hasOutput:=true;
+        end
+      else
+        begin
+          if AIndex<>-1 then
+            opers[idx].direction:=opers[AIndex].direction
+          else
+            opers[idx].direction:=operIn;
+        end;
+
+          pt:=PosSet([',',':'], params);
+
+      c:=params[pt];
+      opers[idx].name:=Copy2SymbDel(params, c);
+      params:=trim(params);
+
+      if c = ':' then
+        begin
+          opers[idx].typ:=Copy2SymbDel(params, ';');
+          result:=opers[idx].typ;
+        end
+      else
+        begin
+          opers[idx].typ:=ParseOperands(idx);
+          result:=opers[idx].typ;
+        end;
+
+      if opers[idx].direction<>operIn then
+        outputType:=opers[idx].typ;
+    end;
+
+  function GetOperLine: string;
+    var
+      i: longint;
+    begin
+      result:='';
+      for i := 0 to opercnt-1 do
+        result:=result+DirLUT[opers[i].direction]+opers[i].name+':'+opers[i].typ+';';
+    end;
+
+  function GetParams: longint;
+    var
+      i: longint;
+    begin
+      result:=0;
+      for i := 0 to opercnt-1 do
+        if opers[i].direction in [operIn,operVar] then
+          inc(result);
+    end;
+
+  function FindOperIdx(const AOper: string): longint;
+    var
+      i,cnt: longint;
+    begin
+      cnt:=0;
+      result:=0;
+      for i := 0 to opercnt-1 do
+        if (opers[i].direction in [operIn,operVar]) then
+          begin
+            if opers[i].name=AOper then
+              exit(cnt);
+            inc(cnt);
+          end;
+    end;
+
+  begin
+    intrnum:=0;
+
+    assignfile(f, AFilename);
+    reset(f);
+
+    assignfile(fprocs, APrefix+'procs.inc'); rewrite(fprocs);
+    assignfile(finnr, APrefix+'innr.inc'); rewrite(finnr);
+
+    writeln(finnr,'const');
+
+    ftypechk:=TStringList.Create;
+    ffirst:=TStringList.Create;
+    fsecond:=TStringList.Create;
+
+    writeln(finnr, '  in_', APrefix,'_first = in_',APrefix,'_base;');
+
+    while not EOF(f) do
+      begin
+        readln(f, str);
+
+        str:=trim(str);
+
+        if (str='') or (Pos(';',str)=1) then
+          continue;
+
+        instrPart:=Copy2SymbDel(str, '(');
+
+        // Check for postfix
+        if pos('{',instrPart)>0 then
+          begin
+            postfix:=instrPart;
+            instrPart:=Copy2SymbDel(postfix, '{');
+            postfix:=TrimRightSet(postfix,['}']);
+          end
+        else
+          postfix:='';
+
+        // Check for alias
+        if pos('[',instrPart)>0 then
+          begin
+            _alias:=instrPart;
+            instrPart:=Copy2SymbDel(_alias, '[');
+            _alias:='_'+TrimRightSet(_alias,[']']);
+          end
+        else
+          _alias:='';
+
+        // Get parameters
+        params:=trim(Copy2SymbDel(str,')'));
+        str:=trim(str);
+
+        hasOutput:=false;
+        opercnt:=0;
+        outputType:='';
+
+        while params<>'' do
+          ParseOperands;
+
+        operline:=GetOperLine;
+        // Write typecheck code
+        i:=ftypechk.IndexOf(': //'+operline);
+        if i>=0 then
+          ftypechk.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias)
+        else
+          begin
+            ftypechk.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            ftypechk.Add(': //'+operline);
+            ftypechk.Add('  begin');
+            ftypechk.Add('    CheckParameters('+inttostr(GetParams())+');');
+            if hasOutput then
+              ftypechk.Add('    resultdef:='+GetTypeDef(outputType)+';')
+            else
+              ftypechk.Add('    resultdef:=voidtype;');
+            ftypechk.Add('  end;')
+          end;
+
+        // Write firstpass code
+        i:=ffirst.IndexOf(': //'+operline);
+        if i>=0 then
+          ffirst.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias)
+        else
+          begin
+            ffirst.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            ffirst.Add(': //'+operline);
+            ffirst.Add('  begin');
+            if hasOutput then
+              ffirst.Add('    expectloc:='+GetLoc(outputType,false)+';')
+            else
+              ffirst.Add('    expectloc:=LOC_VOID;');
+            ffirst.Add('    result:=nil;');
+            ffirst.Add('  end;')
+          end;
+
+        // Write secondpass code
+        i:=fsecond.IndexOf(': //'+operline);
+        if i>=0 then
+          begin
+            fsecond.Insert(i+3,'      in_'+APrefix+'_'+instrPart+postfix+_alias+': begin op:=A_'+instrPart+' end;');
+            fsecond.Insert(i,',in_'+APrefix+'_'+instrPart+postfix+_alias);
+          end
+        else
+          begin
+            fsecond.Add('in_'+APrefix+'_'+instrPart+postfix+_alias);
+            fsecond.Add(': //'+operline);
+            fsecond.Add('  begin');
+            fsecond.add('    case inlinenumber of');
+            fsecond.Add('      in_'+APrefix+'_'+instrPart+postfix+_alias+': begin op:=A_'+instrPart+'; end;');
+            fsecond.add('    end;');
+            fsecond.Add('');
+
+            i:=GetParams;
+            fsecond.Add('    GetParameters('+inttostr(i)+');');
+            fsecond.Add('');
+
+            fsecond.add('    for i := 1 to '+inttostr(i)+' do secondpass(paraarray[i]);');
+            fsecond.Add('');
+
+            // Force inputs
+            cnt:=0;
+            for i := 0 to opercnt-1 do
+              begin
+                case opers[i].direction of
+                  operIn:
+                    begin
+                      tmp:=GetLocStatement(cnt, opers[i].typ, true);
+                      if tmp<>'' then
+                        fsecond.add('    '+tmp);
+                      inc(cnt);
+                    end;
+                  operVar:
+                    begin
+                      tmp:=GetLocStatement(cnt, opers[i].typ, false);
+                      if tmp<>'' then
+                        fsecond.add('    '+tmp);
+                      inc(cnt);
+                    end;
+                end;
+              end;
+
+            // Allocate output
+            cnt:=0;
+            for i := 0 to opercnt-1 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    begin
+                      fsecond.add('    location_reset(location,'+GetLoc(opers[i].typ)+');');
+                      fsecond.Add('    '+GetLocAllocation(opers[i].typ));
+                    end;
+                  operVar:
+                    begin
+                      fsecond.Add('    location:=paraarray['+inttostr(cnt+1)+'].location;');
+                      inc(cnt);
+                    end;
+                  operIn:
+                    inc(cnt);
+                end;
+              end;
+
+            operline:='taicpu.op';
+            //for i := 0 to opercnt-1 do
+            for i := opercnt-1 downto 0 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    operline:=operline+GetOper(opers[i].typ);
+                  operVar:
+                    operline:=operline+GetOper(opers[i].typ);
+                  operIn:
+                    operline:=operline+GetOper(opers[i].typ);
+                end;
+              end;
+
+            if operline='taicpu.op' then
+              operline:='taicpu.op_none(op,S_NO'
+            else
+              operline:=operline+'(op,S_NO';
+
+            //for i := 0 to opercnt-1 do
+            for i := opercnt-1 downto 0 do
+              begin
+                case opers[i].direction of
+                  operOut:
+                    operline:=operline+GetOperandLoc(opers[i].typ);
+                  operIn,
+                  operVar:
+                    begin
+                      dec(cnt);
+                      operline:=operline+GetOperand(opers[i].typ, cnt+1);
+                    end;
+                end;
+              end;
+
+            operline:=operline+')';
+
+            fsecond.Add('    current_asmdata.CurrAsmList.concat('+operline+');');
+
+            fsecond.Add('  end;')
+          end;
+
+        // Write innr
+        writeln(finnr, '  in_', APrefix,'_',instrPart,postfix+_alias,' = in_',APrefix,'_base+',intrnum,';');
+
+        // Write function
+        if hasOutput then write(fprocs,'function ') else write(fprocs,'procedure ');
+        write(fprocs,APrefix,'_',instrPart,postfix,'(');
+
+        cnt:=0;
+        for i:=0 to opercnt-1 do
+          begin
+            if opers[i].direction=operOut then
+              Continue;
+
+            if cnt>0 then
+              begin
+                if opers[i].typ<>opers[i-1].typ then
+                  write(fprocs,': ',GetPascalType(opers[i-1].typ),'; ')
+                else
+                  write(fprocs,', ');
+              end;
+
+            write(fprocs,opers[i].name);
+            if i=opercnt-1 then
+              write(fprocs,': ',GetPascalType(opers[i].typ));
+
+            inc(cnt);
+          end;
+
+        write(fprocs,')');
+
+        if hasOutput then write(fprocs,': ',GetPascalType(outputType));
+        writeln(fprocs,'; [INTERNPROC: in_',APrefix,'_',instrPart,postfix+_alias,'];');
+
+        // Str now contains conditionals
+
+        inc(intrnum);
+      end;
+
+    writeln(finnr, '  in_', APrefix,'_last = in_',APrefix,'_base+',intrnum-1,';');
+
+    ftypechk.SaveToFile(APrefix+'type.inc');
+    ffirst.SaveToFile(APrefix+'first.inc');
+    fsecond.SaveToFile(APrefix+'second.inc');
+
+    ftypechk.Free;
+    ffirst.Free;
+    fsecond.Free;
+
+    CloseFile(fprocs);
+    CloseFile(finnr);
+
+    closefile(f);
+  end;
+
+begin
+  ParseList('x86', 'x86intr.dat');
+end.
+

+ 12 - 2
compiler/x86/cgx86.pas

@@ -1237,6 +1237,14 @@ unit cgx86;
             else
               result:=A_MOVQ;
           end
+        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
+                (fromsize=OS_M128) then
+          begin
+            if UseAVX then
+              result:=A_VMOVDQU
+            else
+              result:=A_MOVDQU;
+          end
         else
           internalerror(2010060104);
         if result=A_NONE then
@@ -1260,6 +1268,8 @@ unit cgx86;
                   instr:=taicpu.op_reg_reg(A_MOVAPD,S_NO,reg1,reg2);
                 OS_M64:
                   instr:=taicpu.op_reg_reg(A_MOVQ,S_NO,reg1,reg2);
+                OS_M128:
+                  instr:=taicpu.op_reg_reg(A_MOVAPS,S_NO,reg1,reg2);
                 else
                   internalerror(2006091201);
               end
@@ -1317,7 +1327,7 @@ unit cgx86;
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
+             if fromsize in [OS_64,OS_S64,OS_M64] then
                list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
              else
 {$ifdef x86_64}
@@ -1352,7 +1362,7 @@ unit cgx86;
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
+             if fromsize in [OS_64,OS_S64,OS_M64] then
                list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
              else
 {$ifdef x86_64}

+ 99 - 0
compiler/x86/nx86inl.pas

@@ -46,6 +46,7 @@ interface
           function first_trunc_real: tnode; override;
           function first_popcnt: tnode; override;
           function first_fma: tnode; override;
+          function first_sse: tnode; override;
           { second pass override to generate these nodes }
           procedure second_IncludeExclude;override;
           procedure second_pi; override;
@@ -66,6 +67,7 @@ interface
 {$endif not i8086}
           procedure second_popcnt;override;
           procedure second_fma;override;
+          procedure second_sse;override;
        private
           procedure load_fpu_location(lnode: tnode);
        end;
@@ -264,6 +266,19 @@ implementation
        end;
 
 
+     function tx86inlinenode.first_sse : tnode;
+       begin
+{$ifndef i8086}
+         if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_SSEUNIT])<>[]) then
+           case inlinenumber of
+             {$i x86first.inc}
+           end
+         else
+{$endif i8086}
+           Result:=inherited first_fma;
+       end;
+
+
      procedure tx86inlinenode.second_pi;
        begin
          location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
@@ -918,4 +933,88 @@ implementation
            internalerror(2014032301);
       end;
 
+    procedure tx86inlinenode.second_sse;
+
+      var
+        paraarray : array[1..4] of tnode;
+        i : integer;
+        op: TAsmOp;
+
+      function GetConstInt(n: tnode): longint;
+        begin
+          if is_constintnode(n) then
+            result:=tordconstnode(n).value.svalue
+          else
+            Message(type_e_constant_expr_expected);
+        end;
+
+      procedure GetParameters(count: longint);
+        var
+          i: longint;
+          p: tnode;
+        begin
+          if (count=1) and
+             (not (left is tcallparanode)) then
+            paraarray[1]:=left
+          else
+            begin
+              p:=left;
+              for i := count downto 1 do
+                begin
+                  paraarray[i]:=tcallparanode(p).paravalue;
+                  p:=tcallparanode(p).nextpara;
+                end;
+            end;
+        end;
+
+      procedure location_force_mmxreg(list:TAsmList;var l: tlocation;maybeconst:boolean);
+        var
+          reg : tregister;
+        begin
+          if (l.loc<>LOC_MMXREGISTER)  and
+             ((l.loc<>LOC_CMMXREGISTER) or (not maybeconst)) then
+            begin
+              reg:=tcgx86(cg).getmmxregister(list);
+              cg.a_loadmm_loc_reg(list,OS_M64,l,reg,nil);
+              location_freetemp(list,l);
+              location_reset(l,LOC_MMXREGISTER,OS_M64);
+              l.register:=reg;
+            end;
+        end;
+
+      procedure location_make_ref(var loc: tlocation);
+        var
+          hloc: tlocation;
+        begin
+          case loc.loc of
+            LOC_CREGISTER,
+            LOC_REGISTER:
+              begin
+                location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1);
+                hloc.reference.base:=loc.register;
+
+                loc:=hloc;
+              end;
+            LOC_CREFERENCE,
+            LOC_REFERENCE:
+              begin
+              end;
+          else
+            begin
+              hlcg.location_force_reg(current_asmdata.CurrAsmList,loc,u32inttype,u32inttype,false);
+
+              location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1);
+              hloc.reference.base:=loc.register;
+
+              loc:=hloc;
+            end;
+          end;
+        end;
+
+      begin
+        case inlinenumber of
+          {$i x86second.inc}
+        end;
+      end;
+
 end.

+ 157 - 0
compiler/x86/x86first.inc

@@ -0,0 +1,157 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    expectloc:=LOC_VOID;
+    result:=nil;
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    expectloc:=LOC_MMXREGISTER;
+    result:=nil;
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    expectloc:=LOC_REGISTER;
+    result:=nil;
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    expectloc:=LOC_MMREGISTER;
+    result:=nil;
+  end;

+ 70 - 0
compiler/x86/x86innr.inc

@@ -0,0 +1,70 @@
+const
+  in_x86_first = in_x86_base;
+  in_x86_movss = in_x86_base+0;
+  in_x86_movaps = in_x86_base+1;
+  in_x86_movups = in_x86_base+2;
+  in_x86_movss_to_mem = in_x86_base+3;
+  in_x86_movaps_to_mem = in_x86_base+4;
+  in_x86_movups_to_mem = in_x86_base+5;
+  in_x86_movss_to_val = in_x86_base+6;
+  in_x86_movss_from_val = in_x86_base+7;
+  in_x86_movlps = in_x86_base+8;
+  in_x86_movhps = in_x86_base+9;
+  in_x86_movlhps = in_x86_base+10;
+  in_x86_movhlps = in_x86_base+11;
+  in_x86_addss = in_x86_base+12;
+  in_x86_subss = in_x86_base+13;
+  in_x86_mulss = in_x86_base+14;
+  in_x86_divss = in_x86_base+15;
+  in_x86_rcpss = in_x86_base+16;
+  in_x86_sqrtss = in_x86_base+17;
+  in_x86_maxss = in_x86_base+18;
+  in_x86_minss = in_x86_base+19;
+  in_x86_rsqrtss = in_x86_base+20;
+  in_x86_addps = in_x86_base+21;
+  in_x86_subps = in_x86_base+22;
+  in_x86_mulps = in_x86_base+23;
+  in_x86_divps = in_x86_base+24;
+  in_x86_rcpps = in_x86_base+25;
+  in_x86_sqrtps = in_x86_base+26;
+  in_x86_maxps = in_x86_base+27;
+  in_x86_minps = in_x86_base+28;
+  in_x86_rsqrtps = in_x86_base+29;
+  in_x86_andps = in_x86_base+30;
+  in_x86_orps = in_x86_base+31;
+  in_x86_xorps = in_x86_base+32;
+  in_x86_andnps = in_x86_base+33;
+  in_x86_cmpss = in_x86_base+34;
+  in_x86_cmpps = in_x86_base+35;
+  in_x86_shufps = in_x86_base+36;
+  in_x86_unpckhps = in_x86_base+37;
+  in_x86_unpcklps = in_x86_base+38;
+  in_x86_cvtsi2ss = in_x86_base+39;
+  in_x86_cvtss2si = in_x86_base+40;
+  in_x86_cvttss2si = in_x86_base+41;
+  in_x86_cvtpi2ps = in_x86_base+42;
+  in_x86_cvtps2pi = in_x86_base+43;
+  in_x86_cvttps2pi = in_x86_base+44;
+  in_x86_pmulhuw_mmx = in_x86_base+45;
+  in_x86_psadbw_mmx = in_x86_base+46;
+  in_x86_pavgb_mmx = in_x86_base+47;
+  in_x86_pavgw_mmx = in_x86_base+48;
+  in_x86_pmaxub_mmx = in_x86_base+49;
+  in_x86_pminub_mmx = in_x86_base+50;
+  in_x86_pmaxsw_mmx = in_x86_base+51;
+  in_x86_pminsw_mmx = in_x86_base+52;
+  in_x86_pextrw_mmx = in_x86_base+53;
+  in_x86_pinsrw_mmx = in_x86_base+54;
+  in_x86_pmovmskb = in_x86_base+55;
+  in_x86_pshufw = in_x86_base+56;
+  in_x86_pmulhuw = in_x86_base+57;
+  in_x86_psadbw = in_x86_base+58;
+  in_x86_pavgb = in_x86_base+59;
+  in_x86_pavgw = in_x86_base+60;
+  in_x86_pmaxub = in_x86_base+61;
+  in_x86_pminub = in_x86_base+62;
+  in_x86_pmaxsw = in_x86_base+63;
+  in_x86_pminsw = in_x86_base+64;
+  in_x86_pextrw = in_x86_base+65;
+  in_x86_pinsrw = in_x86_base+66;
+  in_x86_last = in_x86_base+66;

+ 81 - 0
compiler/x86/x86intr.dat

@@ -0,0 +1,81 @@
+movss(out r0: xmm; r1: ptr32)
+movaps(out r0: xmm; r1: ptr32)
+movups(out r0: xmm; r1: ptr32)
+
+movss[to_mem](r0: ptr32; r1: xmm)
+movaps[to_mem](r0: ptr32; r1: xmm)
+movups[to_mem](r0: ptr32; r1: xmm)
+
+movss[to_val](out r0: f32; r1: xmm)
+movss[from_val](out r0: xmm; r1: f32)
+
+movlps(var r0: xmm; r1: ptr32)
+movhps(var r0: xmm; r1: ptr32)
+
+movlhps(var r0: xmm; r1: xmm)
+movhlps(var r0: xmm; r1: xmm)
+
+addss(var r0: xmm; r1: xmm)
+subss(var r0: xmm; r1: xmm)
+mulss(var r0: xmm; r1: xmm)
+divss(var r0: xmm; r1: xmm)
+rcpss(var r0: xmm; r1: xmm)
+sqrtss(var r0: xmm; r1: xmm)
+maxss(var r0: xmm; r1: xmm)
+minss(var r0: xmm; r1: xmm)
+rsqrtss(var r0: xmm; r1: xmm)
+
+addps(var r0: xmm; r1: xmm)
+subps(var r0: xmm; r1: xmm)
+mulps(var r0: xmm; r1: xmm)
+divps(var r0: xmm; r1: xmm)
+rcpps(var r0: xmm; r1: xmm)
+sqrtps(var r0: xmm; r1: xmm)
+maxps(var r0: xmm; r1: xmm)
+minps(var r0: xmm; r1: xmm)
+rsqrtps(var r0: xmm; r1: xmm)
+
+andps(var r0: xmm; r1: xmm)
+orps(var r0: xmm; r1: xmm)
+xorps(var r0: xmm; r1: xmm)
+andnps(var r0: xmm; r1: xmm)
+
+cmpss(var r0: xmm; r1: xmm; imm: i32)             (imm in [0..7])
+cmpps(var r0: xmm; r1: xmm; imm: i32)             (imm in [0..7])
+
+shufps(var r0: xmm; r1: xmm; imm: i32)            (imm in [0..$ff])
+unpckhps(var r0: xmm; r1: xmm)
+unpcklps(var r0: xmm; r1: xmm)
+
+cvtsi2ss(var r0: xmm; r1: r32)
+cvtss2si(out r0: r32; r1: xmm)
+cvttss2si(out r0: r32; r1: xmm)
+
+cvtpi2ps(var r0: xmm; r1: mm)
+cvtps2pi(out r0: mm; r1: xmm)
+cvttps2pi(out r0: mm; r1: xmm)
+
+pmulhuw[mmx](var r0: mm; r1: mm)
+psadbw[mmx](var r0: mm; r1: mm)
+pavgb[mmx](var r0: mm; r1: mm)
+pavgw[mmx](var r0: mm; r1: mm)
+pmaxub[mmx](var r0: mm; r1: mm)
+pminub[mmx](var r0: mm; r1: mm)
+pmaxsw[mmx](var r0: mm; r1: mm)
+pminsw[mmx](var r0: mm; r1: mm)
+pextrw[mmx](out r0: r32; r1: mm; imm: i32)             (imm in [0..3])
+pinsrw[mmx](var r0: mm; r1: r32; imm: i32)             (imm in [0..3])
+
+pmovmskb(out r0: r32; r1: mm)
+pshufw(out r0: mm; r1: mm; imm: i32)                   (imm in [0..$ff])
+
+pmulhuw(var r0: xmm; r1: xmm)
+psadbw(var r0: xmm; r1: xmm)
+pavgb(var r0: xmm; r1: xmm)
+pavgw(var r0: xmm; r1: xmm)
+pmaxub(var r0: xmm; r1: xmm)
+pminub(var r0: xmm; r1: xmm)
+pmaxsw(var r0: xmm; r1: xmm)
+pminsw(var r0: xmm; r1: xmm)
+pextrw(out r0: r32; r1: xmm; imm: i32)            (imm in [0..7])
+pinsrw(var r0: xmm; r1: r32; imm: i32)            (imm in [0..7])         

+ 385 - 0
compiler/x86/x86second.inc

@@ -0,0 +1,385 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    case inlinenumber of
+      in_x86_movups: begin op:=A_movups end;
+      in_x86_movaps: begin op:=A_movaps end;
+      in_x86_movss: begin op:=A_movss; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_make_ref(paraarray[1].location);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,paraarray[1].location.reference,location.register));
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_movups_to_mem: begin op:=A_movups end;
+      in_x86_movaps_to_mem: begin op:=A_movaps end;
+      in_x86_movss_to_mem: begin op:=A_movss; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_make_ref(paraarray[1].location);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_ref(op,S_NO,paraarray[2].location.register,paraarray[1].location.reference));
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_movss_to_val: begin op:=A_movss; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    case inlinenumber of
+      in_x86_movss_from_val: begin op:=A_movss; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMREGISTER,OS_M128);
+    location.register:=cg.getmmregister(current_asmdata.CurrAsmList, OS_M128);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    case inlinenumber of
+      in_x86_movhps: begin op:=A_movhps end;
+      in_x86_movlps: begin op:=A_movlps; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_make_ref(paraarray[2].location);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,paraarray[2].location.reference,paraarray[1].location.register));
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_pminsw: begin op:=A_pminsw end;
+      in_x86_pmaxsw: begin op:=A_pmaxsw end;
+      in_x86_pminub: begin op:=A_pminub end;
+      in_x86_pmaxub: begin op:=A_pmaxub end;
+      in_x86_pavgw: begin op:=A_pavgw end;
+      in_x86_pavgb: begin op:=A_pavgb end;
+      in_x86_psadbw: begin op:=A_psadbw end;
+      in_x86_pmulhuw: begin op:=A_pmulhuw end;
+      in_x86_unpcklps: begin op:=A_unpcklps end;
+      in_x86_unpckhps: begin op:=A_unpckhps end;
+      in_x86_andnps: begin op:=A_andnps end;
+      in_x86_xorps: begin op:=A_xorps end;
+      in_x86_orps: begin op:=A_orps end;
+      in_x86_andps: begin op:=A_andps end;
+      in_x86_rsqrtps: begin op:=A_rsqrtps end;
+      in_x86_minps: begin op:=A_minps end;
+      in_x86_maxps: begin op:=A_maxps end;
+      in_x86_sqrtps: begin op:=A_sqrtps end;
+      in_x86_rcpps: begin op:=A_rcpps end;
+      in_x86_divps: begin op:=A_divps end;
+      in_x86_mulps: begin op:=A_mulps end;
+      in_x86_subps: begin op:=A_subps end;
+      in_x86_addps: begin op:=A_addps end;
+      in_x86_rsqrtss: begin op:=A_rsqrtss end;
+      in_x86_minss: begin op:=A_minss end;
+      in_x86_maxss: begin op:=A_maxss end;
+      in_x86_sqrtss: begin op:=A_sqrtss end;
+      in_x86_rcpss: begin op:=A_rcpss end;
+      in_x86_divss: begin op:=A_divss end;
+      in_x86_mulss: begin op:=A_mulss end;
+      in_x86_subss: begin op:=A_subss end;
+      in_x86_addss: begin op:=A_addss end;
+      in_x86_movhlps: begin op:=A_movhlps end;
+      in_x86_movlhps: begin op:=A_movlhps; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_shufps: begin op:=A_shufps end;
+      in_x86_cmpps: begin op:=A_cmpps end;
+      in_x86_cmpss: begin op:=A_cmpss; end;
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    case inlinenumber of
+      in_x86_cvtsi2ss: begin op:=A_cvtsi2ss; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_cvttss2si: begin op:=A_cvttss2si end;
+      in_x86_cvtss2si: begin op:=A_cvtss2si; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_cvtpi2ps: begin op:=A_cvtpi2ps; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    case inlinenumber of
+      in_x86_cvttps2pi: begin op:=A_cvttps2pi end;
+      in_x86_cvtps2pi: begin op:=A_cvtps2pi; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMXREGISTER,OS_M64);
+    location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_pminsw_mmx: begin op:=A_pminsw end;
+      in_x86_pmaxsw_mmx: begin op:=A_pmaxsw end;
+      in_x86_pminub_mmx: begin op:=A_pminub end;
+      in_x86_pmaxub_mmx: begin op:=A_pmaxub end;
+      in_x86_pavgw_mmx: begin op:=A_pavgw end;
+      in_x86_pavgb_mmx: begin op:=A_pavgb end;
+      in_x86_psadbw_mmx: begin op:=A_psadbw end;
+      in_x86_pmulhuw_mmx: begin op:=A_pmulhuw; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[2].location, true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pextrw_mmx: begin op:=A_pextrw; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pinsrw_mmx: begin op:=A_pinsrw; end;
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    case inlinenumber of
+      in_x86_pmovmskb: begin op:=A_pmovmskb; end;
+    end;
+
+    GetParameters(1);
+
+    for i := 1 to 1 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,paraarray[1].location.register,location.register));
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pshufw: begin op:=A_pshufw; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmxreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_MMXREGISTER,OS_M64);
+    location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pextrw: begin op:=A_pextrw; end;
+    end;
+
+    GetParameters(2);
+
+    for i := 1 to 2 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, true);
+    location_reset(location,LOC_REGISTER,OS_32);
+    location.register:=cg.getintregister(current_asmdata.CurrAsmList, OS_32);
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[2]),paraarray[1].location.register,location.register));
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    case inlinenumber of
+      in_x86_pinsrw: begin op:=A_pinsrw; end;
+    end;
+
+    GetParameters(3);
+
+    for i := 1 to 3 do secondpass(paraarray[i]);
+
+    location_force_mmreg(current_asmdata.CurrAsmList, paraarray[1].location, false);
+    hlcg.location_force_reg(current_asmdata.CurrAsmList, paraarray[2].location, paraarray[2].resultdef,u32inttype,true);
+    location:=paraarray[1].location;
+    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(op,S_NO,GetConstInt(paraarray[3]),paraarray[2].location.register,paraarray[1].location.register));
+  end;

+ 157 - 0
compiler/x86/x86type.inc

@@ -0,0 +1,157 @@
+in_x86_movss
+,in_x86_movaps
+,in_x86_movups
+: //out r0:xmm;r1:ptr32;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movss_to_mem
+,in_x86_movaps_to_mem
+,in_x86_movups_to_mem
+: //r0:ptr32;r1:xmm;
+  begin
+    CheckParameters(2);
+    resultdef:=voidtype;
+  end;
+in_x86_movss_to_val
+: //out r0:f32;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=s32floattype;
+  end;
+in_x86_movss_from_val
+: //out r0:xmm;r1:f32;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movlps
+,in_x86_movhps
+: //var r0:xmm;r1:ptr32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_movlhps
+,in_x86_movhlps
+,in_x86_addss
+,in_x86_subss
+,in_x86_mulss
+,in_x86_divss
+,in_x86_rcpss
+,in_x86_sqrtss
+,in_x86_maxss
+,in_x86_minss
+,in_x86_rsqrtss
+,in_x86_addps
+,in_x86_subps
+,in_x86_mulps
+,in_x86_divps
+,in_x86_rcpps
+,in_x86_sqrtps
+,in_x86_maxps
+,in_x86_minps
+,in_x86_rsqrtps
+,in_x86_andps
+,in_x86_orps
+,in_x86_xorps
+,in_x86_andnps
+,in_x86_unpckhps
+,in_x86_unpcklps
+,in_x86_pmulhuw
+,in_x86_psadbw
+,in_x86_pavgb
+,in_x86_pavgw
+,in_x86_pmaxub
+,in_x86_pminub
+,in_x86_pmaxsw
+,in_x86_pminsw
+: //var r0:xmm;r1:xmm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cmpss
+,in_x86_cmpps
+,in_x86_shufps
+: //var r0:xmm;r1:xmm;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtsi2ss
+: //var r0:xmm;r1:r32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtss2si
+,in_x86_cvttss2si
+: //out r0:r32;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=u32inttype;
+  end;
+in_x86_cvtpi2ps
+: //var r0:xmm;r1:mm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m128type;
+  end;
+in_x86_cvtps2pi
+,in_x86_cvttps2pi
+: //out r0:mm;r1:xmm;
+  begin
+    CheckParameters(1);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pmulhuw_mmx
+,in_x86_psadbw_mmx
+,in_x86_pavgb_mmx
+,in_x86_pavgw_mmx
+,in_x86_pmaxub_mmx
+,in_x86_pminub_mmx
+,in_x86_pmaxsw_mmx
+,in_x86_pminsw_mmx
+: //var r0:mm;r1:mm;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pextrw_mmx
+: //out r0:r32;r1:mm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=u32inttype;
+  end;
+in_x86_pinsrw_mmx
+: //var r0:mm;r1:r32;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pmovmskb
+: //out r0:r32;r1:mm;
+  begin
+    CheckParameters(1);
+    resultdef:=u32inttype;
+  end;
+in_x86_pshufw
+: //out r0:mm;r1:mm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=x86_m64type;
+  end;
+in_x86_pextrw
+: //out r0:r32;r1:xmm;imm:i32;
+  begin
+    CheckParameters(2);
+    resultdef:=u32inttype;
+  end;
+in_x86_pinsrw
+: //var r0:xmm;r1:r32;imm:i32;
+  begin
+    CheckParameters(3);
+    resultdef:=x86_m128type;
+  end;

+ 2 - 2
ide/compiler/Makefile

@@ -341,10 +341,10 @@ override NOCPUDEF=1
 override FPCOPT+= -d$(PPC_TARGET)
 override FPCOPT+=-dBrowserCol -dGDB
 ifeq ($(PPC_TARGET),i386)
-override FPCOPT+= -Fu$(COMPILERDIR)/x86
+override FPCOPT+= -Fu$(COMPILERDIR)/x86 -Fi$(COMPILERDIR)/x86
 endif
 ifeq ($(PPC_TARGET),x86_64)
-override FPCOPT+= -Fu$(COMPILERDIR)/x86 -dNOOPT
+override FPCOPT+= -Fu$(COMPILERDIR)/x86 -Fi$(COMPILERDIR)/x86 -dNOOPT
 endif
 ifeq ($(PPC_TARGET),powerpc)
 override FPCOPT+= -Fu$(COMPILERDIR)/ppcgen

+ 2 - 2
ide/compiler/Makefile.fpc

@@ -45,11 +45,11 @@ override FPCOPT+= -d$(PPC_TARGET)
 override FPCOPT+=-dBrowserCol -dGDB
 ifeq ($(PPC_TARGET),i386)
 # the x86 dir doesn't hurt for 1.0.x though we could leave it away
-override FPCOPT+= -Fu$(COMPILERDIR)/x86
+override FPCOPT+= -Fu$(COMPILERDIR)/x86 -Fi$(COMPILERDIR)/x86
 #-dSUPPORT_MMX
 endif
 ifeq ($(PPC_TARGET),x86_64)
-override FPCOPT+= -Fu$(COMPILERDIR)/x86 -dNOOPT
+override FPCOPT+= -Fu$(COMPILERDIR)/x86 -Fi$(COMPILERDIR)/x86 -dNOOPT
 endif
 ifeq ($(PPC_TARGET),powerpc)
 override FPCOPT+= -Fu$(COMPILERDIR)/ppcgen

+ 2 - 0
ide/fpmake.pp

@@ -194,6 +194,8 @@ begin
 
     if CompilerTarget in [x86_64, i386] then
       P.Options.Add('-Fu../compiler/x86');
+    if CompilerTarget in [x86_64, i386] then
+      P.Options.Add('-Fi../compiler/x86');
     if CompilerTarget in [powerpc, powerpc64] then
       P.Options.Add('-Fu../compiler/ppcgen');
     if CompilerTarget = x86_64 then

+ 70 - 0
rtl/i386/x86innr.inc

@@ -0,0 +1,70 @@
+const
+  in_x86_first = in_x86_base;
+  in_x86_movss = in_x86_base+0;
+  in_x86_movaps = in_x86_base+1;
+  in_x86_movups = in_x86_base+2;
+  in_x86_movss_to_mem = in_x86_base+3;
+  in_x86_movaps_to_mem = in_x86_base+4;
+  in_x86_movups_to_mem = in_x86_base+5;
+  in_x86_movss_to_val = in_x86_base+6;
+  in_x86_movss_from_val = in_x86_base+7;
+  in_x86_movlps = in_x86_base+8;
+  in_x86_movhps = in_x86_base+9;
+  in_x86_movlhps = in_x86_base+10;
+  in_x86_movhlps = in_x86_base+11;
+  in_x86_addss = in_x86_base+12;
+  in_x86_subss = in_x86_base+13;
+  in_x86_mulss = in_x86_base+14;
+  in_x86_divss = in_x86_base+15;
+  in_x86_rcpss = in_x86_base+16;
+  in_x86_sqrtss = in_x86_base+17;
+  in_x86_maxss = in_x86_base+18;
+  in_x86_minss = in_x86_base+19;
+  in_x86_rsqrtss = in_x86_base+20;
+  in_x86_addps = in_x86_base+21;
+  in_x86_subps = in_x86_base+22;
+  in_x86_mulps = in_x86_base+23;
+  in_x86_divps = in_x86_base+24;
+  in_x86_rcpps = in_x86_base+25;
+  in_x86_sqrtps = in_x86_base+26;
+  in_x86_maxps = in_x86_base+27;
+  in_x86_minps = in_x86_base+28;
+  in_x86_rsqrtps = in_x86_base+29;
+  in_x86_andps = in_x86_base+30;
+  in_x86_orps = in_x86_base+31;
+  in_x86_xorps = in_x86_base+32;
+  in_x86_andnps = in_x86_base+33;
+  in_x86_cmpss = in_x86_base+34;
+  in_x86_cmpps = in_x86_base+35;
+  in_x86_shufps = in_x86_base+36;
+  in_x86_unpckhps = in_x86_base+37;
+  in_x86_unpcklps = in_x86_base+38;
+  in_x86_cvtsi2ss = in_x86_base+39;
+  in_x86_cvtss2si = in_x86_base+40;
+  in_x86_cvttss2si = in_x86_base+41;
+  in_x86_cvtpi2ps = in_x86_base+42;
+  in_x86_cvtps2pi = in_x86_base+43;
+  in_x86_cvttps2pi = in_x86_base+44;
+  in_x86_pmulhuw_mmx = in_x86_base+45;
+  in_x86_psadbw_mmx = in_x86_base+46;
+  in_x86_pavgb_mmx = in_x86_base+47;
+  in_x86_pavgw_mmx = in_x86_base+48;
+  in_x86_pmaxub_mmx = in_x86_base+49;
+  in_x86_pminub_mmx = in_x86_base+50;
+  in_x86_pmaxsw_mmx = in_x86_base+51;
+  in_x86_pminsw_mmx = in_x86_base+52;
+  in_x86_pextrw_mmx = in_x86_base+53;
+  in_x86_pinsrw_mmx = in_x86_base+54;
+  in_x86_pmovmskb = in_x86_base+55;
+  in_x86_pshufw = in_x86_base+56;
+  in_x86_pmulhuw = in_x86_base+57;
+  in_x86_psadbw = in_x86_base+58;
+  in_x86_pavgb = in_x86_base+59;
+  in_x86_pavgw = in_x86_base+60;
+  in_x86_pmaxub = in_x86_base+61;
+  in_x86_pminub = in_x86_base+62;
+  in_x86_pmaxsw = in_x86_base+63;
+  in_x86_pminsw = in_x86_base+64;
+  in_x86_pextrw = in_x86_base+65;
+  in_x86_pinsrw = in_x86_base+66;
+  in_x86_last = in_x86_base+66;

+ 67 - 0
rtl/i386/x86procs.inc

@@ -0,0 +1,67 @@
+function x86_movss(r1: pointer): __m128; [INTERNPROC: in_x86_movss];
+function x86_movaps(r1: pointer): __m128; [INTERNPROC: in_x86_movaps];
+function x86_movups(r1: pointer): __m128; [INTERNPROC: in_x86_movups];
+procedure x86_movss(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movss_to_mem];
+procedure x86_movaps(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movaps_to_mem];
+procedure x86_movups(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movups_to_mem];
+function x86_movss(r1: __m128): single; [INTERNPROC: in_x86_movss_to_val];
+function x86_movss(r1: single): __m128; [INTERNPROC: in_x86_movss_from_val];
+function x86_movlps(r0: __m128; r1: pointer): __m128; [INTERNPROC: in_x86_movlps];
+function x86_movhps(r0: __m128; r1: pointer): __m128; [INTERNPROC: in_x86_movhps];
+function x86_movlhps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_movlhps];
+function x86_movhlps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_movhlps];
+function x86_addss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_addss];
+function x86_subss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_subss];
+function x86_mulss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_mulss];
+function x86_divss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_divss];
+function x86_rcpss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rcpss];
+function x86_sqrtss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_sqrtss];
+function x86_maxss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_maxss];
+function x86_minss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_minss];
+function x86_rsqrtss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rsqrtss];
+function x86_addps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_addps];
+function x86_subps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_subps];
+function x86_mulps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_mulps];
+function x86_divps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_divps];
+function x86_rcpps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rcpps];
+function x86_sqrtps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_sqrtps];
+function x86_maxps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_maxps];
+function x86_minps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_minps];
+function x86_rsqrtps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rsqrtps];
+function x86_andps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_andps];
+function x86_orps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_orps];
+function x86_xorps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_xorps];
+function x86_andnps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_andnps];
+function x86_cmpss(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_cmpss];
+function x86_cmpps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_cmpps];
+function x86_shufps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_shufps];
+function x86_unpckhps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_unpckhps];
+function x86_unpcklps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_unpcklps];
+function x86_cvtsi2ss(r0: __m128; r1: longword): __m128; [INTERNPROC: in_x86_cvtsi2ss];
+function x86_cvtss2si(r1: __m128): longword; [INTERNPROC: in_x86_cvtss2si];
+function x86_cvttss2si(r1: __m128): longword; [INTERNPROC: in_x86_cvttss2si];
+function x86_cvtpi2ps(r0: __m128; r1: __m64): __m128; [INTERNPROC: in_x86_cvtpi2ps];
+function x86_cvtps2pi(r1: __m128): __m64; [INTERNPROC: in_x86_cvtps2pi];
+function x86_cvttps2pi(r1: __m128): __m64; [INTERNPROC: in_x86_cvttps2pi];
+function x86_pmulhuw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmulhuw_mmx];
+function x86_psadbw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_psadbw_mmx];
+function x86_pavgb(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pavgb_mmx];
+function x86_pavgw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pavgw_mmx];
+function x86_pmaxub(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmaxub_mmx];
+function x86_pminub(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pminub_mmx];
+function x86_pmaxsw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmaxsw_mmx];
+function x86_pminsw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pminsw_mmx];
+function x86_pextrw(r1: __m64; imm: longint): longword; [INTERNPROC: in_x86_pextrw_mmx];
+function x86_pinsrw(r0: __m64; r1: longword; imm: longint): __m64; [INTERNPROC: in_x86_pinsrw_mmx];
+function x86_pmovmskb(r1: __m64): longword; [INTERNPROC: in_x86_pmovmskb];
+function x86_pshufw(r1: __m64; imm: longint): __m64; [INTERNPROC: in_x86_pshufw];
+function x86_pmulhuw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmulhuw];
+function x86_psadbw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_psadbw];
+function x86_pavgb(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pavgb];
+function x86_pavgw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pavgw];
+function x86_pmaxub(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmaxub];
+function x86_pminub(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pminub];
+function x86_pmaxsw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmaxsw];
+function x86_pminsw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pminsw];
+function x86_pextrw(r1: __m128; imm: longint): longword; [INTERNPROC: in_x86_pextrw];
+function x86_pinsrw(r0: __m128; r1: longword; imm: longint): __m128; [INTERNPROC: in_x86_pinsrw];

+ 11 - 0
rtl/inc/innr.inc

@@ -139,7 +139,18 @@ const
 
 { ARM }
    in_arm_base             = 300;
+
+{ X86 }
+   in_x86_base         = 500;  
    
 {$ifdef ARM}
   {$i arminnr.inc}
 {$endif ARM}
+
+{$ifdef i386}
+  {$i x86innr.inc}
+{$endif i386}
+
+{$ifdef x86_64}
+  {$i x86innr.inc}
+{$endif x86_64}

+ 6 - 0
rtl/inc/systemh.inc

@@ -801,6 +801,12 @@ procedure WriteBarrier;
 {$ifdef ARM}
   {$i armprocs.inc}
 {$endif ARM}
+{$ifdef i386}
+  {$i x86procs.inc}
+{$endif i386}
+{$ifdef x86_64}
+  {$i x86procs.inc}
+{$endif x86_64}
 
 
 {****************************************************************************

+ 70 - 0
rtl/x86_64/x86innr.inc

@@ -0,0 +1,70 @@
+const
+  in_x86_first = in_x86_base;
+  in_x86_movss = in_x86_base+0;
+  in_x86_movaps = in_x86_base+1;
+  in_x86_movups = in_x86_base+2;
+  in_x86_movss_to_mem = in_x86_base+3;
+  in_x86_movaps_to_mem = in_x86_base+4;
+  in_x86_movups_to_mem = in_x86_base+5;
+  in_x86_movss_to_val = in_x86_base+6;
+  in_x86_movss_from_val = in_x86_base+7;
+  in_x86_movlps = in_x86_base+8;
+  in_x86_movhps = in_x86_base+9;
+  in_x86_movlhps = in_x86_base+10;
+  in_x86_movhlps = in_x86_base+11;
+  in_x86_addss = in_x86_base+12;
+  in_x86_subss = in_x86_base+13;
+  in_x86_mulss = in_x86_base+14;
+  in_x86_divss = in_x86_base+15;
+  in_x86_rcpss = in_x86_base+16;
+  in_x86_sqrtss = in_x86_base+17;
+  in_x86_maxss = in_x86_base+18;
+  in_x86_minss = in_x86_base+19;
+  in_x86_rsqrtss = in_x86_base+20;
+  in_x86_addps = in_x86_base+21;
+  in_x86_subps = in_x86_base+22;
+  in_x86_mulps = in_x86_base+23;
+  in_x86_divps = in_x86_base+24;
+  in_x86_rcpps = in_x86_base+25;
+  in_x86_sqrtps = in_x86_base+26;
+  in_x86_maxps = in_x86_base+27;
+  in_x86_minps = in_x86_base+28;
+  in_x86_rsqrtps = in_x86_base+29;
+  in_x86_andps = in_x86_base+30;
+  in_x86_orps = in_x86_base+31;
+  in_x86_xorps = in_x86_base+32;
+  in_x86_andnps = in_x86_base+33;
+  in_x86_cmpss = in_x86_base+34;
+  in_x86_cmpps = in_x86_base+35;
+  in_x86_shufps = in_x86_base+36;
+  in_x86_unpckhps = in_x86_base+37;
+  in_x86_unpcklps = in_x86_base+38;
+  in_x86_cvtsi2ss = in_x86_base+39;
+  in_x86_cvtss2si = in_x86_base+40;
+  in_x86_cvttss2si = in_x86_base+41;
+  in_x86_cvtpi2ps = in_x86_base+42;
+  in_x86_cvtps2pi = in_x86_base+43;
+  in_x86_cvttps2pi = in_x86_base+44;
+  in_x86_pmulhuw_mmx = in_x86_base+45;
+  in_x86_psadbw_mmx = in_x86_base+46;
+  in_x86_pavgb_mmx = in_x86_base+47;
+  in_x86_pavgw_mmx = in_x86_base+48;
+  in_x86_pmaxub_mmx = in_x86_base+49;
+  in_x86_pminub_mmx = in_x86_base+50;
+  in_x86_pmaxsw_mmx = in_x86_base+51;
+  in_x86_pminsw_mmx = in_x86_base+52;
+  in_x86_pextrw_mmx = in_x86_base+53;
+  in_x86_pinsrw_mmx = in_x86_base+54;
+  in_x86_pmovmskb = in_x86_base+55;
+  in_x86_pshufw = in_x86_base+56;
+  in_x86_pmulhuw = in_x86_base+57;
+  in_x86_psadbw = in_x86_base+58;
+  in_x86_pavgb = in_x86_base+59;
+  in_x86_pavgw = in_x86_base+60;
+  in_x86_pmaxub = in_x86_base+61;
+  in_x86_pminub = in_x86_base+62;
+  in_x86_pmaxsw = in_x86_base+63;
+  in_x86_pminsw = in_x86_base+64;
+  in_x86_pextrw = in_x86_base+65;
+  in_x86_pinsrw = in_x86_base+66;
+  in_x86_last = in_x86_base+66;

+ 67 - 0
rtl/x86_64/x86procs.inc

@@ -0,0 +1,67 @@
+function x86_movss(r1: pointer): __m128; [INTERNPROC: in_x86_movss];
+function x86_movaps(r1: pointer): __m128; [INTERNPROC: in_x86_movaps];
+function x86_movups(r1: pointer): __m128; [INTERNPROC: in_x86_movups];
+procedure x86_movss(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movss_to_mem];
+procedure x86_movaps(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movaps_to_mem];
+procedure x86_movups(r0: pointer; r1: __m128); [INTERNPROC: in_x86_movups_to_mem];
+function x86_movss(r1: __m128): single; [INTERNPROC: in_x86_movss_to_val];
+function x86_movss(r1: single): __m128; [INTERNPROC: in_x86_movss_from_val];
+function x86_movlps(r0: __m128; r1: pointer): __m128; [INTERNPROC: in_x86_movlps];
+function x86_movhps(r0: __m128; r1: pointer): __m128; [INTERNPROC: in_x86_movhps];
+function x86_movlhps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_movlhps];
+function x86_movhlps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_movhlps];
+function x86_addss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_addss];
+function x86_subss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_subss];
+function x86_mulss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_mulss];
+function x86_divss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_divss];
+function x86_rcpss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rcpss];
+function x86_sqrtss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_sqrtss];
+function x86_maxss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_maxss];
+function x86_minss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_minss];
+function x86_rsqrtss(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rsqrtss];
+function x86_addps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_addps];
+function x86_subps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_subps];
+function x86_mulps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_mulps];
+function x86_divps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_divps];
+function x86_rcpps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rcpps];
+function x86_sqrtps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_sqrtps];
+function x86_maxps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_maxps];
+function x86_minps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_minps];
+function x86_rsqrtps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_rsqrtps];
+function x86_andps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_andps];
+function x86_orps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_orps];
+function x86_xorps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_xorps];
+function x86_andnps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_andnps];
+function x86_cmpss(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_cmpss];
+function x86_cmpps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_cmpps];
+function x86_shufps(r0, r1: __m128; imm: longint): __m128; [INTERNPROC: in_x86_shufps];
+function x86_unpckhps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_unpckhps];
+function x86_unpcklps(r0, r1: __m128): __m128; [INTERNPROC: in_x86_unpcklps];
+function x86_cvtsi2ss(r0: __m128; r1: longword): __m128; [INTERNPROC: in_x86_cvtsi2ss];
+function x86_cvtss2si(r1: __m128): longword; [INTERNPROC: in_x86_cvtss2si];
+function x86_cvttss2si(r1: __m128): longword; [INTERNPROC: in_x86_cvttss2si];
+function x86_cvtpi2ps(r0: __m128; r1: __m64): __m128; [INTERNPROC: in_x86_cvtpi2ps];
+function x86_cvtps2pi(r1: __m128): __m64; [INTERNPROC: in_x86_cvtps2pi];
+function x86_cvttps2pi(r1: __m128): __m64; [INTERNPROC: in_x86_cvttps2pi];
+function x86_pmulhuw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmulhuw_mmx];
+function x86_psadbw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_psadbw_mmx];
+function x86_pavgb(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pavgb_mmx];
+function x86_pavgw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pavgw_mmx];
+function x86_pmaxub(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmaxub_mmx];
+function x86_pminub(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pminub_mmx];
+function x86_pmaxsw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pmaxsw_mmx];
+function x86_pminsw(r0, r1: __m64): __m64; [INTERNPROC: in_x86_pminsw_mmx];
+function x86_pextrw(r1: __m64; imm: longint): longword; [INTERNPROC: in_x86_pextrw_mmx];
+function x86_pinsrw(r0: __m64; r1: longword; imm: longint): __m64; [INTERNPROC: in_x86_pinsrw_mmx];
+function x86_pmovmskb(r1: __m64): longword; [INTERNPROC: in_x86_pmovmskb];
+function x86_pshufw(r1: __m64; imm: longint): __m64; [INTERNPROC: in_x86_pshufw];
+function x86_pmulhuw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmulhuw];
+function x86_psadbw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_psadbw];
+function x86_pavgb(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pavgb];
+function x86_pavgw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pavgw];
+function x86_pmaxub(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmaxub];
+function x86_pminub(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pminub];
+function x86_pmaxsw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pmaxsw];
+function x86_pminsw(r0, r1: __m128): __m128; [INTERNPROC: in_x86_pminsw];
+function x86_pextrw(r1: __m128; imm: longint): longword; [INTERNPROC: in_x86_pextrw];
+function x86_pinsrw(r0: __m128; r1: longword; imm: longint): __m128; [INTERNPROC: in_x86_pinsrw];