فهرست منبع

* synchronised with trunk up till r26975

git-svn-id: branches/hlcgllvm@26976 -
Jonas Maebe 11 سال پیش
والد
کامیت
e9268a0a14
100فایلهای تغییر یافته به همراه4466 افزوده شده و 1943 حذف شده
  1. 274 119
      .gitattributes
  2. 32 6
      Makefile
  3. 8 2
      Makefile.fpc
  4. 50 11
      compiler/Makefile
  5. 13 7
      compiler/Makefile.fpc
  6. 6 1
      compiler/aasmtai.pas
  7. 77 32
      compiler/aggas.pas
  8. 3 2
      compiler/agjasmin.pas
  9. 121 113
      compiler/aoptobj.pas
  10. 4 7
      compiler/arm/aasmcpu.pas
  11. 2 2
      compiler/arm/agarmgas.pas
  12. 461 260
      compiler/arm/aoptcpu.pas
  13. 69 44
      compiler/arm/cgcpu.pas
  14. 16 28
      compiler/arm/cpubase.pas
  15. 10 0
      compiler/arm/cpuelf.pas
  16. 15 14
      compiler/arm/cpuinfo.pas
  17. 19 10
      compiler/arm/cpupara.pas
  18. 9 4
      compiler/arm/cpupi.pas
  19. 42 18
      compiler/arm/narmadd.pas
  20. 91 77
      compiler/arm/narmmat.pas
  21. 48 3
      compiler/arm/rgcpu.pas
  22. 31 1
      compiler/assemble.pas
  23. 7 1
      compiler/avr/aasmcpu.pas
  24. 1 1
      compiler/avr/agavrgas.pas
  25. 233 14
      compiler/avr/aoptcpu.pas
  26. 26 8
      compiler/avr/aoptcpub.pas
  27. 173 62
      compiler/avr/cgcpu.pas
  28. 12 4
      compiler/avr/cpubase.pas
  29. 10 10
      compiler/avr/cpuinfo.pas
  30. 60 45
      compiler/avr/cpupara.pas
  31. 1 1
      compiler/avr/cpupi.pas
  32. 24 10
      compiler/avr/raavrgas.pas
  33. 6 6
      compiler/cclasses.pas
  34. 30 3
      compiler/cfileutl.pas
  35. 5 1
      compiler/cg64f32.pas
  36. 48 24
      compiler/cgobj.pas
  37. 62 2
      compiler/cgutils.pas
  38. 1 4
      compiler/crefs.pas
  39. 45 61
      compiler/cresstr.pas
  40. 9 1
      compiler/cstreams.pas
  41. 5 1
      compiler/dbgdwarf.pas
  42. 1 0
      compiler/dbgstabs.pas
  43. 1 0
      compiler/dbgstabx.pas
  44. 1 0
      compiler/dirparse.pas
  45. 3 0
      compiler/fpcdefs.inc
  46. 14 8
      compiler/globals.pas
  47. 9 6
      compiler/globtype.pas
  48. 15 10
      compiler/hlcg2ll.pas
  49. 16 29
      compiler/hlcgobj.pas
  50. 9 2
      compiler/htypechk.pas
  51. 16 10
      compiler/i386/aopt386.pas
  52. 17 22
      compiler/i386/cgcpu.pas
  53. 2 0
      compiler/i386/cpuelf.pas
  54. 34 2
      compiler/i386/cpuinfo.pas
  55. 3 0
      compiler/i386/cpunode.pas
  56. 0 53
      compiler/i386/cpupara.pas
  57. 17 7
      compiler/i386/csopt386.pas
  58. 1 0
      compiler/i386/daopt386.pas
  59. 1 0
      compiler/i386/i386att.inc
  60. 1 0
      compiler/i386/i386atts.inc
  61. 1 0
      compiler/i386/i386int.inc
  62. 1 1
      compiler/i386/i386nop.inc
  63. 1 0
      compiler/i386/i386op.inc
  64. 79 78
      compiler/i386/i386prop.inc
  65. 7 0
      compiler/i386/i386tab.inc
  66. 146 21
      compiler/i386/n386add.pas
  67. 2 1
      compiler/i386/n386cal.pas
  68. 685 0
      compiler/i386/n386flw.pas
  69. 5 3
      compiler/i386/popt386.pas
  70. 1 0
      compiler/i386/rropt386.pas
  71. 445 105
      compiler/i8086/cgcpu.pas
  72. 1 1
      compiler/i8086/cpuinfo.pas
  73. 1 0
      compiler/i8086/i8086att.inc
  74. 1 0
      compiler/i8086/i8086atts.inc
  75. 1 0
      compiler/i8086/i8086int.inc
  76. 1 1
      compiler/i8086/i8086nop.inc
  77. 1 0
      compiler/i8086/i8086op.inc
  78. 79 78
      compiler/i8086/i8086prop.inc
  79. 7 0
      compiler/i8086/i8086tab.inc
  80. 10 11
      compiler/i8086/n8086add.pas
  81. 32 3
      compiler/i8086/n8086cal.pas
  82. 17 1
      compiler/i8086/n8086inl.pas
  83. 3 3
      compiler/jvm/cpubase.pas
  84. 4 2
      compiler/jvm/hlcgcpu.pas
  85. 4 1
      compiler/jvm/jvmdef.pas
  86. 2 29
      compiler/jvm/njvmadd.pas
  87. 10 5
      compiler/jvm/njvmmem.pas
  88. 3 0
      compiler/jvm/pjvm.pas
  89. 25 6
      compiler/link.pas
  90. 1 1
      compiler/m68k/ag68kgas.pas
  91. 399 242
      compiler/m68k/cgcpu.pas
  92. 1 1
      compiler/m68k/cpubase.pas
  93. 25 91
      compiler/m68k/cpupara.pas
  94. 12 12
      compiler/m68k/n68kadd.pas
  95. 27 36
      compiler/m68k/n68kcnv.pas
  96. 65 11
      compiler/mips/aasmcpu.pas
  97. 19 8
      compiler/mips/aoptcpu.pas
  98. 16 17
      compiler/mips/cgcpu.pas
  99. 4 4
      compiler/mips/cpugas.pas
  100. 2 1
      compiler/mips/cpuinfo.pas

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 274 - 119
.gitattributes


+ 32 - 6
Makefile

@@ -1,8 +1,8 @@
 #
-# Don't edit, this file is generated by FPCMake Version 2.0.0 [2013-10-05 rev 25642]
+# Don't edit, this file is generated by FPCMake Version 2.0.0 [2014-02-06 rev 26692]
 #
 default: help
-MAKEFILETARGETS=i386-linux i386-go32v2 i386-win32 i386-os2 i386-freebsd i386-beos i386-haiku i386-netbsd i386-solaris i386-qnx i386-netware i386-openbsd i386-wdosx i386-darwin i386-emx i386-watcom i386-netwlibc i386-wince i386-embedded i386-symbian i386-nativent i386-iphonesim i386-android m68k-linux m68k-freebsd m68k-netbsd m68k-amiga m68k-atari m68k-openbsd m68k-palmos m68k-embedded powerpc-linux powerpc-netbsd powerpc-amiga powerpc-macos powerpc-darwin powerpc-morphos powerpc-embedded powerpc-wii powerpc-aix sparc-linux sparc-netbsd sparc-solaris sparc-embedded x86_64-linux x86_64-freebsd x86_64-netbsd x86_64-solaris x86_64-openbsd x86_64-darwin x86_64-win64 x86_64-embedded arm-linux arm-palmos arm-darwin arm-wince arm-gba arm-nds arm-embedded arm-symbian arm-android powerpc64-linux powerpc64-darwin powerpc64-embedded powerpc64-aix avr-embedded armeb-linux armeb-embedded mips-linux mipsel-linux jvm-java jvm-android i8086-msdos
+MAKEFILETARGETS=i386-linux i386-go32v2 i386-win32 i386-os2 i386-freebsd i386-beos i386-haiku i386-netbsd i386-solaris i386-qnx i386-netware i386-openbsd i386-wdosx i386-darwin i386-emx i386-watcom i386-netwlibc i386-wince i386-embedded i386-symbian i386-nativent i386-iphonesim i386-android m68k-linux m68k-freebsd m68k-netbsd m68k-amiga m68k-atari m68k-openbsd m68k-palmos m68k-embedded powerpc-linux powerpc-netbsd powerpc-amiga powerpc-macos powerpc-darwin powerpc-morphos powerpc-embedded powerpc-wii powerpc-aix sparc-linux sparc-netbsd sparc-solaris sparc-embedded x86_64-linux x86_64-freebsd x86_64-netbsd x86_64-solaris x86_64-openbsd x86_64-darwin x86_64-win64 x86_64-embedded arm-linux arm-palmos arm-darwin arm-wince arm-gba arm-nds arm-embedded arm-symbian arm-android powerpc64-linux powerpc64-darwin powerpc64-embedded powerpc64-aix avr-embedded armeb-linux armeb-embedded mips-linux mipsel-linux mipsel-android jvm-java jvm-android i8086-msdos
 BSDs = freebsd netbsd openbsd darwin
 UNIXs = linux $(BSDs) solaris qnx haiku aix
 LIMIT83fs = go32v2 os2 emx watcom msdos
@@ -276,7 +276,7 @@ else
 ifeq ($(CPU_TARGET),i386)
 BINUTILSPREFIX=i686-linux-android-
 else
-ifeq ($(CPU_TARGET),mips)
+ifeq ($(CPU_TARGET),mipsel)
 BINUTILSPREFIX=mipsel-linux-android-
 endif
 endif
@@ -322,7 +322,7 @@ endif
 override PACKAGE_NAME=fpc
 override PACKAGE_VERSION=2.7.1
 REQUIREDVERSION=2.6.2
-REQUIREDVERSION2=2.6.2
+REQUIREDVERSION2=2.6.4
 ifndef inOS2
 override FPCDIR:=$(BASEDIR)
 export FPCDIR
@@ -370,6 +370,9 @@ endif
 ifeq ($(CPU_TARGET),i8086)
 PPSUF=8086
 endif
+ifeq ($(CPU_TARGET),avr)
+PPSUF=avr
+endif
 ifdef CROSSCOMPILE
 ifneq ($(CPU_TARGET),jvm)
 PPPRE=ppcross
@@ -442,8 +445,11 @@ else
 FPCMAKENEW=$(BASEDIR)/utils/fpcm/bin/$(SOURCESUFFIX)/fpcmake$(SRCEXEEXT)
 endif
 endif
+ifneq ($(OPT),)
+OPTNEW+=$(OPT)
+endif
 CLEANOPTS=FPC=$(PPNEW)
-BUILDOPTS=FPC=$(PPNEW) FPCFPMAKE=$(FPCFPMAKENEW) RELEASE=1
+BUILDOPTS=FPC=$(PPNEW) FPCFPMAKE=$(FPCFPMAKENEW) RELEASE=1 'OPT=$(OPTNEW)'
 INSTALLOPTS=FPC=$(PPNEW) ZIPDESTDIR=$(BASEDIR) FPCMAKE=$(FPCMAKENEW)
 ifndef CROSSCOMPILE
 ifneq ($(wildcard ide),)
@@ -672,6 +678,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override TARGET_DIRS+=compiler rtl utils packages ide installer
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override TARGET_DIRS+=compiler rtl utils packages ide installer
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override TARGET_DIRS+=compiler rtl utils packages ide installer
 endif
@@ -1255,6 +1264,7 @@ ASNAME=$(BINUTILSPREFIX)as
 LDNAME=$(BINUTILSPREFIX)ld
 ARNAME=$(BINUTILSPREFIX)ar
 RCNAME=$(BINUTILSPREFIX)rc
+NASMNAME=$(BINUTILSPREFIX)nasm
 ifndef ASPROG
 ifdef CROSSBINDIR
 ASPROG=$(CROSSBINDIR)/$(ASNAME)$(SRCEXEEXT)
@@ -1283,10 +1293,18 @@ else
 ARPROG=$(ARNAME)
 endif
 endif
+ifndef NASMPROG
+ifdef CROSSBINDIR
+NASMPROG=$(CROSSBINDIR)/$(NASMNAME)$(SRCEXEEXT)
+else
+NASMPROG=$(NASMNAME)
+endif
+endif
 AS=$(ASPROG)
 LD=$(LDPROG)
 RC=$(RCPROG)
 AR=$(ARPROG)
+NASM=$(NASMPROG)
 ifdef inUnix
 PPAS=./ppas$(SRCBATCHEXT)
 else
@@ -1453,7 +1471,7 @@ endif
 ifdef ACROSSCOMPILE
 override FPCOPT+=$(CROSSOPT)
 endif
-override COMPILER:=$(FPC) $(FPCOPT)
+override COMPILER:=$(strip $(FPC) $(FPCOPT))
 ifeq (,$(findstring -s ,$(COMPILER)))
 EXECPPAS=
 else
@@ -2366,6 +2384,14 @@ TARGET_DIRS_PACKAGES=1
 TARGET_DIRS_IDE=1
 TARGET_DIRS_INSTALLER=1
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+TARGET_DIRS_COMPILER=1
+TARGET_DIRS_RTL=1
+TARGET_DIRS_UTILS=1
+TARGET_DIRS_PACKAGES=1
+TARGET_DIRS_IDE=1
+TARGET_DIRS_INSTALLER=1
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 TARGET_DIRS_COMPILER=1
 TARGET_DIRS_RTL=1

+ 8 - 2
Makefile.fpc

@@ -21,7 +21,7 @@ rule=help
 
 [prerules]
 REQUIREDVERSION=2.6.2
-REQUIREDVERSION2=2.6.2
+REQUIREDVERSION2=2.6.4
 
 
 # make versions < 3.77 (OS2 version) are buggy
@@ -76,6 +76,9 @@ endif
 ifeq ($(CPU_TARGET),i8086)
 PPSUF=8086
 endif
+ifeq ($(CPU_TARGET),avr)
+PPSUF=avr
+endif
 
 # cross compilers uses full cpu_target, not just ppc-suffix
 # (except if the target cannot run a native compiler)
@@ -179,8 +182,11 @@ endif
 endif
 
 # Build/install options
+ifneq ($(OPT),)
+OPTNEW+=$(OPT)
+endif
 CLEANOPTS=FPC=$(PPNEW)
-BUILDOPTS=FPC=$(PPNEW) FPCFPMAKE=$(FPCFPMAKENEW) RELEASE=1
+BUILDOPTS=FPC=$(PPNEW) FPCFPMAKE=$(FPCFPMAKENEW) RELEASE=1 'OPT=$(OPTNEW)'
 INSTALLOPTS=FPC=$(PPNEW) ZIPDESTDIR=$(BASEDIR) FPCMAKE=$(FPCMAKENEW)
 
 # Compile also IDE (check for ide and fv dir)

+ 50 - 11
compiler/Makefile

@@ -1,8 +1,8 @@
 #
-# Don't edit, this file is generated by FPCMake Version 2.0.0 [2013/10/05]
+# Don't edit, this file is generated by FPCMake Version 2.0.0 [2014-02-06 rev 26692]
 #
 default: all
-MAKEFILETARGETS=i386-linux i386-go32v2 i386-win32 i386-os2 i386-freebsd i386-beos i386-haiku i386-netbsd i386-solaris i386-qnx i386-netware i386-openbsd i386-wdosx i386-darwin i386-emx i386-watcom i386-netwlibc i386-wince i386-embedded i386-symbian i386-nativent i386-iphonesim i386-android m68k-linux m68k-freebsd m68k-netbsd m68k-amiga m68k-atari m68k-openbsd m68k-palmos m68k-embedded powerpc-linux powerpc-netbsd powerpc-amiga powerpc-macos powerpc-darwin powerpc-morphos powerpc-embedded powerpc-wii powerpc-aix sparc-linux sparc-netbsd sparc-solaris sparc-embedded x86_64-linux x86_64-freebsd x86_64-netbsd x86_64-solaris x86_64-openbsd x86_64-darwin x86_64-win64 x86_64-embedded arm-linux arm-palmos arm-darwin arm-wince arm-gba arm-nds arm-embedded arm-symbian arm-android powerpc64-linux powerpc64-darwin powerpc64-embedded powerpc64-aix avr-embedded armeb-linux armeb-embedded mips-linux mipsel-linux jvm-java jvm-android i8086-msdos
+MAKEFILETARGETS=i386-linux i386-go32v2 i386-win32 i386-os2 i386-freebsd i386-beos i386-haiku i386-netbsd i386-solaris i386-qnx i386-netware i386-openbsd i386-wdosx i386-darwin i386-emx i386-watcom i386-netwlibc i386-wince i386-embedded i386-symbian i386-nativent i386-iphonesim i386-android m68k-linux m68k-freebsd m68k-netbsd m68k-amiga m68k-atari m68k-openbsd m68k-palmos m68k-embedded powerpc-linux powerpc-netbsd powerpc-amiga powerpc-macos powerpc-darwin powerpc-morphos powerpc-embedded powerpc-wii powerpc-aix sparc-linux sparc-netbsd sparc-solaris sparc-embedded x86_64-linux x86_64-freebsd x86_64-netbsd x86_64-solaris x86_64-openbsd x86_64-darwin x86_64-win64 x86_64-embedded arm-linux arm-palmos arm-darwin arm-wince arm-gba arm-nds arm-embedded arm-symbian arm-android powerpc64-linux powerpc64-darwin powerpc64-embedded powerpc64-aix avr-embedded armeb-linux armeb-embedded mips-linux mipsel-linux mipsel-android jvm-java jvm-android i8086-msdos
 BSDs = freebsd netbsd openbsd darwin
 UNIXs = linux $(BSDs) solaris qnx haiku aix
 LIMIT83fs = go32v2 os2 emx watcom msdos
@@ -276,7 +276,7 @@ else
 ifeq ($(CPU_TARGET),i386)
 BINUTILSPREFIX=i686-linux-android-
 else
-ifeq ($(CPU_TARGET),mips)
+ifeq ($(CPU_TARGET),mipsel)
 BINUTILSPREFIX=mipsel-linux-android-
 endif
 endif
@@ -402,18 +402,24 @@ override LOCALOPT+=$(OPTLEVEL2)
 override RTLOPT+=$(OPTLEVEL2)
 override LOCALOPT+=$(LOCALOPTLEVEL2)
 override RTLOPT+=$(RTLOPTLEVEL2)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 ifeq ($(CYCLELEVEL),3)
 override LOCALOPT+=$(OPTLEVEL3)
 override RTLOPT+=$(OPTLEVEL3)
 override LOCALOPT+=$(LOCALOPTLEVEL3)
 override RTLOPT+=$(RTLOPTLEVEL3)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 ifeq ($(CYCLELEVEL),4)
 override LOCALOPT+=$(OPTLEVEL4)
 override RTLOPT+=$(OPTLEVEL4)
 override LOCALOPT+=$(LOCALOPTLEVEL4)
 override RTLOPT+=$(RTLOPTLEVEL4)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 endif
 override OPT=
@@ -743,6 +749,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override TARGET_DIRS+=utils
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override TARGET_DIRS+=utils
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override TARGET_DIRS+=utils
 endif
@@ -962,6 +971,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override TARGET_PROGRAMS+=pp
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override TARGET_PROGRAMS+=pp
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override TARGET_PROGRAMS+=pp
 endif
@@ -1182,6 +1194,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override COMPILER_INCLUDEDIR+=$(CPC_TARGET)
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override COMPILER_INCLUDEDIR+=$(CPC_TARGET)
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override COMPILER_INCLUDEDIR+=$(CPC_TARGET)
 endif
@@ -1401,6 +1416,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override COMPILER_UNITDIR+=$(COMPILERSOURCEDIR)
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override COMPILER_UNITDIR+=$(COMPILERSOURCEDIR)
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override COMPILER_UNITDIR+=$(COMPILERSOURCEDIR)
 endif
@@ -1620,6 +1638,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override COMPILER_TARGETDIR+=.
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override COMPILER_TARGETDIR+=.
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override COMPILER_TARGETDIR+=.
 endif
@@ -1839,6 +1860,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 override COMPILER_UNITTARGETDIR+=$(CPU_UNITDIR)/units/$(FULL_TARGET)
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+override COMPILER_UNITTARGETDIR+=$(CPU_UNITDIR)/units/$(FULL_TARGET)
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 override COMPILER_UNITTARGETDIR+=$(CPU_UNITDIR)/units/$(FULL_TARGET)
 endif
@@ -2421,6 +2445,7 @@ ASNAME=$(BINUTILSPREFIX)as
 LDNAME=$(BINUTILSPREFIX)ld
 ARNAME=$(BINUTILSPREFIX)ar
 RCNAME=$(BINUTILSPREFIX)rc
+NASMNAME=$(BINUTILSPREFIX)nasm
 ifndef ASPROG
 ifdef CROSSBINDIR
 ASPROG=$(CROSSBINDIR)/$(ASNAME)$(SRCEXEEXT)
@@ -2449,10 +2474,18 @@ else
 ARPROG=$(ARNAME)
 endif
 endif
+ifndef NASMPROG
+ifdef CROSSBINDIR
+NASMPROG=$(CROSSBINDIR)/$(NASMNAME)$(SRCEXEEXT)
+else
+NASMPROG=$(NASMNAME)
+endif
+endif
 AS=$(ASPROG)
 LD=$(LDPROG)
 RC=$(RCPROG)
 AR=$(ARPROG)
+NASM=$(NASMPROG)
 ifdef inUnix
 PPAS=./ppas$(SRCBATCHEXT)
 else
@@ -2688,6 +2721,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 REQUIRE_PACKAGES_RTL=1
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+REQUIRE_PACKAGES_RTL=1
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 REQUIRE_PACKAGES_RTL=1
 endif
@@ -2877,7 +2913,7 @@ endif
 ifdef ACROSSCOMPILE
 override FPCOPT+=$(CROSSOPT)
 endif
-override COMPILER:=$(FPC) $(FPCOPT)
+override COMPILER:=$(strip $(FPC) $(FPCOPT))
 ifeq (,$(findstring -s ,$(COMPILER)))
 EXECPPAS=
 else
@@ -3533,6 +3569,9 @@ endif
 ifeq ($(FULL_TARGET),mipsel-linux)
 TARGET_DIRS_UTILS=1
 endif
+ifeq ($(FULL_TARGET),mipsel-android)
+TARGET_DIRS_UTILS=1
+endif
 ifeq ($(FULL_TARGET),jvm-java)
 TARGET_DIRS_UTILS=1
 endif
@@ -3798,13 +3837,13 @@ ifdef RELEASE
 DOWPOCYCLE=1
 wpocycle:
 	$(RM) $(EXENAME)
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(LOCALOPT) $(OPTWPOCOLLECT)' compiler
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(LOCALOPT) $(OPTWPOCOLLECT) $(OPTNEW))' compiler
 	$(RM) $(EXENAME)
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(RTLOPT) $(OPTWPOPERFORM)' rtlclean rtl
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(LOCALOPT) $(OPTWPOPERFORM) $(subst pp1.wpo,pp2.wpo,$(OPTWPOCOLLECT))' $(addsuffix _clean,$(ALLTARGETS)) compiler
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(RTLOPT) $(OPTWPOPERFORM) $(OPTNEW))' rtlclean rtl
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(LOCALOPT) $(OPTNEW) $(OPTWPOPERFORM) $(subst pp1.wpo,pp2.wpo,$(OPTWPOCOLLECT)))' $(addsuffix _clean,$(ALLTARGETS)) compiler
 	$(MOVE) $(EXENAME) $(TEMPWPONAME1)
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(RTLOPT) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM))' rtlclean rtl
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(LOCALOPT) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM))' $(addsuffix _clean,$(ALLTARGETS)) compiler
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(strip $(RTLOPT) $(OPTNEW) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM)))' rtlclean rtl
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(strip $(LOCALOPT) $(OPTNEW) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM)))' $(addsuffix _clean,$(ALLTARGETS)) compiler
 	$(COPY) $(EXENAME) $(TEMPWPONAME2)
 endif
 endif
@@ -3873,12 +3912,12 @@ cycle: override FPC=
 cycle:
 	$(MAKE) OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=1
 	$(MAKE) OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) EXENAME=$(TEMPNAME) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 cycleclean compiler CYCLELEVEL=1
-	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=
+	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=2
 	$(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) PPC_TARGET=$(CPU_TARGET) EXENAME=$(PPCROSSNAME) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 cycleclean compiler CYCLELEVEL=2
 ifndef CROSSINSTALL
 	$(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(RTLOPT) $(CROSSOPT)' rtlclean rtl CYCLELEVEL=3
 ifndef NoNativeBinaries
-	$(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(LOCALOPT) $(CROSSOPT)' cycleclean compiler CYCLELEVEL=3
+	$(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(strip $(LOCALOPT) $(CROSSOPT))' cycleclean compiler CYCLELEVEL=3
 endif
 endif
 endif

+ 13 - 7
compiler/Makefile.fpc

@@ -138,18 +138,24 @@ override LOCALOPT+=$(OPTLEVEL2)
 override RTLOPT+=$(OPTLEVEL2)
 override LOCALOPT+=$(LOCALOPTLEVEL2)
 override RTLOPT+=$(RTLOPTLEVEL2)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 ifeq ($(CYCLELEVEL),3)
 override LOCALOPT+=$(OPTLEVEL3)
 override RTLOPT+=$(OPTLEVEL3)
 override LOCALOPT+=$(LOCALOPTLEVEL3)
 override RTLOPT+=$(RTLOPTLEVEL3)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 ifeq ($(CYCLELEVEL),4)
 override LOCALOPT+=$(OPTLEVEL4)
 override RTLOPT+=$(OPTLEVEL4)
 override LOCALOPT+=$(LOCALOPTLEVEL4)
 override RTLOPT+=$(RTLOPTLEVEL4)
+override LOCALOPT+=$(OPTNEW)
+override RTLOPT+=$(OPTNEW)
 endif
 endif
 
@@ -596,13 +602,13 @@ DOWPOCYCLE=1
 wpocycle:
 # don't use cycle_clean, it will delete the compiler utilities again
         $(RM) $(EXENAME)
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(LOCALOPT) $(OPTWPOCOLLECT)' compiler
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(LOCALOPT) $(OPTWPOCOLLECT) $(OPTNEW))' compiler
         $(RM) $(EXENAME)
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(RTLOPT) $(OPTWPOPERFORM)' rtlclean rtl
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(LOCALOPT) $(OPTWPOPERFORM) $(subst pp1.wpo,pp2.wpo,$(OPTWPOCOLLECT))' $(addsuffix _clean,$(ALLTARGETS)) compiler
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(RTLOPT) $(OPTWPOPERFORM) $(OPTNEW))' rtlclean rtl
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME3)' 'OPT=$(strip $(LOCALOPT) $(OPTNEW) $(OPTWPOPERFORM) $(subst pp1.wpo,pp2.wpo,$(OPTWPOCOLLECT)))' $(addsuffix _clean,$(ALLTARGETS)) compiler
         $(MOVE) $(EXENAME) $(TEMPWPONAME1)
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(RTLOPT) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM))' rtlclean rtl
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(LOCALOPT) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM))' $(addsuffix _clean,$(ALLTARGETS)) compiler
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(strip $(RTLOPT) $(OPTNEW) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM)))' rtlclean rtl
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPWPONAME1)' 'OPT=$(strip $(LOCALOPT) $(OPTNEW) $(subst pp1.wpo,pp2.wpo,$(OPTWPOPERFORM)))' $(addsuffix _clean,$(ALLTARGETS)) compiler
         $(COPY) $(EXENAME) $(TEMPWPONAME2)
 endif
 endif
@@ -704,14 +710,14 @@ cycle:
         $(MAKE) OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=1
         $(MAKE) OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) EXENAME=$(TEMPNAME) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 cycleclean compiler CYCLELEVEL=1
 # ppcross<ARCH> (source native)
-        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=
+        $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 rtlclean rtl CYCLELEVEL=2
         $(MAKE) 'FPC=$(BASEDIR)/$(TEMPNAME)' OS_TARGET=$(OS_SOURCE) CPU_TARGET=$(CPU_SOURCE) PPC_TARGET=$(CPU_TARGET) EXENAME=$(PPCROSSNAME) CROSSBINDIR= BINUTILSPREFIX= CROSSCYCLEBOOTSTRAP=1 cycleclean compiler CYCLELEVEL=2
 # ppc<ARCH> (target native)
 ifndef CROSSINSTALL
         $(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(RTLOPT) $(CROSSOPT)' rtlclean rtl CYCLELEVEL=3
 # building a native compiler for JVM and embedded targets is not possible
 ifndef NoNativeBinaries
-        $(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(LOCALOPT) $(CROSSOPT)' cycleclean compiler CYCLELEVEL=3
+        $(MAKE) 'FPC=$(BASEDIR)/$(PPCROSSNAME)' 'OPT=$(strip $(LOCALOPT) $(CROSSOPT))' cycleclean compiler CYCLELEVEL=3
 endif
 endif
 

+ 6 - 1
compiler/aasmtai.pas

@@ -147,7 +147,10 @@ interface
           aitconst_64bit_unaligned,
           { i8086 far pointer; emits: 'DW symbol, SEG symbol' }
           aitconst_farptr,
-          aitconst_got
+          { offset of symbol's GOT slot in GOT }
+          aitconst_got,
+          { offset of symbol itself from GOT }
+          aitconst_gotoff_symbol
         );
 
     const
@@ -1873,6 +1876,8 @@ implementation
             result:=LengthSleb128(value);
           aitconst_half16bit:
             result:=2;
+          aitconst_gotoff_symbol:
+            result:=4;
           else
             internalerror(200603253);
         end;

+ 77 - 32
compiler/aggas.pas

@@ -227,7 +227,7 @@ implementation
         );
 
       { Generic unaligned pseudo-instructions, seems ELF specific }
-      use_ua_elf_systems = [system_mipsel_linux,system_mipseb_linux];
+      use_ua_elf_systems = [system_mipsel_linux,system_mipseb_linux,system_mipsel_android];
       ait_ua_elf_const2str : array[aitconst_16bit_unaligned..aitconst_64bit_unaligned]
         of string[20]=(
           #9'.2byte'#9,#9'.4byte'#9,#9'.8byte'#9
@@ -652,28 +652,18 @@ implementation
               if not(target_info.system in (systems_darwin+systems_aix)) then
                 begin
 {$ifdef m68k}
-                  if assigned(lasthp) and
-                      (
-                        (lasthp.typ=ait_instruction) and
-                        (taicpu(lasthp).opcode<>A_JMP)
-                      ) or
-                      (
-                        (lasthp.typ=ait_label)
-                      ) then
+                  if not use_op and (lastsectype=sec_code) then
                     begin
-                      if ispowerof2(alignment,i) then
-                        begin
-                          { the Coldfire manual suggests the TBF instruction for
-                            alignments, but somehow QEMU does not interpret that
-                            correctly... }
-                          {if current_settings.cputype in cpu_coldfire then
-                            instr:='0x51fc'
-                          else}
-                            instr:='0x4e71';
-                          AsmWrite(#9'.balignw '+tostr(alignment)+','+instr);
-                        end
-                      else
-                        internalerror(2012102101);
+                      if not ispowerof2(alignment,i) then
+                        internalerror(2014022201);
+                      { the Coldfire manual suggests the TBF instruction for
+                        alignments, but somehow QEMU does not interpret that
+                        correctly... }
+                      {if current_settings.cputype in cpu_coldfire then
+                        instr:='0x51fc'
+                      else}
+                        instr:='0x4e71';
+                      AsmWrite(#9'.balignw '+tostr(alignment)+','+instr);
                     end
                   else
                     begin
@@ -961,6 +951,32 @@ implementation
                      AsmWrite(#9'.word'#9+tai_const(hp).sym.name+'(GOT)');
                      Asmln;
                    end;
+
+                 aitconst_gotoff_symbol:
+                   begin
+                     if (tai_const(hp).sym=nil) then
+                       InternalError(2014022601);
+                     case target_info.cpu of
+
+                       cpu_mipseb,cpu_mipsel:
+                         begin
+                           AsmWrite(#9'.gpword'#9);
+                           AsmWrite(tai_const(hp).sym.name);
+                         end;
+
+                       cpu_i386:
+                         begin
+                           AsmWrite(ait_const2str[aitconst_32bit]);
+                           AsmWrite(tai_const(hp).sym.name);
+                         end;
+                     else
+                       InternalError(2014022602);
+                     end;
+                     if (tai_const(hp).value<>0) then
+                       AsmWrite(tostr_with_plus(tai_const(hp).value));
+                     Asmln;
+                   end;
+
                  aitconst_uleb128bit,
                  aitconst_sleb128bit,
 {$ifdef cpu64bitaddr}
@@ -1336,11 +1352,17 @@ implementation
 {$endif arm}
            ait_set:
              begin
-               AsmWriteLn(#9'.set '+tai_set(hp).sym^+', '+tai_set(hp).value^);
+               if replaceforbidden then
+                 AsmWriteLn(#9'.set '+ReplaceForbiddenAsmSymbolChars(tai_set(hp).sym^)+', '+ReplaceForbiddenAsmSymbolChars(tai_set(hp).value^))
+               else
+                 AsmWriteLn(#9'.set '+tai_set(hp).sym^+', '+tai_set(hp).value^);
              end;
            ait_weak:
              begin
-               AsmWriteLn(#9'.weak '+tai_weak(hp).sym^);
+               if replaceforbidden then
+                 AsmWriteLn(#9'.weak '+ReplaceForbiddenAsmSymbolChars(tai_weak(hp).sym^))
+               else
+                 AsmWriteLn(#9'.weak '+tai_weak(hp).sym^);
              end;
            ait_symbol_end :
              begin
@@ -1518,6 +1540,7 @@ implementation
 
       begin
         pos:=0;
+        instring:=false;
         for i:=1 to hp.len do
           begin
             if pos=0 then
@@ -1667,9 +1690,12 @@ implementation
 
       for hal:=low(TasmlistType) to high(TasmlistType) do
         begin
-          AsmWriteLn(target_asm.comment+'Begin asmlist '+AsmlistTypeStr[hal]);
-          writetree(current_asmdata.asmlists[hal]);
-          AsmWriteLn(target_asm.comment+'End asmlist '+AsmlistTypeStr[hal]);
+          if not (current_asmdata.asmlists[hal].empty) then
+            begin
+              AsmWriteLn(target_asm.comment+'Begin asmlist '+AsmlistTypeStr[hal]);
+              writetree(current_asmdata.asmlists[hal]);
+              AsmWriteLn(target_asm.comment+'End asmlist '+AsmlistTypeStr[hal]);
+            end;
         end;
 
       { add weak symbol markers }
@@ -1681,8 +1707,9 @@ implementation
          (target_info.system in systems_darwin) then
         AsmWriteLn(#9'.subsections_via_symbols');
 
-      { "no executable stack" marker for Linux }
-      if (target_info.system in (systems_linux + systems_android)) and
+      { "no executable stack" marker }
+      { TODO: used by OpenBSD/NetBSD as well? }
+      if (target_info.system in (systems_linux + systems_android + systems_freebsd)) and
          not(cs_executable_stack in current_settings.moduleswitches) then
         begin
           AsmWriteLn('.section .note.GNU-stack,"",%progbits');
@@ -1791,7 +1818,10 @@ implementation
               end;
             sec_objc_image_info:
               begin
-                result:='.section __OBJC, __image_info, regular, no_dead_strip';
+                if (target_info.system in systems_objc_nfabi) then
+                  result:='.section __DATA,__objc_imageinfo,regular,no_dead_strip'
+                else
+                  result:='.section __OBJC, __image_info, regular, no_dead_strip';
                 exit;
               end;
             sec_objc_cstring_object:
@@ -1820,12 +1850,27 @@ implementation
                     exit;
                   end;
               end;
-            sec_objc_meth_var_names,
+            sec_objc_meth_var_types:
+              begin
+                if (target_info.system in systems_objc_nfabi) then
+                  begin
+                    result:='.section __TEXT,__objc_methtype,cstring_literals';
+                    exit
+                  end;
+              end;
+            sec_objc_meth_var_names:
+              begin
+                if (target_info.system in systems_objc_nfabi) then
+                  begin
+                    result:='.section __TEXT,__objc_methname,cstring_literals';
+                    exit
+                  end;
+              end;
             sec_objc_class_names:
               begin
                 if (target_info.system in systems_objc_nfabi) then
                   begin
-                    result:='.cstring';
+                    result:='.section __TEXT,__objc_classname,cstring_literals';
                     exit
                   end;
               end;

+ 3 - 2
compiler/agjasmin.pas

@@ -687,7 +687,8 @@ implementation
        if cs_asm_extern in current_settings.globalswitches then
          Replace(result,'$JASMINJAR',maybequoted(ScriptFixFileName(jasminjar)))
        else
-         Replace(result,'$JASMINJAR',ScriptFixFileName(jasminjar))
+         Replace(result,'$JASMINJAR',ScriptFixFileName(jasminjar));
+       Replace(result,'$EXTRAOPT',asmextraopt);
      end;
 
 
@@ -1223,7 +1224,7 @@ implementation
          id     : as_jvm_jasmin;
          idtxt  : 'Jasmin';
          asmbin : 'java';
-         asmcmd : '-jar $JASMINJAR $ASM -d $OBJDIR';
+         asmcmd : '-jar $JASMINJAR $ASM $EXTRAOPT -d $OBJDIR';
          supported_targets : [system_jvm_java32,system_jvm_android32];
          flags : [];
          labelprefix : 'L';

+ 121 - 113
compiler/aoptobj.pas

@@ -1306,130 +1306,138 @@ Unit AoptObj;
     procedure TAOptObj.PeepHoleOptPass1;
       var
         p,hp1,hp2 : tai;
+        stoploop:boolean;
       begin
-        p := BlockStart;
-        ClearUsedRegs;
-        while (p <> BlockEnd) Do
-          begin
-            { I'am not sure why this is done, UsedRegs should reflect the register usage before the instruction
-              If an instruction needs the information of this, it can easily create a TempUsedRegs (FK)
-            UpdateUsedRegs(tai(p.next));
-            }
-{$ifdef DEBUG_OPTALLOC}
-            if p.Typ=ait_instruction then
-              InsertLLItem(tai(p.Previous),p,tai_comment.create(strpnew(GetAllocationString(UsedRegs))));
-{$endif DEBUG_OPTALLOC}
-            if PeepHoleOptPass1Cpu(p) then
-              begin
-                UpdateUsedRegs(p);
-                continue;
-              end;
-            case p.Typ Of
-              ait_instruction:
+        repeat
+          stoploop:=true;
+          p := BlockStart;
+          ClearUsedRegs;
+          while (p <> BlockEnd) Do
+            begin
+              { I'am not sure why this is done, UsedRegs should reflect the register usage before the instruction
+                If an instruction needs the information of this, it can easily create a TempUsedRegs (FK)
+              UpdateUsedRegs(tai(p.next));
+              }
+  {$ifdef DEBUG_OPTALLOC}
+              if p.Typ=ait_instruction then
+                InsertLLItem(tai(p.Previous),p,tai_comment.create(strpnew(GetAllocationString(UsedRegs))));
+  {$endif DEBUG_OPTALLOC}
+              if PeepHoleOptPass1Cpu(p) then
                 begin
-                  { Handle Jmp Optimizations }
-                  if taicpu(p).is_jmp then
-                    begin
-                      { the following if-block removes all code between a jmp and the next label,
-                        because it can never be executed
-                      }
-                      if IsJumpToLabel(taicpu(p)) then
-                        begin
-                          hp2:=p;
-                          while GetNextInstruction(hp2, hp1) and
-                                (hp1.typ <> ait_label) do
-                            if not(hp1.typ in ([ait_label,ait_align]+skipinstr)) then
-                              begin
-                                if (hp1.typ = ait_instruction) and
-                                   taicpu(hp1).is_jmp and
-                                   (JumpTargetOp(taicpu(hp1))^.typ = top_ref) and
-                                   (JumpTargetOp(taicpu(hp1))^.ref^.symbol is TAsmLabel) then
-                                   TAsmLabel(JumpTargetOp(taicpu(hp1))^.ref^.symbol).decrefs;
-                                { don't kill start/end of assembler block,
-                                  no-line-info-start/end etc }
-                                if hp1.typ<>ait_marker then
-                                  begin
-{$if defined(SPARC) or defined(MIPS) }
-                                    if (hp1.typ=ait_instruction) and (taicpu(hp1).is_jmp) then
-                                      RemoveDelaySlot(hp1);
-{$endif SPARC or MIPS }
-                                    asml.remove(hp1);
-                                    hp1.free;
-                                  end
-                                else
-                                  hp2:=hp1;
-                              end
-                            else break;
-                          end;
-                      { remove jumps to a label coming right after them }
-                      if GetNextInstruction(p, hp1) then
-                        begin
-                          SkipEntryExitMarker(hp1,hp1);
-                          if FindLabel(tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol), hp1) and
-        { TODO: FIXME removing the first instruction fails}
-                              (p<>blockstart) then
-                            begin
-                              tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol).decrefs;
-{$if defined(SPARC) or defined(MIPS)}
-                              RemoveDelaySlot(p);
-{$endif SPARC or MIPS}
-                              hp2:=tai(hp1.next);
-                              asml.remove(p);
-                              p.free;
-                              p:=hp2;
-                              continue;
-                            end
-                          else if assigned(hp1) then
-                            begin
-                              if hp1.typ = ait_label then
-                                SkipLabels(hp1,hp1);
-                              if (tai(hp1).typ=ait_instruction) and
-                                  IsJumpToLabel(taicpu(hp1)) and
-                                  GetNextInstruction(hp1, hp2) and
-                                  FindLabel(tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol), hp2) then
+                  stoploop:=false;
+                  UpdateUsedRegs(p);
+                  continue;
+                end;
+              case p.Typ Of
+                ait_instruction:
+                  begin
+                    { Handle Jmp Optimizations }
+                    if taicpu(p).is_jmp then
+                      begin
+                        { the following if-block removes all code between a jmp and the next label,
+                          because it can never be executed
+                        }
+                        if IsJumpToLabel(taicpu(p)) then
+                          begin
+                            hp2:=p;
+                            while GetNextInstruction(hp2, hp1) and
+                                  (hp1.typ <> ait_label) do
+                              if not(hp1.typ in ([ait_label,ait_align]+skipinstr)) then
                                 begin
-                                  if (taicpu(p).opcode=aopt_condjmp)
-{$ifdef arm}
-                                    and (taicpu(p).condition<>C_None)
-{$endif arm}
-                                  then
+                                  if (hp1.typ = ait_instruction) and
+                                     taicpu(hp1).is_jmp and
+                                     (JumpTargetOp(taicpu(hp1))^.typ = top_ref) and
+                                     (JumpTargetOp(taicpu(hp1))^.ref^.symbol is TAsmLabel) then
+                                     TAsmLabel(JumpTargetOp(taicpu(hp1))^.ref^.symbol).decrefs;
+                                  { don't kill start/end of assembler block,
+                                    no-line-info-start/end etc }
+                                  if hp1.typ<>ait_marker then
                                     begin
-                                      taicpu(p).condition:=inverse_cond(taicpu(p).condition);
-                                      tai_label(hp2).labsym.decrefs;
-                                      JumpTargetOp(taicpu(p))^.ref^.symbol:=JumpTargetOp(taicpu(hp1))^.ref^.symbol;
-                                      { when freeing hp1, the reference count
-                                        isn't decreased, so don't increase
-
-                                       taicpu(p).oper[0]^.ref^.symbol.increfs;
-                                      }
-{$if defined(SPARC) or defined(MIPS)}
-                                      RemoveDelaySlot(hp1);
-{$endif SPARC or MIPS}
+  {$if defined(SPARC) or defined(MIPS) }
+                                      if (hp1.typ=ait_instruction) and (taicpu(hp1).is_jmp) then
+                                        RemoveDelaySlot(hp1);
+  {$endif SPARC or MIPS }
                                       asml.remove(hp1);
                                       hp1.free;
-                                      GetFinalDestination(taicpu(p),0);
+                                      stoploop:=false;
                                     end
                                   else
-                                    begin
-                                      GetFinalDestination(taicpu(p),0);
-                                      p:=tai(p.next);
-                                      continue;
-                                    end;
+                                    hp2:=hp1;
                                 end
-                              else
-                                GetFinalDestination(taicpu(p),0);
+                              else break;
                             end;
-                        end;
-                    end
-                  else
-                  { All other optimizes }
-                    begin
-                    end; { if is_jmp }
-                end;
+                        { remove jumps to a label coming right after them }
+                        if GetNextInstruction(p, hp1) then
+                          begin
+                            SkipEntryExitMarker(hp1,hp1);
+                            if FindLabel(tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol), hp1) and
+          { TODO: FIXME removing the first instruction fails}
+                                (p<>blockstart) then
+                              begin
+                                tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol).decrefs;
+  {$if defined(SPARC) or defined(MIPS)}
+                                RemoveDelaySlot(p);
+  {$endif SPARC or MIPS}
+                                hp2:=tai(hp1.next);
+                                asml.remove(p);
+                                p.free;
+                                p:=hp2;
+                                stoploop:=false;
+                                continue;
+                              end
+                            else if assigned(hp1) then
+                              begin
+                                if hp1.typ = ait_label then
+                                  SkipLabels(hp1,hp1);
+                                if (tai(hp1).typ=ait_instruction) and
+                                    IsJumpToLabel(taicpu(hp1)) and
+                                    GetNextInstruction(hp1, hp2) and
+                                    FindLabel(tasmlabel(JumpTargetOp(taicpu(p))^.ref^.symbol), hp2) then
+                                  begin
+                                    if (taicpu(p).opcode=aopt_condjmp)
+  {$ifdef arm}
+                                      and (taicpu(p).condition<>C_None)
+  {$endif arm}
+                                    then
+                                      begin
+                                        taicpu(p).condition:=inverse_cond(taicpu(p).condition);
+                                        tai_label(hp2).labsym.decrefs;
+                                        JumpTargetOp(taicpu(p))^.ref^.symbol:=JumpTargetOp(taicpu(hp1))^.ref^.symbol;
+                                        { when freeing hp1, the reference count
+                                          isn't decreased, so don't increase
+
+                                         taicpu(p).oper[0]^.ref^.symbol.increfs;
+                                        }
+  {$if defined(SPARC) or defined(MIPS)}
+                                        RemoveDelaySlot(hp1);
+  {$endif SPARC or MIPS}
+                                        asml.remove(hp1);
+                                        hp1.free;
+                                        stoploop:=false;
+                                        GetFinalDestination(taicpu(p),0);
+                                      end
+                                    else
+                                      begin
+                                        GetFinalDestination(taicpu(p),0);
+                                        p:=tai(p.next);
+                                        continue;
+                                      end;
+                                  end
+                                else
+                                  GetFinalDestination(taicpu(p),0);
+                              end;
+                          end;
+                      end
+                    else
+                    { All other optimizes }
+                      begin
+                      end; { if is_jmp }
+                  end;
+              end;
+              UpdateUsedRegs(p);
+              p:=tai(p.next);
             end;
-            UpdateUsedRegs(p);
-            p:=tai(p.next);
-          end;
+        until stoploop or not(cs_opt_level3 in current_settings.optimizerswitches);
       end;
 
 

+ 4 - 7
compiler/arm/aasmcpu.pas

@@ -770,8 +770,7 @@ implementation
             else
               result:=operand_read;
           A_STREX:
-            if opnr in [0,1,2] then
-              result:=operand_write;
+            result:=operand_write;
           else
             internalerror(200403151);
         end;
@@ -1032,6 +1031,7 @@ implementation
                 end;
             end;
             { special case for case jump tables }
+            penalty:=0;
             if SimpleGetNextInstruction(curtai,hp) and
               (tai(hp).typ=ait_instruction) then
               begin
@@ -1081,12 +1081,8 @@ implementation
                   A_ITTTT:
                     if GenerateThumb2Code then
                       penalty:=4*multiplier;
-                  else
-                    penalty:=0;
                 end;
-              end
-            else
-              penalty:=0;
+              end;
 
             { FLD/FST VFP instructions have a limit of +/- 1024, not 4096 }
             if SimpleGetNextInstruction(curtai,hp) and
@@ -2102,6 +2098,7 @@ implementation
 
       begin
         bytes:=$0;
+        i_field:=0;
         { evaluate and set condition code }
 
         { condition code allowed? }

+ 2 - 2
compiler/arm/agarmgas.pas

@@ -356,7 +356,7 @@ unit agarmgas;
 
             idtxt  : 'AS';
             asmbin : 'as';
-            asmcmd : '-o $OBJ $ASM';
+            asmcmd : '-o $OBJ $EXTRAOPT $ASM';
             supported_targets : [system_arm_linux,system_arm_wince,system_arm_gba,system_arm_palmos,system_arm_nds,
                                  system_arm_embedded,system_arm_symbian,system_arm_android];
             flags : [af_needar,af_smartlink_sections];
@@ -370,7 +370,7 @@ unit agarmgas;
             id     : as_darwin;
             idtxt  : 'AS-Darwin';
             asmbin : 'as';
-            asmcmd : '-o $OBJ $ASM -arch $ARCH';
+            asmcmd : '-o $OBJ $EXTRAOPT $ASM -arch $ARCH';
             supported_targets : [system_arm_darwin];
             flags : [af_needar,af_smartlink_sections,af_supports_dwarf,af_stabs_use_function_absolute_addresses];
             labelprefix : 'L';

+ 461 - 260
compiler/arm/aoptcpu.pas

@@ -38,14 +38,14 @@ Type
     function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
     procedure PeepHoleOptPass2;override;
     Function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
-    procedure RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string);
+    function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
     function RegUsedAfterInstruction(reg: Tregister; p: tai;
                                      var AllUsedRegs: TAllUsedRegs): Boolean;
     { returns true if reg reaches it's end of life at p, this means it is either
       reloaded with a new value or it is deallocated afterwards }
     function RegEndOfLife(reg: TRegister;p: taicpu): boolean;
     { gets the next tai object after current that contains info relevant
-      to the optimizer in p1 which used the given register or does a 
+      to the optimizer in p1 which used the given register or does a
       change in program flow.
       If there is none, it returns false and
       sets p1 to nil                                                     }
@@ -68,6 +68,7 @@ Type
     { uses the same constructor as TAopObj }
     function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
     procedure PeepHoleOptPass2;override;
+    function PostPeepHoleOptsCpu(var p: tai): boolean; override;
   End;
 
   function MustBeLast(p : tai) : boolean;
@@ -151,8 +152,9 @@ Implementation
       result := (oper.typ = top_reg) and (oper.reg = reg);
     end;
 
-  procedure RemoveRedundantMove(const cmpp: tai; movp: tai; asml: TAsmList);
+  function RemoveRedundantMove(const cmpp: tai; movp: tai; asml: TAsmList):Boolean;
     begin
+      Result:=false;
       if (taicpu(movp).condition = C_EQ) and
          (taicpu(cmpp).oper[0]^.reg = taicpu(movp).oper[0]^.reg) and
          (taicpu(cmpp).oper[1]^.val = taicpu(movp).oper[1]^.val) then
@@ -160,6 +162,7 @@ Implementation
         asml.insertafter(tai_comment.Create(strpnew('Peephole CmpMovMov - Removed redundant moveq')), movp);
         asml.remove(movp);
         movp.free;
+        Result:=true;
       end;
     end;
 
@@ -202,6 +205,9 @@ Implementation
       {Loads to all register in the registerset}
       A_LDM:
         regLoadedWithNewValue := (getsupreg(reg) in p.oper[1]^.regset^);
+      A_POP:
+        regLoadedWithNewValue := (getsupreg(reg) in p.oper[0]^.regset^) or
+                                 (reg=NR_STACK_POINTER_REG);
     end;
 
     if regLoadedWithNewValue then
@@ -333,12 +339,13 @@ Implementation
     end;
 {$endif DEBUG_AOPTCPU}
 
-  procedure TCpuAsmOptimizer.RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string);
+  function TCpuAsmOptimizer.RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string):boolean;
     var
       alloc,
       dealloc : tai_regalloc;
       hp1 : tai;
     begin
+      Result:=false;
       if MatchInstruction(movp, A_MOV, [taicpu(p).condition], [PF_None]) and
          (taicpu(movp).ops=2) and {We can't optimize if there is a shiftop}
          MatchOperand(taicpu(movp).oper[1]^, taicpu(p).oper[0]^.reg) and
@@ -370,6 +377,7 @@ Implementation
           if assigned(dealloc) then
             begin
               DebugMsg('Peephole '+optimizer+' removed superfluous mov', movp);
+              result:=true;
 
               { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
                 and remove it if possible }
@@ -529,6 +537,7 @@ Implementation
       i, i2: longint;
       TmpUsedRegs: TAllUsedRegs;
       tempop: tasmop;
+      oldreg: tregister;
 
     function IsPowerOf2(const value: DWord): boolean; inline;
       begin
@@ -564,7 +573,10 @@ Implementation
                 N := result[31];
                 EQ = Z=1; NE = Z=0;
                 MI = N=1; PL = N=0; }
-              MatchInstruction(hp2, A_B, [C_EQ,C_NE,C_MI,C_PL], []) and
+              (MatchInstruction(hp2, A_B, [C_EQ,C_NE,C_MI,C_PL], []) or
+               { mov is also possible, but only if there is no shifter operand, it could be an rxx,
+                 we are too lazy to check if it is rxx or something else }
+               (MatchInstruction(hp2, A_MOV, [C_EQ,C_NE,C_MI,C_PL], []) and (taicpu(hp2).ops=2))) and
               assigned(FindRegDealloc(NR_DEFAULTFLAGS,tai(hp2.Next))) then
              begin
                DebugMsg('Peephole OpCmp2OpS done', p);
@@ -582,6 +594,7 @@ Implementation
 
                asml.remove(hp1);
                hp1.free;
+               Result:=true;
              end
            else
               case taicpu(p).opcode of
@@ -605,7 +618,7 @@ Implementation
                           begin
                             DebugMsg('Peephole StrLdr2StrMov 1 done', hp1);
                             asml.remove(hp1);
-                            hp1.free;                            
+                            hp1.free;
                           end
                         else
                           begin
@@ -643,8 +656,9 @@ Implementation
                         taicpu(p).oppostfix:=PF_D;
                         asml.remove(hp1);
                         hp1.free;
+                        result:=true;
                       end;
-                    LookForPostindexedPattern(taicpu(p));
+                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
                   end;
                 A_LDR:
                   begin
@@ -707,6 +721,7 @@ Implementation
                             taicpu(p).oppostfix:=PF_D;
                             asml.remove(hp1);
                             hp1.free;
+                            result:=true;
                           end;
                       end;
 
@@ -734,8 +749,9 @@ Implementation
                          taicpu(p).oper[0]^.reg := taicpu(hp1).oper[0]^.reg;
                          asml.remove(hp1);
                          hp1.free;
+                         result:=true;
                        end;
-                    LookForPostindexedPattern(taicpu(p));
+                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
                     { Remove superfluous mov after ldr
                       changes
                       ldr reg1, ref
@@ -750,8 +766,10 @@ Implementation
                         * ldr+mov have the same conditions
                         * mov does not set flags
                     }
-                    if (taicpu(p).oppostfix<>PF_D) and GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
-                      RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr');
+                    if (taicpu(p).oppostfix<>PF_D) and
+                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                       RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr') then
+                      Result:=true;
                   end;
                 A_MOV:
                   begin
@@ -799,6 +817,7 @@ Implementation
                                 p.free;
                                 hp1.free;
                                 p:=hp2;
+                                Result:=true;
                               end;
                             ReleaseUsedRegs(TmpUsedRegs);
                           end
@@ -940,8 +959,11 @@ Implementation
                        (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
                        (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
                        GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
-                       (assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
-                         regLoadedWithNewValue(taicpu(p).oper[0]^.reg, hp1)) then
+                       (hp1.typ=ait_instruction) and
+                       (taicpu(hp1).ops>=1) and
+                       (taicpu(hp1).oper[0]^.typ=top_reg) and
+                       (not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
+                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
                        begin
                          if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
                            MatchInstruction(hp1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
@@ -974,7 +996,87 @@ Implementation
                              result:=true;
                            end;
                        end;
+                    { Change
+                      mov rx, ry, lsr/ror #xxx
+                      uxtb/uxth rz,rx/and rz,rx,0xFF
+                      dealloc rx
 
+                      to
+
+                      uxtb/uxth rz,ry,ror #xxx
+                    }
+                    if (taicpu(p).ops=3) and
+                       (taicpu(p).oper[2]^.typ = top_shifterop) and
+                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+                       (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
+                       (GenerateThumb2Code) and
+                       GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
+                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
+                       begin
+                         if MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
+                           (taicpu(hp1).ops = 2) and
+                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                           begin
+                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                             taicpu(hp1).ops := 3;
+
+                             GetNextInstruction(p,hp1);
+
+                             asml.Remove(p);
+                             p.Free;
+
+                             p:=hp1;
+
+                             result:=true;
+                             exit;
+                           end
+                         else if MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
+                           (taicpu(hp1).ops=2) and
+                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
+                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                           begin
+                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                             taicpu(hp1).ops := 3;
+
+                             GetNextInstruction(p,hp1);
+
+                             asml.Remove(p);
+                             p.Free;
+
+                             p:=hp1;
+
+                             result:=true;
+                             exit;
+                           end
+                         else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
+                           (taicpu(hp1).ops = 3) and
+                           (taicpu(hp1).oper[2]^.typ = top_const) and
+                           (taicpu(hp1).oper[2]^.val = $FF) and
+                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                           begin
+                             taicpu(hp1).ops := 3;
+                             taicpu(hp1).opcode := A_UXTB;
+                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+
+                             GetNextInstruction(p,hp1);
+
+                             asml.Remove(p);
+                             p.Free;
+
+                             p:=hp1;
+
+                             result:=true;
+                             exit;
+                           end;
+                       end;
                     {
                       optimize
                       mov rX, yyyy
@@ -1007,6 +1109,7 @@ Implementation
                               GetNextInstruction(hp2,hp1);
                               asml.remove(hp2);
                               hp2.free;
+                              result:=true;
                               if not assigned(hp1) then break;
                             end
                         {
@@ -1026,6 +1129,7 @@ Implementation
                               p.free;
                               p:=hp1;
                               GetNextInstruction(hp1,hp1);
+                              result:=true;
                               if not assigned(hp1) then
                                 break;
                             end;
@@ -1079,9 +1183,46 @@ Implementation
                                 asml.remove(p);
                                 p.free;
                                 p:=hp1;
+                                Result:=true;
                               end;
                             end;
                       end;
+                    { Fold the very common sequence
+                        mov  regA, regB
+                        ldr* regA, [regA]
+                      to
+                        ldr* regA, [regB]
+                      CAUTION! If this one is successful p might not be a mov instruction anymore!
+                    }
+                    if (taicpu(p).opcode = A_MOV) and
+                       (taicpu(p).ops = 2) and
+                       (taicpu(p).oper[1]^.typ = top_reg) and
+                       (taicpu(p).oppostfix = PF_NONE) and
+                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                       MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], []) and
+                       { We can change the base register only when the instruction uses AM_OFFSET }
+                       ((taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
+                         ((taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                          (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
+                       ) and
+                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
+                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
+                      begin
+                        DebugMsg('Peephole MovLdr2Ldr done', hp1);
+                        if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                           (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                          taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+
+                        if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                          taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+
+                        GetNextInstruction(p, hp1);
+                        asml.remove(p);
+                        p.free;
+                        p:=hp1;
+                        result:=true;
+                      end;
+
                     { This folds shifterops into following instructions
                       mov r0, r1, lsl #8
                       add r2, r3, r0
@@ -1103,12 +1244,11 @@ Implementation
                                         [taicpu(p).condition], [PF_None]) and
                        (not ((GenerateThumb2Code) and
                              (taicpu(hp1).opcode in [A_SBC]) and
-                             (((taicpu(hp1).ops=3) and 
+                             (((taicpu(hp1).ops=3) and
                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^.reg)) or
-                              ((taicpu(hp1).ops=2) and 
+                              ((taicpu(hp1).ops=2) and
                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg))))) and
-                       (assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
-                         regLoadedWithNewValue(taicpu(p).oper[0]^.reg, hp1)) and
+                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
                        (taicpu(hp1).ops >= 2) and
                        {Currently we can't fold into another shifterop}
                        (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
@@ -1176,13 +1316,14 @@ Implementation
                                        taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
                                        taicpu(p).oper[2]^.shifterop^);
                               asml.insertbefore(hp2, hp1);
+                              GetNextInstruction(p, hp2);
                               asml.remove(p);
                               asml.remove(hp1);
                               p.free;
                               hp1.free;
                               p:=hp2;
-                              GetNextInstruction(p,hp1);
                               DebugMsg('Peephole FoldShiftProcess done', p);
+                              Result:=true;
                               break;
                             end;
                       end;
@@ -1220,8 +1361,10 @@ Implementation
                        (taicpu(p).oppostfix = PF_NONE) and
                        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
                        {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
-                       MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition],
-                                             [PF_None, PF_B]) and
+                       (MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
+                        (GenerateThumb2Code and
+                         MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
+                       ) and
                        (
                          {If this is address by offset, one of the two registers can be used}
                          ((taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
@@ -1235,14 +1378,15 @@ Implementation
                            (
                              (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
                              (taicpu(hp1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
-                           )
+                           ) and
+                           (not GenerateThumb2Code)
                          )
                        ) and
-                       { Only fold if there isn't another shifterop already. }
+                       { Only fold if there isn't another shifterop already, and offset is zero. }
+                       (taicpu(hp1).oper[1]^.ref^.offset = 0) and
                        (taicpu(hp1).oper[1]^.ref^.shiftmode = SM_None) and
                        not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-                       (assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
-                         regLoadedWithNewValue(taicpu(p).oper[0]^.reg, hp1)) then
+                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
                        begin
                          { If the register we want to do the shift for resides in base, we need to swap that}
                          if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
@@ -1251,17 +1395,20 @@ Implementation
                          taicpu(hp1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
                          taicpu(hp1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
                          DebugMsg('Peephole FoldShiftLdrStr done', hp1);
+                         GetNextInstruction(p, hp1);
                          asml.remove(p);
                          p.free;
                          p:=hp1;
+                         Result:=true;
                        end;
                     {
                       Often we see shifts and then a superfluous mov to another register
                       In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
                     }
-                    if (taicpu(p).opcode = A_MOV) and 
-                        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
-                      RemoveSuperfluousMove(p, hp1, 'MovMov2Mov');
+                    if (taicpu(p).opcode = A_MOV) and
+                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                       RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
+                      Result:=true;
                   end;
                 A_ADD,
                 A_ADC,
@@ -1274,6 +1421,7 @@ Implementation
                 A_EOR,
                 A_ORR,
                 A_MLA,
+                A_MLS,
                 A_MUL:
                   begin
                         {
@@ -1316,6 +1464,7 @@ Implementation
                                 taicpu(hp1).loadConst(2,taicpu(p).oper[2]^.val and taicpu(hp1).oper[2]^.val);
                                 taicpu(hp1).oppostfix:=taicpu(p).oppostfix;
                                 taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+                                GetNextInstruction(p, hp1);
                                 asml.remove(p);
                                 p.free;
                                 p:=hp1;
@@ -1342,6 +1491,7 @@ Implementation
                           begin
                             DebugMsg('Peephole AndStrb2Strb done', p);
                             taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
+                            GetNextInstruction(p, hp1);
                             asml.remove(p);
                             p.free;
                             p:=hp1;
@@ -1359,6 +1509,7 @@ Implementation
                           MatchInstruction(p, A_AND, [C_None], [PF_None]) and
                           GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
                           MatchInstruction(hp1, [A_UXTB,A_UXTH], [C_None], [PF_None]) and
+                          (taicpu(hp1).ops = 2) and
                           RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
                           { reg1 might not be modified inbetween }
@@ -1433,6 +1584,7 @@ Implementation
                               begin
                                 DebugMsg('Peephole AndLsl2Lsl done', p);
                                 taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[0]^.reg;
+                                GetNextInstruction(p, hp1);
                                 asml.Remove(p);
                                 p.free;
                                 p:=hp1;
@@ -1499,6 +1651,7 @@ Implementation
                                 asml.remove(p);
                                 p.free;
                                 p:=hp1;
+                                result:=true;
                                 break;
                               end;
                           end;
@@ -1510,11 +1663,10 @@ Implementation
                       to
                       add reg2, ...
                     }
-                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
-                      begin
-                        if (taicpu(p).ops=3) then
-                          RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
-                      end;
+                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                       (taicpu(p).ops>=3) and
+                       RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
+                      Result:=true;
 
                     if MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
                       LookForPreindexedPattern(taicpu(p)) then
@@ -1524,9 +1676,97 @@ Implementation
                         asml.remove(p);
                         p.free;
                         p:=hp1;
+                        Result:=true;
                       end;
+                    {
+                     Turn
+                     mul reg0, z,w
+                     sub/add x, y, reg0
+                     dealloc reg0
+
+                     into
+
+                     mls/mla x,z,w,y
+                     }
+                    if MatchInstruction(p, [A_MUL], [C_None], [PF_None]) and
+                      (taicpu(p).ops=3) and
+                      (taicpu(p).oper[0]^.typ = top_reg) and
+                      (taicpu(p).oper[1]^.typ = top_reg) and
+                      (taicpu(p).oper[2]^.typ = top_reg) and
+                      GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+                      MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
+
+                      (((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype>=cpu_armv4)) or
+                       ((taicpu(hp1).opcode=A_SUB) and (current_settings.cputype in [cpu_armv6t2,cpu_armv7,cpu_armv7a,cpu_armv7r,cpu_armv7m,cpu_armv7em]))) and
+
+                      // CPUs before ARMv6 don't recommend having the same Rd and Rm for MLA.
+                      // TODO: A workaround would be to swap Rm and Rs
+                      (not ((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype<=cpu_armv6) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^))) and
+
+                      (((taicpu(hp1).ops=3) and
+                        (taicpu(hp1).oper[2]^.typ=top_reg) and
+                        ((MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
+                          (not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, p, hp1))) or
+                         ((MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
+                           (taicpu(hp1).opcode=A_ADD) and
+                           (not RegModifiedBetween(taicpu(hp1).oper[2]^.reg, p, hp1)))))) or
+                       ((taicpu(hp1).ops=2) and
+                        (taicpu(hp1).oper[1]^.typ=top_reg) and
+                        MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
+                      (RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1))) then
+                      begin
+                        if taicpu(hp1).opcode=A_ADD then
+                          begin
+                            taicpu(hp1).opcode:=A_MLA;
+
+                            if taicpu(hp1).ops=3 then
+                              begin
+                                if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
+                                  oldreg:=taicpu(hp1).oper[2]^.reg
+                                else
+                                  oldreg:=taicpu(hp1).oper[1]^.reg;
+                              end
+                            else
+                              oldreg:=taicpu(hp1).oper[0]^.reg;
+
+                            taicpu(hp1).loadreg(1,taicpu(p).oper[1]^.reg);
+                            taicpu(hp1).loadreg(2,taicpu(p).oper[2]^.reg);
+                            taicpu(hp1).loadreg(3,oldreg);
+
+                            DebugMsg('MulAdd2MLA done', p);
+
+                            taicpu(hp1).ops:=4;
+
+                            asml.remove(p);
+                            p.free;
+                            p:=hp1;
+                          end
+                        else
+                          begin
+                            taicpu(hp1).opcode:=A_MLS;
+
+                            taicpu(hp1).loadreg(3,taicpu(hp1).oper[1]^.reg);
+
+                            if taicpu(hp1).ops=2 then
+                              taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg)
+                            else
+                              taicpu(hp1).loadreg(1,taicpu(p).oper[2]^.reg);
+
+                            taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
+
+                            DebugMsg('MulSub2MLS done', p);
+
+                            taicpu(hp1).ops:=4;
+
+                            asml.remove(p);
+                            p.free;
+                            p:=hp1;
+                          end;
+
+                        result:=true;
+                      end
                   end;
-{$ifdef dummy}                  
+{$ifdef dummy}
                 A_MVN:
                   begin
                     {
@@ -1563,12 +1803,13 @@ Implementation
                           end
                         else
                           taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+                        GetNextInstruction(p, hp1);
                         asml.remove(p);
                         p.free;
                         p:=hp1;
                       end;
                   end;
-{$endif dummy}                                    
+{$endif dummy}
                 A_UXTB:
                   begin
                     {
@@ -1607,18 +1848,16 @@ Implementation
                     else if MatchInstruction(p, A_UXTB, [C_None], [PF_None]) and
                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
                       MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
+                      (taicpu(hp1).ops = 2) and
                       MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
                       RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
                       { reg1 might not be modified inbetween }
                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
                       begin
                         DebugMsg('Peephole UxtbUxth2Uxtb done', p);
-                        taicpu(hp1).opcode:=A_UXTB;
-                        taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
-                        GetNextInstruction(p,hp2);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp2;
+                        taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
+                        asml.remove(hp1);
+                        hp1.free;
                         result:=true;
                       end
                     {
@@ -1632,18 +1871,16 @@ Implementation
                     else if MatchInstruction(p, A_UXTB, [C_None], [PF_None]) and
                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
                       MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
+                      (taicpu(hp1).ops = 2) and
                       MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
                       RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
                       { reg1 might not be modified inbetween }
                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
                       begin
                         DebugMsg('Peephole UxtbUxtb2Uxtb done', p);
-                        taicpu(hp1).opcode:=A_UXTB;
-                        taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
-                        GetNextInstruction(p,hp2);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp2;
+                        taicpu(p).loadReg(0,taicpu(hp1).oper[0]^.reg);
+                        asml.remove(hp1);
+                        hp1.free;
                         result:=true;
                       end
                     {
@@ -1656,6 +1893,7 @@ Implementation
                     }
                     else if MatchInstruction(p, A_UXTB, [C_None], [PF_None]) and
                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+                      (taicpu(p).ops=2) and
                       MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
                       (taicpu(hp1).ops=3) and
                       (taicpu(hp1).oper[2]^.typ=top_const) and
@@ -1675,11 +1913,9 @@ Implementation
                         p:=hp2;
                         result:=true;
                       end
-                    else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
-                      begin
-                        //if (taicpu(p).ops=3) then
-                          RemoveSuperfluousMove(p, hp1, 'UxtbMov2Data');
-                      end;
+                    else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                         RemoveSuperfluousMove(p, hp1, 'UxtbMov2Data') then
+                      Result:=true;
                   end;
                 A_UXTH:
                   begin
@@ -1702,6 +1938,7 @@ Implementation
                       begin
                         DebugMsg('Peephole UXTHStrh2Strh done', p);
                         taicpu(hp1).loadReg(0,taicpu(p).oper[1]^.reg);
+                        GetNextInstruction(p, hp1);
                         asml.remove(p);
                         p.free;
                         p:=hp1;
@@ -1718,6 +1955,7 @@ Implementation
                     else if MatchInstruction(p, A_UXTH, [C_None], [PF_None]) and
                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
                       MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
+                      (taicpu(hp1).ops=2) and
                       MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
                       RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp1)) and
                       { reg1 might not be modified inbetween }
@@ -1726,6 +1964,7 @@ Implementation
                         DebugMsg('Peephole UxthUxth2Uxth done', p);
                         taicpu(hp1).opcode:=A_UXTH;
                         taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+                        GetNextInstruction(p, hp1);
                         asml.remove(p);
                         p.free;
                         p:=hp1;
@@ -1754,16 +1993,15 @@ Implementation
                         taicpu(hp1).opcode:=A_UXTH;
                         taicpu(hp1).ops:=2;
                         taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+                        GetNextInstruction(p, hp1);
                         asml.remove(p);
                         p.free;
                         p:=hp1;
                         result:=true;
                       end
-                    else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) then
-                      begin
-                        //if (taicpu(p).ops=3) then
-                          RemoveSuperfluousMove(p, hp1, 'UxthMov2Data');
-                      end;
+                    else if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                         RemoveSuperfluousMove(p, hp1, 'UxthMov2Data') then
+                      Result:=true;
                   end;
                 A_CMP:
                   begin
@@ -1784,8 +2022,8 @@ Implementation
                        MatchInstruction(hp2, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
                        (taicpu(hp1).oper[1]^.typ = top_const) then
                       begin
-                        RemoveRedundantMove(p, hp1, asml);
-                        RemoveRedundantMove(p, hp2, asml);
+                        Result:=RemoveRedundantMove(p, hp1, asml) or Result;
+                        Result:=RemoveRedundantMove(p, hp2, asml) or Result;
                       end;
                   end;
                 A_STM:
@@ -2039,6 +2277,9 @@ Implementation
     begin
       If (p1.typ = ait_instruction) and (taicpu(p1).opcode=A_BL) then
         Result:=true
+      else If MatchInstruction(p1, [A_LDR, A_STR], [], [PF_D]) and
+              (getsupreg(taicpu(p1).oper[0]^.reg)+1=getsupreg(reg)) then
+        Result:=true
       else
         Result:=inherited RegInInstruction(Reg, p1);
     end;
@@ -2147,6 +2388,7 @@ Implementation
             GetNextInstruction(p,hp1) and
             (hp1.typ=ait_instruction) and
             (taicpu(hp1).opcode in [A_LDR,A_LDRB,A_LDRH,A_LDRSB,A_LDRSH]) and
+            (taicpu(hp1).oppostfix in [PF_NONE, PF_B, PF_H, PF_SB, PF_SH]) and
             { for now we don't reschedule if the previous instruction changes potentially a memory location }
             ( (not(taicpu(p).opcode in opcode_could_mem_write) and
                not(RegModifiedByInstruction(NR_PC,p))
@@ -2319,6 +2561,7 @@ Implementation
     var
       hp : taicpu;
       hp1,hp2 : tai;
+      oldreg : TRegister;
     begin
       result:=false;
       if inherited PeepHoleOptPass1Cpu(p) then
@@ -2336,7 +2579,7 @@ Implementation
           p:=hp;
           result:=true;
         end
-      else if (p.typ=ait_instruction) and
+      {else if (p.typ=ait_instruction) and
         MatchInstruction(p, A_STR, [C_None], [PF_None]) and
         (taicpu(p).oper[1]^.ref^.addressmode=AM_PREINDEXED) and
         (taicpu(p).oper[1]^.ref^.index=NR_STACK_POINTER_REG) and
@@ -2350,7 +2593,7 @@ Implementation
           p.Free;
           p:=hp;
           result:=true;
-        end
+        end}
       else if (p.typ=ait_instruction) and
         MatchInstruction(p, A_LDM, [C_None], [PF_FD,PF_IA]) and
         (taicpu(p).oper[0]^.ref^.addressmode=AM_PREINDEXED) and
@@ -2365,7 +2608,7 @@ Implementation
           p:=hp;
           result:=true;
         end
-      else if (p.typ=ait_instruction) and
+      {else if (p.typ=ait_instruction) and
         MatchInstruction(p, A_LDR, [C_None], [PF_None]) and
         (taicpu(p).oper[1]^.ref^.addressmode=AM_POSTINDEXED) and
         (taicpu(p).oper[1]^.ref^.index=NR_STACK_POINTER_REG) and
@@ -2379,132 +2622,7 @@ Implementation
           p.Free;
           p:=hp;
           result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, A_MOV, [C_None], [PF_None]) and
-        (taicpu(p).oper[1]^.typ=top_const) and
-        (taicpu(p).oper[1]^.val >= 0) and
-        (taicpu(p).oper[1]^.val < 256) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          DebugMsg('Peephole Mov2Movs done', p);
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).oppostfix:=PF_S;
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, A_MVN, [C_None], [PF_None]) and
-        (taicpu(p).oper[1]^.typ=top_reg) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          DebugMsg('Peephole Mvn2Mvns done', p);
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).oppostfix:=PF_S;
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
-        (taicpu(p).ops = 3) and
-        MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
-        (not MatchOperand(taicpu(p).oper[0]^, NR_STACK_POINTER_REG)) and
-        (taicpu(p).oper[2]^.typ=top_const) and
-        (taicpu(p).oper[2]^.val >= 0) and
-        (taicpu(p).oper[2]^.val < 256) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          DebugMsg('Peephole AddSub2*s done', p);
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).loadconst(1,taicpu(p).oper[2]^.val);
-          taicpu(p).oppostfix:=PF_S;
-          taicpu(p).ops := 2;
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_ADD], [C_None], [PF_None]) and
-        (taicpu(p).ops = 3) and
-        MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
-        (taicpu(p).oper[2]^.typ=top_reg) then
-        begin
-          DebugMsg('Peephole AddRRR2AddRR done', p);
-          taicpu(p).ops := 2;
-          taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_BIC,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_None]) and
-        (taicpu(p).ops = 3) and
-        MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
-        (taicpu(p).oper[2]^.typ=top_reg) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).ops := 2;
-          taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
-          taicpu(p).oppostfix:=PF_S;
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_BIC,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_S]) and
-        (taicpu(p).ops = 3) and
-        MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
-        (taicpu(p).oper[2]^.typ in [top_reg,top_const]) then
-        begin
-          taicpu(p).ops := 2;
-          if taicpu(p).oper[2]^.typ=top_reg then
-            taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg)
-          else
-            taicpu(p).loadconst(1,taicpu(p).oper[2]^.val);
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_AND,A_ORR,A_EOR], [C_None], [PF_None,PF_S]) and
-        (taicpu(p).ops = 3) and
-        MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[2]^) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).oppostfix:=PF_S;
-          taicpu(p).ops := 2;
-          result:=true;
-        end
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_MOV], [C_None], [PF_None]) and
-        (taicpu(p).ops=3) and
-        (taicpu(p).oper[2]^.typ=top_shifterop) and
-        (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSL,SM_LSR,SM_ASR,SM_ROR]) and
-        //MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
-        (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
-        begin
-          DebugMsg('Peephole Mov2Shift done', p);
-          asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
-          asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
-          IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
-          taicpu(p).oppostfix:=PF_S;
-          //taicpu(p).ops := 2;
-
-          case taicpu(p).oper[2]^.shifterop^.shiftmode of
-            SM_LSL: taicpu(p).opcode:=A_LSL;
-            SM_LSR: taicpu(p).opcode:=A_LSR;
-            SM_ASR: taicpu(p).opcode:=A_ASR;
-            SM_ROR: taicpu(p).opcode:=A_ROR;
-          end;
-
-          if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
-            taicpu(p).loadreg(2, taicpu(p).oper[2]^.shifterop^.rs)
-          else
-            taicpu(p).loadconst(2, taicpu(p).oper[2]^.shifterop^.shiftimm);
-          result:=true;
-        end
+        end}
       else if (p.typ=ait_instruction) and
         MatchInstruction(p, [A_AND], [], [PF_None]) and
         (taicpu(p).ops = 2) and
@@ -2539,80 +2657,6 @@ Implementation
 
           result := true;
         end
-      {
-       Turn
-       mul reg0, z,w
-       sub/add x, y, reg0
-       dealloc reg0
-
-       into
-
-       mls/mla x,y,z,w
-       }
-      {
-      According to Jeppe Johansen this currently uses operands in the wrong order.
-
-      else if (p.typ=ait_instruction) and
-        MatchInstruction(p, [A_MUL], [C_None], [PF_None]) and
-        (taicpu(p).ops=3) and
-        (taicpu(p).oper[0]^.typ = top_reg) and
-        (taicpu(p).oper[1]^.typ = top_reg) and
-        (taicpu(p).oper[2]^.typ = top_reg) and
-        GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-        MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
-        (((taicpu(hp1).ops=3) and
-          (taicpu(hp1).oper[2]^.typ=top_reg) and
-          (MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
-           (MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
-            (taicpu(hp1).opcode=A_ADD)))) or
-         ((taicpu(hp1).ops=2) and
-          (taicpu(hp1).oper[1]^.typ=top_reg) and
-          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
-        assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
-        not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-        not(RegModifiedBetween(taicpu(p).oper[2]^.reg,p,hp1)) then
-        begin
-          if taicpu(hp1).opcode=A_ADD then
-            begin
-              taicpu(hp1).opcode:=A_MLA;
-
-              if taicpu(hp1).ops=3 then
-                if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
-                  taicpu(hp1).loadreg(1,taicpu(hp1).oper[2]^.reg);
-
-              taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
-              taicpu(hp1).loadreg(3,taicpu(p).oper[2]^.reg);
-
-              DebugMsg('MulAdd2MLA done', p);
-
-              taicpu(hp1).ops:=4;
-
-              asml.remove(p);
-              p.free;
-              p:=hp1;
-            end
-          else
-            begin
-              taicpu(hp1).opcode:=A_MLS;
-
-              if taicpu(hp1).ops=2 then
-                taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg);
-
-              taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
-              taicpu(hp1).loadreg(3,taicpu(p).oper[2]^.reg);
-
-              DebugMsg('MulSub2MLS done', p);
-
-              taicpu(hp1).ops:=4;
-
-              asml.remove(p);
-              p.free;
-              p:=hp1;
-            end;
-
-          result:=true;
-        end
-      }
       {else if (p.typ=ait_instruction) and
         MatchInstruction(p, [A_CMP], [C_None], [PF_None]) and
         (taicpu(p).oper[1]^.typ=top_const) and
@@ -2739,6 +2783,163 @@ Implementation
         end;
     end;
 
+
+  function TCpuThumb2AsmOptimizer.PostPeepHoleOptsCpu(var p: tai): boolean;
+    begin
+      result:=false;
+
+      if p.typ = ait_instruction then
+        begin
+          if MatchInstruction(p, A_MOV, [C_None], [PF_None]) and
+            (taicpu(p).oper[1]^.typ=top_const) and
+            (taicpu(p).oper[1]^.val >= 0) and
+            (taicpu(p).oper[1]^.val < 256) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole Mov2Movs done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+              result:=true;
+            end
+          else if MatchInstruction(p, A_MVN, [C_None], [PF_None]) and
+            (taicpu(p).oper[1]^.typ=top_reg) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole Mvn2Mvns done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+              result:=true;
+            end
+          else if MatchInstruction(p, A_RSB, [C_None], [PF_None]) and
+            (taicpu(p).ops = 3) and
+            (taicpu(p).oper[2]^.typ=top_const) and
+            (taicpu(p).oper[2]^.val=0) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole Rsb2Rsbs done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
+            (taicpu(p).ops = 3) and
+            MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
+            (not MatchOperand(taicpu(p).oper[0]^, NR_STACK_POINTER_REG)) and
+            (taicpu(p).oper[2]^.typ=top_const) and
+            (taicpu(p).oper[2]^.val >= 0) and
+            (taicpu(p).oper[2]^.val < 256) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole AddSub2*s done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).loadconst(1,taicpu(p).oper[2]^.val);
+              taicpu(p).oppostfix:=PF_S;
+              taicpu(p).ops := 2;
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
+            (taicpu(p).ops = 2) and
+            (taicpu(p).oper[1]^.typ=top_reg) and
+            (not MatchOperand(taicpu(p).oper[0]^, NR_STACK_POINTER_REG)) and
+            (not MatchOperand(taicpu(p).oper[1]^, NR_STACK_POINTER_REG)) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole AddSub2*s done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_ADD], [C_None], [PF_None]) and
+            (taicpu(p).ops = 3) and
+            MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
+            (taicpu(p).oper[2]^.typ=top_reg) then
+            begin
+              DebugMsg('Peephole AddRRR2AddRR done', p);
+              taicpu(p).ops := 2;
+              taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_BIC,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_None]) and
+            (taicpu(p).ops = 3) and
+            MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
+            (taicpu(p).oper[2]^.typ=top_reg) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole opXXY2opsXY done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).ops := 2;
+              taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg);
+              taicpu(p).oppostfix:=PF_S;
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_AND,A_ORR,A_EOR,A_BIC,A_LSL,A_LSR,A_ASR,A_ROR], [C_None], [PF_S]) and
+            (taicpu(p).ops = 3) and
+            MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
+            (taicpu(p).oper[2]^.typ in [top_reg,top_const]) then
+            begin
+              DebugMsg('Peephole opXXY2opXY done', p);
+              taicpu(p).ops := 2;
+              if taicpu(p).oper[2]^.typ=top_reg then
+                taicpu(p).loadreg(1,taicpu(p).oper[2]^.reg)
+              else
+                taicpu(p).loadconst(1,taicpu(p).oper[2]^.val);
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_ADD,A_AND,A_ORR,A_EOR], [C_None], [PF_None,PF_S]) and
+            (taicpu(p).ops = 3) and
+            MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[2]^) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole opXYX2opsXY done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+              taicpu(p).ops := 2;
+              result:=true;
+            end
+          else if MatchInstruction(p, [A_MOV], [C_None], [PF_None]) and
+            (taicpu(p).ops=3) and
+            (taicpu(p).oper[2]^.typ=top_shifterop) and
+            (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSL,SM_LSR,SM_ASR,SM_ROR]) and
+            //MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) and
+            (not RegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs)) then
+            begin
+              DebugMsg('Peephole Mov2Shift done', p);
+              asml.InsertBefore(tai_regalloc.alloc(NR_DEFAULTFLAGS,p), p);
+              asml.InsertAfter(tai_regalloc.dealloc(NR_DEFAULTFLAGS,p), p);
+              IncludeRegInUsedRegs(NR_DEFAULTFLAGS,UsedRegs);
+              taicpu(p).oppostfix:=PF_S;
+
+              case taicpu(p).oper[2]^.shifterop^.shiftmode of
+                SM_LSL: taicpu(p).opcode:=A_LSL;
+                SM_LSR: taicpu(p).opcode:=A_LSR;
+                SM_ASR: taicpu(p).opcode:=A_ASR;
+                SM_ROR: taicpu(p).opcode:=A_ROR;
+              end;
+
+              if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
+                taicpu(p).loadreg(2, taicpu(p).oper[2]^.shifterop^.rs)
+              else
+                taicpu(p).loadconst(2, taicpu(p).oper[2]^.shifterop^.shiftimm);
+              result:=true;
+            end
+        end;
+    end;
+
+
 begin
   casmoptimizer:=TCpuAsmOptimizer;
   cpreregallocscheduler:=TCpuPreRegallocScheduler;

+ 69 - 44
compiler/arm/cgcpu.pas

@@ -47,7 +47,6 @@ unit cgcpu;
 
         procedure a_call_name(list : TAsmList;const s : string; weak: boolean);override;
         procedure a_call_reg(list : TAsmList;reg: tregister);override;
-        procedure a_call_ref(list : TAsmList;ref: treference);override;
 
         { move instructions }
         procedure a_load_reg_ref(list : TAsmList; fromsize, tosize: tcgsize; reg : tregister;const ref : treference);override;
@@ -681,16 +680,6 @@ unit cgcpu;
       end;
 
 
-    procedure tbasecgarm.a_call_ref(list : TAsmList;ref: treference);
-      begin
-        a_reg_alloc(list,NR_R12);
-        a_load_ref_reg(list,OS_ADDR,OS_ADDR,ref,NR_R12);
-        a_call_reg(list,NR_R12);
-        a_reg_dealloc(list,NR_R12);
-        include(current_procinfo.flags,pi_do_call);
-      end;
-
-
      procedure tcgarm.a_op_const_reg(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; reg: TRegister);
        begin
           a_op_const_reg_reg(list,op,size,a,reg,reg);
@@ -894,12 +883,26 @@ unit cgcpu;
 
     procedure tcgarm.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister;setflags : boolean;var ovloc : tlocation);
       var
-        shift : byte;
+        shift, lsb, width : byte;
         tmpreg : tregister;
         so : tshifterop;
         l1 : longint;
         imm1, imm2: DWord;
       begin
+        optimize_op_const(size, op, a);
+        case op of
+          OP_NONE:
+            begin
+              if src <> dst then
+                a_load_reg_reg(list, size, size, src, dst);
+              exit;
+            end;
+          OP_MOVE:
+            begin
+              a_load_const_reg(list, size, a, dst);
+              exit;
+            end;
+        end;
         ovloc.loc:=LOC_VOID;
         if {$ifopt R+}(a<>-2147483648) and{$endif} not setflags and is_shifter_const(-a,shift) then
           case op of
@@ -927,18 +930,13 @@ unit cgcpu;
               begin
                 if a>32 then
                   internalerror(200308294);
-                if a<>0 then
-                  begin
-                    shifterop_reset(so);
-                    so.shiftmode:=opshift2shiftmode(op);
-                    if op = OP_ROL then
-                      so.shiftimm:=32-a
-                    else
-                      so.shiftimm:=a;
-                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,so));
-                  end
+                shifterop_reset(so);
+                so.shiftmode:=opshift2shiftmode(op);
+                if op = OP_ROL then
+                  so.shiftimm:=32-a
                 else
-                 list.concat(taicpu.op_reg_reg(A_MOV,dst,src));
+                  so.shiftimm:=a;
+                list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,so));
               end;
             else
               {if (op in [OP_SUB, OP_ADD]) and
@@ -972,11 +970,7 @@ unit cgcpu;
         else
           begin
             { there could be added some more sophisticated optimizations }
-            if (op in [OP_MUL,OP_IMUL,OP_DIV,OP_IDIV]) and (a=1) then
-              a_load_reg_reg(list,size,size,src,dst)
-            else if (op in [OP_MUL,OP_IMUL]) and (a=0) then
-              a_load_const_reg(list,size,0,dst)
-            else if (op in [OP_IMUL,OP_IDIV]) and (a=-1) then
+            if (op in [OP_IMUL,OP_IDIV]) and (a=-1) then
               a_op_reg_reg(list,OP_NEG,size,src,dst)
             { we do this here instead in the peephole optimizer because
               it saves us a register }
@@ -1006,23 +1000,48 @@ unit cgcpu;
               begin
                 { nothing to do on success }
               end
-            { x := y and 0; just clears a register, this sometimes gets generated on 64bit ops.
-              Just using mov x, #0 might allow some easier optimizations down the line. }
-            else if (op = OP_AND) and (dword(a)=0) then
-              list.concat(taicpu.op_reg_const(A_MOV,dst,0))
-            { x := y AND $FFFFFFFF just copies the register, so use mov for better optimizations }
-            else if (op = OP_AND) and (not(dword(a))=0) then
-              list.concat(taicpu.op_reg_reg(A_MOV,dst,src))
             { BIC clears the specified bits, while AND keeps them, using BIC allows to use a
               broader range of shifterconstants.}
             else if (op = OP_AND) and is_shifter_const(not(dword(a)),shift) then
               list.concat(taicpu.op_reg_reg_const(A_BIC,dst,src,not(dword(a))))
+            { Doing two shifts instead of two bics might allow the peephole optimizer to fold the second shift
+              into the following instruction}
+            else if (op = OP_AND) and
+                    is_continuous_mask(a, lsb, width) and
+                    ((lsb = 0) or ((lsb + width) = 32)) then
+              begin
+                shifterop_reset(so);
+                if (width = 16) and
+                   (lsb = 0) and
+                   (current_settings.cputype >= cpu_armv6) then
+                  list.concat(taicpu.op_reg_reg(A_UXTH,dst,src))
+                else if (width = 8) and
+                   (lsb = 0) and
+                   (current_settings.cputype >= cpu_armv6) then
+                  list.concat(taicpu.op_reg_reg(A_UXTB,dst,src))
+                else if lsb = 0 then
+                  begin
+                    so.shiftmode:=SM_LSL;
+                    so.shiftimm:=32-width;
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,so));
+                    so.shiftmode:=SM_LSR;
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,dst,so));
+                  end
+                else
+                  begin
+                    so.shiftmode:=SM_LSR;
+                    so.shiftimm:=lsb;
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,src,so));
+                    so.shiftmode:=SM_LSL;
+                    list.concat(taicpu.op_reg_reg_shifterop(A_MOV,dst,dst,so));
+                  end;
+              end
             else if (op = OP_AND) and split_into_shifter_const(not(dword(a)), imm1, imm2) then
               begin
                 list.concat(taicpu.op_reg_reg_const(A_BIC,dst,src,imm1));
                 list.concat(taicpu.op_reg_reg_const(A_BIC,dst,dst,imm2));
               end
-            else if (op in [OP_ADD, OP_SUB, OP_OR]) and
+            else if (op in [OP_ADD, OP_SUB, OP_OR, OP_XOR]) and
                     not(cgsetflags or setflags) and
                     split_into_shifter_const(a, imm1, imm2) then
               begin
@@ -1155,7 +1174,7 @@ unit cgcpu;
         if (ref.base=NR_NO) then
           begin
             if ref.shiftmode<>SM_None then
-              internalerror(200308294);
+              internalerror(2014020701);
             ref.base:=ref.index;
             ref.index:=NR_NO;
           end;
@@ -1752,6 +1771,7 @@ unit cgcpu;
         { call instruction does not put anything on the stack }
         stackmisalignment:=0;
         tarmprocinfo(current_procinfo).stackpaddingreg:=High(TSuperRegister);
+        lastfloatreg:=RS_NO;
         if not(nostackframe) then
           begin
             firstfloatreg:=RS_NO;
@@ -1978,7 +1998,9 @@ unit cgcpu;
           begin
             stackmisalignment:=0;
             firstfloatreg:=RS_NO;
+            lastfloatreg:=RS_NO;
             mmregs:=[];
+            saveregs:=[];
             case current_settings.fputype of
               fpu_fpa,
               fpu_fpa10,
@@ -2203,7 +2225,7 @@ unit cgcpu;
         if (tmpref.base=NR_NO) then
           begin
             if tmpref.shiftmode<>SM_None then
-              internalerror(200308294);
+              internalerror(2014020702);
             if tmpref.signindex<0 then
               internalerror(200312023);
             tmpref.base:=tmpref.index;
@@ -2271,6 +2293,7 @@ unit cgcpu;
         cg.a_label(current_procinfo.aktlocaldata,l);
         tmpref.symboldata:=current_procinfo.aktlocaldata.last;
         piclabel:=nil;
+        tmpreg:=NR_NO;
 
         indirection_done:=false;
         if assigned(ref.symbol) then
@@ -3773,7 +3796,7 @@ unit cgcpu;
            OS_S32:
              oppostfix:=PF_None;
            else
-             InternalError(200308297);
+             InternalError(200308298);
          end;
          if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[fromsize]) then
            begin
@@ -4255,7 +4278,7 @@ unit cgcpu;
            OS_S32:
              oppostfix:=PF_None;
            else
-             InternalError(200308297);
+             InternalError(200308299);
          end;
          if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[fromsize]) then
            begin
@@ -4403,7 +4426,7 @@ unit cgcpu;
             OP_SHL:
               begin
                 if a>32 then
-                  internalerror(200308294);
+                  internalerror(2014020703);
                 if a<>0 then
                   begin
                     shifterop_reset(so);
@@ -4417,7 +4440,7 @@ unit cgcpu;
             OP_ROL:
               begin
                 if a>32 then
-                  internalerror(200308294);
+                  internalerror(2014020704);
                 if a<>0 then
                   begin
                     shifterop_reset(so);
@@ -4431,7 +4454,7 @@ unit cgcpu;
             OP_ROR:
               begin
                 if a>32 then
-                  internalerror(200308294);
+                  internalerror(2014020705);
                 if a<>0 then
                   begin
                     shifterop_reset(so);
@@ -4712,6 +4735,7 @@ unit cgcpu;
         if not(nostackframe) then
           begin
             firstfloatreg:=RS_NO;
+            lastfloatreg:=RS_NO;
             { save floating point registers? }
             for r:=RS_F0 to RS_F7 do
               if r in rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall) then
@@ -4816,6 +4840,7 @@ unit cgcpu;
             stackmisalignment:=0;
             { restore floating point register }
             firstfloatreg:=RS_NO;
+            lastfloatreg:=RS_NO;
             { save floating point registers? }
             for r:=RS_F0 to RS_F7 do
               if r in rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall) then
@@ -4910,7 +4935,7 @@ unit cgcpu;
         if (ref.base=NR_NO) then
           begin
             if ref.shiftmode<>SM_None then
-              internalerror(200308294);
+              internalerror(2014020706);
             ref.base:=ref.index;
             ref.index:=NR_NO;
           end;

+ 16 - 28
compiler/arm/cpubase.pas

@@ -571,35 +571,23 @@ unit cpubase;
         i : longint;
         imm : byte;
       begin
-        result:=false;
+        {Loading 0-255 is simple}
         if (d and $FF) = d then
-          begin
-            result:=true;
-            exit;
-          end;
-        if ((d and $FF00FF00) = 0) and
-           ((d shr 16)=(d and $FFFF)) then
-          begin
-            result:=true;
-            exit;
-          end;
-        if ((d and $00FF00FF) = 0) and
-           ((d shr 16)=(d and $FFFF)) then
-          begin
-            result:=true;
-            exit;
-          end;
-        if ((d shr 16)=(d and $FFFF)) and
-           ((d shr 8)=(d and $FF)) then
-          begin
-            result:=true;
-            exit;
-          end;
-        if is_shifter_const(d,imm) then
-          begin
-            result:=true;
-            exit;
-          end;
+          result:=true
+        { If top and bottom are equal, check if either all 4 bytes are equal
+          or byte 0 and 2 or byte 1 and 3 are equal }
+        else if ((d shr 16)=(d and $FFFF)) and
+                (
+                  ((d and $FF00FF00) = 0) or
+                  ((d and $00FF00FF) = 0) or
+                  ((d shr 8)=(d and $FF))
+                ) then
+          result:=true
+        {Can an 8-bit value be shifted accordingly?}
+        else if is_shifter_const(d,imm) then
+          result:=true
+        else
+          result:=false;
       end;
     
     function is_continuous_mask(d : aint;var lsb, width: byte) : boolean;

+ 10 - 0
compiler/arm/cpuelf.pas

@@ -168,6 +168,16 @@ implementation
 
     TCB_SIZE = 8;
 
+    { ELF header e_flags }
+    EF_ARM_BE8          = $00800000;
+    EF_ARM_EABIMASK     = $FF000000;
+    EF_ARM_EABI_UNKNOWN = $00000000;
+    EF_ARM_EABI_VER1    = $01000000;
+    EF_ARM_EABI_VER2    = $02000000;
+    EF_ARM_EABI_VER3    = $03000000;
+    EF_ARM_EABI_VER4    = $04000000;
+    EF_ARM_EABI_VER5    = $05000000;
+
   { Using short identifiers to save typing. This ARM thing has more relocations
     than it has instructions... }
   const

+ 15 - 14
compiler/arm/cpuinfo.pas

@@ -664,28 +664,29 @@ Const
        CPUARM_HAS_LDREX,
        CPUARM_HAS_IDIV,
        CPUARM_HAS_THUMB_IDIV,
-       CPUARM_HAS_THUMB2
+       CPUARM_HAS_THUMB2,
+       CPUARM_HAS_UMULL
       );
 
  const
    cpu_capabilities : array[tcputype] of set of tcpuflags =
      ( { cpu_none     } [],
        { cpu_armv3    } [],
-       { cpu_armv4    } [],
-       { cpu_armv4t   } [CPUARM_HAS_BX],
-       { cpu_armv5    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ],
-       { cpu_armv5t   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ],
-       { cpu_armv5te  } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP],
-       { cpu_armv5tej } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP],
-       { cpu_armv6    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
-       { cpu_armv6k   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
-       { cpu_armv6t2  } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB2],
-       { cpu_armv6z   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX],
+       { cpu_armv4    } [CPUARM_HAS_UMULL],
+       { cpu_armv4t   } [CPUARM_HAS_BX,CPUARM_HAS_UMULL],
+       { cpu_armv5    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_UMULL],
+       { cpu_armv5t   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_UMULL],
+       { cpu_armv5te  } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_UMULL],
+       { cpu_armv5tej } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_UMULL],
+       { cpu_armv6    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_UMULL],
+       { cpu_armv6k   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_UMULL],
+       { cpu_armv6t2  } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
+       { cpu_armv6z   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_UMULL],
        { cpu_armv6m   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_REV],
        { the identifier armv7 is should not be used, it is considered being equal to armv7a }
-       { cpu_armv7    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2],
-       { cpu_armv7a   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2],
-       { cpu_armv7r   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2],
+       { cpu_armv7    } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
+       { cpu_armv7a   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
+       { cpu_armv7r   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
        { cpu_armv7m   } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2],
        { cpu_armv7em  } [CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2]
      );

+ 19 - 10
compiler/arm/cpupara.pas

@@ -714,16 +714,25 @@ unit cpupara;
               begin
                 paraloc^.loc:=LOC_REGISTER;
                 paraloc^.register:=NR_FUNCTION_RETURN_REG;
-                if (result.intsize<>3) then
-                  begin
-                    paraloc^.size:=retcgsize;
-                    paraloc^.def:=result.def;
-                  end
-                else
-                  begin
-                    paraloc^.size:=OS_32;
-                    paraloc^.def:=u32inttype;
-                  end;
+                case result.IntSize of
+                  0:
+                    begin
+                      paraloc^.loc:=LOC_VOID;
+                      paraloc^.register:=NR_NO;
+                      paraloc^.size:=OS_NO;
+                      paraloc^.def:=voidpointertype;
+                    end;
+                  3:
+                    begin
+                      paraloc^.size:=OS_32;
+                      paraloc^.def:=u32inttype;
+                    end;
+                  else
+                    begin
+                      paraloc^.size:=retcgsize;
+                      paraloc^.def:=result.def;
+                    end;
+                end;
               end;
           end;
       end;

+ 9 - 4
compiler/arm/cpupi.pas

@@ -111,10 +111,14 @@ unit cpupi;
             localsize:=0;
             for i:=0 to procdef.parast.SymList.Count-1 do
               if tsym(procdef.parast.SymList[i]).typ=paravarsym then
-                if is_open_string(tabstractnormalvarsym(procdef.parast.SymList[i]).vardef) then
-                  inc(localsize,256)
-                else
-                  inc(localsize,tabstractnormalvarsym(procdef.parast.SymList[i]).getsize);
+                begin
+                  if tabstractnormalvarsym(procdef.parast.SymList[i]).varspez in [vs_var,vs_out,vs_constref] then
+                    inc(localsize,4)
+                  else if is_open_string(tabstractnormalvarsym(procdef.parast.SymList[i]).vardef) then
+                    inc(localsize,256)
+                  else
+                    inc(localsize,tabstractnormalvarsym(procdef.parast.SymList[i]).getsize);
+                end;
 
             inc(stackframesize,localsize);
 
@@ -154,6 +158,7 @@ unit cpupi;
                 begin
                   { save floating point registers? }
                   firstfloatreg:=RS_NO;
+                  lastfloatreg:=RS_NO;
                   regs:=cg.rg[R_FPUREGISTER].used_in_proc-paramanager.get_volatile_registers_fpu(pocall_stdcall);
                   for r:=RS_F0 to RS_F7 do
                     if r in regs then

+ 42 - 18
compiler/arm/narmadd.pas

@@ -34,8 +34,11 @@ interface
           function  GetResFlags(unsigned:Boolean):TResFlags;
        public
           function pass_1 : tnode;override;
+          function use_generic_mul32to64: boolean; override;
+          function use_generic_mul64bit: boolean; override;
        protected
           function first_addfloat: tnode; override;
+          procedure second_addordinal;override;
           procedure second_addfloat;override;
           procedure second_cmpfloat;override;
           procedure second_cmpordinal;override;
@@ -481,11 +484,7 @@ interface
           begin
             asmList := current_asmdata.CurrAsmList;
             pass_left_right;
-
-            if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
-              hlcg.location_force_reg(asmList,left.location,left.resultdef,left.resultdef,true);
-            if not(right.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
-              hlcg.location_force_reg(asmList,right.location,right.resultdef,right.resultdef,true);
+            force_reg_left_right(true, (left.location.loc<>LOC_CONSTANT) and (right.location.loc<>LOC_CONSTANT));
             set_result_location_reg;
 
             { shortcuts to register64s }
@@ -507,19 +506,7 @@ interface
       var
         unsigned : boolean;
       begin
-        { prepare for MUL64 inlining }
-        if (not(cs_check_overflow in current_settings.localswitches)) and
-           (nodetype in [muln]) and
-           (is_64bitint(left.resultdef)) and
-           (not (GenerateThumbCode)) then
-          begin
-            result := nil;
-            firstpass(left);
-            firstpass(right);
-            expectloc := LOC_REGISTER;
-          end
-        else
-          result:=inherited pass_1;
+        result:=inherited pass_1;
 
         if not(assigned(result)) then
           begin
@@ -644,6 +631,43 @@ interface
         location.resflags:=getresflags(unsigned);
       end;
 
+    const
+      multops: array[boolean] of TAsmOp = (A_SMULL, A_UMULL);
+
+    procedure tarmaddnode.second_addordinal;
+      var
+        unsigned: boolean;
+      begin
+        if (nodetype=muln) and
+           is_64bit(resultdef) and
+           not(GenerateThumbCode) and
+           (CPUARM_HAS_UMULL in cpu_capabilities[current_settings.cputype]) then
+          begin
+            pass_left_right;
+            force_reg_left_right(true, false);
+            set_result_location_reg;
+            unsigned:=not(is_signed(left.resultdef)) or
+                      not(is_signed(right.resultdef));
+            current_asmdata.CurrAsmList.Concat(
+              taicpu.op_reg_reg_reg_reg(multops[unsigned], location.register64.reglo, location.register64.reghi,
+                                        left.location.register,right.location.register));
+          end
+        else
+          inherited second_addordinal;
+      end;
+
+    function tarmaddnode.use_generic_mul32to64: boolean;
+      begin
+        result:=GenerateThumbCode or not(CPUARM_HAS_UMULL in cpu_capabilities[current_settings.cputype]);
+      end;
+
+    function tarmaddnode.use_generic_mul64bit: boolean;
+      begin
+        result:=GenerateThumbCode or
+          not(CPUARM_HAS_UMULL in cpu_capabilities[current_settings.cputype]) or
+          (cs_check_overflow in current_settings.localswitches);
+      end;
+
 begin
   caddnode:=tarmaddnode;
 end.

+ 91 - 77
compiler/arm/narmmat.pas

@@ -71,19 +71,20 @@ implementation
       var
         power  : longint;
       begin
-        if (right.nodetype=ordconstn) and
-          (nodetype=divn) and
-          (ispowerof2(tordconstnode(right).value,power) or
-           (tordconstnode(right).value=1) or
-           (tordconstnode(right).value=int64(-1))
-          ) and
-          not(is_64bitint(resultdef)) then
+        if not(cs_check_overflow in current_settings.localswitches) and
+           (right.nodetype=ordconstn) and
+           (nodetype=divn) and
+           (ispowerof2(tordconstnode(right).value,power) or
+            (tordconstnode(right).value=1) or
+            (tordconstnode(right).value=int64(-1))
+           ) and
+           not(is_64bitint(resultdef)) then
           result:=nil
-        else if ((GenerateThumbCode) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
+        else if ((GenerateThumbCode or GenerateThumb2Code) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
           (nodetype=divn) and
           not(is_64bitint(resultdef)) then
           result:=nil
-        else if ((GenerateThumbCode) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
+        else if ((GenerateThumbCode or GenerateThumb2Code) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
           (nodetype=modn) and
           not(is_64bitint(resultdef)) then
           begin
@@ -220,7 +221,7 @@ implementation
         secondpass(left);
         secondpass(right);
 
-        if ((GenerateThumbCode) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
+        if ((GenerateThumbCode or GenerateThumb2Code) and (CPUARM_HAS_THUMB_IDIV in cpu_capabilities[current_settings.cputype])) and
            (nodetype=divn) and
            not(is_64bitint(resultdef)) then
           begin
@@ -445,70 +446,80 @@ implementation
 
     procedure tarmshlshrnode.second_64bit;
       var
-        hreg64hi,hreg64lo,shiftreg:Tregister;
         v : TConstExprInt;
-        l1,l2,l3:Tasmlabel;
         so: tshifterop;
+        lreg, resreg: TRegister64;
 
       procedure emit_instr(p: tai);
         begin
           current_asmdata.CurrAsmList.concat(p);
         end;
 
-      {Reg1 gets shifted and moved into reg2, and is set to zero afterwards}
-      procedure shift_more_than_32(reg1, reg2: TRegister; shiftval: Byte ; sm: TShiftMode);
+      {This code is build like it gets called with sm=SM_LSR all the time, for SM_LSL dst* and src* have to be reversed}
+      procedure shift_less_than_32(srchi, srclo, dsthi, dstlo: TRegister; shiftval: Byte; sm: TShiftMode);
         begin
-          shifterop_reset(so); so.shiftimm:=shiftval - 32; so.shiftmode:=sm;
-          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, reg2, reg1, so));
-          emit_instr(taicpu.op_reg_const(A_MOV, reg1, 0));
-        end;
+          shifterop_reset(so);
 
-      procedure shift_less_than_32(reg1, reg2: TRegister; shiftval: Byte; shiftright: boolean);
-        begin
-          shifterop_reset(so); so.shiftimm:=shiftval;
-          if shiftright then so.shiftmode:=SM_LSR else so.shiftmode:=SM_LSL;
-          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, reg1, reg1, so));
+          so.shiftimm:=shiftval;
+          so.shiftmode:=sm;
+          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, dstlo, srclo, so));
+          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, dsthi, srchi, so));
 
-          if shiftright then so.shiftmode:=SM_LSL else so.shiftmode:=SM_LSR;
+          if sm = SM_LSR then so.shiftmode:=SM_LSL else so.shiftmode:=SM_LSR;
           so.shiftimm:=32-shiftval;
-          emit_instr(taicpu.op_reg_reg_reg_shifterop(A_ORR, reg1, reg1, reg2, so));
+          emit_instr(taicpu.op_reg_reg_reg_shifterop(A_ORR, dstlo, dstlo, srchi, so));
 
-          if shiftright then so.shiftmode:=SM_LSR else so.shiftmode:=SM_LSL;
-          so.shiftimm:=shiftval;
-          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, reg2, reg2, so));
         end;
 
-      procedure shift_by_variable(reg1, reg2, shiftval: TRegister; shiftright: boolean);
+      {This code is build like it gets called with sm=SM_LSR all the time, for SM_LSL dst* and src* have to be reversed
+       This will generate
+         mov   shiftval1, shiftval
+         cmp   shiftval1, #64
+         movcs shiftval1, #64
+         rsb   shiftval2, shiftval1, #32
+         mov   dstlo, srclo, lsr shiftval1
+         mov   dsthi, srchi, lsr shiftval1
+         orr   dstlo, srchi, lsl shiftval2
+         subs  shiftval2, shiftval1, #32
+         movpl dstlo, srchi, lsr shiftval2
+      }
+      procedure shift_by_variable(srchi, srclo, dsthi, dstlo, shiftval: TRegister; sm: TShiftMode);
         var
-          shiftval2:TRegister;
+          shiftval1,shiftval2:TRegister;
         begin
           shifterop_reset(so);
+          shiftval1:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
           shiftval2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
 
+          cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, shiftval, shiftval1);
+
+          {The ARM barrel shifter only considers the lower 8 bits of a register for the shift}
           cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+          emit_instr(taicpu.op_reg_const(A_CMP, shiftval1, 64));
+          emit_instr(setcondition(taicpu.op_reg_const(A_MOV, shiftval1, 64), C_CS));
+          cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
 
-          {Do we shift more than 32 bits?}
-          emit_instr(setoppostfix(taicpu.op_reg_reg_const(A_RSB, shiftval2, shiftval, 32), PF_S));
+          {Calculate how much the upper register needs to be shifted left}
+          emit_instr(taicpu.op_reg_reg_const(A_RSB, shiftval2, shiftval1, 32));
 
-          {This part cares for 32 bits and more}
-          emit_instr(setcondition(taicpu.op_reg_reg_const(A_SUB, shiftval2, shiftval, 32), C_MI));
-          if shiftright then so.shiftmode:=SM_LSR else so.shiftmode:=SM_LSL;
-          so.rs:=shiftval2;
-          emit_instr(setcondition(taicpu.op_reg_reg_shifterop(A_MOV, reg2, reg1, so), C_MI));
+          so.shiftmode:=sm;
+          so.rs:=shiftval1;
+
+          {Shift and zerofill the hi+lo register}
+          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, dstlo, srclo, so));
+          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, dsthi, srchi, so));
 
-          {Less than 32 bits}
-          so.rs:=shiftval;
-          emit_instr(setcondition(taicpu.op_reg_reg_shifterop(A_MOV, reg2, reg2, so), C_PL));
-          if shiftright then so.shiftmode:=SM_LSL else so.shiftmode:=SM_LSR;
+          {Fold in the lower 32-shiftval bits}
+          if sm = SM_LSR then so.shiftmode:=SM_LSL else so.shiftmode:=SM_LSR;
           so.rs:=shiftval2;
-          emit_instr(setcondition(taicpu.op_reg_reg_reg_shifterop(A_ORR, reg2, reg2, reg1, so), C_PL));
+          emit_instr(taicpu.op_reg_reg_reg_shifterop(A_ORR, dstlo, dstlo, srchi, so));
 
-          cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+          cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+          emit_instr(setoppostfix(taicpu.op_reg_reg_const(A_SUB, shiftval2, shiftval1, 32), PF_S));
 
-          {Final adjustments}
-          if shiftright then so.shiftmode:=SM_LSR else so.shiftmode:=SM_LSL;
-          so.rs:=shiftval;
-          emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, reg1, reg1, so));
+          so.shiftmode:=sm;
+          emit_instr(setcondition(taicpu.op_reg_reg_shifterop(A_MOV, dstlo, srchi, so), C_PL));
+          cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
         end;
 
       begin
@@ -519,13 +530,17 @@ implementation
         end;
 
         location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+        location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        location.register64.reglo:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
 
         { load left operator in a register }
-        hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
-        hreg64hi:=left.location.register64.reghi;
-        hreg64lo:=left.location.register64.reglo;
-        location.register64.reghi:=hreg64hi;
-        location.register64.reglo:=hreg64lo;
+        if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER]) or
+           (left.location.size<>OS_64) then
+          hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,true);
+
+        lreg := left.location.register64;
+        resreg := location.register64;
+        shifterop_reset(so);
 
         { shifting by a constant directly coded: }
         if (right.nodetype=ordconstn) then
@@ -537,8 +552,8 @@ implementation
                 begin
                   {Shift left by one by 2 simple 32bit additions}
                   cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
-                  emit_instr(setoppostfix(taicpu.op_reg_reg_reg(A_ADD, hreg64lo, hreg64lo, hreg64lo), PF_S));
-                  emit_instr(taicpu.op_reg_reg_reg(A_ADC, hreg64hi, hreg64hi, hreg64hi));
+                  emit_instr(setoppostfix(taicpu.op_reg_reg_reg(A_ADD, resreg.reglo, lreg.reglo, lreg.reglo), PF_S));
+                  emit_instr(taicpu.op_reg_reg_reg(A_ADC, resreg.reghi, lreg.reghi, lreg.reghi));
                   cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
                 end
               else
@@ -546,42 +561,41 @@ implementation
                   {Shift right by first shifting hi by one and then using RRX (rotate right extended), which rotates through the carry}
                   shifterop_reset(so); so.shiftmode:=SM_LSR; so.shiftimm:=1;
                   cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
-                  emit_instr(setoppostfix(taicpu.op_reg_reg_shifterop(A_MOV, hreg64hi, hreg64hi, so), PF_S));
+                  emit_instr(setoppostfix(taicpu.op_reg_reg_shifterop(A_MOV, resreg.reghi, lreg.reghi, so), PF_S));
                   so.shiftmode:=SM_RRX; so.shiftimm:=0; {RRX does NOT have a shift amount}
-                  emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, hreg64lo, hreg64lo, so));
+                  emit_instr(taicpu.op_reg_reg_shifterop(A_MOV, resreg.reglo, lreg.reglo, so));
                   cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
                 end
-            {A 32bit shift just replaces a register and clears the other}
-            else if v = 32 then
+            {Clear one register and use the cg to generate a normal 32-bit shift}
+            else if v >= 32 then
+              if nodetype=shln then
               begin
-                if nodetype=shln then
-                  emit_instr(taicpu.op_reg_const(A_MOV, hreg64hi, 0))
-                else
-                  emit_instr(taicpu.op_reg_const(A_MOV, hreg64lo, 0));
-                location.register64.reghi:=hreg64lo;
-                location.register64.reglo:=hreg64hi;
+                emit_instr(taicpu.op_reg_const(A_MOV, resreg.reglo, 0));
+                cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHL,OS_32,v.uvalue-32,lreg.reglo,resreg.reghi);
               end
-            {Shift LESS than 32}
-            else if (v < 32) and (v > 1) then
-              if nodetype=shln then
-                shift_less_than_32(hreg64hi, hreg64lo, v.uvalue, false)
               else
-                shift_less_than_32(hreg64lo, hreg64hi, v.uvalue, true)
-            {More than 32}
-            else if v > 32 then
+              begin
+                emit_instr(taicpu.op_reg_const(A_MOV, resreg.reghi, 0));
+                cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,OS_32,v.uvalue-32,lreg.reghi,resreg.reglo);
+              end
+            {Shift LESS than 32, thats the tricky one}
+            else if (v < 32) and (v > 1) then
               if nodetype=shln then
-                shift_more_than_32(hreg64lo, hreg64hi, v.uvalue, SM_LSL)
+                shift_less_than_32(lreg.reglo, lreg.reghi, resreg.reglo, resreg.reghi, v.uvalue, SM_LSL)
               else
-                shift_more_than_32(hreg64hi, hreg64lo, v.uvalue, SM_LSR);
+                shift_less_than_32(lreg.reghi, lreg.reglo, resreg.reghi, resreg.reglo, v.uvalue, SM_LSR);
           end
         else
           begin
-            { force right operators in a register }
-            hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,resultdef,false);
+            { force right operator into a register }
+            if not(right.location.loc in [LOC_CREGISTER,LOC_REGISTER]) or
+               (right.location.size<>OS_32) then
+              hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,u32inttype,true);
+
             if nodetype = shln then
-              shift_by_variable(hreg64lo,hreg64hi,right.location.register, false)
+              shift_by_variable(lreg.reglo, lreg.reghi, resreg.reglo, resreg.reghi, right.location.register, SM_LSL)
             else
-              shift_by_variable(hreg64hi,hreg64lo,right.location.register, true);
+              shift_by_variable(lreg.reghi, lreg.reglo, resreg.reghi, resreg.reglo, right.location.register, SM_LSR);
           end;
       end;
 

+ 48 - 3
compiler/arm/rgcpu.pas

@@ -28,7 +28,7 @@ unit rgcpu;
   interface
 
      uses
-       aasmbase,aasmtai,aasmdata,aasmcpu,
+       aasmbase,aasmtai,aasmsym,aasmdata,aasmcpu,
        cgbase,cgutils,
        cpubase,
        {$ifdef DEBUG_SPILLING}
@@ -43,6 +43,8 @@ unit rgcpu;
        public
          procedure do_spill_read(list: TAsmList; pos: tai; const spilltemp: treference; tempreg: tregister; orgsupreg: tsuperregister); override;
          procedure do_spill_written(list: TAsmList; pos: tai; const spilltemp: treference; tempreg: tregister; orgsupreg: tsuperregister); override;
+         function do_spill_replace(list : TAsmList;instr : tai_cpu_abstract_sym;
+           orgreg : tsuperregister;const spilltemp : treference) : boolean;override;
          procedure add_constraints(reg:tregister);override;
          function  get_spill_subreg(r:tregister) : tsubregister;override;
        end;
@@ -126,9 +128,10 @@ unit rgcpu;
                     end;
                 end;
               A_MLA,
+              A_MLS,
               A_MUL:
                 begin
-                  if current_settings.cputype<cpu_armv6 then
+                  if (current_settings.cputype<cpu_armv6) and (taicpu(p).opcode<>A_MLS) then
                     add_edge(getsupreg(taicpu(p).oper[0]^.reg),getsupreg(taicpu(p).oper[1]^.reg));
                    add_edge(getsupreg(taicpu(p).oper[0]^.reg),RS_R13);
                    add_edge(getsupreg(taicpu(p).oper[0]^.reg),RS_R15);
@@ -136,7 +139,7 @@ unit rgcpu;
                    add_edge(getsupreg(taicpu(p).oper[1]^.reg),RS_R15);
                    add_edge(getsupreg(taicpu(p).oper[2]^.reg),RS_R13);
                    add_edge(getsupreg(taicpu(p).oper[2]^.reg),RS_R15);
-                   if taicpu(p).opcode=A_MLA then
+                   if taicpu(p).opcode<>A_MUL then
                      begin
                        add_edge(getsupreg(taicpu(p).oper[3]^.reg),RS_R13);
                        add_edge(getsupreg(taicpu(p).oper[3]^.reg),RS_R15);
@@ -279,6 +282,48 @@ unit rgcpu;
       end;
 
 
+    function trgcpu.do_spill_replace(list:TAsmList;instr:tai_cpu_abstract_sym;orgreg:tsuperregister;const spilltemp:treference):boolean;
+      var
+        b : byte;
+      begin
+        result:=false;
+        if abs(spilltemp.offset)>4095 then
+          exit;
+
+        { Replace 'mov  dst,orgreg' with 'ldr  dst,spilltemp'
+          and     'mov  orgreg,src' with 'str  dst,spilltemp' }
+        with instr do
+          begin
+            if (opcode=A_MOV) and (ops=2) and (oper[1]^.typ=top_reg) and (oper[0]^.typ=top_reg) then
+              begin
+                if (getregtype(oper[0]^.reg)=regtype) and
+                   (get_alias(getsupreg(oper[0]^.reg))=orgreg) and
+                   (get_alias(getsupreg(oper[1]^.reg))<>orgreg) then
+                  begin
+                    { str expects the register in oper[0] }
+                    oper[0]^.typ:=top_reg;
+                    oper[0]^.reg:=oper[1]^.reg;
+                    oper[1]^.typ:=top_ref;
+                    new(oper[1]^.ref);
+                    oper[1]^.ref^:=spilltemp;
+                    opcode:=A_STR;
+                    result:=true;
+                  end
+                else if (getregtype(oper[1]^.reg)=regtype) and
+                   (get_alias(getsupreg(oper[1]^.reg))=orgreg) and
+                   (get_alias(getsupreg(oper[0]^.reg))<>orgreg) then
+                  begin
+                    oper[1]^.typ:=top_ref;
+                    new(oper[1]^.ref);
+                    oper[1]^.ref^:=spilltemp;
+                    opcode:=A_LDR;
+                    result:=true;
+                  end;
+              end;
+          end;
+      end;
+
+
     procedure trgcpu.add_constraints(reg:tregister);
       var
         supreg,i : Tsuperregister;

+ 31 - 1
compiler/assemble.pas

@@ -32,6 +32,9 @@ interface
 
 
     uses
+{$ifdef hasamiga}
+      exec,
+{$endif}
       SysUtils,
       systems,globtype,globals,aasmbase,aasmtai,aasmdata,ogbase,finput;
 
@@ -297,7 +300,7 @@ Implementation
         var
           dir : TRawByteSearchRec;
         begin
-          if findfirst(s+source_info.dirsep+'*'+AExt,faAnyFile,dir) = 0 then
+          if findfirst(FixPath(s,false)+'*'+AExt,faAnyFile,dir) = 0 then
             begin
               repeat
                 DeleteFile(s+source_info.dirsep+dir.name);
@@ -606,13 +609,36 @@ Implementation
            Replace(result,'$NOWARN','')
          else
            Replace(result,'$NOWARN','-W');
+         Replace(result,'$EXTRAOPT',asmextraopt);
       end;
 
 
     procedure TExternalAssembler.AsmCreate(Aplace:tcutplace);
+{$ifdef hasamiga}
+      var
+        tempFileName: TPathStr;
+{$endif}
       begin
         if SmartAsm then
          NextSmartName(Aplace);
+{$ifdef hasamiga}
+        { on Amiga/MorphOS try to redirect .s files to the T: assign, which is
+          for temp files, and usually (default setting) located in the RAM: drive. 
+          This highly improves assembling speed for complex projects like the 
+          compiler itself, especially on hardware with slow disk I/O.
+          Consider this as a poor man's pipe on Amiga, because real pipe handling 
+          would be much more complex and error prone to implement. (KB) }
+        if (([cs_asm_extern,cs_asm_leave,cs_link_on_target] * current_settings.globalswitches) = []) then
+         begin
+          { try to have an unique name for the .s file }
+          tempFileName:=HexStr(FindTask(nil))+ExtractFileName(AsmFileName);
+{$ifndef morphos}
+          { old Amiga RAM: handler only allows filenames up to 30 char }
+          if Length(tempFileName) < 30 then
+{$endif}
+          AsmFileName:='T:'+tempFileName;
+         end;
+{$endif}
 {$ifdef hasunix}
         if DoPipe then
          begin
@@ -1357,6 +1383,8 @@ Implementation
         relative_reloc: boolean;
       begin
         fillchar(zerobuf,sizeof(zerobuf),0);
+        fillchar(objsym,sizeof(objsym),0);
+        fillchar(objsymend,sizeof(objsymend),0);
         { main loop }
         while assigned(hp) do
          begin
@@ -1460,6 +1488,8 @@ Implementation
                        { Required for DWARF2 support under Windows }
                        ObjData.writereloc(Tai_const(hp).symofs,sizeof(longint),Objdata.SymbolRef(tai_const(hp).sym),RELOC_SECREL32);
                      end;
+                   aitconst_gotoff_symbol:
+                     ObjData.writereloc(Tai_const(hp).symofs,sizeof(longint),Objdata.SymbolRef(tai_const(hp).sym),RELOC_GOTOFF);
                    aitconst_uleb128bit,
                    aitconst_sleb128bit :
                      begin

+ 7 - 1
compiler/avr/aasmcpu.pas

@@ -246,12 +246,16 @@ implementation
       begin
         result:=operand_read;
         case opcode of
+          A_CLR,
+          A_MOV, A_MOVW:
+           if opnr=0 then
+             result:=operand_write;
           A_CP,A_CPC,A_CPI,A_PUSH :
             ;
           else
             begin
               if opnr=0 then
-                result:=operand_write;
+                result:=operand_readwrite;
             end;
         end;
       end;
@@ -416,6 +420,8 @@ implementation
                       end;
                     ait_align:
                       inc(CurrOffset,tai_align(curtai).aligntype);
+                    ait_weak,
+                    ait_set,
                     ait_marker:
                       ;
                     ait_label:

+ 1 - 1
compiler/avr/agavrgas.pas

@@ -203,7 +203,7 @@ unit agavrgas;
 
             idtxt  : 'AS';
             asmbin : 'as';
-            asmcmd : '-o $OBJ $ASM';
+            asmcmd : '-o $OBJ $EXTRAOPT $ASM';
             supported_targets : [system_avr_embedded];
             flags : [af_needar,af_smartlink_sections];
             labelprefix : '.L';

+ 233 - 14
compiler/avr/aoptcpu.pas

@@ -28,10 +28,12 @@ Unit aoptcpu;
 
 Interface
 
-uses cpubase, aasmtai, aopt, aoptcpub;
+uses cpubase, cgbase, aasmtai, aopt, aoptcpub;
 
 Type
   TCpuAsmOptimizer = class(TAsmOptimizer)
+    Function GetNextInstructionUsingReg(Current: tai; Var Next: tai;reg : TRegister): Boolean;
+
     { uses the same constructor as TAopObj }
     function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
     procedure PeepHoleOptPass2;override;
@@ -40,7 +42,9 @@ Type
 Implementation
 
   uses
-    aasmbase,aasmcpu,cgbase;
+    cpuinfo,
+    aasmbase,aasmcpu,
+    globals,globtype;
 
   function CanBeCond(p : tai) : boolean;
     begin
@@ -48,40 +52,255 @@ Implementation
     end;
 
 
+  function TCpuAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
+    var Next: tai; reg: TRegister): Boolean;
+    begin
+      Next:=Current;
+      repeat
+        Result:=GetNextInstruction(Next,Next);
+      until not(cs_opt_level3 in current_settings.optimizerswitches) or not(Result) or (Next.typ<>ait_instruction) or (RegInInstruction(reg,Next)) or
+        (is_calljmp(taicpu(Next).opcode));
+    end;
+
+
   function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
     var
-      next1: tai;
+      hp1,hp2,hp3: tai;
+      alloc, dealloc: tai_regalloc;
+      i: integer;
     begin
       result := false;
       case p.typ of
         ait_instruction:
           begin
             case taicpu(p).opcode of
+              A_LDI:
+                begin
+                  { turn
+                    ldi reg0, imm
+                    cp reg1, reg0
+                    dealloc reg0
+                    into
+                    cpi reg1, imm
+                  }
+                  if (taicpu(p).ops=2) and
+                     (taicpu(p).oper[0]^.typ=top_reg) and
+                     (taicpu(p).oper[1]^.typ=top_const) and
+                     GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                     (not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+                     (taicpu(hp1).opcode=A_CP) and
+                     (taicpu(hp1).ops=2) and
+                     (taicpu(hp1).oper[1]^.typ=top_reg) and
+                     (getsupreg(taicpu(hp1).oper[0]^.reg) in [16..31]) and
+                     (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[0]^.reg) and
+                     assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
+                    begin
+                      taicpu(hp1).opcode:=A_CPI;
+                      taicpu(hp1).loadconst(1, taicpu(p).oper[1]^.val);
+
+                      alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
+                      dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
+
+                      if assigned(alloc) and assigned(dealloc) then
+                        begin
+                          asml.Remove(alloc);
+                          alloc.Free;
+                          asml.Remove(dealloc);
+                          dealloc.Free;
+                        end;
+
+                      GetNextInstruction(p,hp1);
+                      asml.Remove(p);
+                      p.Free;
+                      p:=hp1;
+
+                      result:=true;
+                    end;
+                end;
+              A_CLR:
+                begin
+                  { turn the common
+                    clr rX
+                    mov/ld rX, rY
+                    into
+                    mov/ld rX, rY
+                  }
+                  if (taicpu(p).ops=1) and
+                     (taicpu(p).oper[0]^.typ=top_reg) and
+                     GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                     (not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+                     (hp1.typ=ait_instruction) and
+                     (taicpu(hp1).opcode in [A_MOV,A_LD]) and
+                     (taicpu(hp1).ops>0) and
+                     (taicpu(hp1).oper[0]^.typ=top_reg) and
+                     (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg) then
+                    begin
+                      asml.Remove(p);
+                      p.Free;
+                      p:=hp1;
+                      result:=true;
+                    end
+                  { turn
+                    clr rX
+                    ...
+                    adc rY, rX
+                    into
+                    ...
+                    adc rY, r1
+                  }
+                  else if (taicpu(p).ops=1) and
+                     (taicpu(p).oper[0]^.typ=top_reg) and
+                     GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                     (not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+                     (hp1.typ=ait_instruction) and
+                     (taicpu(hp1).opcode in [A_ADC,A_SBC]) and
+                     (taicpu(hp1).ops=2) and
+                     (taicpu(hp1).oper[1]^.typ=top_reg) and
+                     (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[0]^.reg) and
+                     (taicpu(hp1).oper[0]^.reg<>taicpu(p).oper[0]^.reg) and
+                     assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
+                    begin
+                      taicpu(hp1).oper[1]^.reg:=NR_R1;
+
+                      alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
+                      dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
+
+                      if assigned(alloc) and assigned(dealloc) then
+                        begin
+                          asml.Remove(alloc);
+                          alloc.Free;
+                          asml.Remove(dealloc);
+                          dealloc.Free;
+                        end;
+
+                      GetNextInstruction(p,hp1);
+                      asml.Remove(p);
+                      p.free;
+                      p:=hp1;
+
+                      result:=true;
+                    end;
+                end;
+              A_PUSH:
+                begin
+                  { turn
+                    push reg0
+                    push reg1
+                    pop reg3
+                    pop reg2
+                    into
+                    movw reg2,reg0
+                  }
+                  if (taicpu(p).ops=1) and
+                     (taicpu(p).oper[0]^.typ=top_reg) and
+                     GetNextInstruction(p,hp1) and
+                     (hp1.typ=ait_instruction) and
+                     (taicpu(hp1).opcode=A_PUSH) and
+                     (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(p).oper[0]^.reg)+1) and
+                     ((getsupreg(taicpu(p).oper[0]^.reg) mod 2)=0) and
+
+                     GetNextInstruction(hp1,hp2) and
+                     (hp2.typ=ait_instruction) and
+                     (taicpu(hp2).opcode=A_POP) and
+
+                     GetNextInstruction(hp2,hp3) and
+                     (hp3.typ=ait_instruction) and
+                     (taicpu(hp3).opcode=A_POP) and
+                     (getsupreg(taicpu(hp2).oper[0]^.reg)=getsupreg(taicpu(hp3).oper[0]^.reg)+1) and
+                     ((getsupreg(taicpu(hp3).oper[0]^.reg) mod 2)=0) then
+                    begin
+                      taicpu(p).ops:=2;
+                      taicpu(p).opcode:=A_MOVW;
+
+                      taicpu(p).loadreg(1, taicpu(p).oper[0]^.reg);
+                      taicpu(p).loadreg(0, taicpu(hp3).oper[0]^.reg);
+
+                      asml.Remove(hp1);
+                      hp1.Free;
+                      asml.Remove(hp2);
+                      hp2.Free;
+                      asml.Remove(hp3);
+                      hp3.Free;
+
+                      result:=true;
+                    end;
+                end;
               A_MOV:
                 begin
+                  { turn
+                    mov reg0, reg1
+                    push reg0
+                    dealloc reg0
+                    into
+                    push reg1
+                  }
+                  if (taicpu(p).ops=2) and
+                     (taicpu(p).oper[0]^.typ = top_reg) and
+                     (taicpu(p).oper[1]^.typ = top_reg) and
+                     GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+                     (not RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+                     (hp1.typ = ait_instruction) and
+                     (taicpu(hp1).opcode in [A_PUSH,A_MOV,A_CP,A_CPC,A_ADD,A_SUB,A_EOR,A_AND,A_OR]) and
+                     RegInInstruction(taicpu(p).oper[0]^.reg, hp1) and
+                     (not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1)) and
+                     {(taicpu(hp1).ops=1) and
+                     (taicpu(hp1).oper[0]^.typ = top_reg) and
+                     (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg) and  }
+                     assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
+                    begin
+                      for i := 0 to taicpu(hp1).ops-1 do
+                        if taicpu(hp1).oper[i]^.typ=top_reg then
+                          if taicpu(hp1).oper[i]^.reg=taicpu(p).oper[0]^.reg then
+                            taicpu(hp1).oper[i]^.reg:=taicpu(p).oper[1]^.reg;
+
+                      alloc:=FindRegAllocBackward(taicpu(p).oper[0]^.reg,tai(p.Previous));
+                      dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next));
+
+                      if assigned(alloc) and assigned(dealloc) then
+                        begin
+                          asml.Remove(alloc);
+                          alloc.Free;
+                          asml.Remove(dealloc);
+                          dealloc.Free;
+                        end;
+
+                      GetNextInstruction(p,hp1);
+                      asml.Remove(p);
+                      p.free;
+                      p:=hp1;
+                      result:=true;
+                    end
                   { fold
                     mov reg2,reg0
                     mov reg3,reg1
                     to
                     movw reg2,reg0
                   }
-                  if (taicpu(p).ops=2) and
+                  else if (CPUAVR_HAS_MOVW in cpu_capabilities[current_settings.cputype]) and
+                     (taicpu(p).ops=2) and
                      (taicpu(p).oper[0]^.typ = top_reg) and
                      (taicpu(p).oper[1]^.typ = top_reg) and
-                     getnextinstruction(p,next1) and
-                     (next1.typ = ait_instruction) and
-                     (taicpu(next1).opcode = A_MOV) and
-                     (taicpu(next1).ops=2) and
-                     (taicpu(next1).oper[0]^.typ = top_reg) and
-                     (taicpu(next1).oper[1]^.typ = top_reg) and
-                     (getsupreg(taicpu(next1).oper[0]^.reg)=getsupreg(taicpu(p).oper[0]^.reg)+1) and
+                     getnextinstruction(p,hp1) and
+                     (hp1.typ = ait_instruction) and
+                     (taicpu(hp1).opcode = A_MOV) and
+                     (taicpu(hp1).ops=2) and
+                     (taicpu(hp1).oper[0]^.typ = top_reg) and
+                     (taicpu(hp1).oper[1]^.typ = top_reg) and
+                     (getsupreg(taicpu(hp1).oper[0]^.reg)=getsupreg(taicpu(p).oper[0]^.reg)+1) and
                      ((getsupreg(taicpu(p).oper[0]^.reg) mod 2)=0) and
                      ((getsupreg(taicpu(p).oper[1]^.reg) mod 2)=0) and
-                     (getsupreg(taicpu(next1).oper[1]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)+1) then
+                     (getsupreg(taicpu(hp1).oper[1]^.reg)=getsupreg(taicpu(p).oper[1]^.reg)+1) then
                     begin
+                      alloc:=FindRegAllocBackward(taicpu(hp1).oper[0]^.reg,tai(hp1.Previous));
+                      if assigned(alloc) then
+                        begin
+                          asml.Remove(alloc);
+                          asml.InsertBefore(alloc,p);
+                        end;
+
                       taicpu(p).opcode:=A_MOVW;
-                      asml.remove(next1);
-                      next1.free;
+                      asml.remove(hp1);
+                      hp1.free;
                       result := true;
                     end;
                 end;

+ 26 - 8
compiler/avr/aoptcpub.pas

@@ -36,7 +36,10 @@ Unit aoptcpub; { Assembler OPTimizer CPU specific Base }
 Interface
 
 Uses
-  cpubase,aasmcpu,AOptBase;
+  cpubase,
+  cgbase,
+  aasmcpu,aasmtai,
+  AOptBase;
 
 Type
 
@@ -58,6 +61,7 @@ Type
 { ************************************************************************* }
 
   TAoptBaseCpu = class(TAoptBase)
+    function RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean; override;
   End;
 
 
@@ -103,12 +107,26 @@ Implementation
 { ************************************************************************* }
 { **************************** TCondRegs ********************************** }
 { ************************************************************************* }
-Constructor TCondRegs.init;
-Begin
-End;
-
-Destructor TCondRegs.Done; {$ifdef inl} inline; {$endif inl}
-Begin
-End;
+  Constructor TCondRegs.init;
+    Begin
+    End;
+
+  Destructor TCondRegs.Done; {$ifdef inl} inline; {$endif inl}
+    Begin
+    End;
+
+
+  function TAoptBaseCpu.RegModifiedByInstruction(Reg: TRegister; p1: tai): boolean;
+    var
+      i : Longint;
+    begin
+      result:=false;
+      for i:=0 to taicpu(p1).ops-1 do
+        if (taicpu(p1).oper[i]^.typ=top_reg) and (taicpu(p1).oper[i]^.reg=Reg) and (taicpu(p1).spilling_get_operation_type(i) in [operand_write,operand_readwrite]) then
+          begin
+            result:=true;
+            exit;
+          end;
+    end;
 
 End.

+ 173 - 62
compiler/avr/cgcpu.pas

@@ -50,10 +50,10 @@ unit cgcpu;
         procedure a_load_const_cgpara(list : TAsmList;size : tcgsize;a : tcgint;const paraloc : TCGPara);override;
         procedure a_load_ref_cgpara(list : TAsmList;size : tcgsize;const r : treference;const paraloc : TCGPara);override;
         procedure a_loadaddr_ref_cgpara(list : TAsmList;const r : treference;const paraloc : TCGPara);override;
+        procedure a_load_reg_cgpara(list : TAsmList; size : tcgsize;r : tregister; const cgpara : tcgpara);override;
 
         procedure a_call_name(list : TAsmList;const s : string; weak: boolean);override;
         procedure a_call_reg(list : TAsmList;reg: tregister);override;
-        procedure a_call_ref(list : TAsmList;ref: treference);override;
 
         procedure a_op_const_reg(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; reg: TRegister); override;
         procedure a_op_reg_reg(list: TAsmList; Op: TOpCG; size: TCGSize; src, dst : TRegister); override;
@@ -205,25 +205,138 @@ unit cgcpu;
       end;
 
 
+    procedure tcgavr.a_load_reg_cgpara(list : TAsmList;size : tcgsize;r : tregister;const cgpara : tcgpara);
+
+      procedure load_para_loc(r : TRegister;paraloc : PCGParaLocation);
+        var
+          ref : treference;
+        begin
+          paramanager.allocparaloc(list,paraloc);
+          case paraloc^.loc of
+             LOC_REGISTER,LOC_CREGISTER:
+               a_load_reg_reg(list,paraloc^.size,paraloc^.size,r,paraloc^.register);
+             LOC_REFERENCE,LOC_CREFERENCE:
+               begin
+                  reference_reset_base(ref,paraloc^.reference.index,paraloc^.reference.offset,2);
+                  a_load_reg_ref(list,paraloc^.size,paraloc^.size,r,ref);
+               end;
+             else
+               internalerror(2002071004);
+          end;
+        end;
+
+      var
+        i, i2 : longint;
+        hp : PCGParaLocation;
+
+      begin
+{        if use_push(cgpara) then
+          begin
+            if tcgsize2size[cgpara.Size] > 2 then
+              begin
+                if tcgsize2size[cgpara.Size] <> 4 then
+                  internalerror(2013031101);
+                if cgpara.location^.Next = nil then
+                  begin
+                    if tcgsize2size[cgpara.location^.size] <> 4 then
+                      internalerror(2013031101);
+                  end
+                else
+                  begin
+                    if tcgsize2size[cgpara.location^.size] <> 2 then
+                      internalerror(2013031101);
+                    if tcgsize2size[cgpara.location^.Next^.size] <> 2 then
+                      internalerror(2013031101);
+                    if cgpara.location^.Next^.Next <> nil then
+                      internalerror(2013031101);
+                  end;
+
+                if tcgsize2size[cgpara.size]>cgpara.alignment then
+                  pushsize:=cgpara.size
+                else
+                  pushsize:=int_cgsize(cgpara.alignment);
+                pushsize2 := int_cgsize(tcgsize2size[pushsize] - 2);
+                list.concat(taicpu.op_reg(A_PUSH,TCgsize2opsize[pushsize2],makeregsize(list,GetNextReg(r),pushsize2)));
+                list.concat(taicpu.op_reg(A_PUSH,S_W,makeregsize(list,r,OS_16)));
+              end
+            else
+              begin
+                cgpara.check_simple_location;
+                if tcgsize2size[cgpara.location^.size]>cgpara.alignment then
+                  pushsize:=cgpara.location^.size
+                else
+                  pushsize:=int_cgsize(cgpara.alignment);
+                list.concat(taicpu.op_reg(A_PUSH,TCgsize2opsize[pushsize],makeregsize(list,r,pushsize)));
+              end;
+
+          end
+        else }
+          begin
+            if not(tcgsize2size[cgpara.Size] in [1..4]) then
+              internalerror(2014011101);
+
+            hp:=cgpara.location;
+
+            i:=0;
+            while i<tcgsize2size[cgpara.Size] do
+              begin
+                if not(assigned(hp)) then
+                  internalerror(2014011102);
+
+                inc(i, tcgsize2size[hp^.Size]);
+
+                if hp^.Loc=LOC_REGISTER then
+                  begin
+                    load_para_loc(r,hp);
+                    hp:=hp^.Next;
+                    r:=GetNextReg(r);
+                  end
+                else
+                  begin
+                    load_para_loc(r,hp);
+
+                    for i2:=1 to tcgsize2size[hp^.Size] do
+                      r:=GetNextReg(r);
+
+                    hp:=hp^.Next;
+                  end;
+              end;
+            if assigned(hp) then
+              internalerror(2014011103);
+          end;
+      end;
+
+
     procedure tcgavr.a_load_const_cgpara(list : TAsmList;size : tcgsize;a : tcgint;const paraloc : TCGPara);
       var
-        ref: treference;
+        i : longint;
+        hp : PCGParaLocation;
       begin
-        paraloc.check_simple_location;
-        paramanager.allocparaloc(list,paraloc.location);
-        case paraloc.location^.loc of
-          LOC_REGISTER,LOC_CREGISTER:
-            a_load_const_reg(list,size,a,paraloc.location^.register);
-          LOC_REFERENCE:
-            begin
-               reference_reset(ref,paraloc.alignment);
-               ref.base:=paraloc.location^.reference.index;
-               ref.offset:=paraloc.location^.reference.offset;
-               a_load_const_ref(list,size,a,ref);
+        if not(tcgsize2size[paraloc.Size] in [1..4]) then
+          internalerror(2014011101);
+
+        hp:=paraloc.location;
+
+        for i:=1 to tcgsize2size[paraloc.Size] do
+          begin
+            if not(assigned(hp)) or
+              (tcgsize2size[hp^.size]<>1) or
+              (hp^.shiftval<>0) then
+              internalerror(2014011105);
+             case hp^.loc of
+               LOC_REGISTER,LOC_CREGISTER:
+                 a_load_const_reg(list,hp^.size,(a shr (i-1)) and $ff,hp^.register);
+               LOC_REFERENCE,LOC_CREFERENCE:
+                 begin
+                   list.concat(taicpu.op_const(A_PUSH,(a shr (i-1)) and $ff));
+                 end;
+               else
+                 internalerror(2002071004);
             end;
-          else
-            internalerror(2002081101);
-        end;
+            hp:=hp^.Next;
+          end;
+        if assigned(hp) then
+          internalerror(2014011104);
       end;
 
 
@@ -271,26 +384,11 @@ unit cgcpu;
 
     procedure tcgavr.a_loadaddr_ref_cgpara(list : TAsmList;const r : treference;const paraloc : TCGPara);
       var
-        ref: treference;
         tmpreg: tregister;
       begin
-        paraloc.check_simple_location;
-        paramanager.allocparaloc(list,paraloc.location);
-        case paraloc.location^.loc of
-          LOC_REGISTER,LOC_CREGISTER:
-            a_loadaddr_ref_reg(list,r,paraloc.location^.register);
-          LOC_REFERENCE:
-            begin
-              reference_reset(ref,paraloc.alignment);
-              ref.base := paraloc.location^.reference.index;
-              ref.offset := paraloc.location^.reference.offset;
-              tmpreg := getintregister(list,OS_ADDR);
-              a_loadaddr_ref_reg(list,r,tmpreg);
-              a_load_reg_ref(list,OS_ADDR,OS_ADDR,tmpreg,ref);
-            end;
-          else
-            internalerror(2002080701);
-        end;
+        tmpreg:=getaddressregister(list);
+        a_loadaddr_ref_reg(list,r,tmpreg);
+        a_load_reg_cgpara(list,OS_ADDR,tmpreg,paraloc);
       end;
 
 
@@ -321,19 +419,6 @@ unit cgcpu;
       end;
 
 
-    procedure tcgavr.a_call_ref(list : TAsmList;ref: treference);
-      begin
-        a_reg_alloc(list,NR_ZLO);
-        a_reg_alloc(list,NR_ZHI);
-        a_load_ref_reg(list,OS_ADDR,OS_ADDR,ref,NR_ZLO);
-        list.concat(taicpu.op_none(A_ICALL));
-        a_reg_dealloc(list,NR_ZLO);
-        a_reg_dealloc(list,NR_ZHI);
-
-        include(current_procinfo.flags,pi_do_call);
-      end;
-
-
      procedure tcgavr.a_op_const_reg(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; reg: TRegister);
        begin
          if not(size in [OS_S8,OS_8,OS_S16,OS_16,OS_S32,OS_32]) then
@@ -414,7 +499,15 @@ unit cgcpu;
            OP_NEG:
              begin
                if src<>dst then
-                 a_load_reg_reg(list,size,size,src,dst);
+                 begin
+                   if size in [OS_S64,OS_64] then
+                     begin
+                       a_load_reg_reg(list,OS_32,OS_32,src,dst);
+                       a_load_reg_reg(list,OS_32,OS_32,srchi,dsthi);
+                     end
+                   else
+                     a_load_reg_reg(list,size,size,src,dst);
+                 end;
 
                if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
                  begin
@@ -428,7 +521,7 @@ unit cgcpu;
                    tmpreg:=GetNextReg(dst);
                    for i:=2 to tcgsize2size[size] do
                      begin
-                       list.concat(taicpu.op_reg_const(A_SBCI,dst,-1));
+                       list.concat(taicpu.op_reg_const(A_SBCI,tmpreg,-1));
                        NextTmp;
                    end;
                  end;
@@ -488,7 +581,7 @@ unit cgcpu;
                current_asmdata.getjumplabel(l2);
                countreg:=getintregister(list,OS_8);
                a_load_reg_reg(list,size,OS_8,src,countreg);
-               list.concat(taicpu.op_reg_const(A_CP,countreg,0));
+               list.concat(taicpu.op_reg_const(A_CPI,countreg,0));
                a_jmp_flags(list,F_EQ,l2);
                cg.a_label(list,l1);
                case op of
@@ -606,7 +699,7 @@ unit cgcpu;
              end;
            OP_SUB:
              begin
-               list.concat(taicpu.op_reg_const(A_SUBI,reg,a));
+               list.concat(taicpu.op_reg_const(A_SUBI,reg,a and mask));
                if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
                  begin
                    for i:=2 to tcgsize2size[size] do
@@ -618,6 +711,20 @@ unit cgcpu;
                      end;
                  end;
              end;
+           {OP_ADD:
+             begin
+               list.concat(taicpu.op_reg_const(A_SUBI,reg,(-a) and mask));
+               if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
+                 begin
+                   for i:=2 to tcgsize2size[size] do
+                     begin
+                       NextReg;
+                       mask:=mask shl 8;
+                       inc(shift,8);
+                       list.concat(taicpu.op_reg_const(A_ADC,reg,(a and mask) shr shift));
+                     end;
+                 end;
+             end; }
          else
            begin
              if size in [OS_64,OS_S64] then
@@ -716,11 +823,11 @@ unit cgcpu;
         else if (ref.base<>NR_NO) and (ref.index<>NR_NO) then
           begin
             maybegetcpuregister(list,tmpreg);
-            emit_mov(list,tmpreg,ref.index);
+            emit_mov(list,tmpreg,ref.base);
             maybegetcpuregister(list,GetNextReg(tmpreg));
-            emit_mov(list,GetNextReg(tmpreg),GetNextReg(ref.index));
-            list.concat(taicpu.op_reg_reg(A_ADD,tmpreg,ref.base));
-            list.concat(taicpu.op_reg_reg(A_ADC,GetNextReg(tmpreg),GetNextReg(ref.base)));
+            emit_mov(list,GetNextReg(tmpreg),GetNextReg(ref.base));
+            list.concat(taicpu.op_reg_reg(A_ADD,tmpreg,ref.index));
+            list.concat(taicpu.op_reg_reg(A_ADC,GetNextReg(tmpreg),GetNextReg(ref.index)));
             ref.base:=tmpreg;
             ref.index:=NR_NO;
           end
@@ -1175,6 +1282,7 @@ unit cgcpu;
       begin
         if a=0 then
           begin
+            swapped:=false;
             { swap parameters? }
             case cmp_op of
               OC_GT:
@@ -1195,7 +1303,7 @@ unit cgcpu;
               OC_A:
                 begin
                   swapped:=true;
-                  cmp_op:=OC_A;
+                  cmp_op:=OC_B;
                 end;
             end;
 
@@ -1227,6 +1335,7 @@ unit cgcpu;
         tmpreg : tregister;
         i : byte;
       begin
+        swapped:=false;
         { swap parameters? }
         case cmp_op of
           OC_GT:
@@ -1247,7 +1356,7 @@ unit cgcpu;
           OC_A:
             begin
               swapped:=true;
-              cmp_op:=OC_A;
+              cmp_op:=OC_B;
             end;
         end;
         if swapped then
@@ -1256,13 +1365,13 @@ unit cgcpu;
             reg1:=reg2;
             reg2:=tmpreg;
           end;
-        list.concat(taicpu.op_reg_reg(A_CP,reg1,reg2));
+        list.concat(taicpu.op_reg_reg(A_CP,reg2,reg1));
 
         for i:=2 to tcgsize2size[size] do
           begin
             reg1:=GetNextReg(reg1);
             reg2:=GetNextReg(reg2);
-            list.concat(taicpu.op_reg_reg(A_CPC,reg1,reg2));
+            list.concat(taicpu.op_reg_reg(A_CPC,reg2,reg1));
           end;
 
         a_jmp_cond(list,cmp_op,l);
@@ -1338,7 +1447,7 @@ unit cgcpu;
         case value of
           0:
             ;
-          -14..-1:
+          {-14..-1:
             begin
               if ((-value) mod 2)<>0 then
                 list.concat(taicpu.op_reg(A_PUSH,NR_R0));
@@ -1349,7 +1458,7 @@ unit cgcpu;
             begin
               for i:=1 to value do
                 list.concat(taicpu.op_reg(A_POP,NR_R0));
-            end;
+            end;}
           else
             begin
               list.concat(taicpu.op_reg_const(A_SUBI,NR_R28,lo(word(-value))));
@@ -1440,6 +1549,8 @@ unit cgcpu;
                 LocalSize:=current_procinfo.calc_stackframe_size;
                 a_adjust_sp(list,LocalSize);
                 regs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(pocall_stdcall);
+                if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
+                  regs:=regs+[RS_R28,RS_R29];
 
                 for reg:=RS_R0 to RS_R31 do
                   if reg in regs then
@@ -1780,7 +1891,7 @@ unit cgcpu;
 
     procedure tcgavr.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);
       begin
-        internalerror(2011021324);
+        //internalerror(2011021324);
       end;
 
 

+ 12 - 4
compiler/avr/cpubase.pas

@@ -235,11 +235,11 @@ unit cpubase;
       }
       NR_PIC_OFFSET_REG = NR_R9;
       { Results are returned in this register (32-bit values) }
-      NR_FUNCTION_RETURN_REG = NR_R0;
-      RS_FUNCTION_RETURN_REG = RS_R0;
+      NR_FUNCTION_RETURN_REG = NR_R24;
+      RS_FUNCTION_RETURN_REG = RS_R24;
       { Low part of 64bit return value }
-      NR_FUNCTION_RETURN64_LOW_REG = NR_R0;
-      RS_FUNCTION_RETURN64_LOW_REG = RS_R0;
+      NR_FUNCTION_RETURN64_LOW_REG = NR_R22;
+      RS_FUNCTION_RETURN64_LOW_REG = RS_R22;
       { High part of 64bit return value }
       NR_FUNCTION_RETURN64_HIGH_REG = NR_R1;
       RS_FUNCTION_RETURN64_HIGH_REG = RS_R1;
@@ -323,6 +323,8 @@ unit cpubase;
     { returns the register with the offset of ofs of a continuous set of register starting with r and being continued with rhi }
     function GetOffsetReg64(const r,rhi: TRegister;ofs : shortint): TRegister;
 
+    function is_calljmp(o:tasmop):boolean;{$ifdef USEINLINE}inline;{$endif USEINLINE}
+
   implementation
 
     uses
@@ -476,4 +478,10 @@ unit cpubase;
       end;
 
 
+    function is_calljmp(o:tasmop):boolean;{$ifdef USEINLINE}inline;{$endif USEINLINE}
+      begin
+        is_calljmp:= o in jmp_instructions;
+      end;
+
+
 end.

+ 10 - 10
compiler/avr/cpuinfo.pas

@@ -194,16 +194,16 @@ Const
  const
    cpu_capabilities : array[tcputype] of set of tcpuflags =
      ( { cpu_none } [],
-       { cpu_avr1 } [],
-       { cpu_avr2 } [],
-       { cpu_avr25 } [],
-       { cpu_avr3 } [],
-       { cpu_avr31 } [],
-       { cpu_avr35 } [],
-       { cpu_avr4 } [],
-       { cpu_avr5 } [],
-       { cpu_avr51 } [],
-       { cpu_avr6 } []
+       { cpu_avr1 } [CPUAVR_2_BYTE_PC],
+       { cpu_avr2 } [CPUAVR_2_BYTE_PC],
+       { cpu_avr25 } [CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_2_BYTE_PC],
+       { cpu_avr3 } [CPUAVR_HAS_JMP_CALL,CPUAVR_2_BYTE_PC],
+       { cpu_avr31 } [CPUAVR_HAS_JMP_CALL,CPUAVR_HAS_RAMPZ,CPUAVR_HAS_ELPM,CPUAVR_2_BYTE_PC],
+       { cpu_avr35 } [CPUAVR_HAS_JMP_CALL,CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_2_BYTE_PC],
+       { cpu_avr4 } [CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_HAS_MUL,CPUAVR_2_BYTE_PC],
+       { cpu_avr5 } [CPUAVR_HAS_JMP_CALL,CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_HAS_MUL,CPUAVR_2_BYTE_PC],
+       { cpu_avr51 } [CPUAVR_HAS_JMP_CALL,CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_HAS_MUL,CPUAVR_HAS_RAMPZ,CPUAVR_HAS_ELPM,CPUAVR_HAS_ELPMX,CPUAVR_2_BYTE_PC],
+       { cpu_avr6 } [CPUAVR_HAS_JMP_CALL,CPUAVR_HAS_MOVW,CPUAVR_HAS_LPMX,CPUAVR_HAS_MUL,CPUAVR_HAS_RAMPZ,CPUAVR_HAS_ELPM,CPUAVR_HAS_ELPMX,CPUAVR_3_BYTE_PC]
      );
 
 Implementation

+ 60 - 45
compiler/avr/cpupara.pas

@@ -38,7 +38,6 @@ unit cpupara;
           function get_volatile_registers_fpu(calloption : tproccalloption):tcpuregisterset;override;
           function push_addr_param(varspez:tvarspez;def : tdef;calloption : tproccalloption) : boolean;override;
           function ret_in_param(def:tdef;pd:tabstractprocdef):boolean;override;
-          procedure getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);override;
           function create_paraloc_info(p : tabstractprocdef; side: tcallercallee):longint;override;
           function create_varargs_paraloc_info(p : tabstractprocdef; varargspara:tvarargsparalist):longint;override;
           function  get_funcretloc(p : tabstractprocdef; side: tcallercallee; forcetempdef: tdef): tcgpara;override;
@@ -68,45 +67,6 @@ unit cpupara;
       end;
 
 
-    procedure tcpuparamanager.getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);
-      var
-        paraloc : pcgparalocation;
-        psym: tparavarsym;
-        pdef: tdef;
-      begin
-        if nr<1 then
-          internalerror(2002070801);
-        psym:=tparavarsym(pd.paras[nr-1]);
-        pdef:=psym.vardef;
-        if push_addr_param(psym.varspez,pdef,pd.proccalloption) then
-          pdef:=getpointerdef(pdef);
-        cgpara.reset;
-        cgpara.size:=def_cgsize(pdef);
-        cgpara.intsize:=tcgsize2size[cgpara.size];
-        cgpara.alignment:=std_param_align;
-        cgpara.def:=pdef;
-        paraloc:=cgpara.add_location;
-        with paraloc^ do
-          begin
-            size:=def_cgsize(pdef);
-            def:=pdef;
-            { the four first parameters are passed into registers }
-            if nr<=9 then
-              begin
-                loc:=LOC_REGISTER;
-                register:=newreg(R_INTREGISTER,RS_R25-(nr-1)*2,R_SUBWHOLE);
-              end
-            else
-              begin
-                { the other parameters are passed on the stack }
-                loc:=LOC_REFERENCE;
-                reference.index:=NR_STACK_POINTER_REG;
-                reference.offset:=(nr-10)*2;
-              end;
-          end;
-      end;
-
-
     function getparaloc(calloption : tproccalloption; p : tdef) : tcgloc;
       begin
          { Later, the LOC_REFERENCE is in most cases changed into LOC_REGISTER
@@ -181,6 +141,8 @@ unit cpupara;
             result:=not is_smallset(def);
           stringdef :
             result:=tstringdef(def).stringtype in [st_shortstring,st_longstring];
+        else
+          result:=def.size>4;
         end;
       end;
 
@@ -202,7 +164,10 @@ unit cpupara;
             result:=not(def.size in [1,2,4]);
           }
           else
-            result:=inherited ret_in_param(def,pd);
+            if (def.size > 4) then
+              result:=true
+            else
+              result:=inherited ret_in_param(def,pd);
         end;
       end;
 
@@ -481,7 +446,57 @@ unit cpupara;
           { Return in register }
         else
           begin
-            if retcgsize in [OS_64,OS_S64] then
+            case retcgsize of
+              OS_32,OS_S32:
+                begin
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R22;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+
+                  paraloc:=result.add_location;
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R23;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+
+                  paraloc:=result.add_location;
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R24;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+
+                  paraloc:=result.add_location;
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R25;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+                end;
+              OS_16,OS_S16:
+                begin
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R24;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+
+                  paraloc:=result.add_location;
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R25;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+                end;
+              OS_8,OS_S8:
+                begin
+                  paraloc^.loc:=LOC_REGISTER;
+                  paraloc^.register:=NR_R24;
+                  paraloc^.size:=OS_8;
+                  paraloc^.def:=u8inttype;
+                end;
+              else
+                internalerror(2014030101);
+            end;
+
+            {if retcgsize in [OS_64,OS_S64] then
               begin
                 paraloc^.loc:=LOC_REGISTER;
                 paraloc^.register:=NR_FUNCTION_RESULT64_LOW_REG;
@@ -497,9 +512,9 @@ unit cpupara;
               begin
                 paraloc^.loc:=LOC_REGISTER;
                 paraloc^.register:=NR_FUNCTION_RETURN_REG;
-                paraloc^.size:=OS_32;
-                paraloc^.def:=u32inttype;
-              end;
+                paraloc^.size:=OS_INT;
+                paraloc^.def:=u16inttype;
+              end;}
           end;
       end;
 

+ 1 - 1
compiler/avr/cpupi.pas

@@ -63,7 +63,7 @@ unit cpupi;
     function tavrprocinfo.calc_stackframe_size:longint;
       begin
         maxpushedparasize:=align(maxpushedparasize,max(current_settings.alignment.localalignmin,4));
-        result:=0;
+        result:=Align(tg.direction*tg.lasttemp,max(current_settings.alignment.localalignmin,4))+maxpushedparasize;
       end;
 
 

+ 24 - 10
compiler/avr/raavrgas.pas

@@ -69,14 +69,20 @@ Unit raavrgas;
           name : string[2];
           reg : tregister;
         end;
-{
+
       const
-        extraregs : array[0..19] of treg2str = (
-          (name: 'X'; reg : NR_Z),
-          (name: 'Y'; reg : NR_R1),
-          (name: 'Z'; reg : NR_R2),
+        extraregs : array[0..8] of treg2str = (
+          (name: 'X'; reg : NR_R26),
+          (name: 'XL'; reg : NR_R26),
+          (name: 'XH'; reg : NR_R27),
+          (name: 'Y'; reg : NR_R28),
+          (name: 'YL'; reg : NR_R28),
+          (name: 'YH'; reg : NR_R29),
+          (name: 'Z'; reg : NR_R30),
+          (name: 'ZL'; reg : NR_R30),
+          (name: 'ZH'; reg : NR_R31)
         );
-}
+
       var
         i : longint;
 
@@ -85,9 +91,9 @@ Unit raavrgas;
         { reg found?
           possible aliases are always 2 char
         }
-        if result or (length(s)<>2) then
+        if result or (not (length(s) in [1,2])) then
           exit;
-{
+
         for i:=low(extraregs) to high(extraregs) do
           begin
             if s=extraregs[i].name then
@@ -98,7 +104,6 @@ Unit raavrgas;
                 exit;
               end;
           end;
-}
       end;
 
 
@@ -480,7 +485,16 @@ Unit raavrgas;
               { save the type of register used. }
               tempreg:=actasmregister;
               Consume(AS_REGISTER);
-              if (actasmtoken in [AS_END,AS_SEPARATOR,AS_COMMA]) then
+              if (actasmtoken=AS_PLUS) then
+                begin
+                  oper.opr.typ:=OPR_REFERENCE;
+
+                  reference_reset_base(oper.opr.ref,tempreg,0,1);
+                  oper.opr.ref.addressmode:=AM_POSTINCREMENT;
+
+                  consume(AS_PLUS);
+                end
+              else if (actasmtoken in [AS_END,AS_SEPARATOR,AS_COMMA]) then
                 Begin
                   if not (oper.opr.typ in [OPR_NONE,OPR_REGISTER]) then
                     Message(asmr_e_invalid_operand_type);

+ 6 - 6
compiler/cclasses.pas

@@ -87,13 +87,13 @@ type
     procedure Put(Index: Integer; Item: Pointer);
     procedure SetCapacity(NewCapacity: Integer);
     procedure SetCount(NewCount: Integer);
-    Procedure RaiseIndexError(Index : Integer);
+    Procedure RaiseIndexError(Index : Integer);{$ifndef VER2_6}noreturn;{$endif VER2_6}
   public
     destructor Destroy; override;
     function Add(Item: Pointer): Integer;
     procedure Clear;
     procedure Delete(Index: Integer);
-    class procedure Error(const Msg: string; Data: PtrInt);
+    class procedure Error(const Msg: string; Data: PtrInt);{$ifndef VER2_6}noreturn;{$endif VER2_6}
     procedure Exchange(Index1, Index2: Integer);
     function Expand: TFPList;
     function Extract(item: Pointer): Pointer;
@@ -224,7 +224,7 @@ type
     function HashOfIndex(Index: Integer): LongWord;
     function GetNextCollision(Index: Integer): Integer;
     procedure Delete(Index: Integer);
-    class procedure Error(const Msg: string; Data: PtrInt);
+    class procedure Error(const Msg: string; Data: PtrInt);{$ifndef VER2_6}noreturn;{$endif VER2_6}
     function Expand: TFPHashList;
     function Extract(item: Pointer): Pointer;
     function IndexOf(Item: Pointer): Integer;
@@ -649,7 +649,7 @@ implementation
                TFPObjectList (Copied from rtl/objpas/classes/lists.inc)
 *****************************************************************************}
 
-procedure TFPList.RaiseIndexError(Index : Integer);
+procedure TFPList.RaiseIndexError(Index : Integer);{$ifndef VER2_6}noreturn;{$endif VER2_6}
 begin
   Error(SListIndexError, Index);
 end;
@@ -745,7 +745,7 @@ begin
   end;
 end;
 
-class procedure TFPList.Error(const Msg: string; Data: PtrInt);
+class procedure TFPList.Error(const Msg: string; Data: PtrInt);{$ifndef VER2_6}noreturn;{$endif VER2_6}
 begin
   Raise EListError.CreateFmt(Msg,[Data]) at get_caller_addr(get_frame), get_caller_frame(get_frame);
 end;
@@ -1443,7 +1443,7 @@ begin
     Self.Delete(Result);
 end;
 
-class procedure TFPHashList.Error(const Msg: string; Data: PtrInt);
+class procedure TFPHashList.Error(const Msg: string; Data: PtrInt);{$ifndef VER2_6}noreturn;{$endif VER2_6}
 begin
   Raise EListError.CreateFmt(Msg,[Data])  at get_caller_addr(get_frame), get_caller_frame(get_frame);
 end;

+ 30 - 3
compiler/cfileutl.pas

@@ -764,6 +764,7 @@ end;
   begin
     oldpos := 1;
     slashPos := Pos('/', path);
+    TranslatePathToMac:='';
     if (slashPos <> 0) then   {its a unix path}
       begin
         if slashPos = 1 then
@@ -1479,6 +1480,7 @@ end;
         inquotes:=false;
         result:='';
         i:=1;
+        temp:='';
         while i<=length(QuotedStr) do
           begin
             case QuotedStr[i] of
@@ -1521,6 +1523,10 @@ end;
       var
         quote_script: tscripttype;
       begin
+
+        if do_checkverbosity(V_Executable) then
+          do_comment(V_Executable,'Executing "'+Path+'" with command line "'+
+            ComLine+'"');
         if (cs_link_on_target in current_settings.globalswitches) then
           quote_script:=target_info.script
         else
@@ -1533,7 +1539,21 @@ end;
 
 
     function RequotedExecuteProcess(const Path: AnsiString; const ComLine: array of AnsiString; Flags: TExecuteFlags): Longint;
+      var
+        i : longint;
+        st : string;
       begin
+        if do_checkverbosity(V_Executable) then
+          begin
+            if high(ComLine)=0 then
+              st:=''
+            else
+              st:=ComLine[1];
+            for i:=2 to high(ComLine) do
+              st:=st+' '+ComLine[i];
+            do_comment(V_Executable,'Executing "'+Path+'" with command line "'+
+              st+'"');
+          end;
         result:=sysutils.ExecuteProcess(Path,ComLine,Flags);
       end;
 
@@ -1543,21 +1563,28 @@ end;
         expansion under linux }
 {$ifdef hasunix}
       begin
+        if do_checkverbosity(V_Used) then
+          do_comment(V_Executable,'Executing "'+Command+'" with fpSystem call');
         result := Unix.fpsystem(command);
       end;
 {$else hasunix}
-  {$ifdef amigashell}
+  {$ifdef hasamiga}
       begin
+        if do_checkverbosity(V_Used) then
+          do_comment(V_Executable,'Executing "'+Command+'" using RequotedExecuteProcess');
         result := RequotedExecuteProcess('',command);
       end;
-  {$else amigashell}
+  {$else hasamiga}
       var
         comspec : string;
       begin
         comspec:=GetEnvironmentVariable('COMSPEC');
+        if do_checkverbosity(V_Used) then
+          do_comment(V_Executable,'Executing "'+Command+'" using comspec "'
+            +ComSpec+'"');
         result := RequotedExecuteProcess(comspec,' /C '+command);
       end;
-   {$endif amigashell}
+   {$endif hasamiga}
 {$endif hasunix}
 
 

+ 5 - 1
compiler/cg64f32.pas

@@ -861,7 +861,11 @@ unit cg64f32;
                begin
                  current_asmdata.getjumplabel(neglabel);
                  cg.a_cmp_const_reg_label(list,OS_32,OC_EQ,-1,hreg,neglabel);
-               end;
+               end
+             else
+               { we do not have dynamic dfa, so avoid a warning below about the unused
+                 neglabel }
+               neglabel:=nil;
              { For all other values we have a range check error }
              cg.a_call_name(list,'fpc_rangeerror',false);
 

+ 48 - 24
compiler/cgobj.pas

@@ -228,7 +228,6 @@ unit cgobj;
           }
           procedure a_call_name(list : TAsmList;const s : string; weak: boolean);virtual; abstract;
           procedure a_call_reg(list : TAsmList;reg : tregister);virtual; abstract;
-          procedure a_call_ref(list : TAsmList;ref : treference);virtual;
           { same as a_call_name, might be overridden on certain architectures to emit
             static calls without usage of a got trampoline }
           procedure a_call_name_static(list : TAsmList;const s : string);virtual;
@@ -337,11 +336,12 @@ unit cgobj;
              to emit, and the constant value to emit. This function can opcode OP_NONE to
              remove the opcode and OP_MOVE to replace it with a simple load
 
+             @param(size Size of the operand in constant)
              @param(op The opcode to emit, returns the opcode which must be emitted)
              @param(a  The constant which should be emitted, returns the constant which must
                     be emitted)
           }
-          procedure optimize_op_const(var op: topcg; var a : tcgint);virtual;
+          procedure optimize_op_const(size: TCGSize; var op: topcg; var a : tcgint);virtual;
 
          {#
              This routine is used in exception management nodes. It should
@@ -765,7 +765,7 @@ implementation
           No IE can be generated, because the VMT is written
           without a valid rg[] }
         if assigned(rg[rt]) then
-          rg[rt].add_reg_instruction(instr,r,cg.executionweight);
+          rg[rt].add_reg_instruction(instr,r,executionweight);
       end;
 
 
@@ -1137,7 +1137,7 @@ implementation
                end;
              end;
            LOC_FPUREGISTER :
-             cg.a_loadfpu_reg_ref(list,paraloc.size,paraloc.size,paraloc.register,ref);
+             a_loadfpu_reg_ref(list,paraloc.size,paraloc.size,paraloc.register,ref);
            LOC_REFERENCE :
              begin
                reference_reset_base(href,paraloc.reference.index,paraloc.reference.offset,align);
@@ -1293,6 +1293,7 @@ implementation
         tmpreg,
         tmpreg2 : tregister;
         i : longint;
+        hisize : tcgsize;
       begin
         if ref.alignment in [1,2] then
           begin
@@ -1305,14 +1306,18 @@ implementation
                   a_load_ref_reg(list,fromsize,tosize,tmpref,register)
                 else
                   begin
+                    if FromSize=OS_16 then
+                      hisize:=OS_8
+                    else
+                      hisize:=OS_S8;
                     { first load in tmpreg, because the target register }
                     { may be used in ref as well                        }
                     if target_info.endian=endian_little then
                       inc(tmpref.offset);
                     tmpreg:=getintregister(list,OS_8);
-                    a_load_ref_reg(list,OS_8,OS_8,tmpref,tmpreg);
-                    tmpreg:=makeregsize(list,tmpreg,OS_16);
-                    a_op_const_reg(list,OP_SHL,OS_16,8,tmpreg);
+                    a_load_ref_reg(list,hisize,hisize,tmpref,tmpreg);
+                    tmpreg:=makeregsize(list,tmpreg,FromSize);
+                    a_op_const_reg(list,OP_SHL,FromSize,8,tmpreg);
                     if target_info.endian=endian_little then
                       dec(tmpref.offset)
                     else
@@ -1444,10 +1449,39 @@ implementation
       end;
 
 
-    procedure tcg.optimize_op_const(var op: topcg; var a : tcgint);
+    procedure tcg.optimize_op_const(size: TCGSize; var op: topcg; var a : tcgint);
       var
         powerval : longint;
+        signext_a, zeroext_a: tcgint;
       begin
+        case size of
+          OS_64,OS_S64:
+            begin
+              signext_a:=int64(a);
+              zeroext_a:=int64(a);
+            end;
+          OS_32,OS_S32:
+            begin
+              signext_a:=longint(a);
+              zeroext_a:=dword(a);
+            end;
+          OS_16,OS_S16:
+            begin
+              signext_a:=smallint(a);
+              zeroext_a:=word(a);
+            end;
+          OS_8,OS_S8:
+            begin
+              signext_a:=shortint(a);
+              zeroext_a:=byte(a);
+            end
+          else
+            begin
+              { Should we internalerror() here instead? }
+              signext_a:=a;
+              zeroext_a:=a;
+            end;
+        end;
         case op of
           OP_OR :
             begin
@@ -1456,13 +1490,13 @@ implementation
                 op:=OP_NONE
               else
               { or with max returns max }
-                if a = -1 then
+                if signext_a = -1 then
                   op:=OP_MOVE;
             end;
           OP_AND :
             begin
               { and with max returns same result }
-              if (a = -1) then
+              if (signext_a = -1) then
                 op:=OP_NONE
               else
               { and with 0 returns 0 }
@@ -1474,7 +1508,7 @@ implementation
               { division by 1 returns result }
               if a = 1 then
                 op:=OP_NONE
-              else if ispowerof2(int64(a), powerval) and not(cs_check_overflow in current_settings.localswitches) then
+              else if ispowerof2(int64(zeroext_a), powerval) and not(cs_check_overflow in current_settings.localswitches) then
                 begin
                   a := powerval;
                   op:= OP_SHR;
@@ -1492,7 +1526,7 @@ implementation
                else
                  if a=0 then
                    op:=OP_MOVE
-               else if ispowerof2(int64(a), powerval) and not(cs_check_overflow in current_settings.localswitches)  then
+               else if ispowerof2(int64(zeroext_a), powerval) and not(cs_check_overflow in current_settings.localswitches)  then
                  begin
                    a := powerval;
                    op:= OP_SHL;
@@ -2058,7 +2092,7 @@ implementation
            (tcgsize2size[tosize]<>4) then
           internalerror(2009112504);
         tg.gettemp(list,8,8,tt_normal,tmpref);
-        cg.a_loadmm_reg_ref(list,fromsize,fromsize,mmreg,tmpref,shuffle);
+        a_loadmm_reg_ref(list,fromsize,fromsize,mmreg,tmpref,shuffle);
         a_load_ref_reg(list,tosize,tosize,tmpref,intreg);
         tg.ungettemp(list,tmpref);
       end;
@@ -2334,7 +2368,7 @@ implementation
 
     procedure tcg.g_exception_reason_load(list : TAsmList; const href : treference);
       begin
-        cg.a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
+        a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
         a_load_ref_reg(list, OS_INT, OS_INT, href, NR_FUNCTION_RESULT_REG);
       end;
 
@@ -2385,16 +2419,6 @@ implementation
       end;
 
 
-    procedure tcg.a_call_ref(list : TAsmList;ref: treference);
-      var
-        tempreg : TRegister;
-      begin
-        tempreg := getintregister(list, OS_ADDR);
-        a_load_ref_reg(list,OS_ADDR,OS_ADDR,ref,tempreg);
-        a_call_reg(list,tempreg);
-      end;
-
-
    function tcg.g_indirect_sym_load(list:TAsmList;const symname: string; const flags: tindsymflags): tregister;
       var
         l: tasmsymbol;

+ 62 - 2
compiler/cgutils.pas

@@ -52,9 +52,9 @@ unit cgutils;
          offset      : asizeint;
          symbol,
          relsymbol   : tasmsymbol;
-{$if defined(x86) or defined(m68k)}
+{$if defined(x86)}
          segment,
-{$endif defined(x86) or defined(m68k)}
+{$endif defined(x86)}
          base,
          index       : tregister;
          refaddr     : trefaddr;
@@ -174,6 +174,7 @@ unit cgutils;
     procedure location_reset_ref(var l : tlocation;lt:TCGRefLoc;lsize:TCGSize; alignment: longint);
     procedure location_copy(var destloc:tlocation; const sourceloc : tlocation);
     procedure location_swap(var destloc,sourceloc : tlocation);
+    function location_reg2string(const locreg: tlocation): string;
 
     { returns r with the given alignment }
     function setalignment(const r : treference;b : byte) : treference;
@@ -271,5 +272,64 @@ uses
       end;
 
 
+    function location_reg2string(const locreg: tlocation): string;
+      begin
+        if not (locreg.loc in [LOC_REGISTER,LOC_CREGISTER,
+            LOC_MMXREGISTER,LOC_CMMXREGISTER,
+            LOC_MMREGISTER,LOC_CMMREGISTER,
+            LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
+          internalerror(2013122301);
+
+        if locreg.loc in [LOC_REGISTER,LOC_CREGISTER] then
+          begin
+            case locreg.size of
+{$if defined(cpu64bitalu)}
+              OS_128,OS_S128:
+                result:=std_regname(locreg.registerhi)+':'+std_regname(locreg.register);
+{$elseif defined(cpu32bitalu)}
+              OS_64,OS_S64:
+                result:=std_regname(locreg.registerhi)+':'+std_regname(locreg.register);
+{$elseif defined(cpu16bitalu)}
+              OS_64,OS_S64:
+                if getsupreg(locreg.register)<first_int_imreg then
+                  result:='??:'+std_regname(locreg.registerhi)+':??:'+std_regname(locreg.register)
+                else
+                  result:=std_regname(GetNextReg(locreg.registerhi))+':'+std_regname(locreg.registerhi)+':'+std_regname(GetNextReg(locreg.register))+':'+std_regname(locreg.register);
+              OS_32,OS_S32:
+                if getsupreg(locreg.register)<first_int_imreg then
+                  result:='??:'+std_regname(locreg.register)
+                else
+                  result:=std_regname(GetNextReg(locreg.register))+':'+std_regname(locreg.register);
+{$elseif defined(cpu8bitalu)}
+              OS_64,OS_S64:
+                if getsupreg(locreg.register)<first_int_imreg then
+                  result:='??:??:??:'+std_regname(locreg.registerhi)+':??:??:??:'+std_regname(locreg.register)
+                else
+                  result:=std_regname(GetNextReg(GetNextReg(GetNextReg(locreg.registerhi))))+':'+std_regname(GetNextReg(GetNextReg(locreg.registerhi)))+':'+std_regname(GetNextReg(locreg.registerhi))+':'+std_regname(locreg.registerhi)+':'+std_regname(GetNextReg(GetNextReg(GetNextReg(locreg.register))))+':'+std_regname(GetNextReg(GetNextReg(locreg.register)))+':'+std_regname(GetNextReg(locreg.register))+':'+std_regname(locreg.register);
+              OS_32,OS_S32:
+                if getsupreg(locreg.register)<first_int_imreg then
+                  result:='??:??:??:'+std_regname(locreg.register)
+                else
+                  result:=std_regname(GetNextReg(GetNextReg(GetNextReg(locreg.register))))+':'+std_regname(GetNextReg(GetNextReg(locreg.register)))+':'+std_regname(GetNextReg(locreg.register))+':'+std_regname(locreg.register);
+              OS_16,OS_S16:
+                if getsupreg(locreg.register)<first_int_imreg then
+                  result:='??:'+std_regname(locreg.register)
+                else
+                  result:=std_regname(GetNextReg(locreg.register))+':'+std_regname(locreg.register);
+{$endif}
+              else
+                result:=std_regname(locreg.register);
+            end;
+          end
+        else
+          begin
+            if locreg.registerhi<>NR_NO then
+              result:=std_regname(locreg.registerhi)+':'+std_regname(locreg.register)
+            else
+              result:=std_regname(locreg.register);
+          end;
+      end;
+
+
 end.
 

+ 1 - 4
compiler/crefs.pas

@@ -50,11 +50,8 @@ begin
 end;
 
 Function TRefItem.GetCopy : TLinkedListItem;
-var
-  NR : TRefItem;
 begin
-  NR.Create(RefInfo);
-  GetCopy:=NR;
+  Result:=TRefItem.Create(RefInfo);
 end;
 
 procedure TRefLinkedList.WriteToPPU;

+ 45 - 61
compiler/cresstr.pas

@@ -32,7 +32,7 @@ implementation
 
 uses
    SysUtils,
-   cclasses,
+   cclasses,widestr,
    cutils,globtype,globals,systems,
    symconst,symtype,symdef,symsym,
    verbose,fmodule,ppu,
@@ -64,7 +64,7 @@ uses
         constructor Create;
         destructor  Destroy;override;
         procedure CreateResourceStringData;
-        Procedure WriteResourceFile;
+        procedure WriteRSJFile;
         procedure RegisterResourceStrings;
       end;
 
@@ -214,89 +214,73 @@ uses
           current_asmdata.asmlists[al_resourcestrings].concat(Tai_const.create_sym(endsymlab));
       end;
 
-
-    Procedure Tresourcestrings.WriteResourceFile;
-      Type
-        TMode = (quoted,unquoted);
+    procedure Tresourcestrings.WriteRSJFile;
       Var
-        F : Text;
-        Mode : TMode;
-        R : TResourceStringItem;
-        C : char;
-        Col,i : longint;
-        ResFileName : string;
-
-        Procedure Add(Const S : String);
-        begin
-          Write(F,S);
-          inc(Col,length(s));
-        end;
-
+        F: Text;
+        R: TResourceStringItem;
+        ResFileName: string;
+        I: Integer;
+        C: tcompilerwidechar;
+        W: pcompilerwidestring;
       begin
-        ResFileName:=ChangeFileExt(current_module.ppufilename,'.rst');
+        ResFileName:=ChangeFileExt(current_module.ppufilename,'.rsj');
         message1 (general_i_writingresourcefile,ExtractFileName(ResFileName));
         Assign(F,ResFileName);
         {$push}{$i-}
         Rewrite(f);
         {$pop}
-        If IOresult<>0 then
+        if IOresult<>0 then
           begin
             message1(general_e_errorwritingresourcefile,ResFileName);
             exit;
           end;
+        writeln(f,'{"version":1,"strings":[');
         R:=TResourceStringItem(List.First);
         while assigned(R) do
           begin
-            writeln(f);
-            Writeln(f,'# hash value = ',R.Hash);
-            col:=0;
-            Add(R.Name+'=');
-            Mode:=unquoted;
-            For I:=0 to R.Len-1 do
-             begin
-               C:=R.Value[i];
-               If (ord(C)>31) and (Ord(c)<=128) and (c<>'''') then
-                begin
-                  If mode=Quoted then
-                   Add(c)
+            write(f, '{"hash":',R.Hash,',"name":"',R.Name,'","value":"');
+            initwidestring(W);
+            ascii2unicode(R.Value,R.Len,current_settings.sourcecodepage,W);
+            for I := 0 to W^.len - 1 do
+              begin
+                C := W^.Data[I];
+                case C of
+                  Ord('"'), Ord('\'), Ord('/'):
+                    write(f, '\', Chr(C));
+                  8:
+                    write(f, '\b');
+                  9:
+                    write(f, '\t');
+                  10:
+                    write(f, '\n');
+                  13:
+                    write(f, '\r');
+                  12:
+                    write(f, '\f');
                   else
-                   begin
-                     Add(''''+c);
-                     mode:=quoted
-                   end;
-                end
-               else
-                begin
-                  If Mode=quoted then
-                   begin
-                     Add('''');
-                     mode:=unquoted;
-                   end;
-                  Add('#'+tostr(ord(c)));
-                end;
-               If Col>72 then
-                begin
-                  if mode=quoted then
-                   Write (F,'''');
-                  Writeln(F,'+');
-                  Col:=0;
-                  Mode:=unQuoted;
+                  if (C < 32) or (C > 127) then
+                    write(f,'\u',hexStr(Longint(C), 4))
+                  else
+                    write(f,Chr(C));
                 end;
-             end;
-            if mode=quoted then
-             writeln (f,'''');
-            Writeln(f);
+              end;
+            donewidestring(W);
+            write(f,'"}');
             R:=TResourceStringItem(R.Next);
+            if assigned(R) then
+              writeln(f,',')
+            else
+              writeln(f);
           end;
+        writeln(f,']}');
         close(f);
       end;
 
-
     procedure Tresourcestrings.ConstSym_Register(p:TObject;arg:pointer);
       begin
         if (tsym(p).typ=constsym) and
            (tconstsym(p).consttyp=constresourcestring) then
-          List.Concat(tResourceStringItem.Create(TConstsym(p)));
+          List.Concat(TResourceStringItem.Create(TConstsym(p)));
       end;
 
 
@@ -318,7 +302,7 @@ uses
           begin
             current_module.flags:=current_module.flags or uf_has_resourcestrings;
             resstrs.CreateResourceStringData;
-            resstrs.WriteResourceFile;
+            resstrs.WriteRSJFile;
           end;
         resstrs.Free;
       end;

+ 9 - 1
compiler/cstreams.pas

@@ -450,6 +450,11 @@ begin
           l:=0;
          System.Seek(FHandle,l);
        end;
+     else
+       begin
+         CStreamError:=103;
+         l:=Offset;
+       end;
    end;
   {$pop}
   CStreamError:=IOResult;
@@ -549,7 +554,10 @@ begin
     Result:=FMemory
   else
     If NewCapacity=0 then
-      FreeMem (FMemory,Fcapacity)
+      begin
+        FreeMem (FMemory,Fcapacity);
+        Result:=nil;
+      end
     else
       begin
       GetMem (Result,NewCapacity);

+ 5 - 1
compiler/dbgdwarf.pas

@@ -1806,7 +1806,7 @@ implementation
           append_labelentry_ref(DW_AT_type,def_dwarf_lab(cansichartype));
           finish_entry;
           append_entry(DW_TAG_subrange_type,false,[
-            DW_AT_lower_bound,DW_FORM_udata,0,
+            DW_AT_lower_bound,DW_FORM_udata,1,
             DW_AT_upper_bound,DW_FORM_udata,qword(slen)
             ]);
           append_labelentry_ref(DW_AT_type,def_dwarf_lab(lendef));
@@ -2330,6 +2330,8 @@ implementation
         has_high_reg : boolean;
         dreg,dreghigh : byte;
       begin
+        blocksize:=0;
+        dreghigh:=0;
         { external symbols can't be resolved at link time, so we
           can't generate stabs for them
 
@@ -2886,6 +2888,8 @@ implementation
               templist.free;
               exit;
             end;
+          else
+            internalerror(2013120111);
         end;
 
         append_entry(DW_TAG_variable,false,[

+ 1 - 0
compiler/dbgstabs.pas

@@ -241,6 +241,7 @@ implementation
       len:=0;
       varcounter:=0;
       varptr:=@varvaluedata[0];
+      varvalues[0]:=nil;
       while i<=length(s) do
         begin
           if (s[i]='$') and (i<length(s)) then

+ 1 - 0
compiler/dbgstabx.pas

@@ -274,6 +274,7 @@ implementation
       if vo_is_external in sym.varoptions then
         exit;
       ismem:=not(sym.localloc.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_MMREGISTER,LOC_CMMREGISTER,LOC_FPUREGISTER,LOC_CFPUREGISTER]);
+      isglobal:=false;
       if ismem then
         isglobal:=current_asmdata.RefAsmSymbol(sym.mangledname).bind=AB_GLOBAL;
 

+ 1 - 0
compiler/dirparse.pas

@@ -255,6 +255,7 @@ implementation
         opt   : ttargetswitch;
       begin
         result:=true;
+        value:='';
         repeat
           tok:=GetToken(s,',');
           if tok='' then

+ 3 - 0
compiler/fpcdefs.inc

@@ -74,6 +74,7 @@
   {$define cpurox}
   {$define SUPPORT_SAFECALL}
   {$define SUPPORT_GET_FRAME}
+  {$define cpucapabilities}
 {$endif i386}
 
 {$ifdef x86_64}
@@ -89,6 +90,7 @@
   {$define cpurefshaveindexreg}
   {$define SUPPORT_SAFECALL}
   {$define SUPPORT_GET_FRAME}
+  {$define cpucapabilities}
 {$endif x86_64}
 
 {$ifdef ia64}
@@ -222,6 +224,7 @@
   {$define cpurefshaveindexreg}
   {$define fpc_compiler_has_fixup_jmps}
   {$define SUPPORT_GET_FRAME}
+  {$define SUPPORT_SAFECALL}
 {$endif mips}
 
 {$ifdef jvm}

+ 14 - 8
compiler/globals.pas

@@ -228,6 +228,9 @@ interface
        wpofeedbackinput,
        wpofeedbackoutput : TPathStr;
 
+       { external assembler extra option }
+       asmextraopt       : string;
+
        { things specified with parameters }
        paratarget        : tsystem;
        paratargetdbg     : tdbg;
@@ -253,8 +256,11 @@ interface
        do_build,
        do_release,
        do_make       : boolean;
+       { Path to ppc }
+       exepath       : TPathStr;
+       { Path to unicode charmap/collation binaries }
+       unicodepath   : TPathStr;
        { path for searching units, different paths can be seperated by ; }
-       exepath            : TPathStr;  { Path to ppc }
        librarysearchpath,
        unitsearchpath,
        objectsearchpath,
@@ -324,7 +330,7 @@ interface
      { parameter switches }
        debugstop : boolean;
 {$EndIf EXTDEBUG}
-       { Application type (platform specific) 
+       { Application type (platform specific)
          see globtype.pas for description }
        apptype : tapptype;
 
@@ -560,11 +566,7 @@ implementation
       macutils,
 {$endif}
 {$ifdef mswindows}
-{$ifdef VER2_4}
-      cwindirs,
-{$else VER2_4}
       windirs,
-{$endif VER2_4}
 {$endif}
       comphook;
 
@@ -916,7 +918,7 @@ implementation
         {$undef GETENVOK}
       {$else}
         GetEnvPchar:=StrPNew(GetEnvironmentVariable(envname));
-        if (length(GetEnvPChar)=0) then 
+        if (length(GetEnvPChar)=0) then
           begin
             FreeEnvPChar(GetEnvPChar);
             GetEnvPChar:=nil;
@@ -1107,7 +1109,9 @@ implementation
              (abiinfo[t].name=hs) then
             begin
               a:=t;
-              result:=true;
+              { abi_old_win32_gnu is a win32 i386 specific "feature" }
+              if (t<>abi_old_win32_gnu) or (target_info.system=system_i386_win32) then
+                result:=true;
               break;
             end;
       end;
@@ -1299,6 +1303,7 @@ implementation
 {$endif need_path_search}
      begin
        localexepath:=GetEnvironmentVariable('PPC_EXEC_PATH');
+       exeName := '';
        if localexepath='' then
          begin
            exeName := FixFileName(system.paramstr(0));
@@ -1364,6 +1369,7 @@ implementation
         sysrootpath:='';
 
         { Search Paths }
+        unicodepath:='';
         librarysearchpath:=TSearchPathList.Create;
         unitsearchpath:=TSearchPathList.Create;
         includesearchpath:=TSearchPathList.Create;

+ 9 - 6
compiler/globtype.pas

@@ -263,7 +263,7 @@ interface
          cs_opt_level1,cs_opt_level2,cs_opt_level3,
          cs_opt_regvar,cs_opt_uncertain,cs_opt_size,cs_opt_stackframe,
          cs_opt_peephole,cs_opt_asmcse,cs_opt_loopunroll,cs_opt_tailrecursion,cs_opt_nodecse,
-         cs_opt_nodedfa,cs_opt_loopstrength,cs_opt_scheduler,cs_opt_autoinline,cs_useebp,
+         cs_opt_nodedfa,cs_opt_loopstrength,cs_opt_scheduler,cs_opt_autoinline,cs_useebp,cs_userbp,
          cs_opt_reorder_fields,cs_opt_fastmath,
          { Allow removing expressions whose result is not used, even when this
            can change program behaviour (range check errors disappear,
@@ -308,7 +308,7 @@ interface
          'LEVEL1','LEVEL2','LEVEL3',
          'REGVAR','UNCERTAIN','SIZE','STACKFRAME',
          'PEEPHOLE','ASMCSE','LOOPUNROLL','TAILREC','CSE',
-         'DFA','STRENGTH','SCHEDULE','AUTOINLINE','USEEBP',
+         'DFA','STRENGTH','SCHEDULE','AUTOINLINE','USEEBP','USERBP',
          'ORDERFIELDS','FASTMATH','DEADVALUES','REMOVEEMPTYPROCS',
          'CONSTPROP',
          'DEADSTORE'
@@ -336,7 +336,7 @@ interface
        { switches being applied to all CPUs at the given level }
        genericlevel1optimizerswitches = [cs_opt_level1];
        genericlevel2optimizerswitches = [cs_opt_level2,cs_opt_remove_emtpy_proc];
-       genericlevel3optimizerswitches = [cs_opt_level3,cs_opt_constant_propagate];
+       genericlevel3optimizerswitches = [cs_opt_level3,cs_opt_constant_propagate,cs_opt_nodedfa];
        genericlevel4optimizerswitches = [cs_opt_reorder_fields,cs_opt_dead_values,cs_opt_fastmath];
 
        { whole program optimizations whose information generation requires
@@ -389,8 +389,10 @@ interface
          m_final_fields,        { allows declaring fields as "final", which means they must be initialised
                                   in the (class) constructor and are constant from then on (same as final
                                   fields in Java) }
-         m_default_unicodestring { makes the default string type in $h+ mode unicodestring rather than
-                                   ansistring; similarly, char becomes unicodechar rather than ansichar }
+         m_default_unicodestring, { makes the default string type in $h+ mode unicodestring rather than
+                                    ansistring; similarly, char becomes unicodechar rather than ansichar }
+         m_type_helpers         { allows the declaration of "type helper" (non-Delphi) or "record helper"
+                                  (Delphi) for primitive types }
        );
        tmodeswitches = set of tmodeswitch;
 
@@ -554,7 +556,8 @@ interface
          'ISOUNARYMINUS',
          'SYSTEMCODEPAGE',
          'FINALFIELDS',
-         'UNICODESTRINGS');
+         'UNICODESTRINGS',
+         'TYPEHELPERS');
 
 
      type

+ 15 - 10
compiler/hlcg2ll.pas

@@ -151,7 +151,6 @@ unit hlcg2ll;
 
           function a_call_name(list : TAsmList;pd : tprocdef;const s : TSymStr; forceresdef: tdef; weak: boolean): tcgpara;override;
           procedure a_call_reg(list : TAsmList;pd : tabstractprocdef;reg : tregister);override;
-          procedure a_call_ref(list : TAsmList;pd : tabstractprocdef;const ref : treference);override;
           { same as a_call_name, might be overridden on certain architectures to emit
             static calls without usage of a got trampoline }
           function a_call_name_static(list : TAsmList;pd : tprocdef;const s : TSymStr; forceresdef: tdef): tcgpara;override;
@@ -462,11 +461,6 @@ implementation
       cg.a_call_reg(list,reg);
     end;
 
-  procedure thlcg2ll.a_call_ref(list: TAsmList; pd: tabstractprocdef; const ref: treference);
-    begin
-      cg.a_call_ref(list,ref);
-    end;
-
   function thlcg2ll.a_call_name_static(list: TAsmList; pd: tprocdef; const s: TSymStr; forceresdef: tdef): tcgpara;
     begin
       cg.a_call_name_static(list,s);
@@ -1021,7 +1015,7 @@ implementation
       hl: tasmlabel;
       oldloc : tlocation;
       const_location: boolean;
-      dst_cgsize: tcgsize;
+      dst_cgsize,tmpsize: tcgsize;
     begin
       oldloc:=l;
       dst_cgsize:=def_cgsize(dst_size);
@@ -1051,7 +1045,7 @@ implementation
 {$ifdef cpuflags}
               LOC_FLAGS :
                 begin
-                  cg.g_flags2reg(list,OS_INT,l.resflags,hregister);
+                  cg.g_flags2reg(list,OS_32,l.resflags,hregister);
                   cg.a_reg_dealloc(list,NR_DEFAULTFLAGS);
                 end;
 {$endif cpuflags}
@@ -1064,6 +1058,9 @@ implementation
                   cg.a_label(list,current_procinfo.CurrFalseLabel);
                   cg.a_load_const_reg(list,OS_INT,0,hregister);
                   cg.a_label(list,hl);
+{$if defined(cpu8bitalu) or defined(cpu16bitalu)}
+                  cg.a_load_reg_reg(list,OS_INT,OS_32,hregister,hregister);
+{$endif}
                 end;
               else
                 a_load_loc_reg(list,src_size,u32inttype,l,hregister);
@@ -1149,13 +1146,21 @@ implementation
 {$endif cpuflags}
             LOC_JUMP :
               begin
+                tmpsize:=dst_cgsize;
+{$if defined(cpu8bitalu) or defined(cpu16bitalu)}
+                if TCGSize2Size[dst_cgsize]>TCGSize2Size[OS_INT] then
+                  tmpsize:=OS_INT;
+{$endif}
                 cg.a_label(list,current_procinfo.CurrTrueLabel);
-                cg.a_load_const_reg(list,dst_cgsize,1,hregister);
+                cg.a_load_const_reg(list,tmpsize,1,hregister);
                 current_asmdata.getjumplabel(hl);
                 cg.a_jmp_always(list,hl);
                 cg.a_label(list,current_procinfo.CurrFalseLabel);
-                cg.a_load_const_reg(list,dst_cgsize,0,hregister);
+                cg.a_load_const_reg(list,tmpsize,0,hregister);
                 cg.a_label(list,hl);
+{$if defined(cpu8bitalu) or defined(cpu16bitalu)}
+                cg.a_load_reg_reg(list,tmpsize,dst_cgsize,hregister,hregister);
+{$endif}
               end;
             else
               begin

+ 16 - 29
compiler/hlcgobj.pas

@@ -195,7 +195,6 @@ unit hlcgobj;
           }
           function a_call_name(list : TAsmList;pd : tprocdef;const s : TSymStr; forceresdef: tdef; weak: boolean): tcgpara;virtual;abstract;
           procedure a_call_reg(list : TAsmList;pd : tabstractprocdef;reg : tregister);virtual;abstract;
-          procedure a_call_ref(list : TAsmList;pd : tabstractprocdef;const ref : treference);virtual;
           { same as a_call_name, might be overridden on certain architectures to emit
             static calls without usage of a got trampoline }
           function a_call_name_static(list : TAsmList;pd : tprocdef;const s : TSymStr; forceresdef: tdef): tcgpara;virtual;
@@ -987,22 +986,6 @@ implementation
          end;
     end;
 
-  procedure thlcgobj.a_call_ref(list: TAsmList; pd: tabstractprocdef; const ref: treference);
-    var
-      reg: tregister;
-      size: tdef;
-    begin
-      { the loaded data is always a pointer to a procdef. A procvardef is
-        implicitly a pointer already, but a procdef isn't -> create one }
-      if pd.typ=procvardef then
-        size:=pd
-      else
-        size:=getpointerdef(pd);
-      reg:=getaddressregister(list,size);
-      a_load_ref_reg(list,size,size,ref,reg);
-      a_call_reg(list,pd,reg);
-    end;
-
   function thlcgobj.a_call_name_static(list: TAsmList; pd: tprocdef; const s: TSymStr; forceresdef: tdef): tcgpara;
     begin
       result:=a_call_name(list,pd,s,forceresdef,false);
@@ -1562,6 +1545,8 @@ implementation
                   tmpreg:=getintregister(list,locsize);
                   a_load_const_reg(list,locsize,loc.value,tmpreg);
                 end;
+              else
+                internalerror(2013112909);
             end;
             a_bit_test_reg_reg_reg(list,bitnumbersize,locsize,destsize,bitnumber,tmpreg,destreg);
           end;
@@ -2229,7 +2214,9 @@ implementation
       tmpreg: tregister;
       subsetregdef: torddef;
       stopbit: byte;
+
     begin
+      tmpreg:=NR_NO;
       subsetregdef:=cgsize_orddef(sreg.subsetregsize);
       stopbit:=sreg.startbit+sreg.bitlen;
       // on x86(64), 1 shl 32(64) = 1 instead of 0
@@ -2245,7 +2232,10 @@ implementation
            if (slopt<>SL_REGNOSRCMASK) then
             a_op_const_reg(list,OP_AND,subsetregdef,tcgint(not(bitmask)),tmpreg);
         end;
-      if (slopt<>SL_SETMAX) then
+      if (slopt<>SL_SETMAX) and
+      { the "and" is not needed if the whole register is modified (except for SL_SETZERO),
+        because later on we do a move in this case instead of an or }
+        ((sreg.bitlen<>AIntBits) or (slopt=SL_SETZERO)) then
         a_op_const_reg(list,OP_AND,subsetregdef,tcgint(bitmask),sreg.subsetreg);
 
       case slopt of
@@ -2257,8 +2247,11 @@ implementation
               sreg.subsetreg)
           else
             a_load_const_reg(list,subsetregdef,-1,sreg.subsetreg);
+        { if the whole register is modified, no "or" is needed }
+        else if sreg.bitlen=AIntBits then
+          a_load_reg_reg(list,subsetregdef,subsetregdef,tmpreg,sreg.subsetreg)
         else
-          a_op_reg_reg(list,OP_OR,subsetregdef,tmpreg,sreg.subsetreg);
+          a_op_reg_reg(list,OP_OR,subsetregdef,tmpreg,sreg.subsetreg)
        end;
     end;
 
@@ -3565,6 +3558,8 @@ implementation
           toreg:=getaddressregister(list,regsize);
         R_FPUREGISTER:
           toreg:=getfpuregister(list,regsize);
+        else
+          internalerror(2013112910);
       end;
       a_load_reg_reg(list,regsize,regsize,fromreg,toreg);
     end;
@@ -3580,6 +3575,8 @@ implementation
             toreg:=getaddressregister(list,regsize);
           R_FPUREGISTER:
             toreg:=getfpuregister(list,regsize);
+        else
+          internalerror(2013112915);
         end;
         a_load_reg_reg(list,regsize,regsize,fromreg,toreg);
       end;
@@ -3878,12 +3875,6 @@ implementation
     end;
 
 
-  function use_ent : boolean;
-    begin
-	  use_ent := (target_info.system in [system_mipsel_linux,system_mipseb_linux])
-	             or (target_info.cpu=cpu_alpha);
-    end;
-
   procedure thlcgobj.gen_proc_symbol(list: TAsmList);
     var
       item,
@@ -3917,15 +3908,11 @@ implementation
           previtem:=item;
           item := TCmdStrListItem(item.next);
         end;
-      if (use_ent) then
-        list.concat(Tai_directive.create(asd_ent,current_procinfo.procdef.mangledname));
       current_procinfo.procdef.procstarttai:=tai(list.last);
     end;
 
   procedure thlcgobj.gen_proc_symbol_end(list: TAsmList);
     begin
-      if (use_ent) then
-        list.concat(Tai_directive.create(asd_ent_end,current_procinfo.procdef.mangledname));
       list.concat(Tai_symbol_end.Createname(current_procinfo.procdef.mangledname));
 
       current_procinfo.procdef.procendtai:=tai(list.last);

+ 9 - 2
compiler/htypechk.pas

@@ -870,6 +870,8 @@ implementation
       begin
         isbinaryoverloaded:=false;
         operpd:=nil;
+        ppn:=nil;
+
         { load easier access variables }
         ld:=tbinarynode(t).left.resultdef;
         rd:=tbinarynode(t).right.resultdef;
@@ -972,6 +974,8 @@ implementation
     { marks an lvalue as "unregable" }
     procedure make_not_regable_intern(p : tnode; how: tregableinfoflags; records_only: boolean);
       begin
+        if ra_addr_taken in how then
+          include(p.flags,nf_address_taken);
         repeat
           case p.nodetype of
             subscriptn:
@@ -1153,6 +1157,9 @@ implementation
              vecn:
                begin
                  set_varstate(tbinarynode(p).right,vs_read,[vsf_must_be_valid]);
+                 { dyn. arrays and dyn. strings are read }
+                 if is_implicit_array_pointer(tunarynode(p).left.resultdef) then
+                   newstate:=vs_read;
                  if (newstate in [vs_read,vs_readwritten]) or
                     not(tunarynode(p).left.resultdef.typ in [stringdef,arraydef]) then
                    include(varstateflags,vsf_must_be_valid)
@@ -1510,8 +1517,8 @@ implementation
                      begin
                        { pointer -> array conversion is done then we need to see it
                          as a deref, because a ^ is then not required anymore }
-                       if (ttypeconvnode(hp).left.resultdef.typ=pointerdef) then
-                        gotderef:=true;
+                       if ttypeconvnode(hp).convtype=tc_pointer_2_array then
+                         gotderef:=true;
                      end;
                  end;
                  hp:=ttypeconvnode(hp).left;

+ 16 - 10
compiler/i386/aopt386.pas

@@ -64,13 +64,16 @@ Begin
    { or nil                                                                }
      While Assigned(BlockStart) Do
        Begin
-         if pass = 0 then
-           PrePeepHoleOpts(AsmL, BlockStart, BlockEnd);
-        { Peephole optimizations }
-         PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
-        { Only perform them twice in the first pass }
-         if pass = 0 then
-           PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+         if (cs_opt_peephole in current_settings.optimizerswitches) then
+           begin
+            if (pass = 0) then
+              PrePeepHoleOpts(AsmL, BlockStart, BlockEnd);
+              { Peephole optimizations }
+               PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+              { Only perform them twice in the first pass }
+               if pass = 0 then
+                 PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+           end;
         { Data flow analyzer }
          If (cs_opt_asmcse in current_settings.optimizerswitches) Then
            begin
@@ -79,9 +82,12 @@ Begin
                changed := CSE(asmL, blockStart, blockEnd, pass) or changed;
            end;
         { More peephole optimizations }
-         PeepHoleOptPass2(AsmL, BlockStart, BlockEnd);
-         if lastLoop then
-           PostPeepHoleOpts(AsmL, BlockStart, BlockEnd);
+         if (cs_opt_peephole in current_settings.optimizerswitches) then
+           begin
+             PeepHoleOptPass2(AsmL, BlockStart, BlockEnd);
+             if lastLoop then
+               PostPeepHoleOpts(AsmL, BlockStart, BlockEnd);
+           end;
 
         { Free memory }
         dfa.clear;

+ 17 - 22
compiler/i386/cgcpu.pas

@@ -294,17 +294,15 @@ unit cgcpu;
 
     procedure tcg386.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
 
-      procedure increase_fp(a : tcgint);
+      procedure increase_sp(a : tcgint);
         var
           href : treference;
         begin
-          reference_reset_base(href,current_procinfo.framepointer,a,0);
+          reference_reset_base(href,NR_STACK_POINTER_REG,a,0);
           { normally, lea is a better choice than an add }
-          list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,current_procinfo.framepointer));
+          list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG));
         end;
 
-      var
-        stacksize : longint;
       begin
         { MMX needs to call EMMS }
         if assigned(rg[R_MMXREGISTER]) and
@@ -314,20 +312,15 @@ unit cgcpu;
         { remove stackframe }
         if not nostackframe then
           begin
-            if current_procinfo.framepointer=NR_STACK_POINTER_REG then
+            if (current_procinfo.framepointer=NR_STACK_POINTER_REG) or
+               (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
               begin
-                stacksize:=current_procinfo.calc_stackframe_size;
-                if (target_info.stackalign>4) and
-                   ((stacksize <> 0) or
-                    (pi_do_call in current_procinfo.flags) or
-                    { can't detect if a call in this case -> use nostackframe }
-                    { if you (think you) know what you are doing              }
-                    (po_assembler in current_procinfo.procdef.procoptions)) then
-                  stacksize := align(stacksize+sizeof(aint),target_info.stackalign) - sizeof(aint);
-                if stacksize<>0 then
-                  increase_fp(stacksize);
+                if current_procinfo.final_localsize<>0 then
+                  increase_sp(current_procinfo.final_localsize);
                 if (not paramanager.use_fixed_stack) then
                   internal_restore_regs(list,true);
+                if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
+                  list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
               end
             else
               begin
@@ -387,7 +380,8 @@ unit cgcpu;
            { but not on win32 }
            { and not for safecall with hidden exceptions, because the result }
            { wich contains the exception is passed in EAX }
-           if (target_info.system <> system_i386_win32) and
+           if ((target_info.system <> system_i386_win32) or
+               (target_info.abi=abi_old_win32_gnu)) and
               not ((current_procinfo.procdef.proccalloption = pocall_safecall) and
                (tf_safecall_exceptions in target_info.flags)) and
               paramanager.ret_in_param(current_procinfo.procdef.returndef,
@@ -561,7 +555,7 @@ unit cgcpu;
       begin
         if not paramanager.use_fixed_stack then
           begin
-            cg.a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
+            a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
             list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_INT],NR_FUNCTION_RESULT_REG))
           end
         else
@@ -590,7 +584,7 @@ unit cgcpu;
                (current_settings.optimizecputype in [cpu_Pentium2,cpu_Pentium3,cpu_Pentium4]) } then
               begin
                 current_module.requires_ebx_pic_helper:=true;
-                cg.a_call_name_static(list,'fpc_geteipasebx');
+                a_call_name_static(list,'fpc_geteipasebx');
               end
             else
               begin
@@ -654,6 +648,7 @@ unit cgcpu;
           for i:=0 to procdef.paras.count-1 do
            begin
              hp:=tparavarsym(procdef.paras[i]);
+             procdef.init_paraloc_info(calleeside);
              paraloc:=hp.paraloc[calleeside].Location;
              while paraloc<>nil do
                begin
@@ -679,7 +674,7 @@ unit cgcpu;
               else
                 selfoffsetfromsp:=sizeof(aint);
               reference_reset_base(href,NR_ESP,selfoffsetfromsp+offs,4);
-              cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
+              a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
             end;
         end;
 
@@ -689,7 +684,7 @@ unit cgcpu;
         begin
           { mov  0(%eax),%reg ; load vmt}
           reference_reset_base(href,NR_EAX,0,4);
-          cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,reg);
+          a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,reg);
         end;
 
       procedure op_onregmethodaddr(op: TAsmOp; reg: tregister);
@@ -712,7 +707,7 @@ unit cgcpu;
             Internalerror(200006139);
           { mov vmtoffs(%eax),%eax ; method offs }
           reference_reset_base(href,NR_EAX,tobjectdef(procdef.struct).vmtmethodoffset(procdef.extnumber),4);
-          cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
+          a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
         end;
 
 

+ 2 - 0
compiler/i386/cpuelf.pas

@@ -110,6 +110,8 @@ implementation
           result:=R_386_GOTPC;
         RELOC_PLT32 :
           result:=R_386_PLT32;
+        RELOC_GOTOFF:
+          result:=R_386_GOTOFF;
       else
         result:=0;
         InternalError(2012082301);

+ 34 - 2
compiler/i386/cpuinfo.pas

@@ -46,7 +46,10 @@ Type
        cpu_Pentium2,
        cpu_Pentium3,
        cpu_Pentium4,
-       cpu_PentiumM
+       cpu_PentiumM,
+       cpu_core_i,
+       cpu_core_avx,
+       cpu_core_avx2
       );
 
    tfputype =
@@ -85,7 +88,10 @@ Const
      'PENTIUM2',
      'PENTIUM3',
      'PENTIUM4',
-     'PENTIUMM'
+     'PENTIUMM',
+     'COREI',
+     'COREAVX',
+     'COREAVX2'
    );
 
    fputypestr : array[tfputype] of string[6] = ('',
@@ -123,6 +129,32 @@ Const
    level3optimizerswitches = genericlevel3optimizerswitches + level2optimizerswitches + [{,cs_opt_loopunroll}];
    level4optimizerswitches = genericlevel4optimizerswitches + level3optimizerswitches + [cs_useebp];
 
+type
+   tcpuflags =
+      (CPUX86_HAS_SSEUNIT,
+       CPUX86_HAS_BMI1,
+       CPUX86_HAS_BMI2,
+       CPUX86_HAS_POPCNT,
+       CPUX86_HAS_AVXUNIT,
+       CPUX86_HAS_LZCNT,
+       CPUX86_HAS_MOVBE
+      );
+
+ const
+   cpu_capabilities : array[tcputype] of set of tcpuflags = (
+     { cpu_none      } [],
+     { cpu_386       } [],
+     { cpu_Pentium   } [],
+     { cpu_Pentium2  } [],
+     { cpu_Pentium3  } [CPUX86_HAS_SSEUNIT],
+     { cpu_Pentium4  } [CPUX86_HAS_SSEUNIT],
+     { cpu_PentiumM  } [CPUX86_HAS_SSEUNIT],
+     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
+     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
+   );
+
+
 Implementation
 
 end.

+ 3 - 0
compiler/i386/cpunode.pas

@@ -54,6 +54,9 @@ unit cpunode;
        n386mem,
        n386set,
        n386inl,
+{$ifdef TEST_WIN32_SEH}
+       n386flw,
+{$endif TEST_WIN32_SEH}
        n386mat
        ;
 

+ 0 - 53
compiler/i386/cpupara.pas

@@ -40,12 +40,6 @@ unit cpupara;
           function get_volatile_registers_int(calloption : tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_fpu(calloption : tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_mm(calloption : tproccalloption):tcpuregisterset;override;
-          { Returns the location for the nr-st 32 Bit int parameter
-            if every parameter before is an 32 Bit int parameter as well
-            and if the calling conventions for the helper routines of the
-            rtl are used.
-          }
-          procedure getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);override;
           function create_paraloc_info(p : tabstractprocdef; side: tcallercallee):longint;override;
           function create_varargs_paraloc_info(p : tabstractprocdef; varargspara:tvarargsparalist):longint;override;
           procedure createtempparaloc(list: TAsmList;calloption : tproccalloption;parasym : tparavarsym;can_use_final_stack_loc : boolean;var cgpara:TCGPara);override;
@@ -274,53 +268,6 @@ unit cpupara;
       end;
 
 
-    procedure tcpuparamanager.getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);
-      var
-        paraloc : pcgparalocation;
-        psym: tparavarsym;
-        pdef: tdef;
-      begin
-        psym:=tparavarsym(pd.paras[nr-1]);
-        pdef:=psym.vardef;
-        if push_addr_param(psym.varspez,pdef,pd.proccalloption) then
-          pdef:=getpointerdef(pdef);
-        cgpara.reset;
-        cgpara.size:=def_cgsize(pdef);
-        cgpara.intsize:=tcgsize2size[cgpara.size];
-        cgpara.alignment:=get_para_align(pd.proccalloption);
-        cgpara.def:=pdef;
-        paraloc:=cgpara.add_location;
-        with paraloc^ do
-         begin
-           size:=def_cgsize(pdef);
-           def:=pdef;
-           if pd.proccalloption=pocall_register then
-             begin
-               if (nr<=length(parasupregs)) then
-                 begin
-                   if nr=0 then
-                     internalerror(200309271);
-                   loc:=LOC_REGISTER;
-                   register:=newreg(R_INTREGISTER,parasupregs[nr-1],R_SUBWHOLE);
-                 end
-               else
-                 begin
-                   loc:=LOC_REFERENCE;
-                   reference.index:=NR_STACK_POINTER_REG;
-                   { the previous parameters didn't take up room in memory }
-                   reference.offset:=sizeof(aint)*(nr-length(parasupregs)-1)
-                 end;
-             end
-           else
-             begin
-               loc:=LOC_REFERENCE;
-               reference.index:=NR_STACK_POINTER_REG;
-               reference.offset:=sizeof(aint)*nr;
-             end;
-          end;
-      end;
-
-
     function  tcpuparamanager.get_funcretloc(p : tabstractprocdef; side: tcallercallee; forcetempdef: tdef): TCGPara;
       var
         retcgsize  : tcgsize;

+ 17 - 7
compiler/i386/csopt386.pas

@@ -353,6 +353,7 @@ var
 
   function getNextRegToTest(var prev: tai; currentReg: tsuperregister): tsuperregister;
   begin
+    getNextRegToTest := RS_INVALID;
     if not checkingPrevSequences then
       begin
         if (currentreg = RS_INVALID) then
@@ -423,9 +424,13 @@ begin {CheckSequence}
   TmpResult := False;
   FillChar(OrgRegInfo, Sizeof(OrgRegInfo), 0);
   FillChar(startRegInfo, sizeof(startRegInfo), 0);
+  FillChar(HighRegInfo, sizeof(HighRegInfo), 0);
+  FillChar(prevreginfo, sizeof(prevreginfo), 0);
   OrgRegFound := 0;
   HighFound := 0;
   OrgRegResult := False;
+  highPrev := nil;
+  orgPrev := nil;
   with startRegInfo do
     begin
       newRegsEncountered := [RS_EBP, RS_ESP];
@@ -1074,6 +1079,7 @@ var
   prev: tai;
   newOrgRegRState, newOrgRegWState: byte;
 begin
+  newOrgRegwState := 0;
   if getLastInstruction(hp,prev) then
     with ptaiprop(prev.optinfo)^ do
       begin
@@ -1429,6 +1435,7 @@ var
 
 begin
   replacereg := false;
+  readStateChanged := false;
   if canreplacereg(orgsupreg,newsupreg,p,orgregcanbemodified,newregmodified, orgregread, removelast,endp) then
     begin
 {$ifdef replaceregdebug}
@@ -1595,12 +1602,12 @@ var
   regcounter: tsuperregister;
   optimizable: boolean;
 begin
+  memtoreg := NR_NO;
+
   if not getlastinstruction(t,hp) or
      not issimplememloc(ref) then
-    begin
-      memtoreg := NR_NO;
-      exit;
-    end;
+    exit;
+
   p := ptaiprop(hp.optinfo);
   optimizable := false;
   for regcounter := RS_EAX to RS_EDI do
@@ -1832,15 +1839,18 @@ procedure doCSE(asml: TAsmList; First, Last: tai; findPrevSeqs, doSubOpts: boole
  removed immediately because sometimes an instruction needs to be checked in
  two different sequences}
 var cnt, cnt2, {cnt3,} orgNrofMods: longint;
-    p, hp1, hp2, prevSeq: tai;
-    hp4: tai;
-    hp5 : tai;
+    p, hp1, hp2, hp4, hp5, prevSeq: tai;
     reginfo: toptreginfo;
     memreg: tregister;
     regcounter: tsuperregister;
 begin
   p := First;
   SkipHead(p);
+  hp1 := nil;
+  hp2 := nil;
+  hp4 := nil;
+  hp5 := nil;
+  cnt := 0;
   while (p <> Last) do
     begin
       case p.typ of

+ 1 - 0
compiler/i386/daopt386.pas

@@ -901,6 +901,7 @@ var
   Cnt: Word;
 begin
   TmpResult := False;
+  Result := False;
   if supreg = RS_INVALID then
     exit;
   if (p1.typ = ait_instruction) then

+ 1 - 0
compiler/i386/i386att.inc

@@ -946,6 +946,7 @@
 'vzeroupper',
 'andn',
 'bextr',
+'tzcnt',
 'rorx',
 'sarx',
 'shlx',

+ 1 - 0
compiler/i386/i386atts.inc

@@ -963,5 +963,6 @@ attsufNONE,
 attsufNONE,
 attsufNONE,
 attsufNONE,
+attsufNONE,
 attsufNONE
 );

+ 1 - 0
compiler/i386/i386int.inc

@@ -946,6 +946,7 @@
 'vzeroupper',
 'andn',
 'bextr',
+'tzcnt',
 'rorx',
 'sarx',
 'shlx',

+ 1 - 1
compiler/i386/i386nop.inc

@@ -1,2 +1,2 @@
 { don't edit, this file is generated from x86ins.dat }
-1657;
+1658;

+ 1 - 0
compiler/i386/i386op.inc

@@ -946,6 +946,7 @@ A_VZEROALL,
 A_VZEROUPPER,
 A_ANDN,
 A_BEXTR,
+A_TZCNT,
 A_RORX,
 A_SARX,
 A_SHLX,

+ 79 - 78
compiler/i386/i386prop.inc

@@ -224,8 +224,8 @@
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -389,7 +389,7 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_ROp1, Ch_WOp2, Ch_RFLAGS)),
+(Ch: (Ch_ROp1, Ch_RWOp2, Ch_RFLAGS)),
 (Ch: (Ch_RFLAGS, Ch_None, Ch_None)),
 (Ch: (Ch_RFLAGS, Ch_WOp1, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
@@ -489,8 +489,8 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -570,7 +570,7 @@
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -730,6 +730,10 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -753,6 +757,12 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -761,28 +771,20 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
@@ -796,7 +798,15 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -859,6 +869,14 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -867,6 +885,24 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -876,15 +912,26 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -893,63 +940,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_None)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Wop2, Ch_WFlags, Ch_Rop1)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),

+ 7 - 0
compiler/i386/i386tab.inc

@@ -11571,6 +11571,13 @@
     code    : #242#249#1#247#62#72;
     flags   : if_bmi1
   ),
+  (
+    opcode  : A_TZCNT;
+    ops     : 2;
+    optypes : (ot_reg16 or ot_bits32 or ot_bits64,ot_rm_gpr,ot_none,ot_none);
+    code    : #208#219#2#15#188#72;
+    flags   : if_bmi1 or if_sm
+  ),
   (
     opcode  : A_RORX;
     ops     : 3;

+ 146 - 21
compiler/i386/n386add.pas

@@ -31,10 +31,14 @@ interface
     type
        ti386addnode = class(tx86addnode)
          function use_generic_mul32to64: boolean; override;
+         function use_generic_mul64bit: boolean; override;
          procedure second_addordinal; override;
          procedure second_add64bit;override;
          procedure second_cmp64bit;override;
          procedure second_mul(unsigned: boolean);
+         procedure second_mul64bit;
+       protected
+         procedure set_mul_result_location;
        end;
 
   implementation
@@ -58,6 +62,12 @@ interface
       result := False;
     end;
 
+    function ti386addnode.use_generic_mul64bit: boolean;
+    begin
+      result:=(cs_check_overflow in current_settings.localswitches) or
+        (cs_opt_size in current_settings.optimizerswitches);
+    end;
+
     { handles all unsigned multiplications, and 32->64 bit signed ones.
       32bit-only signed mul is handled by generic codegen }
     procedure ti386addnode.second_addordinal;
@@ -66,6 +76,11 @@ interface
     begin
       unsigned:=not(is_signed(left.resultdef)) or
                 not(is_signed(right.resultdef));
+      { use IMUL instead of MUL in case overflow checking is off and we're
+        doing a 32->32-bit multiplication }
+      if not (cs_check_overflow in current_settings.localswitches) and
+         not is_64bit(resultdef) then
+        unsigned:=false;
       if (nodetype=muln) and (unsigned or is_64bit(resultdef)) then
         second_mul(unsigned)
       else
@@ -117,6 +132,11 @@ interface
             op:=OP_OR;
           andn:
             op:=OP_AND;
+          muln:
+            begin
+              second_mul64bit;
+              exit;
+            end
           else
             begin
               { everything should be handled in pass_1 (JM) }
@@ -366,6 +386,32 @@ interface
                                 x86 MUL
 *****************************************************************************}
 
+    procedure ti386addnode.set_mul_result_location;
+    begin
+      location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+      {Free EAX,EDX}
+      cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
+      if is_64bit(resultdef) then
+      begin
+        {Allocate a couple of registers and store EDX:EAX into it}
+        location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EDX, location.register64.reghi);
+        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+        location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EAX, location.register64.reglo);
+      end
+      else
+      begin
+        {Allocate a new register and store the result in EAX in it.}
+        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register);
+      end;
+      location_freetemp(current_asmdata.CurrAsmList,left.location);
+      location_freetemp(current_asmdata.CurrAsmList,right.location);
+    end;
+
+
     procedure ti386addnode.second_mul(unsigned: boolean);
 
     var reg:Tregister;
@@ -379,8 +425,6 @@ interface
     begin
       pass_left_right;
 
-      {The location.register will be filled in later (JM)}
-      location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
       { Mul supports registers and references, so if not register/reference,
         load the location into a register.
         The variant of IMUL which is capable of doing 32->64 bits has the same restrictions. }
@@ -418,26 +462,107 @@ interface
           cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
           cg.a_label(current_asmdata.CurrAsmList,hl4);
         end;
-      {Free EAX,EDX}
-      cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-      if is_64bit(resultdef) then
-      begin
-        {Allocate a couple of registers and store EDX:EAX into it}
-        location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EDX, location.register64.reghi);
-        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-        location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EAX, location.register64.reglo);
-      end
+      set_mul_result_location;
+    end;
+
+
+    procedure ti386addnode.second_mul64bit;
+    var
+      list: TAsmList;
+      hreg1,hreg2: tregister;
+    begin
+      { 64x64 multiplication yields 128-bit result, but we're only
+        interested in its lower 64 bits. This lower part is independent
+        of operand signs, and so is the generated code. }
+      { pass_left_right already called from second_add64bit }
+      list:=current_asmdata.CurrAsmList;
+      if left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
+        tcgx86(cg).make_simple_ref(list,left.location.reference);
+      if right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
+        tcgx86(cg).make_simple_ref(list,right.location.reference);
+
+      { calculate 32-bit terms lo(right)*hi(left) and hi(left)*lo(right) }
+      if (right.location.loc=LOC_CONSTANT) then
+        begin
+          { Omit zero terms, if any }
+          hreg1:=NR_NO;
+          hreg2:=NR_NO;
+          if lo(right.location.value64)<>0 then
+            hreg1:=cg.getintregister(list,OS_INT);
+          if hi(right.location.value64)<>0 then
+            hreg2:=cg.getintregister(list,OS_INT);
+
+          { Take advantage of 3-operand form of IMUL }
+          case left.location.loc of
+            LOC_REGISTER,LOC_CREGISTER:
+              begin
+                if hreg1<>NR_NO then
+                  emit_const_reg_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.register64.reghi,hreg1);
+                if hreg2<>NR_NO then
+                  emit_const_reg_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.register64.reglo,hreg2);
+              end;
+            LOC_REFERENCE,LOC_CREFERENCE:
+              begin
+                if hreg2<>NR_NO then
+                  list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.reference,hreg2));
+                inc(left.location.reference.offset,4);
+                if hreg1<>NR_NO then
+                  list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.reference,hreg1));
+                dec(left.location.reference.offset,4);
+              end;
+          else
+            InternalError(2014011602);
+          end;
+        end
       else
-      begin
-        {Allocate a new register and store the result in EAX in it.}
-        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register);
-      end;
-      location_freetemp(current_asmdata.CurrAsmList,left.location);
-      location_freetemp(current_asmdata.CurrAsmList,right.location);
+        begin
+          hreg1:=cg.getintregister(list,OS_INT);
+          hreg2:=cg.getintregister(list,OS_INT);
+          cg64.a_load64low_loc_reg(list,left.location,hreg1);
+          cg64.a_load64high_loc_reg(list,left.location,hreg2);
+          case right.location.loc of
+            LOC_REGISTER,LOC_CREGISTER:
+              begin
+                emit_reg_reg(A_IMUL,S_L,right.location.register64.reghi,hreg1);
+                emit_reg_reg(A_IMUL,S_L,right.location.register64.reglo,hreg2);
+              end;
+            LOC_REFERENCE,LOC_CREFERENCE:
+              begin
+                emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg2);
+                inc(right.location.reference.offset,4);
+                emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg1);
+                dec(right.location.reference.offset,4);
+              end;
+          else
+            InternalError(2014011603);
+          end;
+        end;
+      { add hi*lo and lo*hi terms together }
+      if (hreg1<>NR_NO) and (hreg2<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg2,hreg1);
+
+      { load lo(right) into EAX }
+      cg.getcpuregister(list,NR_EAX);
+      cg64.a_load64low_loc_reg(list,right.location,NR_EAX);
+
+      { multiply EAX by lo(left), producing 64-bit value in EDX:EAX }
+      cg.getcpuregister(list,NR_EDX);
+      if (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
+        emit_reg(A_MUL,S_L,left.location.register64.reglo)
+      else if (left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
+        emit_ref(A_MUL,S_L,left.location.reference)
+      else
+        InternalError(2014011604);
+      { add previously calculated terms to the high half }
+      if (hreg1<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
+      else if (hreg2<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg2,NR_EDX)
+      else
+        InternalError(2014011604);
+
+      { Result is now in EDX:EAX. Copy it to virtual registers. }
+      set_mul_result_location;
     end;
 
 

+ 2 - 1
compiler/i386/n386cal.pas

@@ -93,7 +93,8 @@ implementation
           it is always the first parameter (apart from hidden parentfp,
           but this one is never put into a register (vs_nonregable set)
           so funcret is always in EAX for register calling }
-        if (target_info.system = system_i386_win32) and
+        if ((target_info.system = system_i386_win32) and
+            not (target_info.abi=abi_old_win32_gnu)) and
             paramanager.ret_in_param(procdefinition.returndef,procdefinition) and
             not ((procdefinition.proccalloption=pocall_register) or
                  ((procdefinition.proccalloption=pocall_internproc) and

+ 685 - 0
compiler/i386/n386flw.pas

@@ -0,0 +1,685 @@
+{
+    Copyright (c) 2011 by Free Pascal development team
+
+    Generate Win32-specific exception handling code
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit n386flw;
+
+{$i fpcdefs.inc}
+
+interface
+
+  uses
+    node,nflw,ncgflw,psub;
+
+  type
+    ti386raisenode=class(tcgraisenode)
+      function pass_1 : tnode;override;
+    end;
+
+    ti386onnode=class(tcgonnode)
+      procedure pass_generate_code;override;
+    end;
+
+    ti386tryexceptnode=class(tcgtryexceptnode)
+      procedure pass_generate_code;override;
+    end;
+
+    ti386tryfinallynode=class(tcgtryfinallynode)
+      finalizepi: tcgprocinfo;
+      constructor create(l,r:TNode);override;
+      constructor create_implicit(l,r,_t1:TNode);override;
+      function pass_1: tnode;override;
+      function simplify(forinline: boolean): tnode;override;
+      procedure pass_generate_code;override;
+    end;
+
+implementation
+
+  uses
+    cutils,globtype,globals,verbose,systems,
+    nbas,ncal,nmem,nutils,
+    symconst,symbase,symtable,symsym,symdef,
+    cgbase,cgobj,cgcpu,cgutils,tgobj,
+    cpubase,htypechk,
+    parabase,paramgr,pdecsub,pass_1,pass_2,ncgutil,cga,
+    aasmbase,aasmtai,aasmdata,aasmcpu,procinfo,cpupi;
+
+  var
+    endexceptlabel: tasmlabel;
+
+
+{ ti386raisenode }
+
+function ti386raisenode.pass_1 : tnode;
+  var
+    statements : tstatementnode;
+    raisenode : tcallnode;
+  begin
+    { difference from generic code is that address stack is not popped on reraise }
+    if (target_info.system<>system_i386_win32) or assigned(left) then
+      result:=inherited pass_1
+    else
+      begin
+        result:=internalstatements(statements);
+        raisenode:=ccallnode.createintern('fpc_reraise',nil);
+        include(raisenode.callnodeflags,cnf_call_never_returns);
+        addstatement(statements,raisenode);
+      end;
+end;
+
+{ ti386onnode }
+
+procedure ti386onnode.pass_generate_code;
+  var
+    oldflowcontrol : tflowcontrol;
+    exceptvarsym : tlocalvarsym;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+
+    location_reset(location,LOC_VOID,OS_NO);
+
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=flowcontrol*[fc_unwind]+[fc_inflowcontrol];
+
+    { RTL will put exceptobject into EAX when jumping here }
+    cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_FUNCTION_RESULT_REG);
+
+    { Retrieve exception variable }
+    if assigned(excepTSymtable) then
+      exceptvarsym:=tlocalvarsym(excepTSymtable.SymList[0])
+    else
+      exceptvarsym:=nil;
+
+    if assigned(exceptvarsym) then
+      begin
+        exceptvarsym.localloc.loc:=LOC_REFERENCE;
+        exceptvarsym.localloc.size:=OS_ADDR;
+        tg.GetLocal(current_asmdata.CurrAsmList,sizeof(pint),voidpointertype,exceptvarsym.localloc.reference);
+        cg.a_load_reg_ref(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_FUNCTION_RESULT_REG,exceptvarsym.localloc.reference);
+      end;
+    cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_FUNCTION_RESULT_REG);
+
+    if assigned(right) then
+      secondpass(right);
+
+    { deallocate exception symbol }
+    if assigned(exceptvarsym) then
+      begin
+        tg.UngetLocal(current_asmdata.CurrAsmList,exceptvarsym.localloc.reference);
+        exceptvarsym.localloc.loc:=LOC_INVALID;
+      end;
+    cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+    cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+
+    flowcontrol:=oldflowcontrol+(flowcontrol-[fc_inflowcontrol]);
+  end;
+
+{ ti386tryfinallynode }
+
+function reset_regvars(var n: tnode; arg: pointer): foreachnoderesult;
+  begin
+    case n.nodetype of
+      temprefn:
+        make_not_regable(n,[]);
+      calln:
+        include(tprocinfo(arg).flags,pi_do_call);
+    end;
+    result:=fen_true;
+  end;
+
+function copy_parasize(var n: tnode; arg: pointer): foreachnoderesult;
+  begin
+    case n.nodetype of
+      calln:
+        tcgprocinfo(arg).allocate_push_parasize(tcallnode(n).pushed_parasize);
+    end;
+    result:=fen_true;
+  end;
+
+constructor ti386tryfinallynode.create(l, r: TNode);
+  begin
+    inherited create(l,r);
+    if (target_info.system<>system_i386_win32) or (
+      { Don't create child procedures for generic methods, their nested-like
+        behavior causes compilation errors because real nested procedures
+        aren't allowed for generics. Not creating them doesn't harm because
+        generic node tree is discarded without generating code. }
+        assigned(current_procinfo.procdef.struct) and
+        (df_generic in current_procinfo.procdef.struct.defoptions)
+      ) then
+      exit;
+    finalizepi:=tcgprocinfo(cprocinfo.create(current_procinfo));
+    finalizepi.force_nested;
+    finalizepi.procdef:=create_finalizer_procdef;
+    finalizepi.entrypos:=r.fileinfo;
+    finalizepi.entryswitches:=r.localswitches;
+    finalizepi.exitpos:=current_filepos; // last_endtoken_pos?
+    finalizepi.exitswitches:=current_settings.localswitches;
+    { Regvar optimization for symbols is suppressed when using exceptions, but
+      temps may be still placed into registers. This must be fixed. }
+    foreachnodestatic(r,@reset_regvars,finalizepi);
+    include(finalizepi.flags,pi_has_assembler_block);
+    include(finalizepi.flags,pi_do_call);
+    include(finalizepi.flags,pi_uses_exceptions);
+  end;
+
+constructor ti386tryfinallynode.create_implicit(l, r, _t1: TNode);
+  begin
+    inherited create_implicit(l, r, _t1);
+    if (target_info.system<>system_i386_win32) then
+      exit;
+
+    { safecall procedures can handle implicit finalization as part of "except" flow }
+    if implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall) then
+      exit;
+
+    if assigned(current_procinfo.procdef.struct) and
+      (df_generic in current_procinfo.procdef.struct.defoptions) then
+      InternalError(2013012501);
+
+    finalizepi:=tcgprocinfo(cprocinfo.create(current_procinfo));
+    finalizepi.force_nested;
+    finalizepi.procdef:=create_finalizer_procdef;
+    finalizepi.entrypos:=current_filepos;
+    finalizepi.exitpos:=current_filepos; // last_endtoken_pos?
+    finalizepi.entryswitches:=r.localswitches;
+    finalizepi.exitswitches:=current_settings.localswitches;
+    include(finalizepi.flags,pi_has_assembler_block);
+    include(finalizepi.flags,pi_do_call);
+    include(finalizepi.flags,pi_uses_exceptions);
+  end;
+
+
+function ti386tryfinallynode.pass_1: tnode;
+  var
+    selfsym: tparavarsym;
+  begin
+    result:=inherited pass_1;
+    if (target_info.system=system_i386_win32) then
+      begin
+        { safecall method will access 'self' from except block -> make it non-regable }
+        if implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall) and
+          is_class(current_procinfo.procdef.struct) then
+          begin
+            selfsym:=tparavarsym(current_procinfo.procdef.parast.Find('self'));
+            if (selfsym=nil) or (selfsym.typ<>paravarsym) then
+              InternalError(2011123101);
+            selfsym.varregable:=vr_none;
+          end;
+      end;
+  end;
+
+
+function ti386tryfinallynode.simplify(forinline: boolean): tnode;
+  begin
+    result:=inherited simplify(forinline);
+    if (target_info.system<>system_i386_win32) then
+      exit;
+
+    if (result=nil) and assigned(finalizepi) then
+      begin
+        finalizepi.code:=right;
+        foreachnodestatic(right,@copy_parasize,finalizepi);
+        right:=ccallnode.create(nil,tprocsym(finalizepi.procdef.procsym),nil,nil,[]);
+        firstpass(right);
+        { For implicit frames, no actual code is available at this time,
+          it is added later in assembler form. So store the nested procinfo
+          for later use. }
+        if implicitframe then
+          begin
+            current_procinfo.finalize_procinfo:=finalizepi;
+            { don't leave dangling pointer }
+            tcgprocinfo(current_procinfo).final_asmnode:=nil;
+          end;
+      end;
+  end;
+
+
+procedure emit_scope_start(handler,data: TAsmSymbol);
+  var
+    href: treference;
+    hreg: tregister;
+  begin
+    hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    reference_reset_base(href,hreg,0,sizeof(pint));
+    href.segment:=NR_FS;
+    emit_reg_reg(A_XOR,S_L,hreg,hreg);
+    emit_sym(A_PUSH,S_L,data);
+    emit_reg(A_PUSH,S_L,NR_FRAME_POINTER_REG);
+    emit_sym(A_PUSH,S_L,handler);
+    emit_ref(A_PUSH,S_L,href);
+    emit_reg_ref(A_MOV,S_L,NR_ESP,href);
+  end;
+
+procedure emit_scope_end;
+  var
+    href: treference;
+    hreg,hreg2: tregister;
+  begin
+    hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    hreg2:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    reference_reset_base(href,hreg,0,sizeof(pint));
+    href.segment:=NR_FS;
+    emit_reg_reg(A_XOR,S_L,hreg,hreg);
+    emit_reg(A_POP,S_L,hreg2);
+    emit_const_reg(A_ADD,S_L,3*sizeof(pint),NR_ESP);
+    emit_reg_ref(A_MOV,S_L,hreg2,href);
+  end;
+
+procedure ti386tryfinallynode.pass_generate_code;
+  var
+    finallylabel,
+    exceptlabel,
+    safecalllabel,
+    endfinallylabel,
+    exitfinallylabel,
+    continuefinallylabel,
+    breakfinallylabel,
+    oldCurrExitLabel,
+    oldContinueLabel,
+    oldBreakLabel : tasmlabel;
+    oldflowcontrol,tryflowcontrol : tflowcontrol;
+    is_safecall: boolean;
+    hreg: tregister;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+    location_reset(location,LOC_VOID,OS_NO);
+    tryflowcontrol:=[];
+    oldBreakLabel:=nil;
+    oldContinueLabel:=nil;
+    continuefinallylabel:=nil;
+    breakfinallylabel:=nil;
+    exceptlabel:=nil;
+    safecalllabel:=nil;
+    is_safecall:=implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall);
+
+    { check if child nodes do a break/continue/exit }
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=[fc_inflowcontrol];
+    current_asmdata.getjumplabel(finallylabel);
+    current_asmdata.getjumplabel(endfinallylabel);
+
+    { the finally block must catch break, continue and exit }
+    { statements                                            }
+    oldCurrExitLabel:=current_procinfo.CurrExitLabel;
+    if implicitframe then
+      exitfinallylabel:=finallylabel
+    else
+      current_asmdata.getjumplabel(exitfinallylabel);
+    current_procinfo.CurrExitLabel:=exitfinallylabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        oldContinueLabel:=current_procinfo.CurrContinueLabel;
+        oldBreakLabel:=current_procinfo.CurrBreakLabel;
+        if implicitframe then
+          begin
+            breakfinallylabel:=finallylabel;
+            continuefinallylabel:=finallylabel;
+          end
+        else
+          begin
+            current_asmdata.getjumplabel(breakfinallylabel);
+            current_asmdata.getjumplabel(continuefinallylabel);
+          end;
+        current_procinfo.CurrContinueLabel:=continuefinallylabel;
+        current_procinfo.CurrBreakLabel:=breakfinallylabel;
+      end;
+
+    { Start of scope }
+    if is_safecall then
+      begin
+        with cg.rg[R_INTREGISTER] do
+          used_in_proc:=used_in_proc+[RS_EBX,RS_ESI,RS_EDI];
+
+        current_asmdata.getjumplabel(exceptlabel);
+        emit_scope_start(
+          current_asmdata.RefAsmSymbol('__FPC_except_safecall'),
+          exceptlabel
+        );
+      end
+    else
+      emit_scope_start(
+        current_asmdata.RefAsmSymbol('__FPC_finally_handler'),
+        current_asmdata.RefAsmSymbol(finalizepi.procdef.mangledname)
+      );
+
+    { try code }
+    if assigned(left) then
+      begin
+        secondpass(left);
+        tryflowcontrol:=flowcontrol;
+        if codegenerror then
+          exit;
+      end;
+
+    { don't generate line info for internal cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoStart));
+
+    cg.a_label(current_asmdata.CurrAsmList,finallylabel);
+    emit_scope_end;
+    if is_safecall then
+      begin
+        current_asmdata.getjumplabel(safecalllabel);
+        hreg:=cg.GetIntRegister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_const_reg(current_asmdata.CurrAsmList,OS_INT,0,hreg);
+        cg.a_jmp_always(current_asmdata.CurrAsmList,safecalllabel);
+        { RTL handler will jump here on exception }
+        cg.a_label(current_asmdata.CurrAsmList,exceptlabel);
+        handle_safecall_exception;
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_FUNCTION_RESULT_REG,hreg);
+        cg.a_label(current_asmdata.CurrAsmList,safecalllabel);
+      end;
+
+    { end cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoEnd));
+
+    { generate finally code as a separate procedure }
+    { !!! this resets flowcontrol, how to check flow away? }
+    if not implicitframe then
+      tcgprocinfo(current_procinfo).generate_exceptfilter(finalizepi);
+
+    flowcontrol:=[fc_inflowcontrol];
+    { right is a call to finalizer procedure }
+    secondpass(right);
+
+    { goto is allowed if it stays inside the finally block,
+      this is checked using the exception block number }
+    if (flowcontrol-[fc_gotolabel])<>[fc_inflowcontrol] then
+      CGMessage(cg_e_control_flow_outside_finally);
+    if codegenerror then
+      exit;
+
+    { don't generate line info for internal cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoStart));
+
+    if not implicitframe then
+      begin
+        if tryflowcontrol*[fc_exit,fc_break,fc_continue]<>[] then
+          cg.a_jmp_always(current_asmdata.CurrAsmList,endfinallylabel);
+        { do some magic for exit,break,continue in the try block }
+        if fc_exit in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,exitfinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+          end;
+        if fc_break in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,breakfinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+          end;
+        if fc_continue in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,continuefinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+          end;
+      end;
+    if is_safecall then
+      cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,hreg,NR_FUNCTION_RETURN_REG);
+    cg.a_label(current_asmdata.CurrAsmList,endfinallylabel);
+
+    { end cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoEnd));
+
+    current_procinfo.CurrExitLabel:=oldCurrExitLabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=oldContinueLabel;
+        current_procinfo.CurrBreakLabel:=oldBreakLabel;
+      end;
+    flowcontrol:=oldflowcontrol+(tryflowcontrol-[fc_inflowcontrol]);
+  end;
+
+{ ti386tryexceptnode }
+
+procedure ti386tryexceptnode.pass_generate_code;
+  var
+    exceptlabel,oldendexceptlabel,
+    lastonlabel,
+    exitexceptlabel,
+    continueexceptlabel,
+    breakexceptlabel,
+    exittrylabel,
+    continuetrylabel,
+    breaktrylabel,
+    oldCurrExitLabel,
+    oldContinueLabel,
+    oldBreakLabel : tasmlabel;
+    onlabel,
+    filterlabel: tasmlabel;
+    oldflowcontrol,tryflowcontrol,
+    exceptflowcontrol : tflowcontrol;
+    hnode : tnode;
+    hlist : tasmlist;
+    onnodecount : tai_const;
+  label
+    errorexit;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+    location_reset(location,LOC_VOID,OS_NO);
+
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=[fc_inflowcontrol];
+    { this can be called recursivly }
+    oldBreakLabel:=nil;
+    oldContinueLabel:=nil;
+    oldendexceptlabel:=endexceptlabel;
+
+    { Win32 SEH unwinding does not preserve registers. Indicate that they are
+      going to be destroyed. }
+    cg.alloccpuregisters(current_asmdata.CurrAsmList,R_INTREGISTER,[RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]);
+    cg.dealloccpuregisters(current_asmdata.CurrAsmList,R_INTREGISTER,[RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]);
+
+    { save the old labels for control flow statements }
+    oldCurrExitLabel:=current_procinfo.CurrExitLabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        oldContinueLabel:=current_procinfo.CurrContinueLabel;
+        oldBreakLabel:=current_procinfo.CurrBreakLabel;
+      end;
+
+    { get new labels for the control flow statements }
+    current_asmdata.getjumplabel(exittrylabel);
+    current_asmdata.getjumplabel(exitexceptlabel);
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        current_asmdata.getjumplabel(breaktrylabel);
+        current_asmdata.getjumplabel(continuetrylabel);
+        current_asmdata.getjumplabel(breakexceptlabel);
+        current_asmdata.getjumplabel(continueexceptlabel);
+      end;
+
+    current_asmdata.getjumplabel(exceptlabel);
+    current_asmdata.getjumplabel(endexceptlabel);
+    current_asmdata.getjumplabel(lastonlabel);
+    filterlabel:=nil;
+
+    { start of scope }
+    if assigned(right) then
+      begin
+        current_asmdata.getdatalabel(filterlabel);
+        emit_scope_start(
+          current_asmdata.RefAsmSymbol('__FPC_on_handler'),
+          filterlabel);
+      end
+    else
+      emit_scope_start(
+        current_asmdata.RefAsmSymbol('__FPC_except_handler'),
+        exceptlabel);
+
+    { set control flow labels for the try block }
+    current_procinfo.CurrExitLabel:=exittrylabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=continuetrylabel;
+        current_procinfo.CurrBreakLabel:=breaktrylabel;
+      end;
+
+    secondpass(left);
+    tryflowcontrol:=flowcontrol;
+    if codegenerror then
+      goto errorexit;
+
+    emit_scope_end;
+    { jump over except handlers }
+    cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+
+    if fc_exit in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,exittrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+      end;
+    if fc_break in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,breaktrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+      end;
+    if fc_continue in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,continuetrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+      end;
+
+    { target for catch-all handler }
+    cg.a_label(current_asmdata.CurrAsmList,exceptlabel);
+
+    { set control flow labels for the except block }
+    { and the on statements                        }
+    current_procinfo.CurrExitLabel:=exitexceptlabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=continueexceptlabel;
+        current_procinfo.CurrBreakLabel:=breakexceptlabel;
+      end;
+
+    flowcontrol:=[fc_inflowcontrol];
+    { on statements }
+    if assigned(right) then
+      begin
+        { emit filter table to a temporary asmlist }
+        hlist:=TAsmList.Create;
+        new_section(hlist,sec_rodata,filterlabel.name,4);
+        cg.a_label(hlist,filterlabel);
+        onnodecount:=tai_const.create_32bit(0);
+        hlist.concat(onnodecount);
+
+        hnode:=right;
+        while assigned(hnode) do
+          begin
+            if hnode.nodetype<>onn then
+              InternalError(2011103101);
+            { TODO: make it done without using global label }
+            current_asmdata.getglobaljumplabel(onlabel);
+            hlist.concat(tai_const.create_sym(current_asmdata.RefAsmSymbol(tonnode(hnode).excepttype.vmt_mangledname,AT_DATA)));
+            hlist.concat(tai_const.create_sym(onlabel));
+            cg.a_label(current_asmdata.CurrAsmList,onlabel);
+            secondpass(hnode);
+            inc(onnodecount.value);
+            hnode:=tonnode(hnode).left;
+          end;
+        { add 'else' node to the filter list, too }
+        if assigned(t1) then
+          begin
+            hlist.concat(tai_const.create_32bit(-1));
+            hlist.concat(tai_const.create_sym(lastonlabel));
+            inc(onnodecount.value);
+          end;
+        { now move filter table to permanent list all at once }
+        maybe_new_object_file(current_asmdata.asmlists[al_typedconsts]);
+        current_asmdata.asmlists[al_typedconsts].concatlist(hlist);
+        hlist.free;
+      end;
+
+    cg.a_label(current_asmdata.CurrAsmList,lastonlabel);
+    if assigned(t1) then
+      begin
+        { here we don't have to reset flowcontrol           }
+        { the default and on flowcontrols are handled equal }
+        secondpass(t1);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        if (flowcontrol*[fc_exit,fc_break,fc_continue]<>[]) then
+          cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+      end;
+    exceptflowcontrol:=flowcontrol;
+
+    if fc_exit in exceptflowcontrol then
+      begin
+        { do some magic for exit in the try block }
+        cg.a_label(current_asmdata.CurrAsmList,exitexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+      end;
+
+    if fc_break in exceptflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,breakexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+      end;
+
+    if fc_continue in exceptflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,continueexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+      end;
+
+    cg.a_label(current_asmdata.CurrAsmList,endexceptlabel);
+
+errorexit:
+    { restore all saved labels }
+    endexceptlabel:=oldendexceptlabel;
+
+    { restore the control flow labels }
+    current_procinfo.CurrExitLabel:=oldCurrExitLabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=oldContinueLabel;
+        current_procinfo.CurrBreakLabel:=oldBreakLabel;
+      end;
+
+    { return all used control flow statements }
+    flowcontrol:=oldflowcontrol+(exceptflowcontrol +
+      tryflowcontrol - [fc_inflowcontrol]);
+  end;
+
+initialization
+  craisenode:=ti386raisenode;
+  connode:=ti386onnode;
+  ctryexceptnode:=ti386tryexceptnode;
+  ctryfinallynode:=ti386tryfinallynode;
+end.

+ 5 - 3
compiler/i386/popt386.pas

@@ -320,7 +320,7 @@ begin
                          12: begin
                             {imul 12, reg1, reg2 to
                                lea (,reg1,4), reg2
-                               lea (,reg1,8) reg2
+                               lea (reg2,reg1,8), reg2
                              imul 12, reg1 to
                                lea (reg1,reg1,2), reg1
                                lea (,reg1,4), reg1}
@@ -853,6 +853,8 @@ begin
                         S_B: v:=$80;
                         S_W: v:=$8000;
                         S_L: v:=aint($80000000);
+                        else
+                          internalerror(2013112905);
                       end;
                       if (taicpu(p).oper[0]^.typ=Top_const) and
                          (taicpu(p).oper[0]^.val=v) and
@@ -1399,6 +1401,7 @@ begin
                                   taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
                                   taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
                                   taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
+                                  taicpu(hp1).fileinfo := taicpu(p).fileinfo;
                                 end
                         end;
                       if GetNextInstruction(p, hp1) and
@@ -1973,10 +1976,9 @@ procedure PeepHoleOptPass2(asml: TAsmList; BlockStart, BlockEnd: tai);
     end;
 
 var
-  p,hp1,hp2: tai;
+  p,hp1,hp2,hp3: tai;
   l : longint;
   condition : tasmcond;
-  hp3: tai;
   UsedRegs, TmpUsedRegs: TRegSet;
   carryadd_opcode: Tasmop;
 

+ 1 - 0
compiler/i386/rropt386.pas

@@ -206,6 +206,7 @@ begin
   sequenceEnd := false;
   reg1Modified := false;
   reg2Modified := false;
+  switchLast := false;
   endP := start;
   while tmpResult and not sequenceEnd do
     begin

+ 445 - 105
compiler/i8086/cgcpu.pas

@@ -49,8 +49,6 @@ unit cgcpu;
         procedure a_call_name_static_far(list : TAsmList;const s : string);
         procedure a_call_reg(list : TAsmList;reg : tregister);override;
         procedure a_call_reg_far(list : TAsmList;reg : tregister);
-        procedure a_call_ref(list : TAsmList;ref : treference);override;
-        procedure a_call_ref_far(list : TAsmList;ref : treference);
 
         procedure a_op_const_reg(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; reg: TRegister); override;
         procedure a_op_const_ref(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; const ref: TReference); override;
@@ -73,6 +71,18 @@ unit cgcpu;
         procedure a_load_ref_reg(list : TAsmList;fromsize,tosize: tcgsize;const ref : treference;reg : tregister);override;
         procedure a_load_reg_reg(list : TAsmList;fromsize,tosize: tcgsize;reg1,reg2 : tregister);override;
 
+        {  comparison operations }
+        procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
+          l : tasmlabel);override;
+        procedure a_cmp_const_ref_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;const ref : treference;
+          l : tasmlabel);override;
+        procedure a_cmp_reg_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;reg1,reg2 : tregister;l : tasmlabel); override;
+        procedure a_cmp_ref_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;const ref: treference; reg : tregister; l : tasmlabel); override;
+        procedure a_cmp_reg_ref_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;reg : tregister; const ref: treference; l : tasmlabel); override;
+
+        procedure gen_cmp32_jmp1(list: TAsmList; cmp_op: topcmp; l_skip, l_target: TAsmLabel);
+        procedure gen_cmp32_jmp2(list: TAsmList; cmp_op: topcmp; l_skip, l_target: TAsmLabel);
+
         procedure g_flags2reg(list: TAsmList; size: TCgSize; const f: tresflags; reg: TRegister);override;
         procedure g_flags2ref(list: TAsmList; size: TCgSize; const f: tresflags; const ref: TReference);override;
 
@@ -89,6 +99,8 @@ unit cgcpu;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
 
         procedure get_32bit_ops(op: TOpCG; out op1,op2: TAsmOp);
+
+        procedure add_move_instruction(instr:Taicpu);override;
      end;
 
       tcg64f8086 = class(tcg64f32)
@@ -123,12 +135,11 @@ unit cgcpu;
     procedure tcg8086.init_register_allocators;
       begin
         inherited init_register_allocators;
-        if not(target_info.system in [system_i386_darwin,system_i386_iphonesim]) and
-           (cs_create_pic in current_settings.moduleswitches) then
-          rg[R_INTREGISTER]:=trgcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_AX,RS_DX,RS_CX,RS_SI,RS_DI],first_int_imreg,[RS_BP])
+        if cs_create_pic in current_settings.moduleswitches then
+          rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_AX,RS_DX,RS_CX,RS_SI,RS_DI],first_int_imreg,[RS_BP])
         else
           if (cs_useebp in current_settings.optimizerswitches) and assigned(current_procinfo) and (current_procinfo.framepointer<>NR_BP) then
-            rg[R_INTREGISTER]:=trgcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_AX,RS_DX,RS_CX,RS_BX,RS_SI,RS_DI,RS_BP],first_int_imreg,[])
+            rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_AX,RS_DX,RS_CX,RS_BX,RS_SI,RS_DI,RS_BP],first_int_imreg,[])
           else
             rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_AX,RS_DX,RS_CX,RS_BX,RS_SI,RS_DI],first_int_imreg,[RS_BP]);
         rg[R_MMXREGISTER]:=trgcpu.create(R_MMXREGISTER,R_SUBNONE,[RS_XMM0,RS_XMM1,RS_XMM2,RS_XMM3,RS_XMM4,RS_XMM5,RS_XMM6,RS_XMM7],first_mm_imreg,[]);
@@ -240,27 +251,12 @@ unit cgcpu;
         a_load_reg_ref(list,OS_32,OS_32,reg,href);
         cg.getcpuregister(list,NR_BX);
         cg.getcpuregister(list,NR_SI);
-        a_call_ref_far(list,href);
+        href.refaddr:=addr_far_ref;
+        list.concat(taicpu.op_ref(A_CALL,S_NO,href));
         tg.ungettemp(list,href);
       end;
 
 
-    procedure tcg8086.a_call_ref(list: TAsmList; ref: treference);
-      begin
-        if current_settings.x86memorymodel in x86_far_code_models then
-          a_call_ref_far(list,ref)
-        else
-          a_call_ref_near(list,ref);
-      end;
-
-
-    procedure tcg8086.a_call_ref_far(list: TAsmList; ref: treference);
-      begin
-        ref.refaddr:=addr_far_ref;
-        list.concat(taicpu.op_ref(A_CALL,S_NO,ref));
-      end;
-
-
     procedure tcg8086.a_op_const_reg(list: TAsmList; Op: TOpCG; size: TCGSize;
       a: tcgint; reg: TRegister);
       var
@@ -272,7 +268,7 @@ unit cgcpu;
         use_loop: Boolean;
         i: Integer;
       begin
-        optimize_op_const(op, a);
+        optimize_op_const(size, op, a);
         check_register_size(size,reg);
 
         if size in [OS_64, OS_S64] then
@@ -296,7 +292,8 @@ unit cgcpu;
                   { Optimization when the low 16-bits of the constant are 0 }
                   if aint(a and $FFFF) = 0 then
                     begin
-                      list.concat(taicpu.op_const_reg(op1,S_W,aint(a shr 16),GetNextReg(reg)));
+                      { use a_op_const_reg to allow the use of inc/dec }
+                      a_op_const_reg(list,op,OS_16,aint(a shr 16),GetNextReg(reg));
                     end
                   else
                     begin
@@ -306,27 +303,61 @@ unit cgcpu;
                 end;
               OP_AND, OP_OR, OP_XOR:
                 begin
-                  if longword(a) = high(longword) then
+                  { low word operation }
+                  if aint(a and $FFFF) = aint(0) then
                     begin
                       case op of
                         OP_AND:
-                          exit;
+                          a_load_const_reg(list,OS_16,aint(0),reg);
+                        OP_OR,OP_XOR:
+                          {do nothing};
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else if aint(a and $FFFF) = aint($FFFF) then
+                    begin
+                      case op of
+                        OP_AND:
+                          {do nothing};
                         OP_OR:
-                          a_load_const_reg(list,size,high(longword),reg);
+                          a_load_const_reg(list,OS_16,aint($FFFF),reg);
                         OP_XOR:
-                          begin
-                            list.concat(taicpu.op_reg(A_NOT,S_W,reg));
-                            list.concat(taicpu.op_reg(A_NOT,S_W,GetNextReg(reg)));
-                          end;
+                          list.concat(taicpu.op_reg(A_NOT,S_W,reg));
                         else
                           InternalError(2013100701);
-                      end
+                      end;
                     end
                   else
-                  begin
                     a_op_const_reg(list,op,OS_16,aint(a and $FFFF),reg);
+
+                  { high word operation }
+                  if aint(a shr 16) = aint(0) then
+                    begin
+                      case op of
+                        OP_AND:
+                          a_load_const_reg(list,OS_16,aint(0),GetNextReg(reg));
+                        OP_OR,OP_XOR:
+                          {do nothing};
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else if aint(a shr 16) = aint($FFFF) then
+                    begin
+                      case op of
+                        OP_AND:
+                          {do nothing};
+                        OP_OR:
+                          a_load_const_reg(list,OS_16,aint($FFFF),GetNextReg(reg));
+                        OP_XOR:
+                          list.concat(taicpu.op_reg(A_NOT,S_W,GetNextReg(reg)));
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else
                     a_op_const_reg(list,op,OS_16,aint(a shr 16),GetNextReg(reg));
-                  end;
                 end;
               OP_SHR,OP_SHL,OP_SAR:
                 begin
@@ -448,13 +479,6 @@ unit cgcpu;
             { 8086 doesn't support 'imul reg,const', so we handle it here }
             if (current_settings.cputype<cpu_186) and (op in [OP_MUL,OP_IMUL]) then
               begin
-                { TODO: also enable the SHL optimization below }
-    {            if not(cs_check_overflow in current_settings.localswitches) and
-                   ispowerof2(int64(a),power) then
-                  begin
-                    list.concat(taicpu.op_const_reg(A_SHL,TCgSize2OpSize[size],power,reg));
-                    exit;
-                  end;}
                 if op = OP_IMUL then
                   begin
                     if size in [OS_16,OS_S16] then
@@ -470,15 +494,17 @@ unit cgcpu;
                     a_load_const_reg(list,size,a,ax_subreg);
                     if size in [OS_16,OS_S16] then
                       getcpuregister(list,NR_DX);
-                    list.concat(taicpu.op_reg(A_IMUL,TCgSize2OpSize[size],reg));
+                    { prefer MUL over IMUL when overflow checking is off, }
+                    { because it's faster on the 8086 & 8088              }
+                    if not (cs_check_overflow in current_settings.localswitches) then
+                      list.concat(taicpu.op_reg(A_MUL,TCgSize2OpSize[size],reg))
+                    else
+                      list.concat(taicpu.op_reg(A_IMUL,TCgSize2OpSize[size],reg));
                     if size in [OS_16,OS_S16] then
                       ungetcpuregister(list,NR_DX);
                     a_load_reg_reg(list,size,size,ax_subreg,reg);
 
                     ungetcpuregister(list,NR_AX);
-
-                    { TODO: implement overflow checking? }
-
                     exit;
                   end
                 else
@@ -497,7 +523,7 @@ unit cgcpu;
         tmpref: treference;
         op1,op2: TAsmOp;
       begin
-        optimize_op_const(op, a);
+        optimize_op_const(size, op, a);
         tmpref:=ref;
         make_simple_ref(list,tmpref);
 
@@ -522,7 +548,8 @@ unit cgcpu;
                   if aint(a and $FFFF) = 0 then
                     begin
                       inc(tmpref.offset, 2);
-                      list.concat(taicpu.op_const_ref(op1,S_W,aint(a shr 16),tmpref));
+                      { use a_op_const_ref to allow the use of inc/dec }
+                      a_op_const_ref(list,op,OS_16,aint(a shr 16),tmpref);
                     end
                   else
                     begin
@@ -533,29 +560,62 @@ unit cgcpu;
                 end;
               OP_AND, OP_OR, OP_XOR:
                 begin
-                  if longword(a) = high(longword) then
+                  { low word operation }
+                  if aint(a and $FFFF) = aint(0) then
+                    begin
+                      case op of
+                        OP_AND:
+                          a_load_const_ref(list,OS_16,aint(0),ref);
+                        OP_OR,OP_XOR:
+                          {do nothing};
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else if aint(a and $FFFF) = aint($FFFF) then
                     begin
                       case op of
                         OP_AND:
-                          exit;
+                          {do nothing};
                         OP_OR:
-                          a_load_const_ref(list,size,high(longword),tmpref);
+                          a_load_const_ref(list,OS_16,aint($FFFF),tmpref);
                         OP_XOR:
-                          begin
-                            list.concat(taicpu.op_ref(A_NOT,S_W,tmpref));
-                            inc(tmpref.offset, 2);
-                            list.concat(taicpu.op_ref(A_NOT,S_W,tmpref));
-                          end;
+                          list.concat(taicpu.op_ref(A_NOT,S_W,tmpref));
                         else
                           InternalError(2013100701);
-                      end
+                      end;
                     end
                   else
-                  begin
                     a_op_const_ref(list,op,OS_16,aint(a and $FFFF),tmpref);
-                    inc(tmpref.offset, 2);
+
+                  { high word operation }
+                  inc(tmpref.offset, 2);
+                  if aint(a shr 16) = aint(0) then
+                    begin
+                      case op of
+                        OP_AND:
+                          a_load_const_ref(list,OS_16,aint(0),tmpref);
+                        OP_OR,OP_XOR:
+                          {do nothing};
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else if aint(a shr 16) = aint($FFFF) then
+                    begin
+                      case op of
+                        OP_AND:
+                          {do nothing};
+                        OP_OR:
+                          a_load_const_ref(list,OS_16,aint($FFFF),tmpref);
+                        OP_XOR:
+                          list.concat(taicpu.op_ref(A_NOT,S_W,tmpref));
+                        else
+                          InternalError(2013100701);
+                      end;
+                    end
+                  else
                     a_op_const_ref(list,op,OS_16,aint(a shr 16),tmpref);
-                  end;
                 end;
               else
                 internalerror(2013050802);
@@ -1050,7 +1110,6 @@ unit cgcpu;
 
     procedure tcg8086.a_load_reg_ref(list : TAsmList;fromsize,tosize: tcgsize; reg : tregister;const ref : treference);
       var
-        tmpsize : tcgsize;
         tmpreg  : tregister;
         tmpref  : treference;
       begin
@@ -1066,15 +1125,12 @@ unit cgcpu;
               internalerror(2013030310);
           OS_16,OS_S16:
             case fromsize of
-              OS_8:
+              OS_8,OS_S8:
                 begin
-                  reg := makeregsize(list, reg, OS_16);
-                  setsubreg(reg, R_SUBH);
-                  list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg));
-                  setsubreg(reg, R_SUBW);
-                  list.concat(taicpu.op_reg_ref(A_MOV, S_W, reg, tmpref));
+                  tmpreg:=getintregister(list,tosize);
+                  a_load_reg_reg(list,fromsize,tosize,reg,tmpreg);
+                  a_load_reg_ref(list,tosize,tosize,tmpreg,tmpref);
                 end;
-              OS_S8: internalerror(2013052503);  { TODO }
               OS_16,OS_S16:
                 begin
                   list.concat(taicpu.op_reg_ref(A_MOV, S_W, reg, tmpref));
@@ -1084,26 +1140,18 @@ unit cgcpu;
             end;
           OS_32,OS_S32:
             case fromsize of
-              OS_8:
+              OS_8,OS_S8,OS_S16:
                 begin
-                  reg := makeregsize(list, reg, OS_16);
-                  setsubreg(reg, R_SUBH);
-                  list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg));
-                  setsubreg(reg, R_SUBW);
-                  list.concat(taicpu.op_reg_ref(A_MOV, S_W, reg, tmpref));
-                  inc(tmpref.offset, 2);
-                  list.concat(taicpu.op_const_ref(A_MOV, S_W, 0, tmpref));
+                  tmpreg:=getintregister(list,tosize);
+                  a_load_reg_reg(list,fromsize,tosize,reg,tmpreg);
+                  a_load_reg_ref(list,tosize,tosize,tmpreg,tmpref);
                 end;
-              OS_S8:
-                internalerror(2013052501);  { TODO }
               OS_16:
                 begin
                   list.concat(taicpu.op_reg_ref(A_MOV, S_W, reg, tmpref));
                   inc(tmpref.offset, 2);
                   list.concat(taicpu.op_const_ref(A_MOV, S_W, 0, tmpref));
                 end;
-              OS_S16:
-                internalerror(2013052502);  { TODO }
               OS_32,OS_S32:
                 begin
                   list.concat(taicpu.op_reg_ref(A_MOV, S_W, reg, tmpref));
@@ -1152,9 +1200,11 @@ unit cgcpu;
             case fromsize of
               OS_8:
                 begin
-                  list.concat(taicpu.op_const_reg(A_MOV, S_W, 0, reg));
                   reg := makeregsize(list, reg, OS_8);
                   list.concat(taicpu.op_ref_reg(A_MOV, S_B, tmpref, reg));
+                  setsubreg(reg, R_SUBH);
+                  list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg));
+                  makeregsize(list, reg, OS_16);
                 end;
               OS_S8:
                 begin
@@ -1174,9 +1224,11 @@ unit cgcpu;
               OS_8:
                 begin
                   list.concat(taicpu.op_const_reg(A_MOV,S_W,0,GetNextReg(reg)));
-                  list.concat(taicpu.op_const_reg(A_MOV, S_W, 0, reg));
                   reg := makeregsize(list, reg, OS_8);
                   list.concat(taicpu.op_ref_reg(A_MOV, S_B, tmpref, reg));
+                  setsubreg(reg, R_SUBH);
+                  list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg));
+                  makeregsize(list, reg, OS_16);
                 end;
               OS_S8:
                 begin
@@ -1244,12 +1296,15 @@ unit cgcpu;
             fromsize:=tosize;
           end;
 
-        if (reg1<>reg2) then
+        if (reg1<>reg2) or (fromsize<>tosize) then
           begin
             case tosize of
               OS_8,OS_S8:
                 if fromsize in [OS_8,OS_S8] then
-                  add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2))
+                  begin
+                    if reg1<>reg2 then
+                      add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2));
+                  end
                 else
                   internalerror(2013030210);
               OS_16,OS_S16:
@@ -1257,9 +1312,11 @@ unit cgcpu;
                   OS_8:
                     begin
                       reg2 := makeregsize(list, reg2, OS_8);
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2));
+                      if reg1<>reg2 then
+                        add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2));
                       setsubreg(reg2,R_SUBH);
                       list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg2));
+                      makeregsize(list, reg2, OS_16);
                     end;
                   OS_S8:
                     begin
@@ -1270,7 +1327,10 @@ unit cgcpu;
                       ungetcpuregister(list, NR_AX);
                     end;
                   OS_16,OS_S16:
-                    add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
+                    begin
+                      if reg1<>reg2 then
+                        add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
+                    end
                   else
                     internalerror(2013030212);
                 end;
@@ -1280,9 +1340,11 @@ unit cgcpu;
                     begin
                       list.concat(taicpu.op_const_reg(A_MOV, S_W, 0, GetNextReg(reg2)));
                       reg2 := makeregsize(list, reg2, OS_8);
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2));
+                      if reg1<>reg2 then
+                        add_mov(taicpu.op_reg_reg(A_MOV, S_B, reg1, reg2));
                       setsubreg(reg2,R_SUBH);
                       list.concat(taicpu.op_const_reg(A_MOV, S_B, 0, reg2));
+                      makeregsize(list, reg2, OS_16);
                     end;
                   OS_S8:
                     begin
@@ -1298,7 +1360,8 @@ unit cgcpu;
                     end;
                   OS_16:
                     begin
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
+                      if reg1<>reg2 then
+                        add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
                       list.concat(taicpu.op_const_reg(A_MOV,S_W,0,GetNextReg(reg2)));
                     end;
                   OS_S16:
@@ -1307,15 +1370,19 @@ unit cgcpu;
                       add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, NR_AX));
                       getcpuregister(list, NR_DX);
                       list.concat(taicpu.op_none(A_CWD));
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_W, NR_AX, reg2));
+                      if reg1<>reg2 then
+                        add_mov(taicpu.op_reg_reg(A_MOV, S_W, NR_AX, reg2));
                       ungetcpuregister(list, NR_AX);
                       add_mov(taicpu.op_reg_reg(A_MOV, S_W, NR_DX, GetNextReg(reg2)));
                       ungetcpuregister(list, NR_DX);
                     end;
                   OS_32,OS_S32:
                     begin
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
-                      add_mov(taicpu.op_reg_reg(A_MOV, S_W, GetNextReg(reg1), GetNextReg(reg2)));
+                      if reg1<>reg2 then
+                        begin
+                          add_mov(taicpu.op_reg_reg(A_MOV, S_W, reg1, reg2));
+                          add_mov(taicpu.op_reg_reg(A_MOV, S_W, GetNextReg(reg1), GetNextReg(reg2)));
+                        end;
                     end;
                   else
                     internalerror(2013030213);
@@ -1327,19 +1394,208 @@ unit cgcpu;
       end;
 
 
+    procedure tcg8086.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
+      var
+        hl_skip: TAsmLabel;
+      begin
+        if size in [OS_32, OS_S32] then
+          begin
+            if (longint(a shr 16) = 0) then
+              list.concat(taicpu.op_reg_reg(A_TEST,S_W,GetNextReg(reg),GetNextReg(reg)))
+            else
+              list.concat(taicpu.op_const_reg(A_CMP,S_W,longint(a shr 16),GetNextReg(reg)));
+            current_asmdata.getjumplabel(hl_skip);
+            gen_cmp32_jmp1(list, cmp_op, hl_skip, l);
+
+            if (longint(a and $ffff) = 0) then
+              list.concat(taicpu.op_reg_reg(A_TEST,S_W,reg,reg))
+            else
+              list.concat(taicpu.op_const_reg(A_CMP,S_W,longint(a and $ffff),reg));
+            gen_cmp32_jmp2(list, cmp_op, hl_skip, l);
+            a_label(list,hl_skip);
+          end
+        else
+          inherited a_cmp_const_reg_label(list, size, cmp_op, a, reg, l);
+      end;
+
+
+    procedure tcg8086.a_cmp_const_ref_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; const ref: treference; l: tasmlabel);
+      var
+        tmpref: treference;
+        hl_skip: TAsmLabel;
+      begin
+        if size in [OS_32, OS_S32] then
+          begin
+            tmpref:=ref;
+            make_simple_ref(list,tmpref);
+            inc(tmpref.offset,2);
+            list.concat(taicpu.op_const_ref(A_CMP,S_W,longint(a shr 16),tmpref));
+            current_asmdata.getjumplabel(hl_skip);
+            gen_cmp32_jmp1(list, cmp_op, hl_skip, l);
+            dec(tmpref.offset,2);
+            list.concat(taicpu.op_const_ref(A_CMP,S_W,longint(a and $ffff),tmpref));
+            gen_cmp32_jmp2(list, cmp_op, hl_skip, l);
+            a_label(list,hl_skip);
+          end
+        else
+          inherited a_cmp_const_ref_label(list, size, cmp_op, a, ref, l);
+      end;
+
+
+    procedure tcg8086.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);
+      var
+        hl_skip: TAsmLabel;
+      begin
+        if size in [OS_32, OS_S32] then
+          begin
+            check_register_size(size,reg1);
+            check_register_size(size,reg2);
+            list.concat(taicpu.op_reg_reg(A_CMP,S_W,GetNextReg(reg1),GetNextReg(reg2)));
+            current_asmdata.getjumplabel(hl_skip);
+            gen_cmp32_jmp1(list, cmp_op, hl_skip, l);
+            list.concat(taicpu.op_reg_reg(A_CMP,S_W,reg1,reg2));
+            gen_cmp32_jmp2(list, cmp_op, hl_skip, l);
+            a_label(list,hl_skip);
+          end
+        else
+          inherited a_cmp_reg_reg_label(list, size, cmp_op, reg1, reg2, l);
+      end;
+
+
+    procedure tcg8086.a_cmp_ref_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; const ref: treference; reg: tregister; l: tasmlabel);
+      var
+        tmpref: treference;
+        hl_skip: TAsmLabel;
+      begin
+        if size in [OS_32, OS_S32] then
+          begin
+            tmpref:=ref;
+            make_simple_ref(list,tmpref);
+            check_register_size(size,reg);
+            inc(tmpref.offset,2);
+            list.concat(taicpu.op_ref_reg(A_CMP,S_W,tmpref,GetNextReg(reg)));
+            current_asmdata.getjumplabel(hl_skip);
+            gen_cmp32_jmp1(list, cmp_op, hl_skip, l);
+            dec(tmpref.offset,2);
+            list.concat(taicpu.op_ref_reg(A_CMP,S_W,tmpref,reg));
+            gen_cmp32_jmp2(list, cmp_op, hl_skip, l);
+            a_label(list,hl_skip);
+          end
+        else
+          inherited a_cmp_ref_reg_label(list, size, cmp_op, ref, reg, l);
+      end;
+
+
+    procedure tcg8086.a_cmp_reg_ref_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg: tregister; const ref: treference; l: tasmlabel);
+      var
+        tmpref: treference;
+        hl_skip: TAsmLabel;
+      begin
+        if size in [OS_32, OS_S32] then
+          begin
+            tmpref:=ref;
+            make_simple_ref(list,tmpref);
+            check_register_size(size,reg);
+            inc(tmpref.offset,2);
+            list.concat(taicpu.op_reg_ref(A_CMP,S_W,GetNextReg(reg),tmpref));
+            current_asmdata.getjumplabel(hl_skip);
+            gen_cmp32_jmp1(list, cmp_op, hl_skip, l);
+            dec(tmpref.offset,2);
+            list.concat(taicpu.op_reg_ref(A_CMP,S_W,reg,tmpref));
+            gen_cmp32_jmp2(list, cmp_op, hl_skip, l);
+            a_label(list,hl_skip);
+          end
+        else
+          inherited a_cmp_reg_ref_label(list, size, cmp_op, reg, ref, l);
+      end;
+
+
+    procedure tcg8086.gen_cmp32_jmp1(list: TAsmList; cmp_op: topcmp; l_skip, l_target: TAsmLabel);
+      begin
+        case cmp_op of
+          OC_EQ:
+            a_jmp_cond(list, OC_NE, l_skip);
+          OC_NE:
+            a_jmp_cond(list, OC_NE, l_target);
+          OC_GT,OC_GTE:
+            begin
+              a_jmp_cond(list, OC_GT, l_target);
+              a_jmp_cond(list, OC_LT, l_skip);
+            end;
+          OC_LT,OC_LTE:
+            begin
+              a_jmp_cond(list, OC_LT, l_target);
+              a_jmp_cond(list, OC_GT, l_skip);
+            end;
+          OC_B,OC_BE:
+            begin
+              a_jmp_cond(list, OC_B, l_target);
+              a_jmp_cond(list, OC_A, l_skip);
+            end;
+          OC_A,OC_AE:
+            begin
+              a_jmp_cond(list, OC_A, l_target);
+              a_jmp_cond(list, OC_B, l_skip);
+            end;
+          else
+            internalerror(2014010305);
+        end;
+      end;
+
+    procedure tcg8086.gen_cmp32_jmp2(list: TAsmList; cmp_op: topcmp; l_skip, l_target: TAsmLabel);
+      begin
+        case cmp_op of
+          OC_EQ:
+            a_jmp_cond(list, OC_EQ, l_target);
+          OC_GT:
+            a_jmp_cond(list, OC_A, l_target);
+          OC_LT:
+            a_jmp_cond(list, OC_B, l_target);
+          OC_GTE:
+            a_jmp_cond(list, OC_AE, l_target);
+          OC_LTE:
+            a_jmp_cond(list, OC_BE, l_target);
+          OC_NE:
+            a_jmp_cond(list, OC_NE, l_target);
+          OC_BE:
+            a_jmp_cond(list, OC_BE, l_target);
+          OC_B:
+            a_jmp_cond(list, OC_B, l_target);
+          OC_AE:
+            a_jmp_cond(list, OC_AE, l_target);
+          OC_A:
+            a_jmp_cond(list, OC_A, l_target);
+          else
+            internalerror(2014010306);
+        end;
+      end;
+
+
     procedure tcg8086.g_flags2reg(list: TAsmList; size: TCgSize; const f: tresflags; reg: TRegister);
       var
         ai : taicpu;
-        hreg, hreg16 : tregister;
+        hreg16 : tregister;
         hl_skip: TAsmLabel;
         invf: TResFlags;
+        tmpsize: TCgSize;
       begin
-        hreg:=makeregsize(list,reg,OS_8);
-
         invf := f;
         inverse_flags(invf);
 
-        list.concat(Taicpu.op_const_reg(A_MOV, S_B, 0, hreg));
+        case size of
+          OS_8,OS_S8:
+            begin
+              tmpsize:=OS_8;
+              list.concat(Taicpu.op_const_reg(A_MOV, S_B, 0, reg));
+            end;
+          OS_16,OS_S16,OS_32,OS_S32:
+            begin
+              tmpsize:=OS_16;
+              list.concat(Taicpu.op_const_reg(A_MOV, S_W, 0, reg));
+            end;
+          else
+            internalerror(2013123101);
+        end;
 
         current_asmdata.getjumplabel(hl_skip);
         ai:=Taicpu.Op_Sym(A_Jcc,S_NO,hl_skip);
@@ -1348,23 +1604,39 @@ unit cgcpu;
         list.concat(ai);
 
         { 16-bit INC is shorter than 8-bit }
-        hreg16:=makeregsize(list,hreg,OS_16);
+        hreg16:=makeregsize(list,reg,OS_16);
         list.concat(Taicpu.op_reg(A_INC, S_W, hreg16));
+        makeregsize(list,hreg16,tmpsize);
 
         a_label(list,hl_skip);
 
-        if reg<>hreg then
-          a_load_reg_reg(list,OS_8,size,hreg,reg);
+        a_load_reg_reg(list,tmpsize,size,reg,reg);
       end;
 
 
     procedure tcg8086.g_flags2ref(list: TAsmList; size: TCgSize; const f: tresflags; const ref: TReference);
       var
         tmpreg : tregister;
+        tmpregsize: TCgSize;
+        tmpref: treference;
       begin
-        tmpreg:=getintregister(list,size);
-        g_flags2reg(list,size,f,tmpreg);
-        a_load_reg_ref(list,size,size,tmpreg,ref);
+        if size in [OS_8,OS_S8,OS_16,OS_S16] then
+          tmpregsize:=size
+        else
+          tmpregsize:=OS_16;
+        tmpreg:=getintregister(list,tmpregsize);
+        g_flags2reg(list,tmpregsize,f,tmpreg);
+
+        tmpref:=ref;
+        make_simple_ref(list,tmpref);
+        if size in [OS_64,OS_S64] then
+          begin
+            a_load_reg_ref(list,tmpregsize,OS_32,tmpreg,tmpref);
+            inc(tmpref.offset,4);
+            a_load_const_ref(list,OS_32,0,tmpref);
+          end
+        else
+          a_load_reg_ref(list,tmpregsize,size,tmpreg,tmpref);
       end;
 
 
@@ -1630,6 +1902,72 @@ unit cgcpu;
       end;
 
 
+    procedure tcg8086.add_move_instruction(instr: Taicpu);
+      begin
+        { HACK: when regvars are on, don't notify the register allocator of any
+          direct moves to BX, so it doesn't try to coalesce them. Currently,
+          direct moves to BX are only used when returning an int64 value in
+          AX:BX:CX:DX. This hack fixes a common issue with functions, returning
+          int64, for example:
+
+        function RandomFrom(const AValues: array of Int64): Int64;
+          begin
+            result:=AValues[random(High(AValues)+1)];
+          end;
+
+    	push	bp
+    	mov	bp,sp
+; Var AValues located in register ireg20w
+; Var $highAVALUES located in register ireg21w
+; Var $result located in register ireg33w:ireg32w:ireg31w:ireg30w
+    	mov	ireg20w,word [bp+6]
+    	mov	ireg21w,word [bp+4]
+; [3] result:=AValues[random(High(AValues)+1)];
+    	mov	ireg22w,ireg21w
+    	inc	ireg22w
+    	mov	ax,ireg22w
+    	cwd
+    	mov	ireg23w,ax
+    	mov	ireg24w,dx
+    	push	ireg24w
+    	push	ireg23w
+    	call	SYSTEM_$$_RANDOM$LONGINT$$LONGINT
+    	mov	ireg25w,ax
+    	mov	ireg26w,dx
+    	mov	ireg27w,ireg25w
+    	mov	ireg28w,ireg27w
+    	mov	ireg29w,ireg28w
+    	mov	cl,3
+    	shl	ireg29w,cl
+; Var $result located in register ireg32w:ireg30w
+    	mov	ireg30w,word [ireg20w+ireg29w]
+    	mov	ireg31w,word [ireg20w+ireg29w+2]
+    	mov	ireg32w,word [ireg20w+ireg29w+4]  ; problematic section start
+    	mov	ireg33w,word [ireg20w+ireg29w+6]
+; [4] end;
+    	mov	bx,ireg32w  ; problematic section end
+    	mov	ax,ireg33w
+    	mov	dx,ireg30w
+    	mov	cx,ireg31w
+    	mov	sp,bp
+    	pop	bp
+    	ret	4
+
+        the problem arises, because the register allocator tries to coalesce
+          mov bx,ireg32w
+        however, in the references [ireg20w+ireg29w+const], due to the
+        constraints of i8086, ireg20w can only be BX (or BP, which isn't available
+        to the register allocator, because it's used as a base pointer) }
+
+        if (cs_opt_regvar in current_settings.optimizerswitches) and
+           (instr.opcode=A_MOV) and (instr.ops=2) and
+           (instr.oper[1]^.typ=top_reg) and (getsupreg(instr.oper[1]^.reg)=RS_BX) then
+          exit
+        else
+          inherited add_move_instruction(instr);
+      end;
+
+
     procedure tcg8086.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
       var
         hsym : tsym;
@@ -1975,12 +2313,13 @@ unit cgcpu;
             end;
           OP_ADD, OP_SUB:
             begin
-              // can't use a_op_const_ref because this may use dec/inc
               get_64bit_ops(op,op1,op2);
               if (value and $ffffffffffff) = 0 then
                 begin
-                  list.concat(taicpu.op_const_reg(op1,S_W,aint((value shr 48) and $ffff),GetNextReg(reg.reghi)));
+                  { use a_op_const_reg to allow the use of inc/dec }
+                  cg.a_op_const_reg(list,op,OS_16,aint((value shr 48) and $ffff),GetNextReg(reg.reghi));
                 end
+              // can't use a_op_const_ref because this may use dec/inc
               else if (value and $ffffffff) = 0 then
                 begin
                   list.concat(taicpu.op_const_reg(op1,S_W,aint((value shr 32) and $ffff),reg.reghi));
@@ -2023,12 +2362,13 @@ unit cgcpu;
           OP_ADD, OP_SUB:
             begin
               get_64bit_ops(op,op1,op2);
-              // can't use a_op_const_ref because this may use dec/inc
               if (value and $ffffffffffff) = 0 then
                 begin
                   inc(tempref.offset,6);
-                  list.concat(taicpu.op_const_ref(op1,S_W,aint((value shr 48) and $ffff),tempref));
+                  { use a_op_const_ref to allow the use of inc/dec }
+                  cg.a_op_const_ref(list,op,OS_16,aint((value shr 48) and $ffff),tempref);
                 end
+              // can't use a_op_const_ref because this may use dec/inc
               else if (value and $ffffffff) = 0 then
                 begin
                   inc(tempref.offset,4);

+ 1 - 1
compiler/i8086/cpuinfo.pas

@@ -125,7 +125,7 @@ Const
 
    level1optimizerswitches = genericlevel1optimizerswitches + [cs_opt_peephole];
    level2optimizerswitches = genericlevel2optimizerswitches + level1optimizerswitches +
-     [{cs_opt_regvar,}cs_opt_stackframe,cs_opt_tailrecursion,cs_opt_nodecse];
+     [{cs_opt_regvar,}cs_opt_stackframe,cs_opt_tailrecursion{,cs_opt_nodecse}];
    level3optimizerswitches = genericlevel3optimizerswitches + level2optimizerswitches + [{,cs_opt_loopunroll}];
    level4optimizerswitches = genericlevel4optimizerswitches + level3optimizerswitches + [cs_useebp];
 

+ 1 - 0
compiler/i8086/i8086att.inc

@@ -946,6 +946,7 @@
 'vzeroupper',
 'andn',
 'bextr',
+'tzcnt',
 'rorx',
 'sarx',
 'shlx',

+ 1 - 0
compiler/i8086/i8086atts.inc

@@ -963,5 +963,6 @@ attsufNONE,
 attsufNONE,
 attsufNONE,
 attsufNONE,
+attsufNONE,
 attsufNONE
 );

+ 1 - 0
compiler/i8086/i8086int.inc

@@ -946,6 +946,7 @@
 'vzeroupper',
 'andn',
 'bextr',
+'tzcnt',
 'rorx',
 'sarx',
 'shlx',

+ 1 - 1
compiler/i8086/i8086nop.inc

@@ -1,2 +1,2 @@
 { don't edit, this file is generated from x86ins.dat }
-1685;
+1686;

+ 1 - 0
compiler/i8086/i8086op.inc

@@ -946,6 +946,7 @@ A_VZEROALL,
 A_VZEROUPPER,
 A_ANDN,
 A_BEXTR,
+A_TZCNT,
 A_RORX,
 A_SARX,
 A_SHLX,

+ 79 - 78
compiler/i8086/i8086prop.inc

@@ -224,8 +224,8 @@
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -389,7 +389,7 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_ROp1, Ch_WOp2, Ch_RFLAGS)),
+(Ch: (Ch_ROp1, Ch_RWOp2, Ch_RFLAGS)),
 (Ch: (Ch_RFLAGS, Ch_None, Ch_None)),
 (Ch: (Ch_RFLAGS, Ch_WOp1, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
@@ -489,8 +489,8 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -570,7 +570,7 @@
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -730,6 +730,10 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -753,6 +757,12 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -761,28 +771,20 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
@@ -796,7 +798,15 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -859,6 +869,14 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -867,6 +885,24 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -876,15 +912,26 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop2, Ch_Rop1, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -893,63 +940,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_None)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
-(Ch: (Ch_Wop1, Ch_Rop2, Ch_Rop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Wop2, Ch_WFlags, Ch_Rop1)),
+(Ch: (Ch_Rop1, Ch_Wop2, Ch_None)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
+(Ch: (Ch_Rop1, Ch_Rop2, Ch_Wop3)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_Mop2, Ch_Rop1, Ch_None)),

+ 7 - 0
compiler/i8086/i8086tab.inc

@@ -11571,6 +11571,13 @@
     code    : #242#249#1#247#62#72;
     flags   : if_bmi1
   ),
+  (
+    opcode  : A_TZCNT;
+    ops     : 2;
+    optypes : (ot_reg16 or ot_bits32 or ot_bits64,ot_rm_gpr,ot_none,ot_none);
+    code    : #208#219#2#15#188#72;
+    flags   : if_bmi1 or if_sm
+  ),
   (
     opcode  : A_RORX;
     ops     : 3;

+ 10 - 11
compiler/i8086/n8086add.pas

@@ -596,15 +596,6 @@ interface
 
     procedure ti8086addnode.second_mul(unsigned: boolean);
 
-      procedure add_mov(instr: Taicpu);
-        begin
-          { Notify the register allocator that we have written a move instruction so
-            it can try to eliminate it. }
-          if (instr.oper[0]^.reg<>current_procinfo.framepointer) and (instr.oper[0]^.reg<>NR_STACK_POINTER_REG) then
-            tcgx86(cg).add_move_instruction(instr);
-          current_asmdata.CurrAsmList.concat(instr);
-        end;
-
     var reg:Tregister;
         ref:Treference;
         use_ref:boolean;
@@ -616,6 +607,13 @@ interface
     begin
       pass_left_right;
 
+      { MUL is faster than IMUL on the 8086 & 8088 (and equal in speed on 286+),
+        but it's only safe to use in place of IMUL when overflow checking is off
+        and we're doing a 16-bit>16-bit multiplication }
+      if not (cs_check_overflow in current_settings.localswitches) and
+        (not is_32bitint(resultdef)) then
+        unsigned:=true;
+
       {The location.register will be filled in later (JM)}
       location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
       { Mul supports registers and references, so if not register/reference,
@@ -661,8 +659,9 @@ interface
         {Allocate an imaginary 32-bit register, which consists of a pair of
          16-bit registers and store DX:AX into it}
         location.register := cg.getintregister(current_asmdata.CurrAsmList,OS_32);
-        add_mov(Taicpu.Op_reg_reg(A_MOV,S_W,NR_AX,location.register));
-        add_mov(Taicpu.Op_reg_reg(A_MOV,S_W,NR_DX,GetNextReg(location.register)));
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_DX,GetNextReg(location.register));
+        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_AX,location.register);
       end
       else
       begin

+ 32 - 3
compiler/i8086/n8086cal.pas

@@ -28,13 +28,15 @@ interface
 { $define AnsiStrRef}
 
     uses
-      nx86cal;
+      nx86cal,cgutils;
 
     type
        ti8086callnode = class(tx86callnode)
        protected
           procedure pop_parasize(pop_size:longint);override;
           procedure extra_interrupt_code;override;
+          procedure extra_call_ref_code(var ref: treference);override;
+          procedure do_call_ref(ref: treference);override;
        end;
 
 
@@ -43,11 +45,11 @@ implementation
     uses
       globtype,systems,
       cutils,verbose,globals,
-      cgbase,cgutils,
+      cgbase,
       cpubase,paramgr,
       aasmtai,aasmdata,aasmcpu,
       ncal,nbas,nmem,nld,ncnv,
-      cga,cgobj,cpuinfo;
+      cga,cgobj,cgx86,cpuinfo;
 
 
 {*****************************************************************************
@@ -92,6 +94,33 @@ implementation
       end;
 
 
+    procedure ti8086callnode.extra_call_ref_code(var ref: treference);
+      begin
+        if ref.base<>NR_NO then
+          begin
+            cg.getcpuregister(current_asmdata.CurrAsmList,NR_BX);
+            cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_16,OS_16,ref.base,NR_BX);
+            ref.base:=NR_BX;
+            cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_BX);
+          end;
+        if ref.index<>NR_NO then
+          begin
+            cg.getcpuregister(current_asmdata.CurrAsmList,NR_SI);
+            cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_16,OS_16,ref.index,NR_SI);
+            ref.index:=NR_SI;
+            cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_SI);
+          end;
+      end;
+
+
+    procedure ti8086callnode.do_call_ref(ref: treference);
+      begin
+        if current_settings.x86memorymodel in x86_far_code_models then
+          ref.refaddr:=addr_far_ref;
+        current_asmdata.CurrAsmList.concat(taicpu.op_ref(A_CALL,S_NO,ref));
+      end;
+
+
 begin
    ccallnode:=ti8086callnode;
 end.

+ 17 - 1
compiler/i8086/n8086inl.pas

@@ -36,6 +36,7 @@ interface
          function typecheck_seg: tnode; override;
          function first_seg: tnode; override;
          procedure second_seg; override;
+         procedure second_get_frame;override;
        end;
 
 implementation
@@ -54,7 +55,7 @@ implementation
     nbas,ncon,ncal,ncnv,nld,ncgutil,
     tgobj,
     cga,cgutils,cgx86,cgobj,hlcgobj,
-    htypechk;
+    htypechk,procinfo;
 
      function ti8086inlinenode.typecheck_seg: tnode;
        begin
@@ -75,6 +76,21 @@ implementation
          current_asmdata.CurrAsmList.Concat(Taicpu.op_reg_reg(A_MOV,S_W,NR_DS,location.register));
        end;
 
+     procedure ti8086inlinenode.second_get_frame;
+       begin
+         if current_settings.x86memorymodel in x86_far_data_models then
+           begin
+             if current_procinfo.framepointer=NR_STACK_POINTER_REG then
+               internalerror(2014030201);
+             location_reset(location,LOC_REGISTER,OS_32);
+             location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_32);
+             emit_reg_reg(A_MOV,S_W,current_procinfo.framepointer,location.register);
+             current_asmdata.CurrAsmList.Concat(Taicpu.op_reg_reg(A_MOV,S_W,NR_SS,GetNextReg(location.register)));
+           end
+         else
+           inherited second_get_frame;
+       end;
+
 begin
    cinlinenode:=ti8086inlinenode;
 end.

+ 3 - 3
compiler/jvm/cpubase.pas

@@ -142,13 +142,13 @@ uses
       maxfpuvarregs = 1;
 
       { Integer Super registers first and last }
-      first_int_imreg = 10;
+      first_int_imreg = 2;
 
       { Float Super register first and last }
-      first_fpu_imreg     = 10;
+      first_fpu_imreg     = 2;
 
       { MM Super register first and last }
-      first_mm_imreg     = 10;
+      first_mm_imreg     = 2;
 
       regnumber_table : array[tregisterindex] of tregister = (
         {$i rjvmnum.inc}

+ 4 - 2
compiler/jvm/hlcgcpu.pas

@@ -271,7 +271,7 @@ implementation
       if (fevalstackheight>fmaxevalstackheight) then
         fmaxevalstackheight:=fevalstackheight;
       if cs_asm_regalloc in current_settings.globalswitches then
-        list.concat(tai_comment.Create(strpnew('allocated '+tostr(slots)+', stack height = '+tostr(fevalstackheight))));
+        list.concat(tai_comment.Create(strpnew('    allocated '+tostr(slots)+', stack height = '+tostr(fevalstackheight))));
     end;
 
   procedure thlcgjvm.decstack(list: TAsmList;slots: longint);
@@ -1711,7 +1711,9 @@ implementation
               if tprocsym(sym).procdeflist.Count<>1 then
                 internalerror(2011071713);
               pd:=tprocdef(tprocsym(sym).procdeflist[0]);
-            end;
+            end
+          else
+            internalerror(2013113008);
           a_load_ref_stack(list,java_jlobject,ref,prepare_stack_for_ref(list,ref,false));
           a_call_name(list,pd,pd.mangledname,nil,false);
           { parameter removed, no result }

+ 4 - 1
compiler/jvm/jvmdef.pas

@@ -282,7 +282,10 @@ implementation
                 s64real:
                   c:='D';
                 else
-                  result:=false;
+                  begin
+                    result:=false;
+                    c:=' ';
+                  end;
               end;
               encodedstr:=encodedstr+c;
             end;

+ 2 - 29
compiler/jvm/njvmadd.pas

@@ -38,8 +38,6 @@ interface
        protected
           function jvm_first_addset: tnode;
 
-          function cmpnode2topcmp(unsigned: boolean): TOpCmp;
-
           procedure second_generic_compare(unsigned: boolean);
 
           procedure pass_left_right;override;
@@ -215,6 +213,8 @@ interface
                       end;
                       procname:='CONTAINSALL'
                     end;
+                  else
+                    internalerror(2013120114);
                 end;
               result:=ccallnode.createinternmethod(left,procname,ccallparanode.create(right,nil));
               { for an unequaln, we have to negate the result of equals }
@@ -333,33 +333,6 @@ interface
       end;
 
 
-    function tjvmaddnode.cmpnode2topcmp(unsigned: boolean): TOpCmp;
-      begin
-        if not unsigned then
-          case nodetype of
-            gtn: result:=OC_GT;
-            gten: result:=OC_GTE;
-            ltn: result:=OC_LT;
-            lten: result:=OC_LTE;
-            equaln: result:=OC_EQ;
-            unequaln: result:=OC_NE;
-            else
-              internalerror(2011010412);
-          end
-        else
-        case nodetype of
-          gtn: result:=OC_A;
-          gten: result:=OC_AE;
-          ltn: result:=OC_B;
-          lten: result:=OC_BE;
-          equaln: result:=OC_EQ;
-          unequaln: result:=OC_NE;
-          else
-            internalerror(2011010412);
-        end;
-      end;
-
-
     procedure tjvmaddnode.second_generic_compare(unsigned: boolean);
       var
         cmpop: TOpCmp;

+ 10 - 5
compiler/jvm/njvmmem.pas

@@ -453,6 +453,14 @@ implementation
              object instances (since that's what they are in Java) }
            right.resultdef:=s32inttype;
            right.location.size:=OS_S32;
+          end
+        else if (right.location.loc<>LOC_CONSTANT) and
+                ((right.resultdef.typ<>orddef) or
+                 (torddef(right.resultdef).ordtype<>s32bit)) then
+          begin
+            { Java array indices are always 32 bit signed integers }
+            hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,s32inttype,true);
+            right.resultdef:=s32inttype;
           end;
 
         { adjust index if necessary }
@@ -462,11 +470,8 @@ implementation
           begin
             thlcgjvm(hlcg).a_load_loc_stack(current_asmdata.CurrAsmList,right.resultdef,right.location);
             thlcgjvm(hlcg).a_op_const_stack(current_asmdata.CurrAsmList,OP_SUB,right.resultdef,tarraydef(left.resultdef).lowrange);
-            if right.location.loc<>LOC_REGISTER then
-              begin
-                location_reset(right.location,LOC_REGISTER,def_cgsize(right.resultdef));
-                right.location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,right.resultdef);
-              end;
+            location_reset(right.location,LOC_REGISTER,def_cgsize(right.resultdef));
+            right.location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,right.resultdef);
             thlcgjvm(hlcg).a_load_stack_reg(current_asmdata.CurrAsmList,right.resultdef,right.location.register);
           end;
 

+ 3 - 0
compiler/jvm/pjvm.pas

@@ -80,6 +80,7 @@ implementation
         sstate: tscannerstate;
         needclassconstructor: boolean;
       begin
+        ps:=nil;
         { if there is at least one constructor for a class, do nothing (for
            records, we'll always also need a parameterless constructor) }
         if not is_javaclass(obj) or
@@ -733,6 +734,8 @@ implementation
         conststr: ansistring;
         first: boolean;
       begin
+        result:=nil;
+        esym:=nil;
         case csym.constdef.typ of
           enumdef:
             begin

+ 25 - 6
compiler/link.pas

@@ -566,7 +566,7 @@ Implementation
     Function TLinker.MakeStaticLibrary:boolean;
       begin
         MakeStaticLibrary:=false;
-        Message(exec_e_dll_not_supported);
+        Message(exec_e_static_lib_not_supported);
       end;
 
 
@@ -787,7 +787,8 @@ Implementation
         binstr := FindUtil(utilsprefix + binstr);
 
 
-        scripted_ar:=target_ar.id=ar_gnu_ar_scripted;
+        scripted_ar:=(target_ar.id=ar_gnu_ar_scripted) or
+                     (target_ar.id=ar_watcom_wlib_omf_scripted);
 
         if scripted_ar then
           begin
@@ -796,15 +797,25 @@ Implementation
             Assign(script, scriptfile);
             Rewrite(script);
             try
-              writeln(script, 'CREATE ' + current_module.staticlibfilename);
+              if (target_ar.id=ar_gnu_ar_scripted) then
+                writeln(script, 'CREATE ' + current_module.staticlibfilename)
+              else { wlib case }
+                writeln(script,'-q -fo -c '+
+                  maybequoted(current_module.staticlibfilename));
               current := TCmdStrListItem(SmartLinkOFiles.First);
               while current <> nil do
                 begin
-                  writeln(script, 'ADDMOD ' + current.str);
+                  if (target_ar.id=ar_gnu_ar_scripted) then
+                  writeln(script, 'ADDMOD ' + current.str)
+                  else
+                    writeln(script,'+' + current.str);
                   current := TCmdStrListItem(current.next);
                 end;
-              writeln(script, 'SAVE');
-              writeln(script, 'END');
+              if (target_ar.id=ar_gnu_ar_scripted) then
+                begin
+                  writeln(script, 'SAVE');
+                  writeln(script, 'END');
+                end;
             finally
               Close(script);
             end;
@@ -1585,10 +1596,18 @@ Implementation
             arfinishcmd : ''
           );
 
+      ar_watcom_wlib_omf_scripted_info : tarinfo =
+          (
+            id    : ar_watcom_wlib_omf_scripted;
+            arcmd : 'wlib @$SCRIPT';
+            arfinishcmd : ''
+          );
+
 
 initialization
   RegisterAr(ar_gnu_ar_info);
   RegisterAr(ar_gnu_ar_scripted_info);
   RegisterAr(ar_gnu_gar_info);
   RegisterAr(ar_watcom_wlib_omf_info);
+  RegisterAr(ar_watcom_wlib_omf_scripted_info);
 end.

+ 1 - 1
compiler/m68k/ag68kgas.pas

@@ -319,7 +319,7 @@ interface
             id     : as_gas;
             idtxt  : 'AS';
             asmbin : 'as';
-            asmcmd : '$ARCH -o $OBJ $ASM';
+            asmcmd : '$ARCH -o $OBJ $EXTRAOPT $ASM';
             supported_targets : [system_m68k_Amiga,system_m68k_Atari,system_m68k_Mac,system_m68k_linux,system_m68k_PalmOS,system_m68k_netbsd,system_m68k_openbsd,system_m68k_embedded];
             flags : [af_needar,af_smartlink_sections];
             labelprefix : '.L';

+ 399 - 242
compiler/m68k/cgcpu.pas

@@ -42,7 +42,7 @@ unit cgcpu;
         procedure a_load_reg_cgpara(list : TAsmList;size : tcgsize;r : tregister;const cgpara : tcgpara);override;
         procedure a_load_const_cgpara(list : TAsmList;size : tcgsize;a : tcgint;const cgpara : tcgpara);override;
         procedure a_load_ref_cgpara(list : TAsmList;size : tcgsize;const r : treference;const cgpara : tcgpara);override;
-        procedure a_loadaddr_ref_cgpara(list : TAsmList;const r : treference;const cgpara : tcgpara);override;
+        //procedure a_loadaddr_ref_cgpara(list : TAsmList;const r : treference;const cgpara : tcgpara);override;
 
         procedure a_call_name(list : TAsmList;const s : string; weak: boolean);override;
         procedure a_call_reg(list : TAsmList;reg : tregister);override;
@@ -67,11 +67,11 @@ unit cgcpu;
         procedure a_loadmm_reg_cgpara(list: TAsmList; size: tcgsize; reg: tregister;const locpara : TCGPara;shuffle : pmmshuffle); override;
 
         procedure a_op_const_reg(list : TAsmList; Op: TOpCG; size: tcgsize; a: tcgint; reg: TRegister); override;
-        //procedure a_op_const_ref(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; const ref: TReference); override;
+        procedure a_op_const_ref(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; const ref: TReference); override;
         procedure a_op_reg_reg(list : TAsmList; Op: TOpCG; size: TCGSize; reg1, reg2: TRegister); override;
+        procedure a_op_reg_ref(list : TAsmList; Op: TOpCG; size: TCGSize; reg: TRegister; const ref: TReference); override;
 
-        procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
-          l : tasmlabel);override;
+        procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister; l : tasmlabel);override;
         procedure a_cmp_reg_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;reg1,reg2 : tregister;l : tasmlabel); override;
         procedure a_jmp_name(list : TAsmList;const s : string); override;
         procedure a_jmp_always(list : TAsmList;l: tasmlabel); override;
@@ -85,8 +85,8 @@ unit cgcpu;
         procedure g_proc_entry(list : TAsmList;localsize : longint;nostackframe:boolean);override;
         procedure g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);override;
 
-//        procedure g_restore_frame_pointer(list : TAsmList);override;
-//        procedure g_return_from_proc(list : TAsmList;parasize : tcgint);override;
+        procedure g_save_registers(list:TAsmList);override;
+        procedure g_restore_registers(list:TAsmList);override;
 
         procedure g_adjust_self_value(list:TAsmList;procdef:tprocdef;ioffset:tcgint);override;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
@@ -121,6 +121,7 @@ unit cgcpu;
         is 32K
      }
      function isvalidrefoffset(const ref: treference): boolean;
+     function isvalidreference(const ref: treference): boolean;
 
     procedure create_codegen;
 
@@ -156,6 +157,28 @@ unit cgcpu;
        A_NONE
       );
 
+      { opcode with extend bits table lookup, used by 64bit cg }
+      topcg2tasmopx: Array[topcg] of tasmop =
+      (
+       A_NONE,
+       A_NONE,
+       A_ADDX,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_NEGX,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_NONE,
+       A_SUBX,
+       A_NONE,
+       A_NONE,
+       A_NONE
+      );
 
       TOpCmp2AsmCond: Array[topcmp] of TAsmCond =
       (
@@ -172,14 +195,26 @@ unit cgcpu;
        C_HI
       );
 
+     function isvalidreference(const ref: treference): boolean;
+       begin
+         isvalidreference:=isvalidrefoffset(ref) and
+
+           { don't try to generate addressing with symbol and base reg and offset
+             it might fail in linking stage if the symbol is more than 32k away (KB) }
+           not (assigned(ref.symbol) and (ref.base <> NR_NO) and (ref.offset <> 0)) and
+
+           { coldfire and 68000 cannot handle non-addressregs as bases }
+           not ((current_settings.cputype in cpu_coldfire+[cpu_mc68000]) and
+                not isaddressregister(ref.base));
+       end;
 
      function isvalidrefoffset(const ref: treference): boolean;
       begin
          isvalidrefoffset := true;
          if ref.index <> NR_NO then
            begin
-             if ref.base <> NR_NO then
-                internalerror(2002081401);
+//             if ref.base <> NR_NO then
+//                internalerror(2002081401);
              if (ref.offset < low(shortint)) or (ref.offset > high(shortint)) then
                 isvalidrefoffset := false
            end
@@ -364,7 +399,7 @@ unit cgcpu;
           inherited a_load_ref_cgpara(list,size,r,cgpara);
       end;
 
-
+{
     procedure tcg68k.a_loadaddr_ref_cgpara(list : TAsmList;const r : treference;const cgpara : tcgpara);
       var
         tmpreg : tregister;
@@ -403,7 +438,7 @@ unit cgcpu;
               inherited a_loadaddr_ref_cgpara(list,r,cgpara);
           end;
       end;
-
+}
 
     function tcg68k.fixref(list: TAsmList; var ref: treference): boolean;
        var
@@ -705,11 +740,24 @@ unit cgcpu;
 
 
     procedure tcg68k.a_load_const_reg(list : TAsmList;size : tcgsize;a : tcgint;register : tregister);
+      var
+        opsize: topsize;
       begin
+        opsize:=tcgsize2opsize[size];
+
         if isaddressregister(register) then
-         begin
-           list.concat(taicpu.op_const_reg(A_MOVE,S_L,longint(a),register))
-         end
+          begin
+            { an m68k manual I have recommends SUB Ax,Ax to be used instead of CLR for address regs }
+            if a = 0 then
+              list.concat(taicpu.op_reg_reg(A_SUB,S_L,register,register))
+            else
+              { ISA B/C Coldfire has MOV3Q which can move -1 or 1..7 to any reg }
+              if (current_settings.cputype in [cpu_isa_b,cpu_isa_c]) and 
+                 ((longint(a) = -1) or ((longint(a) > 0) and (longint(a) < 8))) then
+                list.concat(taicpu.op_const_reg(A_MOV3Q,S_L,longint(a),register))
+              else
+                list.concat(taicpu.op_const_reg(A_MOVE,S_L,longint(a),register));
+          end
         else
         if a = 0 then
            list.concat(taicpu.op_reg(A_CLR,S_L,register))
@@ -719,15 +767,28 @@ unit cgcpu;
               list.concat(taicpu.op_const_reg(A_MOVEQ,S_L,longint(a),register))
            else
              begin
-               { clear the register first, for unsigned and positive values, so
-                  we don't need to zero extend after }
-               if (size in [OS_16,OS_8]) or
-                  ((size in [OS_S16,OS_S8]) and (a > 0)) then
-                 list.concat(taicpu.op_reg(A_CLR,S_L,register));
-               list.concat(taicpu.op_const_reg(A_MOVE,tcgsize2opsize[size],longint(a),register));
-               { only sign extend if we need to, zero extension is not necessary because the CLR.L above }
-               if (size in [OS_S16,OS_S8]) and (a < 0) then
-                 sign_extend(list,size,register);
+               { ISA B/C Coldfire has sign extend/zero extend moves }
+               if (current_settings.cputype in [cpu_isa_b,cpu_isa_c]) and 
+                  (size in [OS_16, OS_8, OS_S16, OS_S8]) and 
+                  ((longint(a) >= low(smallint)) and (longint(a) <= high(smallint))) then
+                 begin
+                   if size in [OS_16, OS_8] then
+                     list.concat(taicpu.op_const_reg(A_MVZ,opsize,longint(a),register))
+                   else
+                     list.concat(taicpu.op_const_reg(A_MVS,opsize,longint(a),register));
+                 end
+               else
+                 begin
+                   { clear the register first, for unsigned and positive values, so
+                     we don't need to zero extend after }
+                   if (size in [OS_16,OS_8]) or
+                      ((size in [OS_S16,OS_S8]) and (a > 0)) then
+                     list.concat(taicpu.op_reg(A_CLR,S_L,register));
+                   list.concat(taicpu.op_const_reg(A_MOVE,opsize,longint(a),register));
+                   { only sign extend if we need to, zero extension is not necessary because the CLR.L above }
+                   if (size in [OS_S16,OS_S8]) and (a < 0) then
+                     sign_extend(list,size,register);
+                 end;
              end;
          end;
       end;
@@ -1007,7 +1068,7 @@ unit cgcpu;
        instr : taicpu;
        paraloc1,paraloc2,paraloc3 : tcgpara;
       begin
-        optimize_op_const(op, a);
+        optimize_op_const(size, op, a);
         opcode := topcg2tasmop[op];
         case op of
           OP_NONE :
@@ -1111,15 +1172,25 @@ unit cgcpu;
          end;
       end;
 
-{
+
     procedure tcg68k.a_op_const_ref(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; const ref: TReference);
       var
         opcode: tasmop;
+        opsize : topsize;
       begin
-        writeln('a_op_const_ref');
-
-        optimize_op_const(op, a);
+        optimize_op_const(size, op, a);
         opcode := topcg2tasmop[op];
+        opsize := TCGSize2OpSize[size];
+
+        { on ColdFire all arithmetic operations are only possible on 32bit }
+        if not isvalidreference(ref) or
+           ((current_settings.cputype in cpu_coldfire) and (opsize <> S_L)
+           and not (op in [OP_NONE,OP_MOVE])) then
+          begin
+            inherited;
+            exit;
+          end;
+
         case op of
           OP_NONE :
             begin
@@ -1130,13 +1201,32 @@ unit cgcpu;
               { Optimized, replaced with a simple load }
               a_load_const_ref(list,size,a,ref);
             end;
-          else
+          OP_ADD,
+          OP_SUB :
             begin
-              internalerror(2007010101);
+              { add/sub works the same way, so have it unified here }
+              if (a >= 1) and (a <= 8) then
+                begin
+                  if (op = OP_ADD) then
+                    opcode:=A_ADDQ
+                  else
+                    opcode:=A_SUBQ;
+                  list.concat(taicpu.op_const_ref(opcode, opsize, a, ref));
+                end
+              else
+                if current_settings.cputype = cpu_mc68000 then
+                  list.concat(taicpu.op_const_ref(opcode, opsize, a, ref))
+                else
+                  { on ColdFire, ADDI/SUBI cannot act on memory
+                    so we can only go through a register }
+                  inherited;
             end;
+          else begin
+//            list.concat(tai_comment.create(strpnew('a_op_const_ref inherited')));
+            inherited;
+          end;
         end;
       end;
-}
 
     procedure tcg68k.a_op_reg_reg(list : TAsmList; Op: TOpCG; size: TCGSize; reg1, reg2: TRegister);
       var
@@ -1246,6 +1336,36 @@ unit cgcpu;
       end;
 
 
+    procedure tcg68k.a_op_reg_ref(list : TAsmList; Op: TOpCG; size: TCGSize; reg: TRegister; const ref: TReference);
+      var
+        opcode : tasmop;
+        opsize : topsize;
+      begin
+        opcode := topcg2tasmop[op];
+        opsize := TCGSize2OpSize[size];
+
+        { on ColdFire all arithmetic operations are only possible on 32bit 
+          and addressing modes are limited }
+        if not isvalidreference(ref) or
+           ((current_settings.cputype in cpu_coldfire) and (opsize <> S_L)) then
+          begin
+            inherited;
+            exit;
+          end;
+
+        case op of
+          OP_ADD,
+          OP_SUB :
+            begin
+              { add/sub works the same way, so have it unified here }
+              list.concat(taicpu.op_reg_ref(opcode, opsize, reg, ref));
+            end;
+          else begin
+//            list.concat(tai_comment.create(strpnew('a_op_reg_ref inherited')));
+            inherited;
+          end;
+        end;
+      end;
 
     procedure tcg68k.a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
             l : tasmlabel);
@@ -1572,65 +1692,57 @@ unit cgcpu;
           end;
       end;
 
-{    procedure tcg68k.g_restore_frame_pointer(list : TAsmList);
-      var
-        r:Tregister;
-      begin
-        r:=NR_FRAME_POINTER_REG;
-        list.concat(taicpu.op_reg(A_UNLK,S_NO,r));
-      end;
-}
-
     procedure tcg68k.g_proc_exit(list : TAsmList; parasize: longint; nostackframe: boolean);
       var
         r,hregister : TRegister;
-        localsize: tcgint;
         spr : TRegister;
         fpr : TRegister;
         ref : TReference;
       begin
         if not nostackframe then
           begin
-            localsize := current_procinfo.calc_stackframe_size;
             list.concat(taicpu.op_reg(A_UNLK,S_NO,NR_FRAME_POINTER_REG));
-	    parasize := parasize - target_info.first_parm_offset; { i'm still not 100% confident that this is
-	                                                            correct here, but at least it looks less
-								    hacky, and makes some sense (KB) }
-            if (parasize<>0) then
+            parasize := parasize - target_info.first_parm_offset; { i'm still not 100% confident that this is
+                                                                    correct here, but at least it looks less
+                                                                    hacky, and makes some sense (KB) }
+
+            { if parasize is less than zero here, we probably have a cdecl function.
+              According to the info here: http://www.makestuff.eu/wordpress/gcc-68000-abi/
+              68k GCC uses two different methods to free the stack, depending if the target
+              architecture supports RTD or not, and one does callee side, the other does
+              caller side free, which looks like a PITA to support. We have to figure this 
+              out later. More info welcomed. (KB) }
+
+            if (parasize > 0) then
               begin
-                { only 68020+ supports RTD, so this needs another code path
-                  for 68000 and Coldfire (KB) }
-{ TODO: 68020+ only code generation, without fallback}
                 if current_settings.cputype=cpu_mc68020 then
                   list.concat(taicpu.op_const(A_RTD,S_NO,parasize))
                 else
                   begin
-
                     { We must pull the PC Counter from the stack, before  }
                     { restoring the stack pointer, otherwise the PC would }
                     { point to nowhere!                                   }
 
-                    { save the PC counter (pop it from the stack)         }
-                    { use A0 for this which is defined as a scratch       }
-                    { register                                            }
+                    { Instead of doing a slow copy of the return address while trying    }
+                    { to feed it to the RTS instruction, load the PC to A0 (scratch reg) }
+                    { then free up the stack allocated for paras, then use a JMP (A0) to }
+                    { return to the caller with the paras freed. (KB) }
+
                     hregister:=NR_A0;
                     cg.a_reg_alloc(list,hregister);
                     reference_reset_base(ref,NR_STACK_POINTER_REG,0,4);
                     ref.direction:=dir_inc;
                     list.concat(taicpu.op_ref_reg(A_MOVE,S_L,ref,hregister));
-                    { can we do a quick addition ... }
+
                     r:=NR_SP;
+                    { can we do a quick addition ... }
                     if (parasize > 0) and (parasize < 9) then
                        list.concat(taicpu.op_const_reg(A_ADDQ,S_L,parasize,r))
                     else { nope ... }
                        list.concat(taicpu.op_const_reg(A_ADD,S_L,parasize,r));
 
-                    { restore the PC counter (push it on the stack)       }
-                    reference_reset_base(ref,NR_STACK_POINTER_REG,0,4);
-                    ref.direction:=dir_dec;
-                    cg.a_reg_alloc(list,hregister);
-                    list.concat(taicpu.op_reg_ref(A_MOVE,S_L,hregister,ref));
-                    list.concat(taicpu.op_none(A_RTS,S_NO));
+                    reference_reset_base(ref,hregister,0,4);
+                    list.concat(taicpu.op_ref(A_JMP,S_NO,ref));
                   end;
               end
             else
@@ -1641,9 +1753,9 @@ unit cgcpu;
             list.concat(taicpu.op_none(A_RTS,S_NO));
           end;
 
-//         writeln('g_proc_exit');
          { Routines with the poclearstack flag set use only a ret.
-           also  routines with parasize=0     }
+           also  routines with parasize=0 }
+         { TODO: figure out if these are still relevant to us (KB) }
            (*
          if current_procinfo.procdef.proccalloption in clearstack_pocalls then
            begin
@@ -1658,41 +1770,115 @@ unit cgcpu;
              list.concat(taicpu.op_none(A_RTS,S_NO));
            end
          else
-           begin
-            { return with immediate size possible here
-              signed!
-              RTD is not supported on the coldfire     }
-            if (current_settings.cputype=cpu_MC68020) and (parasize<$7FFF) then
-                list.concat(taicpu.op_const(A_RTD,S_NO,parasize))
-            { manually restore the stack }
+           *)
+      end;
+
+
+    procedure tcg68k.g_save_registers(list:TAsmList);
+      var
+        dataregs: tcpuregisterset;
+        addrregs: tcpuregisterset;
+        href : treference;
+        hreg : tregister;
+        size : longint;
+        r : integer;
+      begin
+        { The code generated by the section below, particularly the movem.l
+          instruction is known to cause an issue when compiled by some GNU 
+          assembler versions (I had it with 2.17, while 2.24 seems OK.) 
+          when you run into this problem, just call inherited here instead
+          to skip the movem.l generation. But better just use working GNU
+          AS version instead. (KB) }
+        dataregs:=[];
+        addrregs:=[];
+
+        { calculate temp. size }
+        size:=0;
+        for r:=low(saved_standard_registers) to high(saved_standard_registers) do
+          if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
+            begin
+              hreg:=newreg(R_INTREGISTER,saved_address_registers[r],R_SUBWHOLE);
+              inc(size,sizeof(aint));
+              dataregs:=dataregs + [saved_standard_registers[r]];
+            end;
+        if uses_registers(R_ADDRESSREGISTER) then
+          for r:=low(saved_address_registers) to high(saved_address_registers) do
+            if saved_address_registers[r] in rg[R_ADDRESSREGISTER].used_in_proc then
+              begin
+                hreg:=newreg(R_ADDRESSREGISTER,saved_address_registers[r],R_SUBWHOLE);
+                inc(size,sizeof(aint));
+                addrregs:=addrregs + [saved_address_registers[r]];
+              end;
+
+        { 68k has no MM registers }
+        if uses_registers(R_MMREGISTER) then
+          internalerror(2014030201);
+
+        if size>0 then
+          begin
+            tg.GetTemp(list,size,sizeof(aint),tt_noreuse,current_procinfo.save_regs_ref);
+            include(current_procinfo.flags,pi_has_saved_regs);
+
+            { Copy registers to temp }
+            href:=current_procinfo.save_regs_ref;
+            if size = sizeof(aint) then
+              a_load_reg_ref(list, OS_32, OS_32, hreg, href)
             else
+              list.concat(taicpu.op_regset_ref(A_MOVEM,S_L,dataregs,addrregs,href));
+          end;
+      end;
+
+
+    procedure tcg68k.g_restore_registers(list:TAsmList);
+      var
+        dataregs: tcpuregisterset;
+        addrregs: tcpuregisterset;
+        href    : treference;
+        r       : integer;
+        hreg    : tregister;
+        size    : longint;
+      begin
+        { see the remark about buggy GNU AS versions in g_save_registers() (KB) }
+        dataregs:=[];
+        addrregs:=[];
+
+        if not(pi_has_saved_regs in current_procinfo.flags) then
+          exit;
+        { Copy registers from temp }
+        size:=0;
+        for r:=low(saved_standard_registers) to high(saved_standard_registers) do
+          if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
+            begin
+              inc(size,sizeof(aint));
+              hreg:=newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE);
+              { Allocate register so the optimizer does not remove the load }
+              a_reg_alloc(list,hreg);
+              dataregs:=dataregs + [saved_standard_registers[r]];
+            end;
+
+        if uses_registers(R_ADDRESSREGISTER) then
+          for r:=low(saved_address_registers) to high(saved_address_registers) do
+            if saved_address_registers[r] in rg[R_ADDRESSREGISTER].used_in_proc then
               begin
-                { We must pull the PC Counter from the stack, before  }
-                { restoring the stack pointer, otherwise the PC would }
-                { point to nowhere!                                   }
-
-                { save the PC counter (pop it from the stack)         }
-                hregister:=NR_A3;
-                cg.a_reg_alloc(list,hregister);
-                reference_reset_base(ref,NR_STACK_POINTER_REG,0);
-                ref.direction:=dir_inc;
-                list.concat(taicpu.op_ref_reg(A_MOVE,S_L,ref,hregister));
-                { can we do a quick addition ... }
-                r:=NR_SP;
-                if (parasize > 0) and (parasize < 9) then
-                   list.concat(taicpu.op_const_reg(A_ADDQ,S_L,parasize,r))
-                else { nope ... }
-                   list.concat(taicpu.op_const_reg(A_ADD,S_L,parasize,r));
-
-                { restore the PC counter (push it on the stack)       }
-                reference_reset_base(ref,NR_STACK_POINTER_REG,0);
-                ref.direction:=dir_dec;
-                cg.a_reg_alloc(list,hregister);
-                list.concat(taicpu.op_reg_ref(A_MOVE,S_L,hregister,ref));
-                list.concat(taicpu.op_none(A_RTS,S_NO));
-               end;
-           end;
-           *)
+                inc(size,sizeof(aint));
+                hreg:=newreg(R_ADDRESSREGISTER,saved_address_registers[r],R_SUBWHOLE);
+                { Allocate register so the optimizer does not remove the load }
+                a_reg_alloc(list,hreg);
+                addrregs:=addrregs + [saved_address_registers[r]];
+              end;
+
+        { 68k has no MM registers }
+        if uses_registers(R_MMREGISTER) then
+          internalerror(2014030202);
+
+        { Restore registers from temp }
+        href:=current_procinfo.save_regs_ref;
+        if size = sizeof(aint) then
+          a_load_ref_reg(list, OS_32, OS_32, href, hreg)
+        else
+          list.concat(taicpu.op_ref_regset(A_MOVEM,S_L,href,dataregs,addrregs));
+
+        tg.UnGetTemp(list,current_procinfo.save_regs_ref);
       end;
 
 
@@ -1909,156 +2095,127 @@ unit cgcpu;
 {****************************************************************************}
 {                               TCG64F68K                                    }
 {****************************************************************************}
- procedure tcg64f68k.a_op64_reg_reg(list : TAsmList;op:TOpCG;size: tcgsize; regsrc,regdst : tregister64);
-  var
-   hreg1, hreg2 : tregister;
-   opcode : tasmop;
-   instr : taicpu;
-  begin
-//    writeln('a_op64_reg_reg');
-    opcode := topcg2tasmop[op];
-    case op of
-      OP_ADD :
-         begin
-            { if one of these three registers is an address
-              register, we'll really get into problems!
-            }
-            if isaddressregister(regdst.reglo) or
-               isaddressregister(regdst.reghi) or
-               isaddressregister(regsrc.reghi) then
-                 internalerror(20020817);
-            list.concat(taicpu.op_reg_reg(A_ADD,S_L,regsrc.reglo,regdst.reglo));
-            list.concat(taicpu.op_reg_reg(A_ADDX,S_L,regsrc.reghi,regdst.reghi));
-         end;
-      OP_AND,OP_OR :
-          begin
-            { at least one of the registers must be a data register }
-            if (isaddressregister(regdst.reglo) and
-                isaddressregister(regsrc.reglo)) or
-               (isaddressregister(regsrc.reghi) and
-                isaddressregister(regdst.reghi))
-               then
-                 internalerror(20020817);
-            cg.a_op_reg_reg(list,op,OS_32,regsrc.reglo,regdst.reglo);
-            cg.a_op_reg_reg(list,op,OS_32,regsrc.reghi,regdst.reghi);
-          end;
-      { this is handled in 1st pass for 32-bit cpu's (helper call) }
-      OP_IDIV,OP_DIV,
-      OP_IMUL,OP_MUL: internalerror(2002081701);
-      { this is also handled in 1st pass for 32-bit cpu's (helper call) }
-      OP_SAR,OP_SHL,OP_SHR: internalerror(2002081702);
-      OP_SUB:
-         begin
-            { if one of these three registers is an address
-              register, we'll really get into problems!
-            }
-            if isaddressregister(regdst.reglo) or
-               isaddressregister(regdst.reghi) or
-               isaddressregister(regsrc.reghi) then
-                 internalerror(20020817);
-            list.concat(taicpu.op_reg_reg(A_SUB,S_L,regsrc.reglo,regdst.reglo));
-            list.concat(taicpu.op_reg_reg(A_SUBX,S_L,regsrc.reghi,regdst.reghi));
-         end;
-      OP_XOR:
-        begin
-            if isaddressregister(regdst.reglo) or
-               isaddressregister(regsrc.reglo) or
-               isaddressregister(regsrc.reghi) or
-               isaddressregister(regdst.reghi) then
-                 internalerror(20020817);
-            list.concat(taicpu.op_reg_reg(A_EOR,S_L,regsrc.reglo,regdst.reglo));
-            list.concat(taicpu.op_reg_reg(A_EOR,S_L,regsrc.reghi,regdst.reghi));
-        end;
-      OP_NEG:
-        begin
-          if isaddressregister(regdst.reglo) or
-              isaddressregister(regdst.reghi) then
-            internalerror(2012110402);
-          instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reglo,regdst.reglo);
-          cg.add_move_instruction(instr);
-          list.concat(instr);
-          instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reghi,regdst.reghi);
-          cg.add_move_instruction(instr);
-          list.concat(instr);
-          list.concat(taicpu.op_reg(A_NEG,S_L,regdst.reglo));
-          list.concat(taicpu.op_reg(A_NEGX,S_L,regdst.reghi));
-        end;
-      OP_NOT:
-        begin
-          if isaddressregister(regdst.reglo) or
-              isaddressregister(regdst.reghi) then
-            internalerror(2012110401);
-          instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reglo,regdst.reglo);
-          cg.add_move_instruction(instr);
-          list.concat(instr);
-          instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reghi,regdst.reghi);
-          cg.add_move_instruction(instr);
-          list.concat(instr);
-          list.concat(taicpu.op_reg(A_NOT,S_L,regdst.reglo));
-          list.concat(taicpu.op_reg(A_NOT,S_L,regdst.reghi));
-        end;
-    end; { end case }
-  end;
+    procedure tcg64f68k.a_op64_reg_reg(list : TAsmList;op:TOpCG;size: tcgsize; regsrc,regdst : tregister64);
+      var
+        hreg1, hreg2 : tregister;
+        opcode : tasmop;
+        xopcode : tasmop;
+        instr : taicpu;
+      begin
+        opcode := topcg2tasmop[op];
+        xopcode := topcg2tasmopx[op];
 
+        case op of
+          OP_ADD,OP_SUB:
+            begin
+              { if one of these three registers is an address
+              register, we'll really get into problems! }
+              if isaddressregister(regdst.reglo) or
+                 isaddressregister(regdst.reghi) or
+                 isaddressregister(regsrc.reghi) then
+                internalerror(2014030101);
+              list.concat(taicpu.op_reg_reg(opcode,S_L,regsrc.reglo,regdst.reglo));
+              list.concat(taicpu.op_reg_reg(xopcode,S_L,regsrc.reghi,regdst.reghi));
+            end;
+          OP_AND,OP_OR:
+            begin
+              { at least one of the registers must be a data register }
+              if (isaddressregister(regdst.reglo) and
+                  isaddressregister(regsrc.reglo)) or
+                 (isaddressregister(regsrc.reghi) and
+                  isaddressregister(regdst.reghi)) then
+                internalerror(2014030102);
+              cg.a_op_reg_reg(list,op,OS_32,regsrc.reglo,regdst.reglo);
+              cg.a_op_reg_reg(list,op,OS_32,regsrc.reghi,regdst.reghi);
+            end;
+          { this is handled in 1st pass for 32-bit cpu's (helper call) }
+          OP_IDIV,OP_DIV,
+          OP_IMUL,OP_MUL: 
+            internalerror(2002081701);
+          { this is also handled in 1st pass for 32-bit cpu's (helper call) }
+          OP_SAR,OP_SHL,OP_SHR:
+            internalerror(2002081702);
+          OP_XOR:
+            begin
+              if isaddressregister(regdst.reglo) or
+                 isaddressregister(regsrc.reglo) or
+                 isaddressregister(regsrc.reghi) or
+                 isaddressregister(regdst.reghi) then
+                internalerror(2014030103);
+              cg.a_op_reg_reg(list,op,OS_32,regsrc.reglo,regdst.reglo);
+              cg.a_op_reg_reg(list,op,OS_32,regsrc.reghi,regdst.reghi);
+            end;
+          OP_NEG,OP_NOT:
+            begin
+              if isaddressregister(regdst.reglo) or
+                 isaddressregister(regdst.reghi) then
+               internalerror(2014030104);
+              instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reglo,regdst.reglo);
+              cg.add_move_instruction(instr);
+              list.concat(instr);
+              instr:=taicpu.op_reg_reg(A_MOVE,S_L,regsrc.reghi,regdst.reghi);
+              cg.add_move_instruction(instr);
+              list.concat(instr);
+              if (op = OP_NOT) then
+                xopcode:=opcode;
+              list.concat(taicpu.op_reg(opcode,S_L,regdst.reglo));
+              list.concat(taicpu.op_reg(xopcode,S_L,regdst.reghi));
+            end;
+        end; { end case }
+      end;
 
- procedure tcg64f68k.a_op64_const_reg(list : TAsmList;op:TOpCG;size: tcgsize; value : int64;regdst : tregister64);
-  var
-   lowvalue : cardinal;
-   highvalue : cardinal;
-   hreg : tregister;
-  begin
-//    writeln('a_op64_const_reg');
-    { is it optimized out ? }
-//    if cg.optimize64_op_const_reg(list,op,value,reg) then
-//       exit;
-
-    lowvalue := cardinal(value);
-    highvalue:= value shr 32;
-
-   { the destination registers must be data registers }
-   if  isaddressregister(regdst.reglo) or
-       isaddressregister(regdst.reghi) then
-         internalerror(20020817);
-   case op of
-      OP_ADD :
-         begin
-           hreg:=cg.getintregister(list,OS_INT);
-           list.concat(taicpu.op_const_reg(A_MOVE,S_L,highvalue,hreg));
-           list.concat(taicpu.op_const_reg(A_ADD,S_L,lowvalue,regdst.reglo));
-           list.concat(taicpu.op_reg_reg(A_ADDX,S_L,hreg,regdst.reghi));
-         end;
-      OP_AND :
-          begin
-            list.concat(taicpu.op_const_reg(A_AND,S_L,lowvalue,regdst.reglo));
-            list.concat(taicpu.op_const_reg(A_AND,S_L,highvalue,regdst.reghi));
-          end;
-      OP_OR :
-          begin
-            list.concat(taicpu.op_const_reg(A_OR,S_L,lowvalue,regdst.reglo));
-            list.concat(taicpu.op_const_reg(A_OR,S_L,highvalue,regdst.reghi));
-          end;
-      { this is handled in 1st pass for 32-bit cpus (helper call) }
-      OP_IDIV,OP_DIV,
-      OP_IMUL,OP_MUL: internalerror(2002081701);
-      { this is also handled in 1st pass for 32-bit cpus (helper call) }
-      OP_SAR,OP_SHL,OP_SHR: internalerror(2002081702);
-      OP_SUB:
-         begin
-           hreg:=cg.getintregister(list,OS_INT);
-           list.concat(taicpu.op_const_reg(A_MOVE,S_L,highvalue,hreg));
-           list.concat(taicpu.op_const_reg(A_SUB,S_L,lowvalue,regdst.reglo));
-           list.concat(taicpu.op_reg_reg(A_SUBX,S_L,hreg,regdst.reghi));
-         end;
-      OP_XOR:
-        begin
-            list.concat(taicpu.op_const_reg(A_EOR,S_L,lowvalue,regdst.reglo));
-            list.concat(taicpu.op_const_reg(A_EOR,S_L,highvalue,regdst.reghi));
-        end;
-      { these should have been handled already by earlier passes }
-      OP_NOT, OP_NEG:
-        internalerror(2012110403);
-    end; { end case }
-  end;
+
+    procedure tcg64f68k.a_op64_const_reg(list : TAsmList;op:TOpCG;size: tcgsize; value : int64;regdst : tregister64);
+      var
+        lowvalue : cardinal;
+        highvalue : cardinal;
+        opcode : tasmop;
+        xopcode : tasmop;
+        hreg : tregister;
+      begin
+        { is it optimized out ? }
+        { optimize64_op_const_reg doesn't seem to be used in any cg64f32 right now. why? (KB) }
+        { if cg.optimize64_op_const_reg(list,op,value,reg) then
+            exit; }
+
+        lowvalue := cardinal(value);
+        highvalue := value shr 32;
+
+        opcode := topcg2tasmop[op];
+        xopcode := topcg2tasmopx[op];
+
+        { the destination registers must be data registers }
+        if isaddressregister(regdst.reglo) or
+           isaddressregister(regdst.reghi) then
+          internalerror(2014030105);
+        case op of
+          OP_ADD,OP_SUB:
+            begin
+              hreg:=cg.getintregister(list,OS_INT);
+              { cg.a_load_const_reg provides optimized loading to register for special cases }
+              cg.a_load_const_reg(list,OS_S32,longint(highvalue),hreg);
+              { don't use cg.a_op_const_reg() here, because a possible optimized
+                ADDQ/SUBQ wouldn't set the eXtend bit }
+              list.concat(taicpu.op_const_reg(opcode,S_L,lowvalue,regdst.reglo));
+              list.concat(taicpu.op_reg_reg(xopcode,S_L,hreg,regdst.reghi));
+            end;
+          OP_AND,OP_OR,OP_XOR:
+            begin
+              cg.a_op_const_reg(list,op,OS_S32,longint(lowvalue),regdst.reglo);
+              cg.a_op_const_reg(list,op,OS_S32,longint(highvalue),regdst.reghi);
+            end;
+          { this is handled in 1st pass for 32-bit cpus (helper call) }
+          OP_IDIV,OP_DIV,
+          OP_IMUL,OP_MUL:
+            internalerror(2002081701);
+          { this is also handled in 1st pass for 32-bit cpus (helper call) }
+          OP_SAR,OP_SHL,OP_SHR: 
+            internalerror(2002081702);
+          { these should have been handled already by earlier passes }
+          OP_NOT,OP_NEG:
+            internalerror(2012110403);
+        end; { end case }
+      end;
 
 
 procedure create_codegen;

+ 1 - 1
compiler/m68k/cpubase.pas

@@ -152,7 +152,7 @@ unit cpubase;
       { registers which may be destroyed by calls }
       VOLATILE_INTREGISTERS = [RS_D0,RS_D1];
       VOLATILE_FPUREGISTERS = [];
-      VOLATILE_ADDRESSREGISTER = [RS_A0,RS_A1];
+      VOLATILE_ADDRESSREGISTERS = [RS_A0,RS_A1];
 
     type
       totherregisterset = set of tregisterindex;

+ 25 - 91
compiler/m68k/cpupara.pas

@@ -52,6 +52,7 @@ unit cpupara;
           function get_volatile_registers_int(calloption:tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_address(calloption:tproccalloption):tcpuregisterset;override;
          private
+          function parse_loc_string_to_register(var locreg: tregister; const s : string): boolean;
           procedure init_values(var curintreg, curfloatreg: tsuperregister; var cur_stack_offset: aword);
           function create_paraloc_info_intern(p : tabstractprocdef; side: tcallercallee; paras: tparalist;
                                                var curintreg, curfloatreg: tsuperregister; var cur_stack_offset: aword):longint;
@@ -66,6 +67,21 @@ unit cpupara;
        cpuinfo,
        defutil;
 
+
+    function tcpuparamanager.get_volatile_registers_int(calloption:tproccalloption):tcpuregisterset;
+      begin
+        { d0 and d1 are considered volatile }
+        Result:=VOLATILE_INTREGISTERS;
+      end;
+
+
+    function tcpuparamanager.get_volatile_registers_address(calloption:tproccalloption):tcpuregisterset;
+      begin
+        { a0 and a1 are considered volatile }
+        Result:=VOLATILE_ADDRESSREGISTERS;
+      end;
+
+
     procedure tcpuparamanager.getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);
       var
         paraloc : pcgparalocation;
@@ -445,71 +461,22 @@ unit cpupara;
       end;
 }
 
+    function tcpuparamanager.parse_loc_string_to_register(var locreg: tregister; const s : string): boolean;
+      begin
+        locreg:=std_regnum_search(lowercase(s));
+        result:=(locreg <> NR_NO) and (locreg <> NR_SP);
+      end;
+
     function tcpuparamanager.parsefuncretloc(p : tabstractprocdef; const s : string) : boolean;
       begin
-        result:=false;
         case target_info.system of
           system_m68k_amiga:
-            begin
-              if s='D0' then
-                p.exp_funcretloc:=NR_D0
-              else if s='D1' then
-                p.exp_funcretloc:=NR_D1
-              else if s='D2' then
-                p.exp_funcretloc:=NR_D2
-              else if s='D3' then
-                p.exp_funcretloc:=NR_D3
-              else if s='D4' then
-                p.exp_funcretloc:=NR_D4
-              else if s='D5' then
-                p.exp_funcretloc:=NR_D5
-              else if s='D6' then
-                p.exp_funcretloc:=NR_D6
-              else if s='D7' then
-                p.exp_funcretloc:=NR_D7
-              else if s='A0' then
-                p.exp_funcretloc:=NR_A0
-              else if s='A1' then
-                p.exp_funcretloc:=NR_A1
-              else if s='A2' then
-                p.exp_funcretloc:=NR_A2
-              else if s='A3' then
-                p.exp_funcretloc:=NR_A3
-              else if s='A4' then
-                p.exp_funcretloc:=NR_A4
-              else if s='A5' then
-                p.exp_funcretloc:=NR_A5
-              { 'A6' is problematic, since it's the frame pointer in fpc,
-                so it should be saved before a call! }
-              else if s='A6' then
-                p.exp_funcretloc:=NR_A6
-              { 'A7' is the stack pointer on 68k, can't be overwritten by API calls }
-              else
-                p.exp_funcretloc:=NR_NO;
-
-              if p.exp_funcretloc<>NR_NO then result:=true;
-            end;
+            result:=parse_loc_string_to_register(p.exp_funcretloc, s);
           else
             internalerror(2005121801);
         end;
       end;
 
-    function tcpuparamanager.get_volatile_registers_int(calloption:tproccalloption):tcpuregisterset;
-      begin
-        { d0 and d1 are considered volatile (ToDo: results in "procedure too
-          complex when compiling unicodedata.pas) }
-        //Result:=[RS_D0,RS_D1];
-        Result:=[RS_D0..RS_D7];
-      end;
-
-
-    function tcpuparamanager.get_volatile_registers_address(calloption:tproccalloption):tcpuregisterset;
-      begin
-        { a0 and a1 are considered volatile }
-        Result:=[RS_A0,RS_A1];
-      end;
-
-
     function tcpuparamanager.parseparaloc(p : tparavarsym;const s : string) : boolean;
       var
         paraloc : pcgparalocation;
@@ -523,41 +490,8 @@ unit cpupara;
               paraloc^.loc:=LOC_REGISTER;
               paraloc^.size:=def_cgsize(p.vardef);
               paraloc^.def:=p.vardef;
-              { pattern is always uppercase'd }
-              if s='D0' then
-                paraloc^.register:=NR_D0
-              else if s='D1' then
-                paraloc^.register:=NR_D1
-              else if s='D2' then
-                paraloc^.register:=NR_D2
-              else if s='D3' then
-                paraloc^.register:=NR_D3
-              else if s='D4' then
-                paraloc^.register:=NR_D4
-              else if s='D5' then
-                paraloc^.register:=NR_D5
-              else if s='D6' then
-                paraloc^.register:=NR_D6
-              else if s='D7' then
-                paraloc^.register:=NR_D7
-              else if s='A0' then
-                paraloc^.register:=NR_A0
-              else if s='A1' then
-                paraloc^.register:=NR_A1
-              else if s='A2' then
-                paraloc^.register:=NR_A2
-              else if s='A3' then
-                paraloc^.register:=NR_A3
-              else if s='A4' then
-                paraloc^.register:=NR_A4
-              else if s='A5' then
-                paraloc^.register:=NR_A5
-              { 'A6' is problematic, since it's the frame pointer in fpc,
-                so it should be saved before a call! }
-              else if s='A6' then
-                paraloc^.register:=NR_A6
-              { 'A7' is the stack pointer on 68k, can't be overwritten by API calls }
-              else
+
+              if not parse_loc_string_to_register(paraloc^.register, s) then
                 exit;
 
               { copy to callee side }

+ 12 - 12
compiler/m68k/n68kadd.pas

@@ -77,7 +77,7 @@ implementation
         tmpreg:=cg.getintregister(current_asmdata.currasmlist,OS_INT);
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         current_asmdata.getjumplabel(labelcmp64_1);
         current_asmdata.getjumplabel(labelcmp64_2);
@@ -102,7 +102,7 @@ implementation
         cg.a_label(current_asmdata.currasmlist,labelcmp64_2);
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64_1);
         result:=tmpreg;
@@ -116,7 +116,7 @@ implementation
         tmpreg:=cg.getintregister(current_asmdata.currasmlist,OS_INT);
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         current_asmdata.getjumplabel(labelcmp64_1);
         current_asmdata.getjumplabel(labelcmp64_2);
@@ -135,7 +135,7 @@ implementation
         cg.a_label(current_asmdata.currasmlist,labelcmp64_2);
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64_1);
         result:=tmpreg;
@@ -150,7 +150,7 @@ implementation
         current_asmdata.getjumplabel(labelcmp64);
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         { is the high order longword equal? }
         current_asmdata.currasmlist.concat(taicpu.op_reg_reg(A_CMP,S_L,left_reg.reghi,right_reg.reghi));
@@ -161,7 +161,7 @@ implementation
         current_asmdata.currasmlist.concat(taicpu.op_cond_sym(A_BXX,C_NE,S_L,labelcmp64));
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64);
         result:=tmpreg;
@@ -176,7 +176,7 @@ implementation
         current_asmdata.getjumplabel(labelcmp64);
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         { is the high order longword equal? }
         current_asmdata.currasmlist.concat(taicpu.op_reg_reg(A_CMP,S_L,left_reg.reghi,right_reg.reghi));
@@ -187,7 +187,7 @@ implementation
         current_asmdata.currasmlist.concat(taicpu.op_cond_sym(A_BXX,C_NE,S_L,labelcmp64));
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64);
         result:=tmpreg;
@@ -201,7 +201,7 @@ implementation
         tmpreg:=cg.getintregister(current_asmdata.currasmlist,OS_INT);
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         current_asmdata.getjumplabel(labelcmp64_1);
         current_asmdata.getjumplabel(labelcmp64_2);
@@ -226,7 +226,7 @@ implementation
         cg.a_label(current_asmdata.currasmlist,labelcmp64_2);
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64_1);
         result:=tmpreg;
@@ -240,7 +240,7 @@ implementation
         tmpreg:=cg.getintregister(current_asmdata.currasmlist,OS_INT);
 
         { load the value for "false" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,0,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,0,tmpreg);
 
         current_asmdata.getjumplabel(labelcmp64_1);
         current_asmdata.getjumplabel(labelcmp64_2);
@@ -259,7 +259,7 @@ implementation
         cg.a_label(current_asmdata.currasmlist,labelcmp64_2);
 
         { load the value for "true" }
-        current_asmdata.currasmlist.concat(taicpu.op_const_reg(A_MOVE,S_L,1,tmpreg));
+        cg.a_load_const_reg(current_asmdata.currasmlist,OS_INT,1,tmpreg);
 
         cg.a_label(current_asmdata.currasmlist,labelcmp64_1);
         result:=tmpreg;

+ 27 - 36
compiler/m68k/n68kcnv.pas

@@ -34,7 +34,6 @@ interface
           function first_int_to_real: tnode; override;
           procedure second_int_to_real;override;
           procedure second_int_to_bool;override;
-//          procedure pass_generate_code;override;
        end;
 
 implementation
@@ -192,7 +191,9 @@ implementation
               exit;
            end;
 
-         location_reset(location,LOC_REGISTER,def_cgsize(left.resultdef));
+         resflags:=F_NE;
+
+         newsize:=def_cgsize(resultdef);
          opsize := def_cgsize(left.resultdef);
          case left.location.loc of
             LOC_CREFERENCE,LOC_REFERENCE :
@@ -229,12 +230,8 @@ implementation
                          cg.a_load_ref_reg(current_asmdata.CurrAsmList,opsize,opsize,
                             left.location.reference,hreg2);
                          current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_TST,TCGSize2OpSize[opsize],hreg2));
-    //                     cg.ungetcpuregister(current_asmdata.CurrAsmList,hreg2);
                       end;
-    //                reference_release(current_asmdata.CurrAsmList,left.location.reference);
                   end;
-                resflags:=F_NE;
-                hreg1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
               end;
             LOC_REGISTER,LOC_CREGISTER :
               begin
@@ -249,20 +246,16 @@ implementation
                   begin
                     hreg2:=left.location.register;
                     current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_TST,TCGSize2OpSize[opsize],hreg2));
-    //                cg.ungetcpuregister(current_asmdata.CurrAsmList,hreg2);
                   end;
-                hreg1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
-                resflags:=F_NE;
               end;
             LOC_FLAGS :
               begin
-                hreg1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
                 resflags:=left.location.resflags;
               end;
             LOC_JUMP :
               begin
                 { for now blindly copied from nx86cnv }
-                location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+                location_reset(location,LOC_REGISTER,newsize);
                 location.register:=cg.getintregister(current_asmdata.CurrAsmList,location.size);
                 current_asmdata.getjumplabel(hlabel);
                 cg.a_label(current_asmdata.CurrAsmList,current_procinfo.CurrTrueLabel);
@@ -280,36 +273,34 @@ implementation
          end;
          if left.location.loc<>LOC_JUMP then
            begin
-             cg.g_flags2reg(current_asmdata.CurrAsmList,location.size,resflags,hreg1);
-             if (is_cbool(resultdef)) then
-               cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NEG,location.size,hreg1,hreg1);
-             location.register := hreg1;
+             location_reset(location,LOC_REGISTER,newsize);
+             if newsize in [OS_64,OS_S64] then
+               begin
+                 hreg2:=cg.getintregister(current_asmdata.CurrAsmList,OS_32);
+                 cg.g_flags2reg(current_asmdata.CurrAsmList,OS_32,resflags,hreg2);
+                 if (is_cbool(resultdef)) then
+                   cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NEG,OS_32,hreg2,hreg2);
+                 location.register64.reglo:=hreg2;
+                 location.register64.reghi:=cg.getintregister(current_asmdata.CurrAsmList,OS_32);
+                 if (is_cbool(resultdef)) then
+                   { reglo is either 0 or -1 -> reghi has to become the same }
+                   cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_32,OS_32,location.register64.reglo,location.register64.reghi)
+                 else
+                   { unsigned }
+                   cg.a_load_const_reg(current_asmdata.CurrAsmList,OS_32,0,location.register64.reghi);
+               end
+            else
+              begin
+                location.register:=cg.getintregister(current_asmdata.CurrAsmList,newsize);
+                cg.g_flags2reg(current_asmdata.CurrAsmList,newsize,resflags,location.register);
+                if (is_cbool(resultdef)) then
+                  cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NEG,newsize,location.register,location.register);
+              end
            end;
          current_procinfo.CurrTrueLabel:=oldTrueLabel;
          current_procinfo.CurrFalseLabel:=oldFalseLabel;
       end;
 
-{
-    procedure tm68ktypeconvnode.pass_generate_code;
-{$ifdef TESTOBJEXT2}
-      var
-         r : preference;
-         nillabel : plabel;
-{$endif TESTOBJEXT2}
-      begin
-         { this isn't good coding, I think tc_bool_2_int, shouldn't be }
-         { type conversion (FK)                                 }
-
-         if not(convtype in [tc_bool_2_int,tc_bool_2_bool]) then
-           begin
-              secondpass(left);
-              location_copy(location,left.location);
-              if codegenerror then
-               exit;
-           end;
-         second_call_helper(convtype);
-      end;
-}
 
 begin
    ctypeconvnode:=tm68ktypeconvnode;

+ 65 - 11
compiler/mips/aasmcpu.pas

@@ -469,11 +469,12 @@ procedure fixup_jmps(list: TAsmList);
   var
     p,pdelayslot: tai;
     newcomment: tai_comment;
-    newjmp,newnoop: taicpu;
+    newins,newjmp,newnoop: taicpu;
     labelpositions: TFPList;
     instrpos: ptrint;
     l: tasmlabel;
     inserted_something: boolean;
+    href: treference;
   begin
     // if certainly not enough instructions to cause an overflow, dont bother
     if (list.count <= (high(smallint) div 4)) then
@@ -541,11 +542,36 @@ procedure fixup_jmps(list: TAsmList);
                        (ptruint(abs(ptrint(labelpositions[tasmlabel(taicpu(p).oper[0]^.ref^.symbol).labelnr]-instrpos)) - (low(smallint) div 4)) > ptruint((high(smallint) - low(smallint)) div 4)) then
 {$pop}
                       begin
-                        { This is not PIC safe }
-                        taicpu(p).opcode:=A_J;
-                        newcomment:=tai_comment.create(strpnew('fixup_jmps, A_BA changed into A_J'));
-                        list.insertbefore(newcomment,p);
-                      end;
+                        if (cs_create_pic in current_settings.moduleswitches) then
+                          begin
+                            newcomment:=tai_comment.create(strpnew('fixup_jmps, A_BA changed into PIC sequence'));
+                            list.insertbefore(newcomment,p);
+                            href:=taicpu(p).oper[0]^.ref^;
+                            href.refaddr:=addr_pic;
+                            href.base:=NR_GP;
+                            newins:=taicpu.op_reg_ref(A_LW,NR_PIC_FUNC,href);
+                            newins.fileinfo:=taicpu(p).fileinfo;
+                            list.insertbefore(newins,p);
+                            inc(instrpos,2);
+                            if (href.symbol.bind=AB_LOCAL) then
+                              begin
+                                href.refaddr:=addr_low;
+                                href.base:=NR_NO;
+                                newins:=taicpu.op_reg_reg_ref(A_ADDIU,NR_PIC_FUNC,NR_PIC_FUNC,href);
+                                newins.fileinfo:=taicpu(p).fileinfo;
+                                list.insertbefore(newins,p);
+                                inc(instrpos,2);
+                              end;
+                            taicpu(p).opcode:=A_JR;
+                            taicpu(p).loadreg(0,NR_PIC_FUNC);
+                          end
+                        else
+                          begin
+                            taicpu(p).opcode:=A_J;
+                            newcomment:=tai_comment.create(strpnew('fixup_jmps, A_BA changed into A_J'));
+                            list.insertbefore(newcomment,p);
+                          end;
+                       end;
                   A_BC:
                     if (taicpu(p).ops=3) and (taicpu(p).oper[2]^.typ = top_ref) and
                        assigned(taicpu(p).oper[2]^.ref^.symbol) and
@@ -574,11 +600,39 @@ procedure fixup_jmps(list: TAsmList);
                         // add a new unconditional jump between this jump and the label
                         newcomment:=tai_comment.create(strpnew('fixup_jmps, A_BXX changed into A_BNOTXX label;A_J;label:'));
                         list.insertbefore(newcomment,p);
-                        newjmp := taicpu.op_sym(A_J,taicpu(p).oper[2]^.ref^.symbol);
-                        newjmp.is_jmp := true;
-                        newjmp.fileinfo := taicpu(p).fileinfo;
-                        list.insertafter(newjmp,pdelayslot);
-                        inc(instrpos,2);
+                        if (cs_create_pic in current_settings.moduleswitches) then
+                          begin
+                            reference_reset_symbol(href,taicpu(p).oper[2]^.ref^.symbol,0,sizeof(pint));
+                            href.refaddr:=addr_pic;
+                            href.base:=NR_GP;
+                            newins:=taicpu.op_reg_ref(A_LW,NR_PIC_FUNC,href);
+                            newins.fileinfo:=taicpu(p).fileinfo;
+                            list.insertafter(newins,pdelayslot);
+                            pdelayslot:=newins;
+                            inc(instrpos,2);
+                            if (href.symbol.bind=AB_LOCAL) then
+                              begin
+                                href.base:=NR_NO;
+                                href.refaddr:=addr_low;
+                                newins:=taicpu.op_reg_reg_ref(A_ADDIU,NR_PIC_FUNC,NR_PIC_FUNC,href);
+                                newins.fileinfo:=taicpu(p).fileinfo;
+                                list.insertafter(newins,pdelayslot);
+                                pdelayslot:=newins;
+                                inc(instrpos,2);
+                              end;
+                            newjmp:=taicpu.op_reg(A_JR,NR_PIC_FUNC);
+                            newjmp.fileinfo:=taicpu(p).fileinfo;
+                            list.insertafter(newjmp,pdelayslot);
+                            inc(instrpos,2);
+                          end
+                        else
+                          begin
+                            newjmp := taicpu.op_sym(A_J,taicpu(p).oper[2]^.ref^.symbol);
+                            newjmp.is_jmp := true;
+                            newjmp.fileinfo := taicpu(p).fileinfo;
+                            list.insertafter(newjmp,pdelayslot);
+                            inc(instrpos,2);
+                          end;
                         { Add a delay slot for new A_J instruction }
                         newnoop:=taicpu.op_none(A_NOP);
                         newnoop.fileinfo := taicpu(p).fileinfo;

+ 19 - 8
compiler/mips/aoptcpu.pas

@@ -32,7 +32,7 @@ unit aoptcpu;
 
     Type
       TCpuAsmOptimizer = class(TAsmOptimizer)
-        function TryRemoveMov(var p: tai): boolean;
+        function TryRemoveMov(var p: tai; opcode: TAsmOp): boolean;
         function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
       End;
 
@@ -65,21 +65,22 @@ unit aoptcpu;
     end;
 
 
-  function TCpuAsmOptimizer.TryRemoveMov(var p: tai): boolean;
+  function TCpuAsmOptimizer.TryRemoveMov(var p: tai; opcode: TAsmOp): boolean;
     var
       next,hp1: tai;
       alloc,dealloc: tai_regalloc;
     begin
       { Fold
-          op   $reg1,...
-          move $reg2,$reg1
+          op      $reg1,...
+          opcode  $reg2,$reg1
           dealloc $reg1
         into
           op   $reg2,...
+        opcode may be A_MOVE, A_MOV_s, A_MOV_d, etc.
       }
       result:=false;
       if GetNextInstruction(p,next) and
-         MatchInstruction(next,A_MOVE) and
+         MatchInstruction(next,opcode) and
          MatchOperand(taicpu(next).oper[1]^,taicpu(p).oper[0]^.reg) then
         begin
           dealloc:=FindRegDealloc(taicpu(p).oper[0]^.reg,tai(next.Next));
@@ -155,7 +156,7 @@ unit aoptcpu;
                       next.free;
                     end
                   else
-                    TryRemoveMov(p);
+                    TryRemoveMov(p,A_MOVE);
                 end;
 
               A_ANDI:
@@ -185,7 +186,7 @@ unit aoptcpu;
                       next2.free;
                     end
                   else
-                    TryRemoveMov(p);
+                    TryRemoveMov(p,A_MOVE);
                 end;
 
               A_ADD,A_ADDU,
@@ -195,7 +196,17 @@ unit aoptcpu;
               A_SRLV,
               A_SLL,A_SLLV,
               A_AND,A_OR,A_XOR,A_ORI,A_XORI:
-                TryRemoveMov(p);
+                TryRemoveMov(p,A_MOVE);
+
+              A_ADD_s, A_SUB_s, A_MUL_s, A_DIV_s,
+              A_ABS_s, A_NEG_s, A_SQRT_s,
+              A_CVT_s_w, A_CVT_s_l, A_CVT_s_d:
+                TryRemoveMov(p,A_MOV_s);
+
+              A_ADD_d, A_SUB_d, A_MUL_d, A_DIV_d,
+              A_ABS_d, A_NEG_d, A_SQRT_d,
+              A_CVT_d_w, A_CVT_d_l, A_CVT_d_s:
+                TryRemoveMov(p,A_MOV_d);
             end;
           end;
       end;

+ 16 - 17
compiler/mips/cgcpu.pas

@@ -749,7 +749,7 @@ const
 
 procedure TCGMIPS.a_op_const_reg(list: tasmlist; Op: TOpCG; size: tcgsize; a: tcgint; reg: TRegister);
 begin
-  optimize_op_const(op,a);
+  optimize_op_const(size,op,a);
   case op of
     OP_NONE:
       exit;
@@ -796,8 +796,6 @@ end;
 
 
 procedure TCGMIPS.a_op_reg_reg_reg(list: tasmlist; op: TOpCg; size: tcgsize; src1, src2, dst: tregister);
-var
-  hreg: tregister;
 begin
   if (TOpcg2AsmOp[op]=A_NONE) then
     InternalError(2013070305);
@@ -805,11 +803,10 @@ begin
     begin
       if (size in [OS_S8,OS_S16]) then
         begin
-          { Shift left by 16/24 bits and increase amount of right shift by same value }
+          { Sign-extend before shiting }
           list.concat(taicpu.op_reg_reg_const(A_SLL, dst, src2, 32-(tcgsize2size[size]*8)));
-          hreg:=GetIntRegister(list,OS_INT);
-          a_op_const_reg_reg(list,OP_ADD,OS_INT,32-(tcgsize2size[size]*8),src1,dst);
-          src1:=hreg;
+          list.concat(taicpu.op_reg_reg_const(A_SRA, dst, dst, 32-(tcgsize2size[size]*8)));
+          src2:=dst;
         end
       else if not (size in [OS_32,OS_S32]) then
         InternalError(2013070306);
@@ -826,7 +823,7 @@ var
   asmop: TAsmOp;
 begin
   ovloc.loc := LOC_VOID;
-  optimize_op_const(op,a);
+  optimize_op_const(size,op,a);
   signed:=(size in [OS_S8,OS_S16,OS_S32]);
   if (setflags and (not signed) and (src=dst) and (op in [OP_ADD,OP_SUB])) then
     hreg:=GetIntRegister(list,OS_INT)
@@ -1198,6 +1195,7 @@ var
   helplist : TAsmList;
   largeoffs : boolean;
 begin
+  list.concat(tai_directive.create(asd_ent,current_procinfo.procdef.mangledname));
   a_reg_alloc(list,NR_STACK_POINTER_REG);
 
   if nostackframe then
@@ -1399,6 +1397,7 @@ begin
        list.concat(Taicpu.op_none(A_P_SET_MACRO));
        list.concat(Taicpu.op_none(A_P_SET_REORDER));
     end;
+  list.concat(tai_directive.create(asd_ent_end,current_procinfo.procdef.mangledname));
 end;
 
 
@@ -1565,8 +1564,8 @@ begin
     { generate a loop }
     if len > 4 then
     begin
-      countreg := cg.GetIntRegister(list, OS_INT);
-      tmpreg1  := cg.GetIntRegister(list, OS_INT);
+      countreg := GetIntRegister(list, OS_INT);
+      tmpreg1  := GetIntRegister(list, OS_INT);
       a_load_const_reg(list, OS_INT, len, countreg);
       current_asmdata.getjumplabel(lab);
       a_label(list, lab);
@@ -1580,7 +1579,7 @@ begin
     else
     begin
       { unrolled loop }
-      tmpreg1 := cg.GetIntRegister(list, OS_INT);
+      tmpreg1 := GetIntRegister(list, OS_INT);
       for i := 1 to len do
       begin
         list.concat(taicpu.op_reg_ref(A_LBU, tmpreg1, src));
@@ -1761,7 +1760,6 @@ var
   tmpref: treference;
   tmpreg: tregister;
 begin
-  { Override this function to prevent loading the reference twice }
   if target_info.endian = endian_big then
     begin
       tmpreg := reg.reglo;
@@ -1769,9 +1767,10 @@ begin
       reg.reghi := tmpreg;
     end;
   tmpref := ref;
-  cg.a_load_reg_ref(list, OS_S32, OS_S32, reg.reglo, tmpref);
+  tcgmips(cg).make_simple_ref(list,tmpref);
+  list.concat(taicpu.op_reg_ref(A_SW,reg.reglo,tmpref));
   Inc(tmpref.offset, 4);
-  cg.a_load_reg_ref(list, OS_S32, OS_S32, reg.reghi, tmpref);
+  list.concat(taicpu.op_reg_ref(A_SW,reg.reghi,tmpref));
 end;
 
 
@@ -1780,7 +1779,6 @@ var
   tmpref: treference;
   tmpreg: tregister;
 begin
-  { Override this function to prevent loading the reference twice }
   if target_info.endian = endian_big then
     begin
       tmpreg := reg.reglo;
@@ -1788,9 +1786,10 @@ begin
       reg.reghi := tmpreg;
     end;
   tmpref := ref;
-  cg.a_load_ref_reg(list, OS_S32, OS_S32, tmpref, reg.reglo);
+  tcgmips(cg).make_simple_ref(list,tmpref);
+  list.concat(taicpu.op_reg_ref(A_LW,reg.reglo,tmpref));
   Inc(tmpref.offset, 4);
-  cg.a_load_ref_reg(list, OS_S32, OS_S32, tmpref, reg.reghi);
+  list.concat(taicpu.op_reg_ref(A_LW,reg.reghi,tmpref));
 end;
 
 

+ 4 - 4
compiler/mips/cpugas.pas

@@ -42,7 +42,7 @@ unit cpugas;
       end;
 
     const
-      use_std_regnames : boolean = 
+      use_std_regnames : boolean =
       {$ifndef USE_MIPS_GAS_REGS}
       true;
       {$else}
@@ -379,8 +379,8 @@ unit cpugas;
         id: as_gas;
         idtxt: 'AS';
         asmbin: 'as';
-        asmcmd: '$ABI $ARCH $NOWARN -EL $PIC -o $OBJ $ASM';
-        supported_targets: [system_mipsel_linux];
+        asmcmd: '$ABI $ARCH $NOWARN -EL $PIC -o $OBJ $EXTRAOPT $ASM';
+        supported_targets: [system_mipsel_linux,system_mipsel_android];
         flags: [ af_needar, af_smartlink_sections];
         labelprefix: '.L';
         comment: '# ';
@@ -391,7 +391,7 @@ unit cpugas;
         id: as_gas;
         idtxt: 'AS';
         asmbin: 'as';
-        asmcmd: '$ABI $ARCH $NOWARN -EB $PIC -o $OBJ $ASM';
+        asmcmd: '$ABI $ARCH $NOWARN -EB $PIC -o $OBJ $EXTRAOPT $ASM';
         supported_targets: [system_mipseb_linux];
         flags: [ af_needar, af_smartlink_sections];
         labelprefix: '.L';

+ 2 - 1
compiler/mips/cpuinfo.pas

@@ -1,7 +1,7 @@
 {
     Copyright (c) 1998-2002 by the Free Pascal development team
 
-    Basic Processor information for the ARM
+    Basic Processor information for the MIPS
 
     See the file COPYING.FPC, included in this distribution,
     for details about the copyright.
@@ -63,6 +63,7 @@ Const
    supported_calling_conventions : tproccalloptions = [
      pocall_internproc,
      pocall_stdcall,
+     pocall_safecall,
      { same as stdcall only different name mangling }
      pocall_cdecl,
      { same as stdcall only different name mangling }

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است