2 years ago · 35345fe145
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@@ -593,14 +593,19 @@ end;
 
				 procedure FillQWord_SSE2(var x;count:SizeInt;value:QWord);assembler;nostackframe;
			
 
				 { eax = x, edx = count, [esp + 4] = value }
			
 
				 asm
			
 
				-        cmp     $1, %edx
			
 
				-        jle     .LOneOrLess
			
 
				         cmp     $4, %edx
			
 
				-        jle     .L2to4
			
 
				+        jle     .L4OrLess
			
 
				         movq    4(%esp), %xmm0
			
 
				         punpcklqdq %xmm0, %xmm0
			
 
				+        { Stack is 12 bytes:
			
 
				+          [esp] = return address, [esp + 4] = value (not required anymore).
			
 
				+          Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
			
 
				+          [esp] = esi, [esp + 4] = return address. }
			
 
				+        mov     (%esp), %ecx
			
 
				+        add     $4, %esp
			
 
				+        mov     %esi, (%esp)
			
 
				+        mov     %ecx, 4(%esp)
			
 
				         shl     $3, %edx
			
 
				-        push    %esi
			
 
				         movdqu  %xmm0, (%eax)
			
 
				         movdqa  %xmm0, %xmm1
			
 
				         test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
			
@@ -618,17 +623,12 @@ asm
 
				         por     %xmm2, %xmm1
			
 
				         jmp     FillXxxx_MoreThanTwoXMMs
			
 
				 
			
 
				-.LOneOrLess:
			
 
				+.L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
			
 
				+        cmp     $1, %edx
			
 
				         jl      .LQuit
			
 
				         mov     4(%esp), %ecx
			
 
				         mov     %ecx, (%eax)
			
 
				-        mov     8(%esp), %ecx
			
 
				-        mov     %ecx, 4(%eax)
			
 
				-.LQuit:
			
 
				-        ret     $8
			
 
				-.L2to4:
			
 
				-        mov     4(%esp), %ecx
			
 
				-        mov     %ecx, (%eax)
			
 
				+        je      .LSecondHalfOf1
			
 
				         mov     %ecx, 8(%eax)
			
 
				         mov     %ecx, -16(%eax,%edx,8)
			
 
				         mov     %ecx, -8(%eax,%edx,8)
			
@@ -637,6 +637,11 @@ asm
 
				         mov     %ecx, 12(%eax)
			
 
				         mov     %ecx, -12(%eax,%edx,8)
			
 
				         mov     %ecx, -4(%eax,%edx,8)
			
 
				+.LQuit:
			
 
				+        ret     $8
			
 
				+.LSecondHalfOf1:
			
 
				+        mov     8(%esp), %ecx
			
 
				+        mov     %ecx, 4(%eax)
			
 
				 end;
			
 
				 
			
 
				 procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;