{ This file is part of the Free Pascal run time library. Copyright (c) 1999-2000 by the Free Pascal development team. Processor dependent implementation for the system unit for intel i386+ See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} {$if not(defined(VER3_0)) and defined(linux)} {$define FPC_SYSTEM_STACKALIGNMENT16} {$endif not(defined(VER3_0)) and defined(linux)} {**************************************************************************** Primitives ****************************************************************************} var os_supports_sse : boolean; { this variable is set to true, if currently an sse check is executed and no sig ill should be generated } sse_check : boolean; fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. } fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. } {$asmmode ATT} function cpuid_support : boolean;assembler;nostackframe; { Check if the ID-flag can be changed, if changed then CpuID is supported. Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV) } asm pushfl movl (%esp),%eax xorl $0x200000,%eax pushl %eax popfl pushfl popl %eax xorl (%esp),%eax popfl testl $0x200000,%eax setnz %al end; {$define FPC_SYSTEM_HAS_FPC_CPUINIT} procedure fpc_cpuinit; begin { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which must be implemented OS dependend (FK) has_sse_support:=sse_support; has_mmx_support:=mmx_support; } end; {$ifndef darwin} procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe; asm movl (%esp),%ebx end; procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe; asm movl (%esp),%ecx end; {$endif} {$if not defined(FPC_SYSTEM_HAS_MOVE) and not defined(OLD_ASSEMBLER) and not defined(darwin)} {$i fastmove.inc} {$endif} {$ifndef FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE} procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler; var saveesi,saveedi : longint; asm movl %edi,saveedi movl %esi,saveesi movl %eax,%esi movl %edx,%edi movl %ecx,%edx movl %edi,%eax { check for zero or negative count } cmpl $0,%edx jle .LMoveEnd { Check for back or forward } sub %esi,%eax jz .LMoveEnd { Do nothing when source=dest } jc .LFMove { Do forward, dest= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). } asm {$ifdef FPC_ENABLED_CLD} cld {$endif FPC_ENABLED_CLD} mov %ecx, (%eax) { Write first 4 bytes unaligned. } push %ecx { pattern } push %edi mov %eax, %edi { Move x to edi, as expected by ‘rep stosl’. } xchg %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) } shl $3, %ecx { ecx = misalignment of x in bits. } rol %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. } add %edi, %edx { edx = x end } lea -1(%edx), %ecx { ecx = x end - 1. } add $4, %edi and $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. } and $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. } sub %edi, %ecx { ecx = byte count between them. } shr $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. } rep stosl pop %edi pop %ecx mov %ecx, -4(%edx) { Write last 4 bytes unaligned. } end; {$endif FillChar/Word/DWord required.} label FillXxxx_MoreThanTwoXMMs; procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe; { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). } const NtThreshold = 4 * 1024 * 1024; asm movd %ecx, %xmm0 pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } movdqu %xmm0, (%eax) movdqu %xmm0, -16(%eax,%edx) cmp $32, %edx ja .LMoreThanTwoVectors ret .byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. } { x can start and end misaligned on the vector boundary: x = ~~][H1][H2][...][T2][T1]~ [UH] [UT] UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. } .LMoreThanTwoVectors: push %esi mov %ecx, %esi { esi = pattern } mov %eax, %ecx shl $3, %ecx { ecx = misalignment of x in bits } rol %cl, %esi { misalign the pattern } movd %esi, %xmm1 pshufd $0, %xmm1, %xmm1 pop %esi { FillChar (to skip the misaligning above) and FillQWord jump here. eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. } FillXxxx_MoreThanTwoXMMs: lea -65(%eax,%edx), %ecx and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. } and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). } movdqa %xmm1, 16(%eax) { Write H1. } cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. } jle .LOneAlignedTailWrite movdqa %xmm1, 32(%eax) { Write H2. } cmp $81, %edx { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. } jle .LTwoAlignedTailWrites cmp $113, %edx { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. } jle .LFourAlignedTailWrites add $48, %eax cmp $NtThreshold, %edx jae .L64xNT_Body .balign 16 { no-op } .L64x_Body: movdqa %xmm1, (%eax) movdqa %xmm1, 16(%eax) movdqa %xmm1, 32(%eax) movdqa %xmm1, 48(%eax) add $64, %eax cmp %ecx, %eax jb .L64x_Body .LFourAlignedTailWrites: movdqa %xmm1, (%ecx) { T4 } movdqa %xmm1, 16(%ecx) { T3 } .LTwoAlignedTailWrites: movdqa %xmm1, 32(%ecx) { T2 } .LOneAlignedTailWrite: movdqa %xmm1, 48(%ecx) { T1 } ret .balign 16 .L64xNT_Body: movntdq %xmm1, (%eax) movntdq %xmm1, 16(%eax) movntdq %xmm1, 32(%eax) movntdq %xmm1, 48(%eax) add $64, %eax cmp %ecx, %eax jb .L64xNT_Body sfence jmp .LFourAlignedTailWrites end; {$if not defined(FPC_SYSTEM_HAS_FILLCHAR) or not defined(FPC_SYSTEM_HAS_FILLWORD) or not defined(FPC_SYSTEM_HAS_FILLDWORD)} procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe; { eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). } asm mov %ecx, (%eax) { Write first 4 bytes. } lea -9(%eax,%edx), %edx mov %ecx, 5(%edx) { Write last 4 bytes. } and $-4, %edx { edx = loop bound. } push %esi mov %ecx, %esi { esi = pattern } mov %eax, %ecx shl $3, %ecx { ecx = misalignment of x in bits } rol %cl, %esi { misalign the pattern } add $4, %eax and $-4, %eax .balign 16 .L8xLoop: mov %esi, (%eax) mov %esi, 4(%eax) add $8, %eax cmp %edx, %eax jb .L8xLoop mov %esi, (%edx) mov %esi, 4(%edx) pop %esi end; procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe; { eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. } asm mov %ecx, (%eax) cmp $8, %edx jle .LLast4 mov %ecx, 4(%eax) mov %ecx, -8(%eax,%edx) .LLast4: mov %ecx, -4(%eax,%edx) end; {$endif FillChar/Word/DWord required.} {$endif FillChar/Word/DWord/QWord required.} {$ifndef FPC_SYSTEM_HAS_FILLCHAR} {$define FPC_SYSTEM_HAS_FILLCHAR} procedure FillChar_3OrLess; assembler; nostackframe; { cl — x, edx — byte count, Low(int32) <= edx <= 3. } asm test %edx, %edx jle .LQuit mov %cl, (%eax) mov %cl, -1(%eax,%edx) shr $1, %edx mov %cl, (%eax,%edx) .LQuit: end; procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe; asm cmp $3, %edx jle FillChar_3OrLess movzbl %cl, %ecx imul $0x01010101, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 jmp FillXxxx_U32Pattern_Plain_16OrMore end; procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe; asm cmp $3, %edx jle FillChar_3OrLess movzbl %cl, %ecx imul $0x01010101, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 cmp $FillXxxx_RepStosThreshold_NoERMS, %edx jae FillXxxx_U32Pattern_RepStos_8OrMore movd %ecx, %xmm0 pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } movdqu %xmm0, (%eax) movdqu %xmm0, -16(%eax,%edx) movdqa %xmm0, %xmm1 cmp $32, %edx ja FillXxxx_MoreThanTwoXMMs end; procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe; asm cmp $3, %edx jle FillChar_3OrLess movzbl %cl, %ecx imul $0x01010101, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 cmp $FillXxxx_RepStosThreshold_ERMS, %edx jae FillXxxx_U32Pattern_RepStos_8OrMore movd %ecx, %xmm0 pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } movdqu %xmm0, (%eax) movdqu %xmm0, -16(%eax,%edx) movdqa %xmm0, %xmm1 cmp $32, %edx ja FillXxxx_MoreThanTwoXMMs end; procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward; var FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch; procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); begin if not fpc_cpucodeinit_performed then begin FillChar_Plain(x, count, value); exit; end; if fast_large_repmovstosb then FillChar_Impl := @FillChar_SSE2_ERMS else if has_sse2_support then FillChar_Impl := @FillChar_SSE2 else FillChar_Impl := @FillChar_Plain; FillChar_Impl(x, count, value); end; procedure FillChar(var x;count:SizeInt;value:byte); begin FillChar_Impl(x, count, value); end; {$endif FPC_SYSTEM_HAS_FILLCHAR} {$ifndef FPC_SYSTEM_HAS_FILLWORD} {$define FPC_SYSTEM_HAS_FILLWORD} procedure FillWord_3OrLess; assembler; nostackframe; asm test %edx, %edx jle .LQuit mov %cx, (%eax) mov %cx, -2(%eax,%edx,2) shr $1, %edx mov %cx, (%eax,%edx,2) .LQuit: end; procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe; asm cmp $3, %edx jle FillWord_3OrLess shl $1, %edx movzwl %cx, %ecx imul $0x00010001, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 jmp FillXxxx_U32Pattern_Plain_16OrMore end; procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe; asm cmp $3, %edx jle FillWord_3OrLess shl $1, %edx movzwl %cx, %ecx imul $0x00010001, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 cmp $FillXxxx_RepStosThreshold_NoERMS, %edx jb FillXxxx_U32Pattern_SSE2_16OrMore jmp FillXxxx_U32Pattern_RepStos_8OrMore end; procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe; asm cmp $3, %edx jle FillWord_3OrLess shl $1, %edx movzwl %cx, %ecx imul $0x00010001, %ecx cmp $16, %edx jbe FillXxxx_U32Pattern_Ladder_4to16 cmp $FillXxxx_RepStosThreshold_ERMS, %edx jb FillXxxx_U32Pattern_SSE2_16OrMore jmp FillXxxx_U32Pattern_RepStos_8OrMore end; procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward; var FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch; procedure FillWord_Dispatch(var x;count:SizeInt;value:word); begin if not fpc_cpucodeinit_performed then begin FillWord_Plain(x, count, value); exit; end; if fast_large_repmovstosb then FillWord_Impl := @FillWord_SSE2_ERMS else if has_sse2_support then FillWord_Impl := @FillWord_SSE2 else FillWord_Impl := @FillWord_Plain; FillWord_Impl(x, count, value); end; procedure FillWord(var x;count:SizeInt;value:word); begin FillWord_Impl(x, count, value); end; {$endif FPC_SYSTEM_HAS_FILLWORD} {$ifndef FPC_SYSTEM_HAS_FILLDWORD} {$define FPC_SYSTEM_HAS_FILLDWORD} procedure FillDWord_4OrLess; assembler; nostackframe; asm cmp $1, %edx jl .LQuit mov %ecx, (%eax) je .LQuit mov %ecx, 4(%eax) mov %ecx, -8(%eax,%edx,4) mov %ecx, -4(%eax,%edx,4) .LQuit: end; procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe; asm cmp $4, %edx jle FillDWord_4OrLess shl $2, %edx jmp FillXxxx_U32Pattern_Plain_16OrMore end; procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe; asm cmp $4, %edx jle FillDWord_4OrLess shl $2, %edx cmp $FillXxxx_RepStosThreshold_NoERMS, %edx jb FillXxxx_U32Pattern_SSE2_16OrMore jmp FillXxxx_U32Pattern_RepStos_8OrMore end; procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe; asm cmp $4, %edx jle FillDWord_4OrLess shl $2, %edx cmp $FillXxxx_RepStosThreshold_ERMS, %edx jb FillXxxx_U32Pattern_SSE2_16OrMore jmp FillXxxx_U32Pattern_RepStos_8OrMore end; procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward; var FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch; procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); begin if not fpc_cpucodeinit_performed then begin FillDWord_Plain(x, count, value); exit; end; if fast_large_repmovstosb then FillDWord_Impl := @FillDWord_SSE2_ERMS else if has_sse2_support then FillDWord_Impl := @FillDWord_SSE2 else FillDWord_Impl := @FillDWord_Plain; FillDWord_Impl(x, count, value); end; procedure FillDWord(var x;count:SizeInt;value:dword); begin FillDWord_Impl(x, count, value); end; {$endif FPC_SYSTEM_HAS_FILLDWORD} {$ifndef FPC_SYSTEM_HAS_FILLQWORD} {$define FPC_SYSTEM_HAS_FILLQWORD} procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe; { eax = x, edx = count, [esp + 4] = value } asm test %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. } jle .LQuit push %esi mov 4+4(%esp), %esi { esi = value[0:31] } mov 4+8(%esp), %ecx { ecx = value[32:63] } .balign 16 .LLoop: mov %esi, (%eax) mov %ecx, 4(%eax) add $8, %eax sub $1, %edx jnz .LLoop pop %esi .LQuit: end; procedure FillQWord_SSE2(var x;count:SizeInt;value:QWord);assembler;nostackframe; { eax = x, edx = count, [esp + 4] = value } asm cmp $4, %edx jle .L4OrLess movq 4(%esp), %xmm0 punpcklqdq %xmm0, %xmm0 { Stack is 12 bytes: [esp] = return address, [esp + 4] = value (not required anymore). Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs: [esp] = return address. } mov (%esp), %ecx add $8, %esp mov %ecx, (%esp) shl $3, %edx movdqu %xmm0, (%eax) movdqu %xmm0, -16(%eax,%edx) movdqa %xmm0, %xmm1 test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. } jz FillXxxx_MoreThanTwoXMMs mov %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x into xmm1. } shl $3, %ecx and $63, %ecx movd %ecx, %xmm3 psllq %xmm3, %xmm1 neg %ecx { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof. } and $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. } movd %ecx, %xmm3 movdqa %xmm0, %xmm2 psrlq %xmm3, %xmm2 por %xmm2, %xmm1 jmp FillXxxx_MoreThanTwoXMMs .L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ } cmp $1, %edx jl .LQuit mov 4(%esp), %ecx mov %ecx, (%eax) je .LSecondHalfOf1 mov %ecx, 8(%eax) mov %ecx, -16(%eax,%edx,8) mov %ecx, -8(%eax,%edx,8) mov 8(%esp), %ecx mov %ecx, 4(%eax) mov %ecx, 12(%eax) mov %ecx, -12(%eax,%edx,8) mov %ecx, -4(%eax,%edx,8) .LQuit: ret $8 .LSecondHalfOf1: mov 8(%esp), %ecx mov %ecx, 4(%eax) end; procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward; var FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch; procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); begin if not fpc_cpucodeinit_performed then begin FillQWord_Plain(x, count, value); exit; end; if has_sse2_support then FillQWord_Impl := @FillQWord_SSE2 else FillQWord_Impl := @FillQWord_Plain; FillQWord_Impl(x, count, value); end; procedure FillQWord(var x;count:SizeInt;value:qword); begin FillQWord_Impl(x, count, value); end; {$endif FPC_SYSTEM_HAS_FILLQWORD} {$ifndef FPC_SYSTEM_HAS_INDEXBYTE} {$define FPC_SYSTEM_HAS_INDEXBYTE} function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe; asm push %esi push %edi push %eax { save initial value of 'buf' } cmp $4,%edx { less than 4 bytes, just test byte by byte. } jb .Ltail mov %cl,%ch { prepare pattern } movzwl %cx,%esi shl $16,%ecx or %esi,%ecx .Lalignloop: test $3,%al { align to 4 bytes if necessary } je .Laligned cmp %cl,(%eax) je .Lexit inc %eax dec %edx jmp .Lalignloop .balign 16 { Main loop, unrolled 4 times for speed } .Lloop: mov (%eax),%esi { load dword } xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 } lea -0x01010101(%esi),%edi xor %esi,%edi { (x-0x01010101) xor x } not %esi and $0x80808080,%esi and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 } jnz .Lfound { one of the bytes matches } mov 4(%eax),%esi xor %ecx,%esi lea -0x01010101(%esi),%edi xor %esi,%edi not %esi and $0x80808080,%esi and %edi,%esi jnz .Lfound4 mov 8(%eax),%esi xor %ecx,%esi lea -0x01010101(%esi),%edi xor %esi,%edi not %esi and $0x80808080,%esi and %edi,%esi jnz .Lfound8 mov 12(%eax),%esi xor %ecx,%esi lea -0x01010101(%esi),%edi xor %esi,%edi not %esi and $0x80808080,%esi and %edi,%esi jnz .Lfound12 add $16,%eax .Laligned: sub $16,%edx jae .Lloop { Still more than 16 bytes remaining } { Process remaining bytes (<16 left at this point) } { length is offset by -16 at this point } .Lloop2: cmp $4-16,%edx { < 4 bytes left? } jb .Ltail mov (%eax),%esi xor %ecx,%esi lea -0x01010101(%esi),%edi xor %esi,%edi not %esi and $0x80808080,%esi and %edi,%esi jne .Lfound add $4,%eax sub $4,%edx jmp .Lloop2 .Ltail: { Less than 4 bytes remaining, check one by one } and $3, %edx jz .Lnotfound .Lloop3: cmp %cl,(%eax) je .Lexit inc %eax dec %edx jnz .Lloop3 .Lnotfound: or $-1,%eax jmp .Lexit1 { add missing source pointer increments } .Lfound12: add $4,%eax .Lfound8: add $4,%eax .Lfound4: add $4,%eax .Lfound: test $0xff,%esi jnz .Lexit inc %eax test $0xff00,%esi jnz .Lexit inc %eax test $0xff0000,%esi jnz .Lexit inc %eax .Lexit: sub (%esp),%eax .Lexit1: pop %ecx { removes initial 'buf' value } pop %edi pop %esi end; function IndexByte_SSE2(const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe; asm test %edx, %edx jz .Lnotfound { exit if len=0 } push %ebx movd %ecx, %xmm1 lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. } punpcklbw %xmm1, %xmm1 and $-0x10, %ecx { first aligned address after buf } punpcklbw %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1 movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) } sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr } pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask } pmovmskb %xmm0, %ebx shl %cl, %ebx { shift valid bits into high word } and $0xffff0000, %ebx { clear low word containing invalid bits } shr %cl, %ebx { shift back } jz .Lcontinue .Lmatch: bsf %ebx, %ebx lea -16(%ecx,%ebx), %eax pop %ebx cmp %eax, %edx { check against the buffer length } jbe .Lnotfound ret .balign 16 .Lloop: movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, } add $16, %ecx { but their sum is evenly divisible by 16. } pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx test %ebx, %ebx jnz .Lmatch .Lcontinue: cmp %ecx, %edx ja .Lloop pop %ebx .Lnotfound: or $-1, %eax end; function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward; var IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch; function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; begin if not fpc_cpucodeinit_performed then exit(IndexByte_Plain(buf,len,b)); if has_sse2_support then IndexByte_Impl:=@IndexByte_SSE2 else IndexByte_Impl:=@IndexByte_Plain; result:=IndexByte_Impl(buf,len,b); end; function IndexByte(const buf;len:SizeInt;b:byte):SizeInt; begin result:=IndexByte_Impl(buf,len,b); end; {$endif FPC_SYSTEM_HAS_INDEXBYTE} {$ifndef FPC_SYSTEM_HAS_INDEXWORD} {$define FPC_SYSTEM_HAS_INDEXWORD} function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe; asm test %edx, %edx jz .LNotFound push %eax .LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. } cmp %cx, (%eax) je .LFound add $2, %eax dec %edx jnz .LWordwise_Body pop %edx .LNotFound: or $-1, %eax ret .LFound: pop %edx sub %edx, %eax shr $1, %eax end; function IndexWord_SSE2(const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe; asm test %edx, %edx { exit if len=0 } je .Lnotfound push %ebx movd %ecx, %xmm1 punpcklwd %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1 lea 16(%eax), %ecx and $-16, %ecx movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) } sub %eax, %ecx test $1, %eax { if buffer isn't aligned to word boundary, } jnz .Lunaligned { use a different algorithm } pcmpeqw %xmm1, %xmm0 pmovmskb %xmm0, %ebx shl %cl, %ebx and $0xffff0000, %ebx shr %cl, %ebx shr $1, %ecx { ecx=number of valid bytes } test %ebx, %ebx jz .Lcontinue .Lmatch: bsf %ebx, %ebx shr $1, %ebx { in words } lea -8(%ecx,%ebx), %eax pop %ebx cmp %eax, %edx jbe .Lnotfound { if match is after the specified length, ignore it } ret .balign 16 .Lloop: movdqa (%eax,%ecx,2), %xmm0 add $8, %ecx pcmpeqw %xmm1, %xmm0 pmovmskb %xmm0, %ebx test %ebx, %ebx jnz .Lmatch .Lcontinue: cmp %ecx, %edx ja .Lloop pop %ebx .Lnotfound: or $-1, %eax ret .Lunaligned: push %esi movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: } psllw $8, %xmm1 { swap bytes of each word of pattern) } psrlw $8, %xmm2 por %xmm2, %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx shl %cl, %ebx and $0xffff0000, %ebx shr %cl, %ebx xor %esi, %esi { nothing to merge yet } add %edx, %edx { length words -> bytes } jmp .Lcontinue_u .balign 16 .Lloop_u: movdqa (%eax,%ecx), %xmm0 add $16, %ecx pcmpeqb %xmm1, %xmm0 { compare by bytes } shr $16, %esi { bit 16 shifts into 0 } pmovmskb %xmm0, %ebx .Lcontinue_u: shl $1, %ebx { 15:0 -> 16:1 } or %esi, %ebx { merge bit 0 from previous round } mov %ebx, %esi shr $1, %ebx { now AND together adjacent pairs of bits } and %esi, %ebx and $0x5555, %ebx { also reset odd bits } jnz .Lmatch_u cmp %ecx, %edx ja .Lloop_u .Lnotfound_u: pop %esi pop %ebx or $-1, %eax ret .Lmatch_u: bsf %ebx, %ebx lea -16(%ecx,%ebx), %eax cmp %eax, %edx jbe .Lnotfound_u { if match is after the specified length, ignore it } sar $1, %eax { in words } pop %esi pop %ebx end; function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward; var IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch; function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; begin if not fpc_cpucodeinit_performed then exit(IndexWord_Plain(buf,len,b)); if has_sse2_support then IndexWord_Impl:=@IndexWord_SSE2 else IndexWord_Impl:=@IndexWord_Plain; result:=IndexWord_Impl(buf,len,b); end; function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline; begin result:=IndexWord_Impl(buf,len,b); end; {$endif FPC_SYSTEM_HAS_INDEXWORD} {$ifndef FPC_SYSTEM_HAS_INDEXDWORD} {$define FPC_SYSTEM_HAS_INDEXDWORD} function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe; asm push %eax sub $4, %eax .LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. } add $4, %eax sub $1, %edx jb .LNotFound cmp %ecx, (%eax) jne .LDWordwise_Next pop %edx sub %edx, %eax shr $2, %eax ret .LNotFound: pop %edx mov $-1, %eax end; function IndexDWord_SSE2(const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe; asm push %eax sub $4, %edx jle .LDwordwise_Prepare movd %ecx, %xmm1 pshufd $0, %xmm1, %xmm1 .balign 16 { 1-byte NOP. } .L4x_Body: movdqu (%eax), %xmm0 pcmpeqd %xmm1, %xmm0 pmovmskb %xmm0, %ecx test %ecx, %ecx jnz .LFoundAtMask add $16, %eax sub $4, %edx jg .L4x_Body lea (%eax,%edx,4), %eax movdqu (%eax), %xmm0 pcmpeqd %xmm1, %xmm0 pmovmskb %xmm0, %ecx test %ecx, %ecx jz .LNothing .LFoundAtMask: bsf %ecx, %ecx add %ecx, %eax .LFoundAtEax: pop %edx sub %edx, %eax shr $2, %eax ret nop { Turns .balign 16 before .LDwordwise_Body into a no-op. } .LDwordwise_Prepare: add $3, %edx cmp $-1, %edx je .LNothing .balign 16 { no-op } .LDwordwise_Body: cmp (%eax), %ecx je .LFoundAtEax add $4, %eax sub $1, %edx jae .LDwordwise_Body .LNothing: pop %edx or $-1, %eax end; function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward; var IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch; function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; begin if not fpc_cpucodeinit_performed then exit(IndexDWord_Plain(buf,len,b)); if has_sse2_support then IndexDWord_Impl:=@IndexDWord_SSE2 else IndexDWord_Impl:=@IndexDWord_Plain; result:=IndexDWord_Impl(buf,len,b); end; function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt; begin result:=IndexDWord_Impl(buf,len,b); end; {$endif FPC_SYSTEM_HAS_INDEXDWORD} {$ifndef FPC_SYSTEM_HAS_INDEXQWORD} {$define FPC_SYSTEM_HAS_INDEXQWORD} function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe; { eax = buf, edx = len, [esp+4] = b } asm push %ebx mov 8(%esp), %ecx { ecx = b[0:31] } mov 12(%esp), %ebx { ebx = b[32:63] } mov %eax, 8(%esp) { remember original buf } sub $8, %eax .balign 16 { no-op } .LQWordwise_Next: add $8, %eax sub $1, %edx jb .LNotFound cmp %ecx, (%eax) jne .LQWordwise_Next cmp %ebx, 4(%eax) jne .LQWordwise_Next sub 8(%esp), %eax pop %ebx shr $3, %eax ret $8 .LNotFound: pop %ebx mov $-1, %eax end; {$endif FPC_SYSTEM_HAS_INDEXQWORD} {$ifndef FPC_SYSTEM_HAS_COMPAREBYTE} {$define FPC_SYSTEM_HAS_COMPAREBYTE} function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe; asm { eax = buf1, edx = buf2, ecx = len } push %ebx sub %eax, %edx { edx = buf2 - buf1 } cmp $3, %ecx jle .LBytewise_Prepare { Align buf1 on 4 bytes. } mov (%edx,%eax), %ebx cmp (%eax), %ebx jne .L4xDiffer lea -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining } and $-4, %eax sub %eax, %ecx .balign 16 .L4x_Next: add $4, %eax sub $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes } jle .LLast4 mov (%edx,%eax), %ebx cmp (%eax), %ebx je .L4x_Next .L4xDiffer: mov (%eax), %edx {$ifdef CPUX86_HAS_BSWAP} bswap %ebx bswap %edx {$else} rol $8, %bx rol $16, %ebx rol $8, %bx rol $8, %dx rol $16, %edx rol $8, %dx {$endif} cmp %ebx, %edx .LDoSbb: sbb %eax, %eax or $1, %eax pop %ebx ret .LLast4: add %ecx, %eax mov (%edx,%eax), %ebx cmp (%eax), %ebx jne .L4xDiffer xor %eax, %eax pop %ebx ret .LBytewise_Prepare: sub $1, %ecx jb .LNothing .balign 16 { no-op } .LBytewise_Body: movzbl (%edx,%eax), %ebx cmp %bl, (%eax) jne .LDoSbb add $1, %eax sub $1, %ecx jae .LBytewise_Body .LNothing: xor %eax, %eax pop %ebx end; function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe; asm { eax = buf1, edx = buf2, ecx = len } cmp $1, %ecx jle .L1OrLess push %ebx cmp $16, %ecx jae .LVecOrMore { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. } mov %eax, %ebx or %edx, %ebx and $4095, %ebx cmp $4080, %ebx ja .LCantOverReadBoth { Over-read both as XMMs. } movdqu (%eax), %xmm0 movdqu (%edx), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. } jz .LNothing bsf %ebx, %ebx cmp %ecx, %ebx { Ignore garbage beyond 'len'. } jae .LNothing movzbl (%eax,%ebx), %eax movzbl (%edx,%ebx), %edx sub %edx, %eax pop %ebx ret .LNothing: pop %ebx xor %eax, %eax ret .LVecOrMore: { Compare first vectors. } movdqu (%eax), %xmm0 movdqu (%edx), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs sub $32, %ecx { now ecx is len - 32... mostly just to save bytes on offsets improving .LAligned32xLoop_Body alignment :) } jbe .LLastVec { Compare second vectors. } movdqu 16(%eax), %xmm0 movdqu 16(%edx), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec1Differs { More than four vectors: aligned loop. } cmp $32, %ecx ja .LAligned32xLoop_Prepare { Compare last two vectors. } movdqu (%eax,%ecx), %xmm0 movdqu (%edx,%ecx), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVecEm2Differs .LLastVec: movdqu 16(%eax,%ecx), %xmm0 movdqu 16(%edx,%ecx), %xmm1 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVecEm1Differs pop %ebx xor %eax, %eax ret .LVecEm2Differs: sub $16, %ecx .LVecEm1Differs: bsf %ebx, %ebx add %ecx, %ebx movzbl 16(%eax,%ebx), %eax movzbl 16(%edx,%ebx), %edx sub %edx, %eax pop %ebx ret nop { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. } .LAligned32xLoop_Prepare: lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) } sub %eax, %edx { edx = buf2 - buf1 } and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. } sub %eax, %ecx { ecx = count to be handled with loop } .balign 16 { No-op. } .LAligned32xLoop_Body: add $32, %eax { Compare two XMMs, reduce the result with 'and'. } movdqu (%edx,%eax), %xmm0 pcmpeqb (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) } movdqu 16(%edx,%eax), %xmm1 pcmpeqb 16(%eax), %xmm1 pand %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) } pmovmskb %xmm1, %ebx inc %bx jnz .LAligned32xLoop_TwoVectorsDiffer sub $32, %ecx ja .LAligned32xLoop_Body { Compare last two vectors after the loop by doing one more loop iteration, modified. } lea 32(%eax,%ecx), %eax movdqu (%edx,%eax), %xmm0 movdqu (%eax), %xmm2 pcmpeqb %xmm2, %xmm0 movdqu 16(%edx,%eax), %xmm1 movdqu 16(%eax), %xmm2 pcmpeqb %xmm2, %xmm1 pand %xmm0, %xmm1 pmovmskb %xmm1, %ebx inc %bx jnz .LAligned32xLoop_TwoVectorsDiffer pop %ebx xor %eax, %eax ret .LAligned32xLoop_TwoVectorsDiffer: add %eax, %edx { restore edx = buf2 } pmovmskb %xmm0, %ecx { Is there a difference in the first vector? } inc %cx jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. } bsf %ecx, %ebx movzbl (%eax,%ebx), %eax movzbl (%edx,%ebx), %edx sub %edx, %eax pop %ebx ret .LVec1Differs: add $16, %eax add $16, %edx .LVec0Differs: bsf %ebx, %ebx movzbl (%eax,%ebx), %eax movzbl (%edx,%ebx), %edx sub %edx, %eax pop %ebx ret .LCantOverReadBoth: cmp $3, %ecx jle .L2to3 push %esi mov (%eax), %ebx mov (%edx), %esi cmp %esi, %ebx jne .L4xDiffer cmp $8, %ecx jbe .LLast4x mov 4(%eax), %ebx mov 4(%edx), %esi cmp %esi, %ebx jne .L4xDiffer mov -8(%eax,%ecx), %ebx mov -8(%edx,%ecx), %esi cmp %esi, %ebx jne .L4xDiffer .LLast4x: mov -4(%eax,%ecx), %ebx mov -4(%edx,%ecx), %esi cmp %esi, %ebx jne .L4xDiffer pop %esi pop %ebx xor %eax, %eax ret .L4xDiffer: bswap %ebx bswap %esi cmp %esi, %ebx pop %esi sbb %eax, %eax or $1, %eax pop %ebx ret .L2to3: movzwl (%edx), %ebx bswap %ebx shr $1, %ebx mov -1(%edx,%ecx), %bl movzwl (%eax), %edx bswap %edx shr $1, %edx mov -1(%eax,%ecx), %dl mov %edx, %eax sub %ebx, %eax pop %ebx ret .L1OrLess: jl .LUnbounded_Prepare movzbl (%eax), %eax movzbl (%edx), %edx sub %edx, %eax ret .LUnbounded_Prepare: sub %eax, %edx { edx = buf2 - buf1 } test %ecx, %ecx jnz .LUnbounded_Body xor %eax, %eax ret .balign 16 .LUnbounded_Next: add $1, %eax .LUnbounded_Body: movzbl (%edx,%eax), %ecx cmp %cl, (%eax) je .LUnbounded_Next sbb %eax, %eax or $1, %eax end; function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward; var CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch; function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; begin if not fpc_cpucodeinit_performed then exit(CompareByte_Plain(buf1, buf2, len)); if has_sse2_support then CompareByte_Impl:=@CompareByte_SSE2 else CompareByte_Impl:=@CompareByte_Plain; result:=CompareByte_Impl(buf1, buf2, len); end; function CompareByte(const buf1, buf2; len: SizeInt): SizeInt; begin result:=CompareByte_Impl(buf1, buf2, len); end; {$endif FPC_SYSTEM_HAS_COMPAREBYTE} {$ifndef FPC_SYSTEM_HAS_COMPAREWORD} {$define FPC_SYSTEM_HAS_COMPAREWORD} function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe; asm push %ebx sub %eax, %edx { edx = buf2 - buf1 } lea -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. } cmp $1073741819, %ebx ja .LWordwise_Prepare test $2, %al je .LAlignedToPtrUintOrNaturallyMisaligned movzwl (%edx,%eax), %ebx cmp %bx, (%eax) jne .LDoSbb add $2, %eax sub $1, %ecx .LAlignedToPtrUintOrNaturallyMisaligned: sub $2, %ecx .balign 16 .LPtrUintWise_Next: mov (%edx,%eax), %ebx cmp %ebx, (%eax) jne .LPtrUintsDiffer add $4, %eax sub $2, %ecx jg .LPtrUintWise_Next lea (%eax,%ecx,2), %eax mov (%edx,%eax), %ebx cmp %ebx, (%eax) jne .LPtrUintsDiffer pop %ebx xor %eax, %eax ret .LPtrUintsDiffer: cmp %bx, (%eax) jne .LDoSbb shr $16, %ebx cmp %bx, 2(%eax) .LDoSbb: sbb %eax, %eax or $1, %eax pop %ebx ret .balign 16 .LWordwise_Body: movzwl (%edx,%eax), %ebx cmp %bx, (%eax) jne .LDoSbb add $2, %eax .LWordwise_Prepare: sub $1, %ecx jnb .LWordwise_Body pop %ebx xor %eax, %eax end; function CompareWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe; asm push %ebx sub %eax, %edx { edx = buf2 - buf1 } lea -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. } cmp $1073741821, %ebx ja .LWordwise_Prepare cmp $8, %ecx jge .LVecOrMore lea (%edx,%eax), %ebx or %eax, %ebx and $4095, %ebx cmp $4080, %ebx ja .LWordwise_Prepare movdqu (%edx,%eax), %xmm0 movdqu (%eax), %xmm1 pcmpeqw %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jz .LNothing shl $1, %ecx { convert to bytes } bsf %ebx, %ebx cmp %ecx, %ebx jb .LSubtractWords .LNothing: pop %ebx xor %eax, %eax ret .balign 16 .LWordwise_Body: movzwl (%edx,%eax), %ebx cmp %bx, (%eax) jne .LDoSbb add $2, %eax .LWordwise_Prepare: sub $1, %ecx jae .LWordwise_Body xor %eax, %eax pop %ebx ret .LDoSbb: sbb %eax, %eax or $1, %eax pop %ebx ret .LVecOrMore: movdqu (%edx,%eax), %xmm0 { Compare first vectors. } movdqu (%eax), %xmm1 pcmpeqw %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs shl $1, %ecx { convert to bytes } sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %eax, %ecx and $-16, %eax { align buf1; +16 is performed by the loop. } sub %eax, %ecx .balign 16 .LAligned8xLoop_Body: add $16, %eax movdqu (%edx,%eax), %xmm0 pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LAligned8xLoop_VecDiffers sub $16, %ecx ja .LAligned8xLoop_Body pop %ebx { drop original buf1 } .LLastVec: lea 16(%eax,%ecx), %eax { point to the last 16 bytes } movdqu (%edx,%eax), %xmm0 movdqu (%eax), %xmm1 pcmpeqw %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs pop %ebx xor %eax, %eax ret .LVec0Differs: bsf %ebx, %ebx .LSubtractWords: add %eax, %edx movzwl (%eax,%ebx), %eax movzwl (%edx,%ebx), %edx sub %edx, %eax pop %ebx ret .LAligned8xLoop_VecDiffers: bsf %ebx, %ebx add %ebx, %eax pop %ecx sub %ecx, %eax and $-2, %eax add %ecx, %eax movzwl (%edx,%eax), %edx movzwl (%eax), %eax sub %edx, %eax pop %ebx end; function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward; var CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch; function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; begin if not fpc_cpucodeinit_performed then exit(CompareWord_Plain(buf1, buf2, len)); if has_sse2_support then CompareWord_Impl:=@CompareWord_SSE2 else CompareWord_Impl:=@CompareWord_Plain; result:=CompareWord_Impl(buf1, buf2, len); end; function CompareWord(const buf1, buf2; len: SizeInt): SizeInt; begin result:=CompareWord_Impl(buf1, buf2, len); end; {$endif FPC_SYSTEM_HAS_COMPAREWORD} {$ifndef FPC_SYSTEM_HAS_COMPAREDWORD} {$define FPC_SYSTEM_HAS_COMPAREDWORD} function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe; asm sub $1, %ecx jb .LNothing push %ebx sub %eax, %edx .balign 16 .LDwordwise_Body: mov (%edx,%eax), %ebx cmp %ebx, (%eax) jne .LDoSbb add $4, %eax sub $1, %ecx jnb .LDwordwise_Body pop %ebx .LNothing: xor %eax, %eax ret .LDoSbb: pop %ebx sbb %eax, %eax or $1, %eax end; function CompareDWord_SSE2(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe; asm push %ebx sub %eax, %edx { edx = buf2 - buf1 } lea -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. } cmp $536870906, %ebx ja .LDwordwise_Prepare shl $2, %ecx { convert to bytes } movdqu (%edx,%eax), %xmm1 { Compare first vectors. } movdqu (%eax), %xmm0 pcmpeqd %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %eax, %ecx and $-16, %eax { align buf1; +16 is performed by the loop. } sub %eax, %ecx .balign 16 .LAligned4xLoop_Body: add $16, %eax movdqu (%eax,%edx), %xmm0 pcmpeqb (%eax), %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LAligned4xLoop_VecDiffers sub $16, %ecx ja .LAligned4xLoop_Body pop %ebx { drop original buf1 } .LLastVec: lea 16(%eax,%ecx), %eax { point to the last 16 bytes } movdqu (%edx,%eax), %xmm1 movdqu (%eax), %xmm0 pcmpeqd %xmm1, %xmm0 pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs pop %ebx xor %eax, %eax ret .LVec0Differs: bsf %ebx, %ebx add %eax, %edx { recover edx = buf2 } mov (%edx,%ebx), %edx cmp %edx, (%eax,%ebx) sbb %eax, %eax or $1, %eax pop %ebx ret .LAligned4xLoop_VecDiffers: bsf %ebx, %ebx add %ebx, %eax pop %ecx sub %ecx, %eax and $-4, %eax add %ecx, %eax mov (%edx,%eax), %edx cmp %edx, (%eax) .LDoSbb: sbb %eax, %eax or $1, %eax pop %ebx ret .balign 16 .LDwordwise_Body: mov (%edx,%eax), %ebx cmp %ebx, (%eax) jne .LDoSbb add $4, %eax .LDwordwise_Prepare: sub $1, %ecx jnb .LDwordwise_Body pop %ebx xor %eax, %eax end; function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward; var CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch; function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; begin if not fpc_cpucodeinit_performed then exit(CompareDWord_Plain(buf1, buf2, len)); if has_sse2_support then CompareDWord_Impl:=@CompareDWord_SSE2 else CompareDWord_Impl:=@CompareDWord_Plain; result:=CompareDWord_Impl(buf1, buf2, len); end; function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt; begin result:=CompareDWord_Impl(buf1, buf2, len); end; {$endif FPC_SYSTEM_HAS_COMPAREDWORD} {$ifndef FPC_SYSTEM_HAS_INDEXCHAR0} {$define FPC_SYSTEM_HAS_INDEXCHAR0} function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler; var saveesi,saveebx : longint; asm movl %esi,saveesi movl %ebx,saveebx // Can't use scasb, or will have to do it twice, think this // is faster for small "len" movl %eax,%esi // Load address movzbl %cl,%ebx // Load searchpattern testl %edx,%edx je .LFound xorl %ecx,%ecx // zero index in Buf xorl %eax,%eax // To make DWord compares possible .balign 4 .LLoop: movb (%esi),%al // Load byte cmpb %al,%bl je .LFound // byte the same? incl %ecx incl %esi cmpl %edx,%ecx // Maximal distance reached? je .LNotFound testl %eax,%eax // Nullchar = end of search? jne .LLoop .LNotFound: movl $-1,%ecx // Not found return -1 .LFound: movl %ecx,%eax movl saveesi,%esi movl saveebx,%ebx end; {$endif FPC_SYSTEM_HAS_INDEXCHAR0} {**************************************************************************** String ****************************************************************************} {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN} {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN} procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc; {$ifndef FPC_PROFILE} nostackframe; {$endif} { eax = res, edx = high(res), ecx = sstr } asm {$ifdef FPC_PROFILE} push %eax push %edx push %ecx call mcount pop %ecx pop %edx pop %eax {$endif FPC_PROFILE} cmp (%ecx), %dl { length(sstr) fits into res? } jbe .LEdxIsLen { use high(res) if length(sstr) does not fit } movzbl (%ecx), %edx { use length(sstr) } .LEdxIsLen: mov %dl, (%eax) { store length to res[0] } xchg %ecx, %edx { ecx = length = Move count, edx = sstr } xchg %eax, %edx { eax = sstr = Move src, edx = res = Move dest } inc %eax inc %edx {$ifdef FPC_PROFILE} {$ifdef FPC_SYSTEM_STACKALIGNMENT16} lea -8(%esp), %esp {$endif FPC_SYSTEM_STACKALIGNMENT16} call Move {$ifdef FPC_SYSTEM_STACKALIGNMENT16} lea 8(%esp), %esp {$endif FPC_SYSTEM_STACKALIGNMENT16} {$else FPC_PROFILE} jmp Move {$endif FPC_PROFILE} end; procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN']; begin asm {$ifdef FPC_PROFILE} push %eax push %edx push %ecx call mcount pop %ecx pop %edx pop %eax {$endif FPC_PROFILE} pushl %eax pushl %ecx {$ifdef FPC_ENABLED_CLD} cld {$endif FPC_ENABLED_CLD} movl dstr,%edi movl sstr,%esi xorl %eax,%eax movl len,%ecx lodsb cmpl %ecx,%eax jbe .LStrCopy1 movl %ecx,%eax .LStrCopy1: stosb cmpl $7,%eax jl .LStrCopy2 movl %edi,%ecx { Align on 32bits } negl %ecx andl $3,%ecx subl %ecx,%eax rep movsb movl %eax,%ecx andl $3,%eax shrl $2,%ecx rep movsl .LStrCopy2: movl %eax,%ecx rep movsb popl %ecx popl %eax end ['ESI','EDI']; end; {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN} {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE} {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE} function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc; { eax = left, edx = right } asm {$ifdef FPC_PROFILE} push %eax push %edx push %ecx call mcount pop %ecx pop %edx pop %eax {$endif FPC_PROFILE} push %ebx movzbl (%eax), %ecx { ecx = len(left) } movzbl (%edx), %ebx { ebx = len(right) } cmp %ebx, %ecx {$ifdef CPUX86_HAS_CMOV} cmovg %ebx, %ecx {$else} jle .LEcxIsLen mov %ebx, %ecx .LEcxIsLen: {$endif} push %eax { save left } inc %eax inc %edx { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. } {$if defined(FPC_PIC) or not declared(CompareByte_Impl)} call CompareByte {$else} call CompareByte_Impl { manually inline CompareByte } {$endif} pop %edx { restore left } test %eax, %eax jnz .LReturn movzbl (%edx), %eax sub %ebx, %eax .LReturn: pop %ebx end; {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE} {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL} {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL} function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe; { eax = left, edx = right } asm movzbl (%eax), %ecx cmp (%edx), %cl jne .LNotEqual inc %eax inc %edx {$if defined(FPC_PIC) or not declared(CompareByte_Impl)} jmp CompareByte {$else} jmp CompareByte_Impl { manually inline CompareByte } {$endif} .LNotEqual: or $-1, %eax end; {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL} {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR} {$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR} procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc; {$ifndef FPC_PROFILE} nostackframe; {$endif} // eax = res, edx = high(res), ecx = p asm {$ifdef FPC_PROFILE} push %eax push %edx push %ecx call mcount pop %ecx pop %edx pop %eax {$endif FPC_PROFILE} test %ecx, %ecx jz .LEmpty push %eax { save res } push %ecx { save p } push %edx { save high(res) } mov %ecx, %eax { eax = IndexByte.buf } { edx is already high(res) = IndexByte.count. Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing, but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’. Generic and x86 versions are “safe”. } xor %ecx, %ecx { ecx = 0 = IndexByte.value } { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx. With a stack frame, there is an additional push ebp and need 12 more bytes to align. } {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)} leal -12(%esp), %esp {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)} {$if defined(FPC_PIC) or not declared(IndexByte_Impl)} call IndexByte {$else} call IndexByte_Impl { manually inline IndexByte } {$endif} {$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)} leal 12(%esp), %esp {$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)} pop %ecx { ecx = high(res) = Move.len } test %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). } {$ifdef CPUX86_HAS_CMOV} cmovns %eax, %ecx {$else} js .LEcxIsLen mov %eax, %ecx .LEcxIsLen: {$endif} pop %eax { pop p to eax = Move.src } pop %edx { pop res to edx } mov %cl, (%edx) { res[0] := len } inc %edx { res[1] = Move.dst } {$ifdef FPC_PROFILE} {$ifdef FPC_SYSTEM_STACKALIGNMENT16} leal -12(%esp), %esp {$endif FPC_SYSTEM_STACKALIGNMENT16} call Move {$ifdef FPC_SYSTEM_STACKALIGNMENT16} leal 12(%esp), %esp {$endif FPC_SYSTEM_STACKALIGNMENT16} jmp .LReturn {$else FPC_PROFILE} jmp Move { can perform a tail call } {$endif FPC_PROFILE} .LEmpty: movb $0, (%eax) {$ifdef FPC_PROFILE} .LReturn: {$endif} end; {$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR} {$IFNDEF INTERNAL_BACKTRACE} {$define FPC_SYSTEM_HAS_GET_FRAME} function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif} asm movl %ebp,%eax end; {$ENDIF not INTERNAL_BACKTRACE} {$define FPC_SYSTEM_HAS_GET_PC_ADDR} Function Get_pc_addr : Pointer;assembler;nostackframe; asm movl (%esp),%eax end; {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR} function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer; {$if defined(win32)} { Windows has StackTop always properly set } begin if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then Result:=PPointer(framebp+4)^ else Result:=nil; end; {$else defined(win32)} nostackframe;assembler; asm orl %eax,%eax jz .Lg_a_null movl 4(%eax),%eax .Lg_a_null: end; {$endif defined(win32)} {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME} function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer; {$if defined(win32)} { Windows has StackTop always properly set } begin if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then Result:=PPointer(framebp)^ else Result:=nil; end; {$else defined(win32)} nostackframe;assembler; asm orl %eax,%eax jz .Lgnf_null movl (%eax),%eax .Lgnf_null: end; {$endif defined(win32)} {$define FPC_SYSTEM_HAS_SPTR} Function Sptr : Pointer;assembler;nostackframe; asm movl %esp,%eax end; {**************************************************************************** Str() ****************************************************************************} {$if defined(disabled) and defined(regcall) } {$define FPC_SYSTEM_HAS_INT_STR_LONGWORD} {$define FPC_SYSTEM_HAS_INT_STR_LONGINT} label str_int_shortcut; procedure int_str(l:longword;out s:shortstring);assembler;nostackframe; asm pushl %esi pushl %edi pushl %ebx mov %edx,%edi xor %edx,%edx jmp str_int_shortcut end; procedure int_str(l:longint;out s:shortstring);assembler;nostackframe; {Optimized for speed, but balanced with size.} const digits:array[0..9] of cardinal=(0,10,100,1000,10000, 100000,1000000,10000000, 100000000,1000000000); asm {$ifdef FPC_PROFILE} push %eax push %edx push %ecx call mcount pop %ecx pop %edx pop %eax {$endif FPC_PROFILE} push %esi push %edi push %ebx movl %edx,%edi { Calculate absolute value and put sign in edx} cltd xorl %edx,%eax subl %edx,%eax negl %edx str_int_shortcut: movl %ecx,%esi {Calculate amount of digits in ecx.} xorl %ecx,%ecx bsrl %eax,%ecx incl %ecx imul $1233,%ecx shr $12,%ecx {$ifdef FPC_PIC} call fpc_geteipasebx {$ifdef darwin} movl digits-.Lpic(%ebx),%ebx {$else} addl $_GLOBAL_OFFSET_TABLE_,%ebx movl digits@GOT(%ebx),%ebx {$endif} cmpl (%ebx,%ecx,4),%eax {$else} cmpl digits(,%ecx,4),%eax {$endif} cmc adcl $0,%ecx {Nr. digits ready in ecx.} {Write length & sign.} lea (%edx,%ecx),%ebx movb $45,%bh {movb $'-,%bh Not supported by our ATT reader.} movw %bx,(%edi) addl %edx,%edi subl %edx,%esi {Skip digits beyond string length.} movl %eax,%edx subl %ecx,%esi jae .Lloop_write .balign 4 .Lloop_skip: movl $0xcccccccd,%eax {Divide by 10 using mul+shr} mull %edx shrl $3,%edx decl %ecx jz .Ldone {If (l<0) and (high(s)=1) this jump is taken.} incl %esi jnz .Lloop_skip {Write out digits.} .balign 4 .Lloop_write: movl $0xcccccccd,%eax {Divide by 10 using mul+shr} {Pre-add '0'} leal 48(%edx),%ebx {leal $'0(,%edx),%ebx Not supported by our ATT reader.} mull %edx shrl $3,%edx leal (%edx,%edx,8),%eax {x mod 10 = x-10*(x div 10)} subl %edx,%ebx subl %eax,%ebx movb %bl,(%edi,%ecx) decl %ecx jnz .Lloop_write .Ldone: popl %ebx popl %edi popl %esi end; {$endif} {**************************************************************************** Bounds Check ****************************************************************************} { do a thread-safe inc/dec } {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT} function cpudeclocked(var l : longint) : boolean;assembler;nostackframe; asm lock decl (%eax) setzb %al end; {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT} procedure cpuinclocked(var l : longint);assembler;nostackframe; asm lock incl (%eax) end; // inline SMP check and normal lock. // the locked one is so slow, inlining doesn't matter. function declocked(var l : longint) : boolean; inline; begin if not ismultithread then begin dec(l); declocked:=l=0; end else declocked:=cpudeclocked(l); end; procedure inclocked(var l : longint); inline; begin if not ismultithread then inc(l) else cpuinclocked(l); end; function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe; asm movl $-1,%edx lock xaddl %edx, (%eax) lea -1(%edx),%eax end; function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe; asm movl $1,%edx lock xaddl %edx, (%eax) lea 1(%edx),%eax end; function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe; asm xchgl (%eax),%edx movl %edx,%eax end; function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe; asm lock xaddl %edx, (%eax) movl %edx,%eax end; function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe; asm xchgl %eax,%ecx lock cmpxchgl %edx, (%ecx) end; function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler; asm pushl %ebx pushl %edi movl %eax,%edi movl Comperand+4,%edx movl Comperand+0,%eax movl NewValue+4,%ecx movl NewValue+0,%ebx lock cmpxchg8b (%edi) pop %edi pop %ebx end; {**************************************************************************** FPU ****************************************************************************} const { Internal constants for use in system unit } FPU_Invalid = 1; FPU_Denormal = 2; FPU_DivisionByZero = 4; FPU_Overflow = 8; FPU_Underflow = $10; FPU_StackUnderflow = $20; FPU_StackOverflow = $40; FPU_ExceptionMask = $ff; MM_Invalid = 1; MM_Denormal = 2; MM_DivisionByZero = 4; MM_Overflow = 8; MM_Underflow = $10; MM_Precicion = $20; MM_ExceptionMask = $3f; MM_MaskInvalidOp = %0000000010000000; MM_MaskDenorm = %0000000100000000; MM_MaskDivZero = %0000001000000000; MM_MaskOverflow = %0000010000000000; MM_MaskUnderflow = %0000100000000000; MM_MaskPrecision = %0001000000000000; {$define FPC_SYSTEM_HAS_SYSINITFPU} Procedure SysInitFPU; begin end; {$define FPC_SYSTEM_HAS_SYSRESETFPU} Procedure SysResetFPU; var { these locals are so we don't have to hack pic code in the assembler } localmxcsr: dword; localfpucw: word; begin localfpucw:=Default8087CW; asm fninit fwait fldcw localfpucw end; if has_sse_support then begin localmxcsr:=DefaultMXCSR; asm { setup sse exceptions } {$ifndef OLD_ASSEMBLER} ldmxcsr localmxcsr {$else OLD_ASSEMBLER} mov localmxcsr,%eax subl $4,%esp mov %eax,(%esp) //ldmxcsr (%esp) .byte 0x0f,0xae,0x14,0x24 addl $4,%esp {$endif OLD_ASSEMBLER} end; end; end; { because of the brain dead sse detection on x86, this test is post poned } procedure fpc_cpucodeinit; var _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint; begin if cpuid_support then begin asm movl $1,%eax xorl %ecx,%ecx cpuid movl %edx,_edx_cpuid1 movl %ecx,_ecx_cpuid1 end ['ebx']; has_mmx_support:=(_edx_cpuid1 and $800000)<>0; if ((_edx_cpuid1 and $2000000)<>0) then begin os_supports_sse:=true; sse_check:=true; asm { force an sse exception if no sse is supported, the exception handler sets os_supports_sse to false then } { don't change this instruction, the code above depends on its size } {$ifdef OLD_ASSEMBLER} .byte 0x0f,0x28,0xf7 {$else} movaps %xmm7, %xmm6 {$endif not EMX} end; sse_check:=false; has_sse_support:=os_supports_sse; end; if has_sse_support then begin has_sse2_support:=((_edx_cpuid1 and $4000000)<>0); has_sse3_support:=((_ecx_cpuid1 and $200)<>0); { now avx } asm xorl %eax,%eax cpuid movl %eax,_eax end; if _eax>=7 then begin asm movl $7,%eax xorl %ecx,%ecx cpuid movl %ebx,_ebx_cpuid7 end; fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0; if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then begin asm xorl %ecx,%ecx .byte 0x0f,0x01,0xd0 { xgetbv } movl %eax,_eax end; if (_eax and 6)=6 then begin has_avx_support:=(_ecx_cpuid1 and $10000000)<>0; has_avx2_support:=(_ebx_cpuid7 and $20)<>0; end; end; end; end; end; { don't let libraries influence the FPU cw set by the host program } if IsLibrary then begin Default8087CW:=Get8087CW; if has_sse_support then DefaultMXCSR:=GetMXCSR; end; SysResetFPU; fpc_cpucodeinit_performed:=true; end; {$if not defined(darwin) and defined(regcall) } { darwin requires that the stack is aligned to 16 bytes when calling another function } {$ifdef FPC_HAS_FEATURE_ANSISTRINGS} {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF} Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler; asm movl (%eax),%edx testl %edx,%edx jz .Lquit movl $0,(%eax) // s:=nil cmpl $0,-8(%edx) // exit if refcount<0 jl .Lquit {$ifdef FPC_PIC} call fpc_geteipasecx addl $_GLOBAL_OFFSET_TABLE_,%ecx movl ismultithread@GOT(%ecx),%ecx cmpl $0,(%ecx) {$else FPC_PIC} cmpl $0,ismultithread {$endif FPC_PIC} je .Lskiplock .byte 0xF0 // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic. .Lskiplock: decl -8(%edx) jz .Lfree .Lquit: ret .Lfree: leal -12(%edx),%eax // points to start of allocation { freemem is not an assembler leaf function like fpc_geteipasecx, so it needs to be called with proper stack alignment } {$ifdef FPC_SYSTEM_STACKALIGNMENT16} leal -12(%esp),%esp call FPC_FREEMEM leal 12(%esp),%esp {$else FPC_SYSTEM_STACKALIGNMENT16} jmp FPC_FREEMEM // can perform a tail call {$endif FPC_SYSTEM_STACKALIGNMENT16} end; function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward; {$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE} Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler; asm // Var S located in register // Var $result located in register movl %eax,%edx // [437] pointer(result) := pointer(s); movl (%eax),%eax // [438] If Pointer(S)=Nil then testl %eax,%eax je .Lj4031 .Lj4036: // [440] if PAnsiRec(Pointer(S)-Firstoff)^.Ref<>1 then movl -8(%eax),%ecx cmpl $1,%ecx je .Lj4038 // [441] result:=fpc_truely_ansistr_unique(s); movl %edx,%eax {$ifdef FPC_SYSTEM_STACKALIGNMENT16} leal -12(%esp),%esp {$endif FPC_SYSTEM_STACKALIGNMENT16} call fpc_truely_ansistr_unique {$ifdef FPC_SYSTEM_STACKALIGNMENT16} leal 12(%esp),%esp {$endif FPC_SYSTEM_STACKALIGNMENT16} .Lj4038: .Lj4031: // [442] end; end; {$endif FPC_HAS_FEATURE_ANSISTRINGS} {$endif ndef darwin and defined(regcall) } {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER} {$define FPC_SYSTEM_HAS_MEM_BARRIER} procedure ReadBarrier;assembler;nostackframe; asm {$ifdef CPUX86_HAS_SSE2} lfence {$else CPUX86_HAS_SSE2} lock addl $0,0(%esp) {$endif CPUX86_HAS_SSE2} end; procedure ReadDependencyBarrier;{$ifdef SYSTEMINLINE}inline;{$endif} begin { reads imply barrier on earlier reads depended on } end; procedure ReadWriteBarrier;assembler;nostackframe; asm {$ifdef CPUX86_HAS_SSE2} mfence {$else CPUX86_HAS_SSE2} lock addl $0,0(%esp) {$endif CPUX86_HAS_SSE2} end; procedure WriteBarrier;assembler;nostackframe; asm {$ifdef CPUX86_HAS_SSEUNIT} sfence {$endif CPUX86_HAS_SSEUNIT} end; {$endif} {$ifndef FPC_SYSTEM_HAS_BSF_QWORD} {$define FPC_SYSTEM_HAS_BSF_QWORD} function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe; asm bsfl 4(%esp),%eax jz .L1 ret $8 .L1: bsfl 8(%esp),%eax jz .L2 add $32,%eax ret $8 .L2: movl $255,%eax end; {$endif FPC_SYSTEM_HAS_BSF_QWORD} {$ifndef FPC_SYSTEM_HAS_BSR_QWORD} {$define FPC_SYSTEM_HAS_BSR_QWORD} function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe; asm bsrl 8(%esp),%eax jz .L1 add $32,%eax ret $8 .L1: bsrl 4(%esp),%eax jz .L2 ret $8 .L2: movl $255,%eax end; {$endif FPC_SYSTEM_HAS_BSR_QWORD} {$ifndef FPC_SYSTEM_HAS_SAR_QWORD} {$define FPC_SYSTEM_HAS_SAR_QWORD} function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe; asm movl 8(%esp),%edx movzbl %al,%ecx cmpb $32,%al jnb .L1 movl 4(%esp),%eax shrdl %cl,%edx,%eax sarl %cl,%edx ret $8 .L1: movl %edx,%eax sarl $31,%edx sarl %cl,%eax // uses 5 lower bits of cl. end; {$endif FPC_SYSTEM_HAS_SAR_QWORD}