|
@@ -84,97 +84,95 @@ asm
|
|
|
mov %rdi, %rcx
|
|
|
{$endif win64}
|
|
|
|
|
|
- mov %r8, %rax
|
|
|
- sub %rdx, %rcx { rcx = src - dest }
|
|
|
- jz .Lquit { exit if src=dest }
|
|
|
- jnb .L1 { src>dest => forward move }
|
|
|
-
|
|
|
- add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
|
|
|
- jb .Lback { if no overlap, still do forward move }
|
|
|
-
|
|
|
-.L1:
|
|
|
+ cmp $3, %r8
|
|
|
+ jle .L3OrLess
|
|
|
cmp $8, %r8
|
|
|
- jl .Lless8f { signed compare, negative count not allowed }
|
|
|
- test $7, %dl
|
|
|
- je .Ldestaligned
|
|
|
-
|
|
|
- test $1, %dl { align dest by moving first 1+2+4 bytes }
|
|
|
- je .L2f
|
|
|
- mov (%rcx,%rdx,1),%al
|
|
|
- dec %r8
|
|
|
- mov %al, (%rdx)
|
|
|
- add $1, %rdx
|
|
|
-.L2f:
|
|
|
- test $2, %dl
|
|
|
- je .L4f
|
|
|
- mov (%rcx,%rdx,1),%ax
|
|
|
- sub $2, %r8
|
|
|
- mov %ax, (%rdx)
|
|
|
- add $2, %rdx
|
|
|
-.L4f:
|
|
|
- test $4, %dl
|
|
|
- je .Ldestaligned
|
|
|
- mov (%rcx,%rdx,1),%eax
|
|
|
- sub $4, %r8
|
|
|
- mov %eax, (%rdx)
|
|
|
- add $4, %rdx
|
|
|
-
|
|
|
-.Ldestaligned:
|
|
|
- mov %r8, %r9
|
|
|
- shr $5, %r9
|
|
|
- jne .Lmore32
|
|
|
-
|
|
|
-.Ltail:
|
|
|
- mov %r8, %r9
|
|
|
- shr $3, %r9
|
|
|
- je .Lless8f
|
|
|
+ jle .L4to8
|
|
|
+ cmp $16, %r8
|
|
|
+ jle .L9to16
|
|
|
+ movdqu (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
|
|
|
+ movdqu -16(%rcx,%r8), %xmm5
|
|
|
+ cmp $32, %r8
|
|
|
+ jg .L33OrMore
|
|
|
+ movdqu %xmm4, (%rdx) { 17–32 bytes }
|
|
|
+ movdqu %xmm5, -16(%rdx,%r8)
|
|
|
+ ret
|
|
|
|
|
|
.balign 16
|
|
|
-.Lloop8f: { max. 8 iterations }
|
|
|
- mov (%rcx,%rdx,1),%rax
|
|
|
- mov %rax, (%rdx)
|
|
|
- add $8, %rdx
|
|
|
- dec %r9
|
|
|
- jne .Lloop8f
|
|
|
- and $7, %r8
|
|
|
+.L3OrLess:
|
|
|
+ cmp $1, %r8
|
|
|
+ jl .LZero
|
|
|
+ movzbl (%rcx), %eax
|
|
|
+ je .LOne
|
|
|
+ movzwl -2(%rcx,%r8), %r9d
|
|
|
+ mov %r9w, -2(%rdx,%r8)
|
|
|
+.LOne:
|
|
|
+ mov %al, (%rdx)
|
|
|
+.LZero:
|
|
|
+ ret
|
|
|
|
|
|
-.Lless8f:
|
|
|
- test %r8, %r8
|
|
|
- jle .Lquit
|
|
|
+.L4to8:
|
|
|
+ mov (%rcx), %eax
|
|
|
+ mov -4(%rcx,%r8), %r9d
|
|
|
+ mov %eax, (%rdx)
|
|
|
+ mov %r9d, -4(%rdx,%r8)
|
|
|
+ ret
|
|
|
|
|
|
- .balign 16
|
|
|
-.Lloop1f:
|
|
|
- mov (%rcx,%rdx,1),%al
|
|
|
- mov %al,(%rdx)
|
|
|
- inc %rdx
|
|
|
- dec %r8
|
|
|
- jne .Lloop1f
|
|
|
+.L9to16:
|
|
|
+ mov (%rcx), %rax
|
|
|
+ mov -8(%rcx,%r8), %r9
|
|
|
+ mov %rax, (%rdx)
|
|
|
+ mov %r9, -8(%rdx,%r8)
|
|
|
.Lquit:
|
|
|
- retq
|
|
|
+ ret
|
|
|
+ .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32f into a no-op. }
|
|
|
|
|
|
+.L33OrMore:
|
|
|
+ sub %rdx, %rcx { rcx = src - dest }
|
|
|
+ jz .Lquit { exit if src=dest }
|
|
|
+ jnb .LForward { src>dest => forward move }
|
|
|
|
|
|
-.Lmore32:
|
|
|
- cmp $0x2000, %r9 { this limit must be processor-specific (1/2 L2 cache size) }
|
|
|
- jnae .Lloop32
|
|
|
- cmp $0x1000, %rcx { but don't bother bypassing cache if src and dest }
|
|
|
- jnb .Lntloopf { are close to each other}
|
|
|
+ mov %r8, %rax
|
|
|
+ add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
|
|
|
+ jb .Lback { if no overlap, still do forward move }
|
|
|
|
|
|
- .balign 16
|
|
|
-.Lloop32:
|
|
|
- add $32,%rdx
|
|
|
- mov -32(%rcx,%rdx,1),%rax
|
|
|
- mov -24(%rcx,%rdx,1),%r10
|
|
|
- mov %rax,-32(%rdx)
|
|
|
- mov %r10,-24(%rdx)
|
|
|
- dec %r9
|
|
|
- mov -16(%rcx,%rdx,1),%rax
|
|
|
- mov -8(%rcx,%rdx,1),%r10
|
|
|
- mov %rax,-16(%rdx)
|
|
|
- mov %r10,-8(%rdx)
|
|
|
- jne .Lloop32
|
|
|
-
|
|
|
- and $0x1f, %r8
|
|
|
- jmpq .Ltail
|
|
|
+.LForward:
|
|
|
+ mov %rdx, %r9 { remember original dest to write first 16 bytes }
|
|
|
+ add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
|
|
|
+ add $16, %rdx
|
|
|
+ and $-16, %rdx
|
|
|
+ sub %rdx, %r8
|
|
|
+
|
|
|
+.LRestAfterNTf:
|
|
|
+ sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
|
|
|
+ jbe .LPost32f
|
|
|
+ cmp $0x40000, %r8 { this limit must be processor-specific (1/2 L2 cache size) }
|
|
|
+ jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
|
|
|
+
|
|
|
+ .balign 16 { no-op }
|
|
|
+.Lloop32f:
|
|
|
+ movdqu (%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, (%rdx)
|
|
|
+ movdqu 16(%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, 16(%rdx)
|
|
|
+ add $32, %rdx
|
|
|
+ sub $32, %r8
|
|
|
+ ja .Lloop32f
|
|
|
+
|
|
|
+.LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
|
|
|
+ cmp $-16, %r8
|
|
|
+ jle .LFirstAndLast16f
|
|
|
+ movdqu (%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, (%rdx)
|
|
|
+.LFirstAndLast16f:
|
|
|
+ movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
|
|
|
+ movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
|
|
|
+ ret
|
|
|
+
|
|
|
+.Lntf:
|
|
|
+ cmp $0x1000, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
|
|
|
+ jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
|
|
|
+ sub $0xFE0, %r8 { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. }
|
|
|
|
|
|
.Lntloopf:
|
|
|
mov $32, %eax
|
|
@@ -185,124 +183,72 @@ asm
|
|
|
prefetchnta 0x40(%rcx,%rdx,1)
|
|
|
add $0x80, %rdx
|
|
|
dec %eax
|
|
|
- jne .Lpref
|
|
|
+ jnz .Lpref
|
|
|
|
|
|
sub $0x1000, %rdx
|
|
|
mov $64, %eax
|
|
|
|
|
|
.balign 16
|
|
|
-.Loop64:
|
|
|
+.Lntloop64f:
|
|
|
add $64, %rdx
|
|
|
- mov -64(%rcx,%rdx,1), %r9
|
|
|
- mov -56(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, -64(%rdx)
|
|
|
- movnti %r10, -56(%rdx)
|
|
|
-
|
|
|
- mov -48(%rcx,%rdx,1), %r9
|
|
|
- mov -40(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, -48(%rdx)
|
|
|
- movnti %r10, -40(%rdx)
|
|
|
+ movdqu -64(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, -64(%rdx)
|
|
|
+ movdqu -48(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, -48(%rdx)
|
|
|
+ movdqu -32(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, -32(%rdx)
|
|
|
+ movdqu -16(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, -16(%rdx)
|
|
|
dec %eax
|
|
|
- mov -32(%rcx,%rdx,1), %r9
|
|
|
- mov -24(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, -32(%rdx)
|
|
|
- movnti %r10, -24(%rdx)
|
|
|
-
|
|
|
- mov -16(%rcx,%rdx,1), %r9
|
|
|
- mov -8(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, -16(%rdx)
|
|
|
- movnti %r10, -8(%rdx)
|
|
|
- jne .Loop64
|
|
|
+ jnz .Lntloop64f
|
|
|
|
|
|
sub $0x1000, %r8
|
|
|
- cmp $0x1000, %r8
|
|
|
jae .Lntloopf
|
|
|
|
|
|
mfence
|
|
|
- jmpq .Ldestaligned { go handle remaining bytes }
|
|
|
+ add $0x1000, %r8
|
|
|
+ jmpq .LRestAfterNTf { go handle remaining bytes }
|
|
|
+ .byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32b into a no-op. }
|
|
|
|
|
|
{ backwards move }
|
|
|
.Lback:
|
|
|
- add %r8, %rdx { points to the end of dest }
|
|
|
- cmp $8, %r8
|
|
|
- jl .Lless8b { signed compare, negative count not allowed }
|
|
|
- test $7, %dl
|
|
|
- je .Ldestalignedb
|
|
|
- test $1, %dl
|
|
|
- je .L2b
|
|
|
- dec %rdx
|
|
|
- mov (%rcx,%rdx,1), %al
|
|
|
- dec %r8
|
|
|
- mov %al, (%rdx)
|
|
|
-.L2b:
|
|
|
- test $2, %dl
|
|
|
- je .L4b
|
|
|
- sub $2, %rdx
|
|
|
- mov (%rcx,%rdx,1), %ax
|
|
|
- sub $2, %r8
|
|
|
- mov %ax, (%rdx)
|
|
|
-.L4b:
|
|
|
- test $4, %dl
|
|
|
- je .Ldestalignedb
|
|
|
- sub $4, %rdx
|
|
|
- mov (%rcx,%rdx,1), %eax
|
|
|
- sub $4, %r8
|
|
|
- mov %eax, (%rdx)
|
|
|
-
|
|
|
-.Ldestalignedb:
|
|
|
- mov %r8, %r9
|
|
|
- shr $5, %r9
|
|
|
- jne .Lmore32b
|
|
|
-
|
|
|
-.Ltailb:
|
|
|
- mov %r8, %r9
|
|
|
- shr $3, %r9
|
|
|
- je .Lless8b
|
|
|
-
|
|
|
-.Lloop8b:
|
|
|
- sub $8, %rdx
|
|
|
- mov (%rcx,%rdx,1), %rax
|
|
|
- dec %r9
|
|
|
- mov %rax, (%rdx)
|
|
|
- jne .Lloop8b
|
|
|
- and $7, %r8
|
|
|
-
|
|
|
-.Lless8b:
|
|
|
- test %r8, %r8
|
|
|
- jle .Lquit2
|
|
|
-
|
|
|
- .balign 16
|
|
|
-.Lsmallb:
|
|
|
- dec %rdx
|
|
|
- mov (%rcx,%rdx,1), %al
|
|
|
- dec %r8
|
|
|
- mov %al,(%rdx)
|
|
|
- jnz .Lsmallb
|
|
|
-.Lquit2:
|
|
|
- retq
|
|
|
-
|
|
|
-.Lmore32b:
|
|
|
- cmp $0x2000, %r9
|
|
|
- jnae .Lloop32b
|
|
|
- cmp $0xfffffffffffff000,%rcx
|
|
|
- jb .Lntloopb
|
|
|
-
|
|
|
- .balign 16
|
|
|
+ lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
|
|
|
+ lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
|
|
|
+ and $-16, %r8
|
|
|
+ sub %rdx, %r8
|
|
|
+ add %r8, %rdx
|
|
|
+
|
|
|
+.LRestAfterNTb:
|
|
|
+ sub $32, %r8
|
|
|
+ jbe .LPost32b
|
|
|
+ cmp $0x40000, %r8
|
|
|
+ jae .Lntb
|
|
|
+
|
|
|
+ .balign 16 { no-op }
|
|
|
.Lloop32b:
|
|
|
sub $32, %rdx
|
|
|
- mov 24(%rcx,%rdx,1), %rax
|
|
|
- mov 16(%rcx,%rdx,1), %r10
|
|
|
- mov %rax, 24(%rdx)
|
|
|
- mov %r10, 16(%rdx)
|
|
|
- dec %r9
|
|
|
- mov 8(%rcx,%rdx,1),%rax
|
|
|
- mov (%rcx,%rdx,1), %r10
|
|
|
- mov %rax, 8(%rdx)
|
|
|
- mov %r10, (%rdx)
|
|
|
- jne .Lloop32b
|
|
|
- and $0x1f, %r8
|
|
|
- jmpq .Ltailb
|
|
|
+ movdqu 16(%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, 16(%rdx)
|
|
|
+ movdqu (%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, (%rdx)
|
|
|
+ sub $32, %r8
|
|
|
+ ja .Lloop32b
|
|
|
+
|
|
|
+.LPost32b:
|
|
|
+ cmp $-16, %r8
|
|
|
+ jle .LFirstAndLast16b
|
|
|
+ movdqu -16(%rcx,%rdx), %xmm0
|
|
|
+ movdqa %xmm0, -16(%rdx)
|
|
|
+.LFirstAndLast16b:
|
|
|
+ sub %r8, %rdx
|
|
|
+ movdqu %xmm4, -32(%rdx)
|
|
|
+ movdqu %xmm5, -16(%r9)
|
|
|
+ ret
|
|
|
|
|
|
+.Lntb:
|
|
|
+ cmp $0xfffffffffffff000,%rcx
|
|
|
+ jnb .Lloop32b
|
|
|
+ sub $0xFE0, %r8
|
|
|
|
|
|
.Lntloopb:
|
|
|
mov $32, %eax
|
|
@@ -319,34 +265,25 @@ asm
|
|
|
mov $0x40, %eax
|
|
|
|
|
|
.balign 16
|
|
|
-.Lloop64b:
|
|
|
+.Lntloop64b:
|
|
|
sub $64, %rdx
|
|
|
- mov 56(%rcx,%rdx,1), %r9
|
|
|
- mov 48(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, 56(%rdx)
|
|
|
- movnti %r10, 48(%rdx)
|
|
|
-
|
|
|
- mov 40(%rcx,%rdx,1), %r9
|
|
|
- mov 32(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, 40(%rdx)
|
|
|
- movnti %r10, 32(%rdx)
|
|
|
+ movdqu 48(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, 48(%rdx)
|
|
|
+ movdqu 32(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, 32(%rdx)
|
|
|
+ movdqu 16(%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, 16(%rdx)
|
|
|
+ movdqu (%rcx,%rdx,1), %xmm0
|
|
|
+ movntdq %xmm0, (%rdx)
|
|
|
dec %eax
|
|
|
- mov 24(%rcx,%rdx,1), %r9
|
|
|
- mov 16(%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, 24(%rdx)
|
|
|
- movnti %r10, 16(%rdx)
|
|
|
-
|
|
|
- mov 8(%rcx,%rdx,1), %r9
|
|
|
- mov (%rcx,%rdx,1), %r10
|
|
|
- movnti %r9, 8(%rdx)
|
|
|
- movnti %r10, (%rdx)
|
|
|
- jne .Lloop64b
|
|
|
+ jnz .Lntloop64b
|
|
|
|
|
|
sub $0x1000, %r8
|
|
|
- cmp $0x1000, %r8
|
|
|
jae .Lntloopb
|
|
|
+
|
|
|
mfence
|
|
|
- jmpq .Ldestalignedb
|
|
|
+ add $0x1000, %r8
|
|
|
+ jmpq .LRestAfterNTb
|
|
|
end;
|
|
|
{$endif FPC_SYSTEM_HAS_MOVE}
|
|
|
|