summaryrefslogtreecommitdiffstats
path: root/libc/string/x86_64/memset.S
diff options
context:
space:
mode:
Diffstat (limited to 'libc/string/x86_64/memset.S')
-rw-r--r--libc/string/x86_64/memset.S30
1 files changed, 18 insertions, 12 deletions
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index d72d74468..46751006b 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -53,15 +53,17 @@ ENTRY (memset)
imul %rax,%r8
#endif
test $0x7,%edi /* Check for alignment. */
- je 2f
+ jz 2f
- .p2align 4
-1: /* Align ptr to 8 byte. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+1:
+ /* Align ptr to 8 byte. */
mov %sil,(%rcx)
dec %rdx
inc %rcx
- test $0x7,%ecx
- jne 1b
+ test $0x7,%cl
+ jnz 1b
2: /* Check for really large regions. */
mov %rdx,%rax
@@ -70,8 +72,10 @@ ENTRY (memset)
cmp LARGE, %rdx
jae 11f
- .p2align 4
-3: /* Copy 64 bytes. */
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
+3:
+ /* Fill 64 bytes. */
mov %r8,(%rcx)
mov %r8,0x8(%rcx)
mov %r8,0x10(%rcx)
@@ -84,7 +88,7 @@ ENTRY (memset)
dec %rax
jne 3b
-4: /* Copy final bytes. */
+4: /* Fill final bytes. */
and $0x3f,%edx
mov %rdx,%rax
shr $0x3,%rax
@@ -107,16 +111,18 @@ ENTRY (memset)
jne 8b
9:
#if BZERO_P
- nop
+ /* nothing */
#else
/* Load result (only if used as memset). */
mov %rdi,%rax /* start address of destination is result */
#endif
retq
- .p2align 4
-11: /* Copy 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
+ /* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+ .p2align 4,,14
+11:
+ /* Fill 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
speed up for large cases but let's not use XMM registers. */
movnti %r8,(%rcx)
movnti %r8,0x8(%rcx)