diff options
Diffstat (limited to 'libc/string/x86_64/memset.S')
-rw-r--r-- | libc/string/x86_64/memset.S | 30 |
1 files changed, 18 insertions, 12 deletions
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index d72d74468..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -53,15 +53,17 @@ ENTRY (memset) imul %rax,%r8 #endif test $0x7,%edi /* Check for alignment. */ - je 2f + jz 2f - .p2align 4 -1: /* Align ptr to 8 byte. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +1: + /* Align ptr to 8 byte. */ mov %sil,(%rcx) dec %rdx inc %rcx - test $0x7,%ecx - jne 1b + test $0x7,%cl + jnz 1b 2: /* Check for really large regions. */ mov %rdx,%rax @@ -70,8 +72,10 @@ ENTRY (memset) cmp LARGE, %rdx jae 11f - .p2align 4 -3: /* Copy 64 bytes. */ + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 +3: + /* Fill 64 bytes. */ mov %r8,(%rcx) mov %r8,0x8(%rcx) mov %r8,0x10(%rcx) @@ -84,7 +88,7 @@ ENTRY (memset) dec %rax jne 3b -4: /* Copy final bytes. */ +4: /* Fill final bytes. */ and $0x3f,%edx mov %rdx,%rax shr $0x3,%rax @@ -107,16 +111,18 @@ ENTRY (memset) jne 8b 9: #if BZERO_P - nop + /* nothing */ #else /* Load result (only if used as memset). */ mov %rdi,%rax /* start address of destination is result */ #endif retq - .p2align 4 -11: /* Copy 64 bytes without polluting the cache. */ - /* We could use movntdq %xmm0,(%rcx) here to further + /* Next 3 insns are 14 bytes total, make sure we decode them in one go */ + .p2align 4,,14 +11: + /* Fill 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further speed up for large cases but let's not use XMM registers. */ movnti %r8,(%rcx) movnti %r8,0x8(%rcx) |