diff options
Diffstat (limited to 'libc/string/x86_64')
| -rw-r--r-- | libc/string/x86_64/_glibc_inc.h | 7 | ||||
| -rw-r--r-- | libc/string/x86_64/memcpy.S | 4 | ||||
| -rw-r--r-- | libc/string/x86_64/memset.S | 30 | ||||
| -rw-r--r-- | libc/string/x86_64/strcat.S | 20 | ||||
| -rw-r--r-- | libc/string/x86_64/strchr.S | 13 | ||||
| -rw-r--r-- | libc/string/x86_64/strcpy.S | 10 | ||||
| -rw-r--r-- | libc/string/x86_64/strcspn.S | 20 | ||||
| -rw-r--r-- | libc/string/x86_64/strlen.S | 15 | ||||
| -rw-r--r-- | libc/string/x86_64/strspn.S | 22 |
9 files changed, 93 insertions, 48 deletions
diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h index 88cef2ea3..415ce90a7 100644 --- a/libc/string/x86_64/_glibc_inc.h +++ b/libc/string/x86_64/_glibc_inc.h @@ -6,15 +6,8 @@ #include <features.h> #include <bits/wordsize.h> -#if __WORDSIZE == 32 -# define ENTRY_ALIGN 4 -#else -# define ENTRY_ALIGN 2 -#endif - #define ENTRY(sym) \ .global sym; \ - .align ENTRY_ALIGN; \ .type sym,%function; \ sym: diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S index b3bb0f96c..697b992d0 100644 --- a/libc/string/x86_64/memcpy.S +++ b/libc/string/x86_64/memcpy.S @@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy)) subq $32, %rcx js 2f - .p2align 4 + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 3: - /* Now correct the loop counter. Please note that in the following code the flags are not changed anymore. */ subq $32, %rcx diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index d72d74468..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -53,15 +53,17 @@ ENTRY (memset) imul %rax,%r8 #endif test $0x7,%edi /* Check for alignment. */ - je 2f + jz 2f - .p2align 4 -1: /* Align ptr to 8 byte. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +1: + /* Align ptr to 8 byte. */ mov %sil,(%rcx) dec %rdx inc %rcx - test $0x7,%ecx - jne 1b + test $0x7,%cl + jnz 1b 2: /* Check for really large regions. */ mov %rdx,%rax @@ -70,8 +72,10 @@ ENTRY (memset) cmp LARGE, %rdx jae 11f - .p2align 4 -3: /* Copy 64 bytes. */ + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 +3: + /* Fill 64 bytes. */ mov %r8,(%rcx) mov %r8,0x8(%rcx) mov %r8,0x10(%rcx) @@ -84,7 +88,7 @@ ENTRY (memset) dec %rax jne 3b -4: /* Copy final bytes. */ +4: /* Fill final bytes. */ and $0x3f,%edx mov %rdx,%rax shr $0x3,%rax @@ -107,16 +111,18 @@ ENTRY (memset) jne 8b 9: #if BZERO_P - nop + /* nothing */ #else /* Load result (only if used as memset). */ mov %rdi,%rax /* start address of destination is result */ #endif retq - .p2align 4 -11: /* Copy 64 bytes without polluting the cache. */ - /* We could use movntdq %xmm0,(%rcx) here to further + /* Next 3 insns are 14 bytes total, make sure we decode them in one go */ + .p2align 4,,14 +11: + /* Fill 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further speed up for large cases but let's not use XMM registers. */ movnti %r8,(%rcx) movnti %r8,0x8(%rcx) diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S index 9b0068981..23d068fea 100644 --- a/libc/string/x86_64/strcat.S +++ b/libc/string/x86_64/strcat.S @@ -21,6 +21,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (BP_SYM (strcat)) @@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat)) /* Now the source is aligned. Scan for NUL byte. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 4: /* First unroll. */ movq (%rax), %rcx /* get double word (= 8 bytes) in question */ @@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat)) the addition will not result in 0. */ jz 4b /* no NUL found => continue loop */ - .p2align 4 /* Align, it's a jump target. */ -3: subq $8,%rax /* correct pointer increment. */ + /* Align, it is a jump target. */ + /* Next 3 insns are 8 bytes total, make sure we decode them in one go */ + .p2align 3,,8 +3: + subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte NUL? */ jz 2f /* yes => return */ @@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat)) /* Now the sources is aligned. Unfortunatly we cannot force to have both source and destination aligned, so ignore the alignment of the destination. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 22: /* 1st unroll. */ movq (%rsi), %rax /* Read double word (8 bytes). */ @@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat)) /* Do the last few bytes. %rax contains the value to write. The loop is unrolled twice. */ - .p2align 4 + + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 23: movb %al, (%rdx) /* 1st byte. */ testb %al, %al /* Is it NUL. */ diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S index 8e59c4c19..9ef46b7f2 100644 --- a/libc/string/x86_64/strchr.S +++ b/libc/string/x86_64/strchr.S @@ -20,6 +20,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (BP_SYM (strchr)) @@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr)) each of whose bytes is C. This turns each byte that is C into a zero. */ - .p2align 4 + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 4: /* Main Loop is unrolled 4 times. */ /* First unroll. */ @@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr)) reversed. */ - .p2align 4 /* Align, it's a jump target. */ -3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + /* Align, it's a jump target. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +3: + movq %r9,%rdx /* move to %rdx so that we can access bytes */ subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte C? */ jz 6f /* yes => return pointer */ @@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr)) incq %rax 6: - nop + /* nop - huh?? */ retq END (BP_SYM (strchr)) diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S index d9a51b0bb..612a30d1a 100644 --- a/libc/string/x86_64/strcpy.S +++ b/libc/string/x86_64/strcpy.S @@ -20,6 +20,8 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ + #ifndef USE_AS_STPCPY # define STRCPY strcpy #endif @@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY)) /* Now the sources is aligned. Unfortunatly we cannot force to have both source and destination aligned, so ignore the alignment of the destination. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 1: /* 1st unroll. */ movq (%rsi), %rax /* Read double word (8 bytes). */ @@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY)) /* Do the last few bytes. %rax contains the value to write. The loop is unrolled twice. */ - .p2align 4 + + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 3: /* Note that stpcpy needs to return with the value of the NUL byte. */ diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S index fed12b5f6..fd9b09c48 100644 --- a/libc/string/x86_64/strcspn.S +++ b/libc/string/x86_64/strcspn.S @@ -25,6 +25,8 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ + /* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */ #define STRPBRK_P (defined strcspn) @@ -53,26 +55,28 @@ ENTRY (strcspn) Although all the following instruction only modify %cl we always have a correct zero-extended 64-bit value in %rcx. */ - .p2align 4 + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 + L(2): movb (%rax), %cl /* get byte from skipset */ testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 1(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 2(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 3(%rax), %cl /* get byte from skipset */ addq $4, %rax /* increment skipset pointer */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jnz L(2) /* no => process next dword from skipset */ L(1): leaq -4(%rdx), %rax /* prepare loop */ @@ -86,7 +90,13 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */ value in the table. But the value of NUL is NUL so the loop terminates for NUL in every case. */ - .p2align 4 + /* Next 3 insns are 9 bytes total. */ + /* .p2align 4,,9 would make sure we decode them in one go, */ + /* but it will also align entire function to 16 bytes, */ + /* potentially creating largish padding at link time. */ + /* We are aligning to 8 bytes instead: */ + .p2align 3,,8 + L(3): addq $4, %rax /* adjust pointer for full loop round */ movb (%rax), %cl /* get byte from string */ diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S index 0441dc46c..4213f0ab6 100644 --- a/libc/string/x86_64/strlen.S +++ b/libc/string/x86_64/strlen.S @@ -20,6 +20,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (strlen) @@ -39,8 +40,11 @@ ENTRY (strlen) 1: movq $0xfefefefefefefeff,%r8 /* Save magic. */ - .p2align 4 /* Align loop. */ -4: /* Main Loop is unrolled 4 times. */ + /* Align loop. */ + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 +4: + /* Main Loop is unrolled 4 times. */ /* First unroll. */ movq (%rax), %rcx /* get double word (= 8 bytes) in question */ addq $8,%rax /* adjust pointer for next word */ @@ -97,8 +101,11 @@ ENTRY (strlen) the addition will not result in 0. */ jz 4b /* no NUL found => continue loop */ - .p2align 4 /* Align, it's a jump target. */ -3: subq $8,%rax /* correct pointer increment. */ + /* Align, it is a jump target. */ + /* Next 3 insns are 8 bytes total, make sure we decode them in one go */ + .p2align 3,,8 +3: + subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte NUL? */ jz 2f /* yes => return */ diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S index c126abd2e..41cff0490 100644 --- a/libc/string/x86_64/strspn.S +++ b/libc/string/x86_64/strspn.S @@ -50,26 +50,28 @@ ENTRY (strspn) Although all the following instruction only modify %cl we always have a correct zero-extended 64-bit value in %rcx. */ - .p2align 4 -L(2): movb (%rax), %cl /* get byte from stopset */ + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 +L(2): + movb (%rax), %cl /* get byte from stopset */ testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 1(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 2(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 3(%rax), %cl /* get byte from stopset */ addq $4, %rax /* increment stopset pointer */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jnz L(2) /* no => process next dword from stopset */ L(1): leaq -4(%rdx), %rax /* prepare loop */ @@ -83,8 +85,14 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */ value in the table. But the value of NUL is NUL so the loop terminates for NUL in every case. */ - .p2align 4 -L(3): addq $4, %rax /* adjust pointer for full loop round */ + /* Next 3 insns are 9 bytes total. */ + /* .p2align 4,,9 would make sure we decode them in one go, */ + /* but it will also align entire function to 16 bytes, */ + /* potentially creating largish padding at link time. */ + /* We are aligning to 8 bytes instead: */ + .p2align 3,,8 +L(3): + addq $4, %rax /* adjust pointer for full loop round */ movb (%rax), %cl /* get byte from string */ testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ |
