summaryrefslogtreecommitdiffstats
path: root/libc/string/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'libc/string/x86_64')
-rw-r--r--libc/string/x86_64/_glibc_inc.h7
-rw-r--r--libc/string/x86_64/memcpy.S4
-rw-r--r--libc/string/x86_64/memset.S30
-rw-r--r--libc/string/x86_64/strcat.S20
-rw-r--r--libc/string/x86_64/strchr.S13
-rw-r--r--libc/string/x86_64/strcpy.S10
-rw-r--r--libc/string/x86_64/strcspn.S20
-rw-r--r--libc/string/x86_64/strlen.S15
-rw-r--r--libc/string/x86_64/strspn.S22
9 files changed, 93 insertions, 48 deletions
diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h
index 88cef2ea3..415ce90a7 100644
--- a/libc/string/x86_64/_glibc_inc.h
+++ b/libc/string/x86_64/_glibc_inc.h
@@ -6,15 +6,8 @@
#include <features.h>
#include <bits/wordsize.h>
-#if __WORDSIZE == 32
-# define ENTRY_ALIGN 4
-#else
-# define ENTRY_ALIGN 2
-#endif
-
#define ENTRY(sym) \
.global sym; \
- .align ENTRY_ALIGN; \
.type sym,%function; \
sym:
diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S
index b3bb0f96c..697b992d0 100644
--- a/libc/string/x86_64/memcpy.S
+++ b/libc/string/x86_64/memcpy.S
@@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))
subq $32, %rcx
js 2f
- .p2align 4
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
3:
-
/* Now correct the loop counter. Please note that in the following
code the flags are not changed anymore. */
subq $32, %rcx
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index d72d74468..46751006b 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -53,15 +53,17 @@ ENTRY (memset)
imul %rax,%r8
#endif
test $0x7,%edi /* Check for alignment. */
- je 2f
+ jz 2f
- .p2align 4
-1: /* Align ptr to 8 byte. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+1:
+ /* Align ptr to 8 byte. */
mov %sil,(%rcx)
dec %rdx
inc %rcx
- test $0x7,%ecx
- jne 1b
+ test $0x7,%cl
+ jnz 1b
2: /* Check for really large regions. */
mov %rdx,%rax
@@ -70,8 +72,10 @@ ENTRY (memset)
cmp LARGE, %rdx
jae 11f
- .p2align 4
-3: /* Copy 64 bytes. */
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
+3:
+ /* Fill 64 bytes. */
mov %r8,(%rcx)
mov %r8,0x8(%rcx)
mov %r8,0x10(%rcx)
@@ -84,7 +88,7 @@ ENTRY (memset)
dec %rax
jne 3b
-4: /* Copy final bytes. */
+4: /* Fill final bytes. */
and $0x3f,%edx
mov %rdx,%rax
shr $0x3,%rax
@@ -107,16 +111,18 @@ ENTRY (memset)
jne 8b
9:
#if BZERO_P
- nop
+ /* nothing */
#else
/* Load result (only if used as memset). */
mov %rdi,%rax /* start address of destination is result */
#endif
retq
- .p2align 4
-11: /* Copy 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
+ /* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+ .p2align 4,,14
+11:
+ /* Fill 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
speed up for large cases but let's not use XMM registers. */
movnti %r8,(%rcx)
movnti %r8,0x8(%rcx)
diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S
index 9b0068981..23d068fea 100644
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -21,6 +21,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (BP_SYM (strcat))
@@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat))
/* Now the source is aligned. Scan for NUL byte. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
@@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat))
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
@@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat))
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
22:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat))
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
23:
movb %al, (%rdx) /* 1st byte. */
testb %al, %al /* Is it NUL. */
diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S
index 8e59c4c19..9ef46b7f2 100644
--- a/libc/string/x86_64/strchr.S
+++ b/libc/string/x86_64/strchr.S
@@ -20,6 +20,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (BP_SYM (strchr))
@@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr))
each of whose bytes is C. This turns each byte that is C
into a zero. */
- .p2align 4
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* Main Loop is unrolled 4 times. */
/* First unroll. */
@@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr))
reversed. */
- .p2align 4 /* Align, it's a jump target. */
-3: movq %r9,%rdx /* move to %rdx so that we can access bytes */
+ /* Align, it's a jump target. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+3:
+ movq %r9,%rdx /* move to %rdx so that we can access bytes */
subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte C? */
jz 6f /* yes => return pointer */
@@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr))
incq %rax
6:
- nop
+ /* nop - huh?? */
retq
END (BP_SYM (strchr))
diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S
index d9a51b0bb..612a30d1a 100644
--- a/libc/string/x86_64/strcpy.S
+++ b/libc/string/x86_64/strcpy.S
@@ -20,6 +20,8 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
+
#ifndef USE_AS_STPCPY
# define STRCPY strcpy
#endif
@@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
1:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
3:
/* Note that stpcpy needs to return with the value of the NUL
byte. */
diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S
index fed12b5f6..fd9b09c48 100644
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -25,6 +25,8 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
+
/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */
#define STRPBRK_P (defined strcspn)
@@ -53,26 +55,28 @@ ENTRY (strcspn)
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+
L(2): movb (%rax), %cl /* get byte from skipset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 1(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 2(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 3(%rax), %cl /* get byte from skipset */
addq $4, %rax /* increment skipset pointer */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jnz L(2) /* no => process next dword from skipset */
L(1): leaq -4(%rdx), %rax /* prepare loop */
@@ -86,7 +90,13 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+
L(3): addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S
index 0441dc46c..4213f0ab6 100644
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -20,6 +20,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (strlen)
@@ -39,8 +40,11 @@ ENTRY (strlen)
1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
- .p2align 4 /* Align loop. */
-4: /* Main Loop is unrolled 4 times. */
+ /* Align loop. */
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
+4:
+ /* Main Loop is unrolled 4 times. */
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
addq $8,%rax /* adjust pointer for next word */
@@ -97,8 +101,11 @@ ENTRY (strlen)
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S
index c126abd2e..41cff0490 100644
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -50,26 +50,28 @@ ENTRY (strspn)
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from stopset */
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+L(2):
+ movb (%rax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 1(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 2(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 3(%rax), %cl /* get byte from stopset */
addq $4, %rax /* increment stopset pointer */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jnz L(2) /* no => process next dword from stopset */
L(1): leaq -4(%rdx), %rax /* prepare loop */
@@ -83,8 +85,14 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+L(3):
+ addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
testb %cl, (%rsp,%rcx) /* is it contained in skipset? */