summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libc/string/arm/_memcpy.S183
-rw-r--r--libc/string/arm/bcopy.S12
-rw-r--r--libc/string/arm/bzero.S12
-rw-r--r--libc/string/arm/memcmp.S28
-rw-r--r--libc/string/arm/memcpy.S11
-rw-r--r--libc/string/arm/memmove.S11
-rw-r--r--libc/string/arm/memset.S62
-rw-r--r--libc/string/arm/strcmp.S19
-rw-r--r--libc/string/arm/strlen.S25
-rw-r--r--libc/string/arm/strncmp.S33
-rw-r--r--libc/string/avr32/Makefile4
-rw-r--r--libc/string/avr32/bcopy.S20
-rw-r--r--libc/string/avr32/bzero.S16
-rw-r--r--libc/string/avr32/memcmp.S68
-rw-r--r--libc/string/avr32/memcpy.S156
-rw-r--r--libc/string/avr32/memmove.S172
-rw-r--r--libc/string/avr32/memset.S76
-rw-r--r--libc/string/avr32/strcmp.S124
-rw-r--r--libc/string/avr32/strlen.S90
-rw-r--r--libc/string/bfin/memchr.S25
-rw-r--r--libc/string/bfin/memcmp.S95
-rw-r--r--libc/string/bfin/memcpy.S65
-rw-r--r--libc/string/bfin/memmove.S87
-rw-r--r--libc/string/bfin/memset.S78
-rw-r--r--libc/string/bfin/strcmp.S39
-rw-r--r--libc/string/cris/memcpy.c4
-rw-r--r--libc/string/cris/memmove.c2
-rw-r--r--libc/string/cris/memset.c4
-rw-r--r--libc/string/cris/strcpy.c2
-rw-r--r--libc/string/cris/strncpy.c4
-rw-r--r--libc/string/frv/memset.S2
-rw-r--r--libc/string/i386/memchr.c2
-rw-r--r--libc/string/i386/memcpy.c2
-rw-r--r--libc/string/i386/memmove.c2
-rw-r--r--libc/string/i386/memset.c2
-rw-r--r--libc/string/i386/strcat.c2
-rw-r--r--libc/string/i386/strchr.c2
-rw-r--r--libc/string/i386/strcmp.c4
-rw-r--r--libc/string/i386/strcpy.c2
-rw-r--r--libc/string/i386/strlen.c2
-rw-r--r--libc/string/i386/strncat.c2
-rw-r--r--libc/string/i386/strncmp.c2
-rw-r--r--libc/string/i386/strncpy.c2
-rw-r--r--libc/string/i386/strnlen.c2
-rw-r--r--libc/string/i386/strrchr.c2
-rw-r--r--libc/string/ia64/memcpy.S4
-rw-r--r--libc/string/ia64/memmove.S2
-rw-r--r--libc/string/ia64/sysdep.h2
-rw-r--r--libc/string/powerpc/memcpy.c2
-rw-r--r--libc/string/powerpc/memmove.c4
-rw-r--r--libc/string/powerpc/memset.c4
-rw-r--r--libc/string/sparc/_glibc_inc.h2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/memchr.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/memcpy.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/memset.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/stpcpy.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/strcat.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/strchr.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/strcmp.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/strcpy.S2
-rw-r--r--libc/string/sparc/sparc32/sparcv9b/strlen.S2
-rw-r--r--libc/string/x86_64/_glibc_inc.h7
-rw-r--r--libc/string/x86_64/memcpy.S4
-rw-r--r--libc/string/x86_64/memset.S30
-rw-r--r--libc/string/x86_64/strcat.S20
-rw-r--r--libc/string/x86_64/strchr.S13
-rw-r--r--libc/string/x86_64/strcpy.S10
-rw-r--r--libc/string/x86_64/strcspn.S20
-rw-r--r--libc/string/x86_64/strlen.S15
-rw-r--r--libc/string/x86_64/strspn.S22
70 files changed, 1080 insertions, 658 deletions
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S
index 3704f96b5..103580a0c 100644
--- a/libc/string/arm/_memcpy.S
+++ b/libc/string/arm/_memcpy.S
@@ -39,7 +39,9 @@
#include <features.h>
#include <endian.h>
+#include <bits/arm_asm.h>
+#if !defined(THUMB1_ONLY)
/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
@@ -77,12 +79,36 @@
.type _memcpy,%function
.align 4
+/* XXX: The Thumb-2 conditionals can be removed if/when we require an
+ assembler that supports unified syntax. */
+.macro copy regs
+#if defined(__thumb2__)
+ ittt ge
+ ldmiage r1!, \regs
+ stmiage r0!, \regs
+#else
+ ldmgeia r1!, \regs
+ stmgeia r0!, \regs
+#endif
+.endm
+
+.macro copydb regs
+#if defined(__thumb2__)
+ ittt ge
+ ldmdbge r1!, \regs
+ stmdbge r0!, \regs
+#else
+ ldmgedb r1!, \regs
+ stmgedb r0!, \regs
+#endif
+.endm
+
_memcpy:
/* Determine copy direction */
cmp r1, r0
bcc .Lmemcpy_backwards
- moveq r0, #0 /* Quick abort for len=0 */
+ IT(tt, eq) /* Quick abort for src=dst */
#if defined(__USE_BX__)
bxeq lr
#else
@@ -102,7 +128,7 @@ _memcpy:
blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
- stmdb sp!, {r4} /* borrow r4 */
+ str r4, [sp, #-4]! /* borrow r4 */
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
@@ -115,19 +141,22 @@ _memcpy:
bge .Lmemcpy_floop32
cmn r2, #0x10
- ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
- stmgeia r0!, {r3, r4, r12, lr}
+ /* blat a remaining 16 bytes */
+ copy "{r3, r4, r12, lr}"
subge r2, r2, #0x10
- ldmia sp!, {r4} /* return r4 */
+ ldr r4, [sp], #4 /* restore r4 */
.Lmemcpy_fl32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
.Lmemcpy_floop12:
- ldmgeia r1!, {r3, r12, lr}
- stmgeia r0!, {r3, r12, lr}
+ copy "{r3, r12, lr}"
+#if defined(__thumb2__)
+ subsge r2, r2, #0x0c
+#else
subges r2, r2, #0x0c
+#endif
bge .Lmemcpy_floop12
.Lmemcpy_fl12:
@@ -135,26 +164,48 @@ _memcpy:
blt .Lmemcpy_fl4
subs r2, r2, #4
+ IT(tt, lt)
ldrlt r3, [r1], #4
strlt r3, [r0], #4
- ldmgeia r1!, {r3, r12}
- stmgeia r0!, {r3, r12}
+ copy "{r3, r12}"
subge r2, r2, #4
.Lmemcpy_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
+#if defined(__thumb2__)
+ it eq
+ popeq {r0, pc} /* done */
+#elif defined(__ARM_ARCH_4T__)
+ ldmeqia sp!, {r0, r3} /* done */
+ bxeq r3
+#else
ldmeqia sp!, {r0, pc} /* done */
+#endif
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
+#if defined(__thumb2__)
+ itt ge
+ ldrbge r3, [r1], #1
+ strbge r3, [r0], #1
+ itt gt
+ ldrbgt r3, [r1], #1
+ strbgt r3, [r0], #1
+#else
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
+#endif
+#if defined(__ARM_ARCH_4T__)
+ ldmia sp!, {r0, r3}
+ bx r3
+#else
ldmia sp!, {r0, pc}
+#endif
/* erg - unaligned destination */
.Lmemcpy_fdestul:
@@ -164,10 +215,19 @@ _memcpy:
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
+#if defined(__thumb2__)
+ itt ge
+ ldrbge r3, [r1], #1
+ strbge r3, [r0], #1
+ itt gt
+ ldrbgt r3, [r1], #1
+ strbgt r3, [r0], #1
+#else
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
+#endif
subs r2, r2, r12
blt .Lmemcpy_fl4 /* less the 4 bytes */
@@ -370,12 +430,12 @@ _memcpy:
.Lmemcpy_bl32:
cmn r2, #0x10
- ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
- stmgedb r0!, {r3, r4, r12, lr}
+ /* blat a remaining 16 bytes */
+ copydb "{r3, r4, r12, lr}"
subge r2, r2, #0x10
adds r2, r2, #0x14
- ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
- stmgedb r0!, {r3, r12, lr}
+ /* blat a remaining 12 bytes */
+ copydb "{r3, r12, lr}"
subge r2, r2, #0x0c
ldmia sp!, {r4, lr}
@@ -383,15 +443,16 @@ _memcpy:
adds r2, r2, #8
blt .Lmemcpy_bl4
subs r2, r2, #4
+ IT(tt, lt)
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
- ldmgedb r1!, {r3, r12}
- stmgedb r0!, {r3, r12}
+ copydb "{r3, r12}"
subge r2, r2, #4
.Lmemcpy_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
+ IT(t, eq)
#if defined(__USE_BX__)
bxeq lr
#else
@@ -401,10 +462,19 @@ _memcpy:
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
+#ifdef __thumb2__
+ itt ge
+ ldrbge r3, [r1, #-1]!
+ strbge r3, [r0, #-1]!
+ itt gt
+ ldrbgt r3, [r1, #-1]!
+ strbgt r3, [r0, #-1]!
+#else
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
+#endif
#if defined(__USE_BX__)
bx lr
#else
@@ -417,10 +487,19 @@ _memcpy:
/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
+#ifdef __thumb2__
+ itt ge
+ ldrbge r3, [r1, #-1]!
+ strbge r3, [r0, #-1]!
+ itt gt
+ ldrbgt r3, [r1, #-1]!
+ strbgt r3, [r0, #-1]!
+#else
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
+#endif
subs r2, r2, r12
blt .Lmemcpy_bl4 /* less than 4 bytes to go */
ands r12, r1, #3
@@ -591,3 +670,77 @@ _memcpy:
.Lmemcpy_bsrcul1l4:
add r1, r1, #1
b .Lmemcpy_bl4
+
+#else /* THUMB1_ONLY */
+
+/* This is a fairly dumb implementation for when we can't use the 32-bit code
+ above. */
+.text
+.global _memcpy
+.hidden _memcpy
+.type _memcpy,%function
+.align 4
+.thumb
+_memcpy:
+ push {r0, r4}
+ cmp r2, #0
+ beq .Lmemcpy_exit
+ @ See if we have overlapping regions, and need to reverse the
+ @ direction of the copy
+ cmp r0, r1
+ bls .Lmemcpy_forwards
+ add r4, r1, r2
+ cmp r0, r4
+ bcc .Lmemcpy_backwards
+.Lmemcpy_forwards:
+ /* Forwards. */
+ mov r3, r0
+ eor r3, r1
+ mov r4, #3
+ tst r3, r4
+ bne .Lmemcpy_funaligned
+ cmp r2, #8
+ bcc .Lmemcpy_funaligned
+1: @ copy up to the first word boundary.
+ tst r0, r4
+ beq 1f
+ ldrb r3, [r1]
+ add r1, r1, #1
+ strb r3, [r0]
+ add r0, r0, #1
+ sub r2, r2, #1
+ b 1b
+1: @ Copy aligned words
+ ldr r3, [r1]
+ add r1, r1, #4
+ str r3, [r0]
+ add r0, r0, #4
+ sub r2, r2, #4
+ cmp r2, #4
+ bcs 1b
+ cmp r2, #0
+ beq .Lmemcpy_exit
+.Lmemcpy_funaligned:
+1:
+ ldrb r3, [r1]
+ add r1, r1, #1
+ strb r3, [r0]
+ add r0, r0, #1
+ sub r2, r2, #1
+ bne 1b
+.Lmemcpy_exit:
+ pop {r0, r4}
+ bx lr
+
+.Lmemcpy_backwards:
+ add r0, r0, r2
+ add r1, r1, r2
+1:
+ sub r0, r0, #1
+ sub r1, r1, #1
+ ldrb r3, [r1]
+ strb r3, [r0]
+ sub r2, r2, #1
+ bne 1b
+ b .Lmemcpy_exit
+#endif
diff --git a/libc/string/arm/bcopy.S b/libc/string/arm/bcopy.S
index db3c9e6c1..2d6e90d13 100644
--- a/libc/string/arm/bcopy.S
+++ b/libc/string/arm/bcopy.S
@@ -40,6 +40,7 @@
/* bcopy = memcpy/memmove with arguments reversed. */
#include <features.h>
+#include <bits/arm_asm.h>
#ifdef __UCLIBC_SUSV3_LEGACY__
@@ -48,12 +49,23 @@
.type bcopy,%function
.align 4
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+bcopy:
+ push {r2, lr}
+ mov ip, r0
+ mov r0, r1
+ mov r1, ip
+ bl _memcpy
+ POP_RET
+#else
bcopy:
/* switch the source and destination registers */
eor r0, r1, r0
eor r1, r0, r1
eor r0, r1, r0
b _memcpy /* (PLT) */
+#endif
.size bcopy,.-bcopy
diff --git a/libc/string/arm/bzero.S b/libc/string/arm/bzero.S
index ee49cf560..e576a12e9 100644
--- a/libc/string/arm/bzero.S
+++ b/libc/string/arm/bzero.S
@@ -38,6 +38,7 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
#ifdef __UCLIBC_SUSV3_LEGACY__
@@ -46,10 +47,21 @@
.type bzero,%function
.align 4
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+bzero:
+ push {r2, lr}
+ mov r2, r1
+ mov r1, #0
+ bl HIDDEN_JUMPTARGET(memset)
+ POP_RET
+#else
+
bzero:
mov r2, r1
mov r1, #0
b HIDDEN_JUMPTARGET(memset)
+#endif
.size bzero,.-bzero
diff --git a/libc/string/arm/memcmp.S b/libc/string/arm/memcmp.S
index 4f78b5128..65409f43a 100644
--- a/libc/string/arm/memcmp.S
+++ b/libc/string/arm/memcmp.S
@@ -30,15 +30,41 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
.text
.global memcmp
.type memcmp,%function
.align 4
+#if defined(THUMB1_ONLY)
+.thumb_func
+memcmp:
+ cmp r2, #0
+ bne 1f
+ mov r0, #0
+ bx lr
+1:
+ push {r4}
+ add r4, r0, r2
+2:
+ ldrb r2, [r0]
+ add r0, r0, #1
+ ldrb r3, [r1]
+ add r1, r1, #1
+ cmp r4, r0
+ beq 3f
+ cmp r2, r3
+ beq 2b
+3:
+ sub r0, r2, r3
+ pop {r4}
+ bx lr
+#else
memcmp:
/* if ((len - 1) < 0) return 0 */
subs r2, r2, #1
+ IT(tt, mi)
movmi r0, #0
#if defined(__USE_BX__)
bxmi lr
@@ -51,6 +77,7 @@ memcmp:
ldrb r2, [r0], #1
ldrb r3, [r1], #1
cmp ip, r0
+ IT(t, cs)
cmpcs r2, r3
beq 1b
sub r0, r2, r3
@@ -59,6 +86,7 @@ memcmp:
#else
mov pc, lr
#endif
+#endif
.size memcmp,.-memcmp
diff --git a/libc/string/arm/memcpy.S b/libc/string/arm/memcpy.S
index 7a5b6ab76..d2013d211 100644
--- a/libc/string/arm/memcpy.S
+++ b/libc/string/arm/memcpy.S
@@ -38,16 +38,23 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
.text
.global memcpy
.type memcpy,%function
.align 4
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
memcpy:
- stmfd sp!, {r0, lr}
+ push {r0, lr}
bl _memcpy
- ldmfd sp!, {r0, pc}
+ POP_RET
+#else
+memcpy:
+ b _memcpy
+#endif
.size memcpy,.-memcpy
diff --git a/libc/string/arm/memmove.S b/libc/string/arm/memmove.S
index 45cd9b4d4..c11b98dd4 100644
--- a/libc/string/arm/memmove.S
+++ b/libc/string/arm/memmove.S
@@ -38,16 +38,23 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
.text
.global memmove
.type memmove,%function
.align 4
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
memmove:
- stmfd sp!, {r0, lr}
+ push {r2, lr}
bl _memcpy
- ldmfd sp!, {r0, pc}
+ POP_RET
+#else
+memmove:
+ b _memcpy
+#endif
.size memmove,.-memmove
diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S
index 16bfe0dc5..66aa6039c 100644
--- a/libc/string/arm/memset.S
+++ b/libc/string/arm/memset.S
@@ -19,12 +19,52 @@
#include <features.h>
#include <sys/syscall.h>
+#include <bits/arm_asm.h>
.text
.global memset
.type memset,%function
.align 4
+#if defined(THUMB1_ONLY)
+.thumb_func
+memset:
+ mov ip, r0
+ cmp r2, #8 @ at least 8 bytes to do?
+ bcc 2f
+
+ lsl r3, r1, #8
+ orr r1, r3
+ lsl r3, r1, #16
+ orr r1, r3
+
+ mov r3, #3
+1: @ Fill up to the first word boundary
+ tst r0, r3
+ beq 1f
+ strb r1, [r0]
+ add r0, r0, #1
+ sub r2, r2, #1
+ b 1b
+1: @ Fill aligned words
+ str r1, [r0]
+ add r0, r0, #4
+ sub r2, r2, #4
+ cmp r2, #4
+ bcs 1b
+
+2: @ Fill the remaining bytes
+ cmp r2, #0
+ beq 2f
+1:
+ strb r1, [r0]
+ add r0, r0, #1
+ sub r2, r2, #1
+ bne 1b
+2:
+ mov r0, ip
+ bx lr
+#else
memset:
mov a4, a1
cmp a3, $8 @ at least 8 bytes to do?
@@ -33,8 +73,14 @@ memset:
orr a2, a2, a2, lsl $16
1:
tst a4, $3 @ aligned yet?
+#if defined(__thumb2__)
+ itt ne
+ strbne a2, [a4], $1
+ subne a3, a3, $1
+#else
strneb a2, [a4], $1
subne a3, a3, $1
+#endif
bne 1b
mov ip, a2
1:
@@ -51,16 +97,30 @@ memset:
stmia a4!, {a2, ip}
sub a3, a3, $8
cmp a3, $8 @ 8 bytes still to do?
+#if defined(__thumb2__)
+ itt ge
+ stmiage a4!, {a2, ip}
+ subge a3, a3, $8
+#else
stmgeia a4!, {a2, ip}
subge a3, a3, $8
+#endif
bge 1b
2:
movs a3, a3 @ anything left?
+ IT(t, eq)
#if defined(__USE_BX__)
bxeq lr
#else
moveq pc, lr @ nope
#endif
+#if defined (__thumb2__)
+1:
+ strb a2, [a4], #1
+ subs a3, a3, #1
+ bne 1b
+ bx lr
+#else
rsb a3, a3, $7
add pc, pc, a3, lsl $2
mov r0, r0
@@ -76,6 +136,8 @@ memset:
#else
mov pc, lr
#endif
+#endif
+#endif
.size memset,.-memset
diff --git a/libc/string/arm/strcmp.S b/libc/string/arm/strcmp.S
index 89aa38874..97363c1c2 100644
--- a/libc/string/arm/strcmp.S
+++ b/libc/string/arm/strcmp.S
@@ -30,17 +30,35 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
.text
.global strcmp
.type strcmp,%function
.align 4
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+strcmp:
+1:
+ ldrb r2, [r0]
+ add r0, r0, #1
+ ldrb r3, [r1]
+ add r1, r1, #1
+ cmp r2, #0
+ beq 2f
+ cmp r2, r3
+ beq 1b
+2:
+ sub r0, r2, r3
+ bx lr
+#else
strcmp:
1:
ldrb r2, [r0], #1
ldrb r3, [r1], #1
cmp r2, #1
+ IT(t, cs)
cmpcs r2, r3
beq 1b
sub r0, r2, r3
@@ -49,6 +67,7 @@ strcmp:
#else
mov pc, lr
#endif
+#endif
.size strcmp,.-strcmp
diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
index 5b4b02e17..949e918f4 100644
--- a/libc/string/arm/strlen.S
+++ b/libc/string/arm/strlen.S
@@ -20,6 +20,7 @@
#include <features.h>
#include <endian.h>
#include <sys/syscall.h>
+#include <bits/arm_asm.h>
/* size_t strlen(const char *S)
* entry: r0 -> string
@@ -31,6 +32,19 @@
.type strlen,%function
.align 4
+#if defined(THUMB1_ONLY)
+/* A simple implementation for when the ARM implementation can't be used. */
+.thumb_func
+strlen:
+ mov r2, #0
+1:
+ ldrb r1, [r0, r2]
+ add r2, r2, #1
+ cmp r1, #0
+ bne 1b
+ sub r0, r2, #1
+ bx lr
+#else
strlen:
bic r1, r0, $3 @ addr of word containing first byte
ldr r2, [r1], $4 @ get the first word
@@ -41,38 +55,48 @@ strlen:
#if __BYTE_ORDER == __BIG_ENDIAN
orr r2, r2, $0xff000000 @ set this byte to non-zero
subs r3, r3, $1 @ any more to do?
+ IT(t, gt)
orrgt r2, r2, $0x00ff0000 @ if so, set this byte
subs r3, r3, $1 @ more?
+ IT(t, gt)
orrgt r2, r2, $0x0000ff00 @ then set.
#else
orr r2, r2, $0x000000ff @ set this byte to non-zero
subs r3, r3, $1 @ any more to do?
+ IT(t, gt)
orrgt r2, r2, $0x0000ff00 @ if so, set this byte
subs r3, r3, $1 @ more?
+ IT(t, gt)
orrgt r2, r2, $0x00ff0000 @ then set.
#endif
Laligned: @ here, we have a word in r2. Does it
tst r2, $0x000000ff @ contain any zeroes?
+ IT(tttt, ne)
tstne r2, $0x0000ff00 @
tstne r2, $0x00ff0000 @
tstne r2, $0xff000000 @
addne r0, r0, $4 @ if not, the string is 4 bytes longer
+ IT(t, ne)
ldrne r2, [r1], $4 @ and we continue to the next word
bne Laligned @
Llastword: @ drop through to here once we find a
#if __BYTE_ORDER == __BIG_ENDIAN
tst r2, $0xff000000 @ word that has a zero byte in it
+ IT(tttt, ne)
addne r0, r0, $1 @
tstne r2, $0x00ff0000 @ and add up to 3 bytes on to it
addne r0, r0, $1 @
tstne r2, $0x0000ff00 @ (if first three all non-zero, 4th
+ IT(t, ne)
addne r0, r0, $1 @ must be zero)
#else
tst r2, $0x000000ff @
+ IT(tttt, ne)
addne r0, r0, $1 @
tstne r2, $0x0000ff00 @ and add up to 3 bytes on to it
addne r0, r0, $1 @
tstne r2, $0x00ff0000 @ (if first three all non-zero, 4th
+ IT(t, ne)
addne r0, r0, $1 @ must be zero)
#endif
#if defined(__USE_BX__)
@@ -80,6 +104,7 @@ Llastword: @ drop through to here once we find a
#else
mov pc,lr
#endif
+#endif
.size strlen,.-strlen
diff --git a/libc/string/arm/strncmp.S b/libc/string/arm/strncmp.S
index eaf0620b4..8487639c8 100644
--- a/libc/string/arm/strncmp.S
+++ b/libc/string/arm/strncmp.S
@@ -30,15 +30,46 @@
*/
#include <features.h>
+#include <bits/arm_asm.h>
.text
.global strncmp
.type strncmp,%function
.align 4
+#if defined(THUMB1_ONLY)
+.thumb_func
strncmp:
/* if (len == 0) return 0 */
cmp r2, #0
+ bne 1f
+ mov r0, #0
+ bx lr
+1:
+ push {r4}
+
+ /* ip == last src address to compare */
+ add r4, r0, r2
+2:
+ cmp r4, r0
+ beq 3f
+ ldrb r2, [r0]
+ add r0, r0, #1
+ ldrb r3, [r1]
+ add r1, r1, #1
+ cmp r2, #0
+ beq 3f
+ cmp r2, r3
+ beq 2b
+3:
+ sub r0, r2, r3
+ pop {r4}
+ bx lr
+#else
+strncmp:
+ /* if (len == 0) return 0 */
+ cmp r2, #0
+ IT(tt, eq)
moveq r0, #0
#if defined(__USE_BX__)
bxeq lr
@@ -53,6 +84,7 @@ strncmp:
ldrb r2, [r0], #1
ldrb r3, [r1], #1
cmp ip, r0
+ IT(tt, cs)
cmpcs r2, #1
cmpcs r2, r3
beq 1b
@@ -62,6 +94,7 @@ strncmp:
#else
mov pc, lr
#endif
+#endif
.size strncmp,.-strncmp
diff --git a/libc/string/avr32/Makefile b/libc/string/avr32/Makefile
index 0002ffdce..e19e9d9ec 100644
--- a/libc/string/avr32/Makefile
+++ b/libc/string/avr32/Makefile
@@ -16,8 +16,8 @@
# along with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-top_srcdir := ../../../
-top_builddir := ../../../
+top_srcdir := ../../../
+top_builddir := ../../../
all: objs
diff --git a/libc/string/avr32/bcopy.S b/libc/string/avr32/bcopy.S
index e1d173165..bdd521814 100644
--- a/libc/string/avr32/bcopy.S
+++ b/libc/string/avr32/bcopy.S
@@ -10,17 +10,17 @@
#ifdef __UCLIBC_SUSV3_LEGACY__
- .text
- .global bcopy
- .type bcopy, @function
- .align 1
+ .text
+ .global bcopy
+ .type bcopy, @function
+ .align 1
bcopy:
- /* Swap the first two arguments */
- eor r11, r12
- eor r12, r11
- eor r11, r12
- rjmp HIDDEN_JUMPTARGET(memmove)
+ /* Swap the first two arguments */
+ eor r11, r12
+ eor r12, r11
+ eor r11, r12
+ rjmp HIDDEN_JUMPTARGET(memmove)
- .size bcopy, . - bcopy
+ .size bcopy, . - bcopy
#endif /* __UCLIBC_SUSV3_LEGACY__ */
diff --git a/libc/string/avr32/bzero.S b/libc/string/avr32/bzero.S
index 928148dcb..ca1bd2dd2 100644
--- a/libc/string/avr32/bzero.S
+++ b/libc/string/avr32/bzero.S
@@ -10,15 +10,15 @@
#ifdef __UCLIBC_SUSV3_LEGACY__
- .text
- .global bzero
- .type bzero, @function
- .align 1
+ .text
+ .global bzero
+ .type bzero, @function
+ .align 1
bzero:
- mov r10, r11
- mov r11, 0
- rjmp HIDDEN_JUMPTARGET(memset)
+ mov r10, r11
+ mov r11, 0
+ rjmp HIDDEN_JUMPTARGET(memset)
- .size bzero, . - bzero
+ .size bzero, . - bzero
#endif /* __UCLIBC_SUSV3_LEGACY__ */
diff --git a/libc/string/avr32/memcmp.S b/libc/string/avr32/memcmp.S
index 5d7eac3d9..ae6cc9189 100644
--- a/libc/string/avr32/memcmp.S
+++ b/libc/string/avr32/memcmp.S
@@ -12,48 +12,48 @@
#define s2 r11
#define len r10
- .text
- .global memcmp
- .type memcmp, @function
- .align 1
+ .text
+ .global memcmp
+ .type memcmp, @function
+ .align 1
memcmp:
- sub len, 4
- brlt .Lless_than_4
+ sub len, 4
+ brlt .Lless_than_4
-1: ld.w r8, s1++
- ld.w r9, s2++
- cp.w r8, r9
- brne .Lfound_word
- sub len, 4
- brge 1b
+1: ld.w r8, s1++
+ ld.w r9, s2++
+ cp.w r8, r9
+ brne .Lfound_word
+ sub len, 4
+ brge 1b
.Lless_than_4:
- sub len, -4
- reteq 0
+ sub len, -4
+ reteq 0
-1: ld.ub r8, s1++
- ld.ub r9, s2++
- sub r8, r9
- retne r8
- sub len, 1
- brgt 1b
+1: ld.ub r8, s1++
+ ld.ub r9, s2++
+ sub r8, r9
+ retne r8
+ sub len, 1
+ brgt 1b
- retal 0
+ retal 0
.Lfound_word:
- mov len, 4
-
-2: bfextu r11, r9, 24, 8
- bfextu r12, r8, 24, 8
- sub r12, r11
- retne r12
- lsl r8, 8
- lsl r9, 8
- sub len, 1
- brne 2b
- retal r12
-
- .size memcmp, . - memcmp
+ mov len, 4
+
+2: bfextu r11, r9, 24, 8
+ bfextu r12, r8, 24, 8
+ sub r12, r11
+ retne r12
+ lsl r8, 8
+ lsl r9, 8
+ sub len, 1
+ brne 2b
+ retal r12
+
+ .size memcmp, . - memcmp
libc_hidden_def(memcmp)
#ifdef __UCLIBC_SUSV3_LEGACY__
diff --git a/libc/string/avr32/memcpy.S b/libc/string/avr32/memcpy.S
index f95aabd13..bf091abf8 100644
--- a/libc/string/avr32/memcpy.S
+++ b/libc/string/avr32/memcpy.S
@@ -11,101 +11,101 @@
#define src r11
#define len r10
- .text
- .global memcpy
- .type memcpy, @function
+ .text
+ .global memcpy
+ .type memcpy, @function
memcpy:
- pref src[0]
- mov dst, r12
+ pref src[0]
+ mov dst, r12
- /* If we have less than 32 bytes, don't do anything fancy */
- cp.w len, 32
- brge .Lmore_than_31
+ /* If we have less than 32 bytes, don't do anything fancy */
+ cp.w len, 32
+ brge .Lmore_than_31
- sub len, 1
- retlt r12
-1: ld.ub r8, src++
- st.b dst++, r8
- sub len, 1
- brge 1b
- retal r12
+ sub len, 1
+ retlt r12
+1: ld.ub r8, src++
+ st.b dst++, r8
+ sub len, 1
+ brge 1b
+ retal r12
.Lmore_than_31:
- pushm r0-r7, lr
+ pushm r0-r7, lr
- /* Check alignment */
- mov r8, src
- andl r8, 31, COH
- brne .Lunaligned_src
- mov r8, dst
- andl r8, 3, COH
- brne .Lunaligned_dst
+ /* Check alignment */
+ mov r8, src
+ andl r8, 31, COH
+ brne .Lunaligned_src
+ mov r8, dst
+ andl r8, 3, COH
+ brne .Lunaligned_dst
.Laligned_copy:
- sub len, 32
- brlt .Lless_than_32
+ sub len, 32
+ brlt .Lless_than_32
-1: /* Copy 32 bytes at a time */
- ldm src, r0-r7
- sub src, -32
- stm dst, r0-r7
- sub dst, -32
- sub len, 32
- brge 1b
+1: /* Copy 32 bytes at a time */
+ ldm src, r0-r7
+ sub src, -32
+ stm dst, r0-r7
+ sub dst, -32
+ sub len, 32
+ brge 1b
.Lless_than_32:
- /* Copy 16 more bytes if possible */
- sub len, -16
- brlt .Lless_than_16
- ldm src, r0-r3
- sub src, -16
- sub len, 16
- stm dst, r0-r3
- sub dst, -16
+ /* Copy 16 more bytes if possible */
+ sub len, -16
+ brlt .Lless_than_16
+ ldm src, r0-r3
+ sub src, -16
+ sub len, 16
+ stm dst, r0-r3
+ sub dst, -16
.Lless_than_16:
- /* Do the remaining as byte copies */
- neg len
- add pc, pc, len << 2
- .rept 15
- ld.ub r0, src++
- st.b dst++, r0
- .endr
+ /* Do the remaining as byte copies */
+ neg len
+ add pc, pc, len << 2
+ .rept 15
+ ld.ub r0, src++
+ st.b dst++, r0
+ .endr
- popm r0-r7, pc
+ popm r0-r7, pc
.Lunaligned_src:
- /* Make src cacheline-aligned. r8 = (src & 31) */
- rsub r8, r8, 32
- sub len, r8
-1: ld.ub r0, src++
- st.b dst++, r0
- sub r8, 1
- brne 1b
-
- /* If dst is word-aligned, we're ready to go */
- pref src[0]
- mov r8, 3
- tst dst, r8
- breq .Laligned_copy
+ /* Make src cacheline-aligned. r8 = (src & 31) */
+ rsub r8, r8, 32
+ sub len, r8
+1: ld.ub r0, src++
+ st.b dst++, r0
+ sub r8, 1
+ brne 1b
+
+ /* If dst is word-aligned, we're ready to go */
+ pref src[0]
+ mov r8, 3
+ tst dst, r8
+ breq .Laligned_copy
.Lunaligned_dst:
- /* src is aligned, but dst is not. Expect bad performance */
- sub len, 4
- brlt 2f
-1: ld.w r0, src++
- st.w dst++, r0
- sub len, 4
- brge 1b
-
-2: neg len
- add pc, pc, len << 2
- .rept 3
- ld.ub r0, src++
- st.b dst++, r0
- .endr
-
- popm r0-r7, pc
- .size memcpy, . - memcpy
+ /* src is aligned, but dst is not. Expect bad performance */
+ sub len, 4
+ brlt 2f
+1: ld.w r0, src++
+ st.w dst++, r0
+ sub len, 4
+ brge 1b
+
+2: neg len
+ add pc, pc, len << 2
+ .rept 3
+ ld.ub r0, src++
+ st.b dst++, r0
+ .endr
+
+ popm r0-r7, pc
+ .size memcpy, . - memcpy
libc_hidden_def(memcpy)
diff --git a/libc/string/avr32/memmove.S b/libc/string/avr32/memmove.S
index 8ca4da54d..535f4a257 100644
--- a/libc/string/avr32/memmove.S
+++ b/libc/string/avr32/memmove.S
@@ -10,107 +10,107 @@
#define src r11
#define len r10
- .text
- .global memmove
- .type memmove, @function
+ .text
+ .global memmove
+ .type memmove, @function
memmove:
- cp.w src, dst
- brge HIDDEN_JUMPTARGET(memcpy)
-
- add dst, len
- add src, len
- pref src[-1]
-
- /*
- * The rest is basically the same as in memcpy.S except that
- * the direction is reversed.
- */
- cp.w len, 32
- brge .Lmore_than_31
-
- sub len, 1
- retlt r12
-1: ld.ub r8, --src
- st.b --dst, r8
- sub len, 1
- brge 1b
- retal r12
+ cp.w src, dst
+ brge HIDDEN_JUMPTARGET(memcpy)
+
+ add dst, len
+ add src, len
+ pref src[-1]
+
+ /*
+ * The rest is basically the same as in memcpy.S except that
+ * the direction is reversed.
+ */
+ cp.w len, 32
+ brge .Lmore_than_31
+
+ sub len, 1
+ retlt r12
+1: ld.ub r8, --src
+ st.b --dst, r8
+ sub len, 1
+ brge 1b
+ retal r12
.Lmore_than_31:
- pushm r0-r7, lr
+ pushm r0-r7, lr
- /* Check alignment */
- mov r8, src
- andl r8, 31, COH
- brne .Lunaligned_src
- mov r8, r12
- andl r8, 3, COH
- brne .Lunaligned_dst
+ /* Check alignment */
+ mov r8, src
+ andl r8, 31, COH
+ brne .Lunaligned_src
+ mov r8, r12
+ andl r8, 3, COH
+ brne .Lunaligned_dst
.Laligned_copy:
- sub len, 32
- brlt .Lless_than_32
+ sub len, 32
+ brlt .Lless_than_32
-1: /* Copy 32 bytes at a time */
- sub src, 32
- ldm src, r0-r7
- sub dst, 32
- sub len, 32
- stm dst, r0-r7
- brge 1b
+1: /* Copy 32 bytes at a time */
+ sub src, 32
+ ldm src, r0-r7
+ sub dst, 32
+ sub len, 32
+ stm dst, r0-r7
+ brge 1b
.Lless_than_32:
- /* Copy 16 more bytes if possible */
- sub len, -16
- brlt .Lless_than_16
- sub src, 16
- ldm src, r0-r3
- sub dst, 16
- sub len, 16
- stm dst, r0-r3
+ /* Copy 16 more bytes if possible */
+ sub len, -16
+ brlt .Lless_than_16
+ sub src, 16
+ ldm src, r0-r3
+ sub dst, 16
+ sub len, 16
+ stm dst, r0-r3
.Lless_than_16:
- /* Do the remaining as byte copies */
- sub len, -16
- breq 2f
-1: ld.ub r0, --src
- st.b --dst, r0
- sub len, 1
- brne 1b
+ /* Do the remaining as byte copies */
+ sub len, -16
+ breq 2f
+1: ld.ub r0, --src
+ st.b --dst, r0
+ sub len, 1
+ brne 1b
-2: popm r0-r7, pc
+2: popm r0-r7, pc
.Lunaligned_src:
- /* Make src cacheline-aligned. r8 = (src & 31) */
- sub len, r8
-1: ld.ub r0, --src
- st.b --dst, r0
- sub r8, 1
- brne 1b
-
- /* If dst is word-aligned, we're ready to go */
- pref src[-4]
- mov r8, 3
- tst dst, r8
- breq .Laligned_copy
+ /* Make src cacheline-aligned. r8 = (src & 31) */
+ sub len, r8
+1: ld.ub r0, --src
+ st.b --dst, r0
+ sub r8, 1
+ brne 1b
+
+ /* If dst is word-aligned, we're ready to go */
+ pref src[-4]
+ mov r8, 3
+ tst dst, r8
+ breq .Laligned_copy
.Lunaligned_dst:
- /* src is aligned, but dst is not. Expect bad performance */
- sub len, 4
- brlt 2f
-1: ld.w r0, --src
- st.w --dst, r0
- sub len, 4
- brge 1b
-
-2: neg len
- add pc, pc, len << 2
- .rept 3
- ld.ub r0, --src
- st.b --dst, r0
- .endr
-
- popm r0-r7, pc
- .size memmove, . - memmove
+ /* src is aligned, but dst is not. Expect bad performance */
+ sub len, 4
+ brlt 2f
+1: ld.w r0, --src
+ st.w --dst, r0
+ sub len, 4
+ brge 1b
+
+2: neg len
+ add pc, pc, len << 2
+ .rept 3
+ ld.ub r0, --src
+ st.b --dst, r0
+ .endr
+
+ popm r0-r7, pc
+ .size memmove, . - memmove
libc_hidden_def(memmove)
diff --git a/libc/string/avr32/memset.S b/libc/string/avr32/memset.S
index 964bf4834..472b2be35 100644
--- a/libc/string/avr32/memset.S
+++ b/libc/string/avr32/memset.S
@@ -12,54 +12,54 @@
#define c r11
#define n r10
- .text
- .global memset
- .type memset, @function
+ .text
+ .global memset
+ .type memset, @function
- .align 1
+ .align 1
memset:
- cp.w n, 32
- mov r9, s
- brge .Llarge_memset
+ cp.w n, 32
+ mov r9, s
+ brge .Llarge_memset
- sub n, 1
- retlt s
-1: st.b s++, c
- sub n, 1
- brge 1b
+ sub n, 1
+ retlt s
+1: st.b s++, c
+ sub n, 1
+ brge 1b
- retal r9
+ retal r9
.Llarge_memset:
- mov r8, r11
- mov r11, 3
- bfins r8, r8, 8, 8
- bfins r8, r8, 16, 16
- tst s, r11
- breq 2f
+ mov r8, r11
+ mov r11, 3
+ bfins r8, r8, 8, 8
+ bfins r8, r8, 16, 16
+ tst s, r11
+ breq 2f
-1: st.b s++, r8
- sub n, 1
- tst s, r11
- brne 1b
+1: st.b s++, r8
+ sub n, 1
+ tst s, r11
+ brne 1b
-2: mov r11, r9
- mov r9, r8
- sub n, 8
+2: mov r11, r9
+ mov r9, r8
+ sub n, 8
-3: st.d s++, r8
- sub n, 8
- brge 3b
+3: st.d s++, r8
+ sub n, 8
+ brge 3b
- /* If we are done, n == -8 and we'll skip all st.b insns below */
- neg n
- lsl n, 1
- add pc, n
- .rept 7
- st.b s++, r8
- .endr
- retal r11
+ /* If we are done, n == -8 and we'll skip all st.b insns below */
+ neg n
+ lsl n, 1
+ add pc, n
+ .rept 7
+ st.b s++, r8
+ .endr
+ retal r11
- .size memset, . - memset
+ .size memset, . - memset
libc_hidden_def(memset)
diff --git a/libc/string/avr32/strcmp.S b/libc/string/avr32/strcmp.S
index e9f087577..f73bd43e7 100644
--- a/libc/string/avr32/strcmp.S
+++ b/libc/string/avr32/strcmp.S
@@ -12,77 +12,77 @@
#define s2 r11
#define len r10
- .text
- .global strcmp
- .type strcmp, @function
- .align 1
+ .text
+ .global strcmp
+ .type strcmp, @function
+ .align 1
strcmp:
- mov r8, 3
- tst s1, r8
- brne .Lunaligned_s1
- tst s2, r8
- brne .Lunaligned_s2
+ mov r8, 3
+ tst s1, r8
+ brne .Lunaligned_s1
+ tst s2, r8
+ brne .Lunaligned_s2
-1: ld.w r8, s1++
- ld.w r9, s2++
- cp.w r8, r9
- brne 2f
- tnbz r8
- brne 1b
- retal 0
+1: ld.w r8, s1++
+ ld.w r9, s2++
+ cp.w r8, r9
+ brne 2f
+ tnbz r8
+ brne 1b
+ retal 0
-2: bfextu r12, r8, 24, 8
- bfextu r11, r9, 24, 8
- sub r12, r11
- retne r12
- cp.w r11, 0
- reteq 0
- bfextu r12, r8, 16, 8
- bfextu r11, r9, 16, 8
- sub r12, r11
- retne r12
- cp.w r11, 0
- reteq 0
- bfextu r12, r8, 8, 8
- bfextu r11, r9, 8, 8
- sub r12, r11
- retne r12
- cp.w r11, 0
- reteq 0
- bfextu r12, r8, 0, 8
- bfextu r11, r9, 0, 8
- sub r12, r11
- retal r12
+2: bfextu r12, r8, 24, 8
+ bfextu r11, r9, 24, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 16, 8
+ bfextu r11, r9, 16, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 8, 8
+ bfextu r11, r9, 8, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 0, 8
+ bfextu r11, r9, 0, 8
+ sub r12, r11
+ retal r12
.Lunaligned_s1:
-3: tst s1, r8
- breq 4f
- ld.ub r10, s1++
- ld.ub r9, s2++
- sub r10, r9
- retne r10
- cp.w r9, 0
- brne 3b
- retal r10
+3: tst s1, r8
+ breq 4f
+ ld.ub r10, s1++
+ ld.ub r9, s2++
+ sub r10, r9
+ retne r10
+ cp.w r9, 0
+ brne 3b
+ retal r10
-4: tst s2, r8
- breq 1b
+4: tst s2, r8
+ breq 1b
.Lunaligned_s2:
- /*
- * s1 and s2 can't both be aligned, and unaligned word loads
- * can trigger spurious exceptions if we cross a page boundary.
- * Do it the slow way...
- */
-1: ld.ub r8, s1++
- ld.ub r9, s2++
- sub r8, r9
- retne r8
- cp.w r9, 0
- brne 1b
- retal 0
+ /*
+ * s1 and s2 can't both be aligned, and unaligned word loads
+ * can trigger spurious exceptions if we cross a page boundary.
+ * Do it the slow way...
+ */
+1: ld.ub r8, s1++
+ ld.ub r9, s2++
+ sub r8, r9
+ retne r8
+ cp.w r9, 0
+ brne 1b
+ retal 0
- .size strcmp, . - strcmp
+ .size strcmp, . - strcmp
libc_hidden_def(strcmp)
#ifndef __UCLIBC_HAS_LOCALE__
diff --git a/libc/string/avr32/strlen.S b/libc/string/avr32/strlen.S
index d2808998d..5223e5365 100644
--- a/libc/string/avr32/strlen.S
+++ b/libc/string/avr32/strlen.S
@@ -10,53 +10,53 @@
#define str r12
- .text
- .global strlen
- .type strlen, @function
+ .text
+ .global strlen
+ .type strlen, @function
strlen:
- mov r11, r12
-
- mov r9, str
- andl r9, 3, COH
- brne .Lunaligned_str
-
-1: ld.w r8, str++
- tnbz r8
- brne 1b
-
- sub r12, r11
- bfextu r9, r8, 24, 8
- cp.w r9, 0
- subeq r12, 4
- reteq r12
- bfextu r9, r8, 16, 8
- cp.w r9, 0
- subeq r12, 3
- reteq r12
- bfextu r9, r8, 8, 8
- cp.w r9, 0
- subeq r12, 2
- reteq r12
- sub r12, 1
- retal r12
+ mov r11, r12
+
+ mov r9, str
+ andl r9, 3, COH
+ brne .Lunaligned_str
+
+1: ld.w r8, str++
+ tnbz r8
+ brne 1b
+
+ sub r12, r11
+ bfextu r9, r8, 24, 8
+ cp.w r9, 0
+ subeq r12, 4
+ reteq r12
+ bfextu r9, r8, 16, 8
+ cp.w r9, 0
+ subeq r12, 3
+ reteq r12
+ bfextu r9, r8, 8, 8
+ cp.w r9, 0
+ subeq r12, 2
+ reteq r12
+ sub r12, 1
+ retal r12
.Lunaligned_str:
- add pc, pc, r9 << 3
- sub r0, r0, 0 /* 4-byte nop */
- ld.ub r8, str++
- sub r8, r8, 0
- breq 1f
- ld.ub r8, str++
- sub r8, r8, 0
- breq 1f
- ld.ub r8, str++
- sub r8, r8, 0
- brne 1b
-
-1: sub r12, 1
- sub r12, r11
- retal r12
-
- .size strlen, . - strlen
+ add pc, pc, r9 << 3
+ sub r0, r0, 0 /* 4-byte nop */
+ ld.ub r8, str++
+ sub r8, r8, 0
+ breq 1f
+ ld.ub r8, str++
+ sub r8, r8, 0
+ breq 1f
+ ld.ub r8, str++
+ sub r8, r8, 0
+ brne 1b
+
+1: sub r12, 1
+ sub r12, r11
+ retal r12
+
+ .size strlen, . - strlen
libc_hidden_def(strlen)
diff --git a/libc/string/bfin/memchr.S b/libc/string/bfin/memchr.S
index 23626d6a4..88e46bef6 100644
--- a/libc/string/bfin/memchr.S
+++ b/libc/string/bfin/memchr.S
@@ -1,5 +1,5 @@
/* memchr.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* void *memchr(const void *s, int c, size_t n);
* R0 = address (s)
* R1 = sought byte (c)
@@ -21,30 +23,29 @@
.align 2
-.global _memchr
-.type _memchr, STT_FUNC
-_memchr:
+.weak _memchr
+ENTRY(_memchr)
P0 = R0; // P0 = address
P2 = R2; // P2 = count
R1 = R1.B(Z);
CC = R2 == 0;
- IF CC JUMP failed;
+ IF CC JUMP .Lfailed;
-bytes:
- LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
+.Lbytes:
+ LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
-byte_loop_s:
+.Lbyte_loop_s:
R3 = B[P0++](Z);
CC = R3 == R1;
- IF CC JUMP found;
-byte_loop_e:
+ IF CC JUMP .Lfound;
+.Lbyte_loop_e:
NOP;
-failed:
+.Lfailed:
R0=0;
RTS;
-found:
+.Lfound:
R0 = P0;
R0 += -1;
RTS;
diff --git a/libc/string/bfin/memcmp.S b/libc/string/bfin/memcmp.S
index f2679d5ae..7cc76ad96 100644
--- a/libc/string/bfin/memcmp.S
+++ b/libc/string/bfin/memcmp.S
@@ -1,5 +1,5 @@
/* memcmp.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* int memcmp(const void *s1, const void *s2, size_t n);
* R0 = First Address (s1)
* R1 = Second Address (s2)
@@ -21,28 +23,27 @@
.align 2
-.global _memcmp
-.type _memcmp, STT_FUNC
-_memcmp:
+.weak _memcmp
+ENTRY(_memcmp)
I1 = P3;
- P0 = R0; // P0 = s1 address
- P3 = R1; // P3 = s2 Address
- P2 = R2 ; // P2 = count
+ P0 = R0; /* P0 = s1 address */
+ P3 = R1; /* P3 = s2 Address */
+ P2 = R2 ; /* P2 = count */
CC = R2 <= 7(IU);
- IF CC JUMP too_small;
- I0 = R1; // s2
- R1 = R1 | R0; // OR addresses together
- R1 <<= 30; // check bottom two bits
- CC = AZ; // AZ set if zero.
- IF !CC JUMP bytes ; // Jump if addrs not aligned.
+ IF CC JUMP .Ltoo_small;
+ I0 = R1; /* s2 */
+ R1 = R1 | R0; /* OR addresses together */
+ R1 <<= 30; /* check bottom two bits */
+ CC = AZ; /* AZ set if zero. */
+ IF !CC JUMP .Lbytes ; /* Jump if addrs not aligned. */
- P1 = P2 >> 2; // count = n/4
+ P1 = P2 >> 2; /* count = n/4 */
R3 = 3;
- R2 = R2 & R3; // remainder
- P2 = R2; // set remainder
+ R2 = R2 & R3; /* remainder */
+ P2 = R2; /* set remainder */
- LSETUP (quad_loop_s , quad_loop_e) LC0=P1;
-quad_loop_s:
+ LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s:
#if !defined(__WORKAROUND_AVOID_DAG1)
MNOP || R0 = [P0++] || R1 = [I0++];
#else
@@ -50,52 +51,54 @@ quad_loop_s:
R1 = [I0++];
#endif
CC = R0 == R1;
- IF !CC JUMP quad_different;
-quad_loop_e:
+ IF !CC JUMP .Lquad_different;
+.Lquad_loop_e:
NOP;
- P3 = I0; // s2
-too_small:
- CC = P2 == 0; //Check zero count
- IF CC JUMP finished; // very unlikely
+ P3 = I0; /* s2 */
+.Ltoo_small:
+ CC = P2 == 0; /* Check zero count*/
+ IF CC JUMP .Lfinished; /* very unlikely*/
-bytes:
- LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
-byte_loop_s:
- R1 = B[P3++](Z); // *s2
- R0 = B[P0++](Z); // *s1
+.Lbytes:
+ LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+.Lbyte_loop_s:
+ R1 = B[P3++](Z); /* *s2 */
+ R0 = B[P0++](Z); /* *s1 */
CC = R0 == R1;
- IF !CC JUMP different;
-byte_loop_e:
+ IF !CC JUMP .Ldifferent;
+.Lbyte_loop_e:
NOP;
-different:
+.Ldifferent:
R0 = R0 - R1;
P3 = I1;
RTS;
-quad_different:
- // We've read two quads which don't match.
- // Can't just compare them, because we're
- // a little-endian machine, so the MSBs of
- // the regs occur at later addresses in the
- // string.
- // Arrange to re-read those two quads again,
- // byte-by-byte.
- P0 += -4; // back up to the start of the
- P3 = I0; // quads, and increase the
- P2 += 4; // remainder count
+.Lquad_different:
+ /* We've read two quads which don't match.
+ * Can't just compare them, because we're
+ * a little-endian machine, so the MSBs of
+ * the regs occur at later addresses in the
+ * string.
+ * Arrange to re-read those two quads again,
+ * byte-by-byte.
+ */
+ P0 += -4; /* back up to the start of the */
+ P3 = I0; /* quads, and increase the*/
+ P2 += 4; /* remainder count*/
P3 += -4;
- JUMP bytes;
+ JUMP .Lbytes;
-finished:
+.Lfinished:
R0 = 0;
P3 = I1;
RTS;
+
.size _memcmp,.-_memcmp
libc_hidden_def (memcmp)
#ifdef __UCLIBC_SUSV3_LEGACY__
-strong_alias (memcmp,bcmp)
+weak_alias (memcmp,bcmp)
#endif
diff --git a/libc/string/bfin/memcpy.S b/libc/string/bfin/memcpy.S
index e7ba7048e..bdd760691 100644
--- a/libc/string/bfin/memcpy.S
+++ b/libc/string/bfin/memcpy.S
@@ -1,5 +1,5 @@
/* memcpy.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* void *memcpy(void *dest, const void *src, size_t n);
* R0 = To Address (dest) (leave unchanged to form result)
* R1 = From Address (src)
@@ -21,54 +23,55 @@
.align 2
-.global _memcpy
-.type _memcpy, STT_FUNC
-_memcpy:
+.weak _memcpy
+ENTRY(_memcpy)
[--SP] = P3;
- P0 = R0; // P0 = To address
- P3 = R1; // P3 = From Address
- P2 = R2 ; // P2 = count
+ P0 = R0; /* P0 = To address */
+ P3 = R1; /* P3 = From Address */
+ P2 = R2; /* P2 = count */
CC = R2 <= 7(IU);
- IF CC JUMP too_small;
+ IF CC JUMP .Ltoo_small;
I0 = R1;
- R3 = R1 | R0; // OR addresses together
- R3 <<= 30; // check bottom two bits
- CC = AZ; // AZ set if zero.
- IF !CC JUMP bytes ; // Jump if addrs not aligned.
- P1 = P2 >> 2; // count = n/4
+ R3 = R1 | R0; /* OR addresses together */
+ R3 <<= 30; /* check bottom two bits */
+ CC = AZ; /* AZ set if zero. */
+ IF !CC JUMP .Lbytes; /* Jump if addrs not aligned. */
+ P1 = P2 >> 2; /* count = n/4 */
P1 += -1;
R3 = 3;
- R2 = R2 & R3; // remainder
- P2 = R2; // set remainder
+ R2 = R2 & R3; /* remainder */
+ P2 = R2; /* set remainder */
R1 = [I0++];
#if !defined(__WORKAROUND_AVOID_DAG1)
- LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop: MNOP || [P0++] = R1 || R1 = [I0++];
+ LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1;
+.Lquad_loop: MNOP || [P0++] = R1 || R1 = [I0++];
#else
- LSETUP (quad_loop_s , quad_loop_e) LC0=P1;
-quad_loop_s: [P0++] = R1;
-quad_loop_e: R1 = [I0++];
+ LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s: [P0++] = R1;
+.Lquad_loop_e: R1 = [I0++];
#endif
[P0++] = R1;
- CC = P2 == 0; // any remaining bytes?
- P3 = I0; // Ammend P3 for remaining copy
- IF !CC JUMP bytes;
+ CC = P2 == 0; /* any remaining bytes? */
+ P3 = I0; /* Ammend P3 for remaining copy */
+ IF !CC JUMP .Lbytes;
P3 = [SP++];
RTS;
-too_small:
- CC = P2 == 0; //Check zero count
- IF CC JUMP finished; // very unlikely
+.Ltoo_small:
+ CC = P2 == 0; /* Check zero count */
+ IF CC JUMP .Lfinished; /* very unlikely */
-bytes:
- LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
-byte_loop_s: R1 = B[P3++](Z);
-byte_loop_e: B[P0++] = R1;
+.Lbytes:
+ LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+.Lbyte_loop_s: R1 = B[P3++](Z);
+.Lbyte_loop_e: B[P0++] = R1;
-finished:
+.Lfinished:
P3 = [SP++];
+
RTS;
+
.size _memcpy,.-_memcpy
libc_hidden_def (memcpy)
diff --git a/libc/string/bfin/memmove.S b/libc/string/bfin/memmove.S
index 3d446f326..73e363820 100644
--- a/libc/string/bfin/memmove.S
+++ b/libc/string/bfin/memmove.S
@@ -1,5 +1,5 @@
/* memmove.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* void *memmove(void *dest, const void *src, size_t n);
* R0 = To Address (dest) (leave unchanged to form result)
* R1 = From Address (src)
@@ -21,75 +23,78 @@
.align 2
-.global _memmove
-.type _memmove, STT_FUNC
-_memmove:
+.weak _memmove
+ENTRY(_memmove)
I1 = P3;
- P0 = R0; // P0 = To address
- P3 = R1; // P3 = From Address
- P2 = R2 ; // P2 = count
- CC = P2 == 0; //Check zero count
- IF CC JUMP finished; // very unlikely
+ P0 = R0; /* P0 = To address */
+ P3 = R1; /* P3 = From Address */
+ P2 = R2; /* P2 = count */
+ CC = P2 == 0; /* Check zero count*/
+ IF CC JUMP .Lfinished; /* very unlikely */
- CC = R1 < R0 (IU); // From < To
- IF !CC JUMP no_overlap;
+ CC = R1 < R0 (IU); /* From < To */
+ IF !CC JUMP .Lno_overlap;
R3 = R1 + R2;
- CC = R0 <= R3 (IU); // (From+len) >= To
- IF CC JUMP overlap;
-no_overlap:
+ CC = R0 <= R3 (IU); /* (From+len) >= To */
+ IF CC JUMP .Loverlap;
+.Lno_overlap:
R3 = 11;
CC = R2 <= R3;
- IF CC JUMP bytes;
- R3 = R1 | R0; // OR addresses together
- R3 <<= 30; // check bottom two bits
- CC = AZ; // AZ set if zero.
- IF !CC JUMP bytes ; // Jump if addrs not aligned.
+ IF CC JUMP .Lbytes;
+ R3 = R1 | R0; /* OR addresses together */
+ R3 <<= 30; /* check bottom two bits */
+ CC = AZ; /* AZ set if zero.*/
+ IF !CC JUMP .Lbytes; /* Jump if addrs not aligned.*/
I0 = P3;
- P1 = P2 >> 2; // count = n/4
+ P1 = P2 >> 2; /* count = n/4 */
P1 += -1;
R3 = 3;
- R2 = R2 & R3; // remainder
- P2 = R2; // set remainder
+ R2 = R2 & R3; /* remainder */
+ P2 = R2; /* set remainder */
R1 = [I0++];
#if !defined(__WORKAROUND_AVOID_DAG1)
- LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop: MNOP || [P0++] = R1 || R1 = [I0++];
+ LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1;
+.Lquad_loop: MNOP || [P0++] = R1 || R1 = [I0++];
#else
- LSETUP (quad_loop_s, quad_loop_e) LC0=P1;
-quad_loop_s: [P0++] = R1;
-quad_loop_e: R1 = [I0++];
+ LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s: [P0++] = R1;
+.Lquad_loop_e: R1 = [I0++];
#endif
[P0++] = R1;
- CC = P2 == 0; // any remaining bytes?
- P3 = I0; // Ammend P3 to updated ptr.
- IF !CC JUMP bytes;
+ CC = P2 == 0; /* any remaining bytes? */
+ P3 = I0; /* Ammend P3 to updated ptr. */
+ IF !CC JUMP .Lbytes;
P3 = I1;
RTS;
-bytes: LSETUP (byte2_s , byte2_e) LC0=P2;
-byte2_s: R1 = B[P3++](Z);
-byte2_e: B[P0++] = R1;
+.Lbytes: LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2;
+.Lbyte2_s: R1 = B[P3++](Z);
+.Lbyte2_e: B[P0++] = R1;
-finished:
- P3 = I1;
+.Lfinished: P3 = I1;
RTS;
-overlap:
+.Loverlap:
P2 += -1;
P0 = P0 + P2;
P3 = P3 + P2;
R1 = B[P3--] (Z);
CC = P2 == 0;
- IF CC JUMP no_loop;
- LSETUP (ol_s, ol_e) LC0 = P2;
-ol_s: B[P0--] = R1;
-ol_e: R1 = B[P3--] (Z);
-no_loop: B[P0] = R1;
+ IF CC JUMP .Lno_loop;
+#if defined(__WORKAROUND_SPECULATIVE_LOADS)
+ NOP;
+ NOP;
+#endif
+ LSETUP (.Lol_s, .Lol_e) LC0 = P2;
+.Lol_s: B[P0--] = R1;
+.Lol_e: R1 = B[P3--] (Z);
+.Lno_loop: B[P0] = R1;
P3 = I1;
RTS;
+
.size _memmove,.-_memmove
libc_hidden_def (memmove)
diff --git a/libc/string/bfin/memset.S b/libc/string/bfin/memset.S
index bd8eb4b6a..64012f783 100644
--- a/libc/string/bfin/memset.S
+++ b/libc/string/bfin/memset.S
@@ -1,5 +1,5 @@
/* memset.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* void *memset(void *s, int c, size_t n);
* R0 = address (s) (leave unchanged to form result)
* R1 = filler byte (c)
@@ -21,66 +23,68 @@
.align 2
-.global _memset
-.type _memset, STT_FUNC
-_memset:
- P0 = R0 ; // P0 = address
- P2 = R2 ; // P2 = count
- R3 = R0 + R2; // end
+.weak _memset
+ENTRY(_memset)
+ P0 = R0 ; /* P0 = address */
+ P2 = R2 ; /* P2 = count */
+ R3 = R0 + R2; /* end */
CC = R2 <= 7(IU);
- IF CC JUMP too_small;
- R1 = R1.B (Z); // R1 = fill char
+ IF CC JUMP .Ltoo_small;
+ R1 = R1.B (Z); /* R1 = fill char */
R2 = 3;
- R2 = R0 & R2; // addr bottom two bits
- CC = R2 == 0; // AZ set if zero.
- IF !CC JUMP force_align ; // Jump if addr not aligned.
+ R2 = R0 & R2; /* addr bottom two bits */
+ CC = R2 == 0; /* AZ set if zero. */
+ IF !CC JUMP .Lforce_align ; /* Jump if addr not aligned. */
-aligned:
- P1 = P2 >> 2; // count = n/4
- R2 = R1 << 8; // create quad filler
+.Laligned:
+ P1 = P2 >> 2; /* count = n/4 */
+ R2 = R1 << 8; /* create quad filler */
R2.L = R2.L + R1.L(NS);
R2.H = R2.L + R1.H(NS);
P2 = R3;
- LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop:
+ LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1;
+.Lquad_loop:
[P0++] = R2;
CC = P0 == P2;
- IF !CC JUMP bytes_left;
+ IF !CC JUMP .Lbytes_left;
RTS;
-bytes_left:
- R2 = R3; // end point
- R3 = P0; // current position
- R2 = R2 - R3; // bytes left
+.Lbytes_left:
+ R2 = R3; /* end point */
+ R3 = P0; /* current position */
+ R2 = R2 - R3; /* bytes left */
P2 = R2;
-too_small:
- CC = P2 == 0; //Check zero count
- IF CC JUMP finished; // Unusual
+.Ltoo_small:
+ CC = P2 == 0; /* Check zero count */
+ IF CC JUMP .Lfinished; /* Unusual */
-bytes: LSETUP (byte_loop , byte_loop) LC0=P2;
-byte_loop: B[P0++] = R1;
+.Lbytes:
+ LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2;
+.Lbyte_loop:
+ B[P0++] = R1;
-finished:
+.Lfinished:
RTS;
-force_align:
- CC = BITTST (R0, 0 ); // odd byte
+.Lforce_align:
+ CC = BITTST (R0, 0); /* odd byte */
R0 = 4;
R0 = R0 - R2;
P1 = R0;
- R0 = P0; // Recover return address
- IF !CC JUMP skip1;
+ R0 = P0; /* Recover return address */
+ IF !CC JUMP .Lskip1;
B[P0++] = R1;
-skip1:
- CC = R2 <= 2; // 2 bytes
- P2 -= P1; // reduce count
- IF !CC JUMP aligned;
+.Lskip1:
+ CC = R2 <= 2; /* 2 bytes */
+ P2 -= P1; /* reduce count */
+ IF !CC JUMP .Laligned;
B[P0++] = R1;
B[P0++] = R1;
- JUMP aligned;
+ JUMP .Laligned;
+
.size _memset,.-_memset
libc_hidden_def (memset)
diff --git a/libc/string/bfin/strcmp.S b/libc/string/bfin/strcmp.S
index 6365024ec..12e8c53c6 100644
--- a/libc/string/bfin/strcmp.S
+++ b/libc/string/bfin/strcmp.S
@@ -1,5 +1,5 @@
/* strcmp.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
*
* This file is subject to the terms and conditions of the GNU Library General
* Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
* http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
*/
+#include <sysdep.h>
+
/* Fast strcmp() for Blackfin.
* When both strings are aligned, this processes four characters at
* a time. Uses a hw loop with "very big" count to loop "forever",
@@ -21,9 +23,8 @@
.align 2
-.global _strcmp
-.type _strcmp, STT_FUNC
-_strcmp:
+.weak _strcmp
+ENTRY(_strcmp)
[--sp] = (R7:4);
p1 = r0;
p2 = r1;
@@ -34,13 +35,13 @@ _strcmp:
r0 = r0 | r1; // check both pointers at same time
r0 <<= 30; // dump all but last 2 bits
cc = az; // are they zero?
- if !cc jump unaligned; // no; use unaligned code.
+ if !cc jump .Lunaligned; // no; use unaligned code.
// fall-thru for aligned case..
// note that r0 is zero from the previous...
// p0 set to -1
- lsetup (beginloop, endloop) lc0=p0;
+ LSETUP (.Lbeginloop, .Lendloop) lc0=p0;
// pick up first words
r1 = [p1++];
r2 = [p2++];
@@ -49,8 +50,8 @@ _strcmp:
r7.h = 0xFF;
// loop : 9 cycles to check 4 characters
cc = r1 == r2;
-beginloop:
- if !cc jump notequal4; // compare failure, exit loop
+.Lbeginloop:
+ if !cc jump .Lnotequal4; // compare failure, exit loop
// starting with 44332211
// see if char 3 or char 1 is 0
@@ -63,18 +64,18 @@ beginloop:
// add to zero, and (r1 is free, reload)
r6 = r3 +|+ r0 || r1 = [p1++] || nop;
cc |= az; // true if either is zero
- if cc jump zero4; // leave if a zero somewhere
-endloop:
+ if cc jump .Lzero4; // leave if a zero somewhere
+.Lendloop:
cc = r1 == r2;
// loop exits
-notequal4: // compare failure on 4-char compare
+.Lnotequal4: // compare failure on 4-char compare
// address pointers are one word ahead;
// faster to use zero4 exit code
p1 += 4;
p2 += 4;
-zero4: // one of the bytes in word 1 is zero
+.Lzero4: // one of the bytes in word 1 is zero
// but we've already fetched the next word; so
// backup two to look at failing word again
p1 += -8;
@@ -85,27 +86,27 @@ zero4: // one of the bytes in word 1 is zero
// here when pointers are unaligned: checks one
// character at a time. Also use at the end of
// the word-check algorithm to figure out what happened
-unaligned:
+.Lunaligned:
// R0 is non-zero from before.
// p0 set to -1
r0 = 0 (Z);
r1 = B[p1++] (Z);
r2 = B[p2++] (Z);
- lsetup (beginloop1, endloop1) lc0=p0;
+ LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0;
-beginloop1:
+.Lbeginloop1:
cc = r1; // first char must be non-zero
// chars must be the same
r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop;
cc &= az;
r3 = r0 - r2; // second char must be non-zero
cc &= an;
- if !cc jump exitloop1;
-endloop1:
+ if !cc jump .Lexitloop1;
+.Lendloop1:
r2 = B[p2++] (Z);
-exitloop1: // here means we found a zero or a difference.
+.Lexitloop1: // here means we found a zero or a difference.
// we have r2(N), p2(N), r1(N+1), p1(N+2)
r1=B[p1+ -2] (Z);
r0 = r1 - r2;
@@ -116,6 +117,6 @@ exitloop1: // here means we found a zero or a difference.
libc_hidden_def (strcmp)
#ifndef __UCLIBC_HAS_LOCALE__
-strong_alias (strcmp,strcoll)
+weak_alias (strcmp,strcoll)
libc_hidden_def (strcoll)
#endif
diff --git a/libc/string/cris/memcpy.c b/libc/string/cris/memcpy.c
index a85108109..0cce37a30 100644
--- a/libc/string/cris/memcpy.c
+++ b/libc/string/cris/memcpy.c
@@ -66,7 +66,7 @@
void *memcpy(void *, const void *, unsigned int);
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
void *memcpy(void *pdst,
const void *psrc,
unsigned int pn)
@@ -130,7 +130,7 @@ void *memcpy(void *pdst,
here (beware: they may be moved to temporary registers).
This way, we do not have to save/move the registers around into
temporaries; we can safely use them straight away. */
- __asm__ volatile ("\
+ __asm__ __volatile__ ("\
.syntax no_register_prefix \n\
\n\
;; Check that the register asm declaration got right. \n\
diff --git a/libc/string/cris/memmove.c b/libc/string/cris/memmove.c
index 437637078..b6620afe0 100644
--- a/libc/string/cris/memmove.c
+++ b/libc/string/cris/memmove.c
@@ -27,7 +27,7 @@
#include "memcopy.h"
#include "../generic/pagecopy.h"
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
void *memmove (void *dest, const void *src, size_t len)
{
unsigned long int dstp = (long int) dest;
diff --git a/libc/string/cris/memset.c b/libc/string/cris/memset.c
index 7e71bc50f..9cc959a33 100644
--- a/libc/string/cris/memset.c
+++ b/libc/string/cris/memset.c
@@ -59,7 +59,7 @@
void *memset(void *, int, unsigned long);
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
void *memset(void *pdst,
int c,
unsigned long plen)
@@ -124,7 +124,7 @@ void *memset(void *pdst,
here (beware: they may be moved to temporary registers).
This way, we do not have to save/move the registers around into
temporaries; we can safely use them straight away. */
- __asm__ volatile (" \n\
+ __asm__ __volatile__ (" \n\
.syntax no_register_prefix \n\
\n\
;; Check that the register asm declaration got right. \n\
diff --git a/libc/string/cris/strcpy.c b/libc/string/cris/strcpy.c
index 0af25253e..955a990b7 100644
--- a/libc/string/cris/strcpy.c
+++ b/libc/string/cris/strcpy.c
@@ -6,7 +6,7 @@
#include <string.h>
-libc_hidden_proto(strcpy)
+/* Experimentally off - libc_hidden_proto(strcpy) */
char *strcpy(char *dest, const char *src)
{
char *ret = dest;
diff --git a/libc/string/cris/strncpy.c b/libc/string/cris/strncpy.c
index 93a6608bc..3f2775bdd 100644
--- a/libc/string/cris/strncpy.c
+++ b/libc/string/cris/strncpy.c
@@ -6,9 +6,9 @@
#include <string.h>
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
-libc_hidden_proto(strncpy)
+/* Experimentally off - libc_hidden_proto(strncpy) */
char *strncpy(char *dest, const char *src, size_t count)
{
char *ret = dest;
diff --git a/libc/string/frv/memset.S b/libc/string/frv/memset.S
index 4e64550e4..477597dcd 100644
--- a/libc/string/frv/memset.S
+++ b/libc/string/frv/memset.S
@@ -155,4 +155,4 @@ memset:
bralr
.size memset, .-memset
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
diff --git a/libc/string/i386/memchr.c b/libc/string/i386/memchr.c
index 229d42919..fe4537914 100644
--- a/libc/string/i386/memchr.c
+++ b/libc/string/i386/memchr.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(memchr)
+/* Experimentally off - libc_hidden_proto(memchr) */
void *memchr(const void *cs, int c, size_t count)
{
int d0;
diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c
index a2b8d3d8c..285583f3b 100644
--- a/libc/string/i386/memcpy.c
+++ b/libc/string/i386/memcpy.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
void *memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
diff --git a/libc/string/i386/memmove.c b/libc/string/i386/memmove.c
index a26fe2be1..a924efcbc 100644
--- a/libc/string/i386/memmove.c
+++ b/libc/string/i386/memmove.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
void *memmove(void *dest, const void *src, size_t n)
{
int d0, d1, d2;
diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c
index eea48040a..bbaa45215 100644
--- a/libc/string/i386/memset.c
+++ b/libc/string/i386/memset.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
void *memset(void *s, int c, size_t count)
{
int d0, d1;
diff --git a/libc/string/i386/strcat.c b/libc/string/i386/strcat.c
index e0b1f3b51..2cf0237a6 100644
--- a/libc/string/i386/strcat.c
+++ b/libc/string/i386/strcat.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strcat)
+/* Experimentally off - libc_hidden_proto(strcat) */
char *strcat(char * dest, const char * src)
{
int d0, d1, d2, d3;
diff --git a/libc/string/i386/strchr.c b/libc/string/i386/strchr.c
index 7568d48db..46b1dfb6e 100644
--- a/libc/string/i386/strchr.c
+++ b/libc/string/i386/strchr.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strchr)
+/* Experimentally off - libc_hidden_proto(strchr) */
char *strchr(const char *s, int c)
{
int d0;
diff --git a/libc/string/i386/strcmp.c b/libc/string/i386/strcmp.c
index 47635d817..eff230c5c 100644
--- a/libc/string/i386/strcmp.c
+++ b/libc/string/i386/strcmp.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strcmp)
+/* Experimentally off - libc_hidden_proto(strcmp) */
int strcmp(const char *cs, const char *ct)
{
int d0, d1;
@@ -55,7 +55,7 @@ int strcmp(const char *cs, const char *ct)
libc_hidden_def(strcmp)
#ifndef __UCLIBC_HAS_LOCALE__
-libc_hidden_proto(strcoll)
+/* Experimentally off - libc_hidden_proto(strcoll) */
strong_alias(strcmp,strcoll)
libc_hidden_def(strcoll)
#endif
diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c
index 9e2b81009..09065a9b7 100644
--- a/libc/string/i386/strcpy.c
+++ b/libc/string/i386/strcpy.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strcpy)
+/* Experimentally off - libc_hidden_proto(strcpy) */
char *strcpy(char * dest, const char * src)
{
int d0, d1, d2;
diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c
index f0767b600..61a178393 100644
--- a/libc/string/i386/strlen.c
+++ b/libc/string/i386/strlen.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strlen)
+/* Experimentally off - libc_hidden_proto(strlen) */
size_t strlen(const char *s)
{
int d0;
diff --git a/libc/string/i386/strncat.c b/libc/string/i386/strncat.c
index c1061421e..5849db3b3 100644
--- a/libc/string/i386/strncat.c
+++ b/libc/string/i386/strncat.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strncat)
+/* Experimentally off - libc_hidden_proto(strncat) */
char *strncat(char * dest,
const char * src, size_t count)
{
diff --git a/libc/string/i386/strncmp.c b/libc/string/i386/strncmp.c
index d716789c3..a14bb503b 100644
--- a/libc/string/i386/strncmp.c
+++ b/libc/string/i386/strncmp.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strncmp)
+/* Experimentally off - libc_hidden_proto(strncmp) */
int strncmp(const char *cs, const char *ct, size_t count)
{
register int __res;
diff --git a/libc/string/i386/strncpy.c b/libc/string/i386/strncpy.c
index c061fe37e..76aa6ae1b 100644
--- a/libc/string/i386/strncpy.c
+++ b/libc/string/i386/strncpy.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strncpy)
+/* Experimentally off - libc_hidden_proto(strncpy) */
char *strncpy(char * dest, const char * src, size_t count)
{
int d0, d1, d2, d3;
diff --git a/libc/string/i386/strnlen.c b/libc/string/i386/strnlen.c
index 77b5c7568..02c72f530 100644
--- a/libc/string/i386/strnlen.c
+++ b/libc/string/i386/strnlen.c
@@ -33,7 +33,7 @@
#include <string.h>
#ifdef __USE_GNU
-libc_hidden_proto(strnlen)
+/* Experimentally off - libc_hidden_proto(strnlen) */
size_t strnlen(const char *s, size_t count)
{
int d0;
diff --git a/libc/string/i386/strrchr.c b/libc/string/i386/strrchr.c
index e3b2df6fb..ef378685b 100644
--- a/libc/string/i386/strrchr.c
+++ b/libc/string/i386/strrchr.c
@@ -32,7 +32,7 @@
#include <string.h>
-libc_hidden_proto(strrchr)
+/* Experimentally off - libc_hidden_proto(strrchr) */
char *strrchr(const char *s, int c)
{
int d0, d1;
diff --git a/libc/string/ia64/memcpy.S b/libc/string/ia64/memcpy.S
index db019f860..810eb0c0e 100644
--- a/libc/string/ia64/memcpy.S
+++ b/libc/string/ia64/memcpy.S
@@ -115,7 +115,7 @@
#if defined(USE_LFETCH)
#define LOOP(shift) \
ALIGN(32); \
-.loop##shift##: \
+.loop##shift : \
{ .mmb \
(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
(p[0]) lfetch.nt1 [ptr1], 16 ; \
@@ -139,7 +139,7 @@
#else
#define LOOP(shift) \
ALIGN(32); \
-.loop##shift##: \
+.loop##shift : \
{ .mmb \
(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
nop.b 0 ; \
diff --git a/libc/string/ia64/memmove.S b/libc/string/ia64/memmove.S
index 0328f84de..00342d8e0 100644
--- a/libc/string/ia64/memmove.S
+++ b/libc/string/ia64/memmove.S
@@ -64,7 +64,7 @@
#define LOOP(shift) \
ALIGN(32); \
-.loop##shift##: \
+.loop##shift : \
(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
diff --git a/libc/string/ia64/sysdep.h b/libc/string/ia64/sysdep.h
index 03e74360d..d10020ac1 100644
--- a/libc/string/ia64/sysdep.h
+++ b/libc/string/ia64/sysdep.h
@@ -34,7 +34,7 @@
#define ASM_UNW_PRLG_GRSAVE(ninputs) (32+(ninputs))
#ifdef __STDC__
-#define C_LABEL(name) name##:
+#define C_LABEL(name) name :
#else
#define C_LABEL(name) name/**/:
#endif
diff --git a/libc/string/powerpc/memcpy.c b/libc/string/powerpc/memcpy.c
index ed8022313..bcbb806f8 100644
--- a/libc/string/powerpc/memcpy.c
+++ b/libc/string/powerpc/memcpy.c
@@ -21,7 +21,7 @@
#include <string.h>
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
void *memcpy(void *to, const void *from, size_t n)
/* PPC can do pre increment and load/store, but not post increment and load/store.
Therefore use *++ptr instead of *ptr++. */
diff --git a/libc/string/powerpc/memmove.c b/libc/string/powerpc/memmove.c
index 327161116..7a4a7e5ff 100644
--- a/libc/string/powerpc/memmove.c
+++ b/libc/string/powerpc/memmove.c
@@ -21,9 +21,9 @@
#include <string.h>
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
void *memmove(void *to, const void *from, size_t n)
{
unsigned long rem, chunks, tmp1, tmp2;
diff --git a/libc/string/powerpc/memset.c b/libc/string/powerpc/memset.c
index 891e0b8aa..d62ec0ee0 100644
--- a/libc/string/powerpc/memset.c
+++ b/libc/string/powerpc/memset.c
@@ -21,14 +21,14 @@
#include <string.h>
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
static inline int expand_byte_word(int c){
/* this does:
c = c << 8 | c;
c = c << 16 | c ;
*/
- asm("rlwimi %0,%0,8,16,23\n"
+ __asm__("rlwimi %0,%0,8,16,23\n"
"\trlwimi %0,%0,16,0,15\n"
: "=r" (c) : "0" (c));
return c;
diff --git a/libc/string/sparc/_glibc_inc.h b/libc/string/sparc/_glibc_inc.h
index 4eb4d755c..e0aef52c2 100644
--- a/libc/string/sparc/_glibc_inc.h
+++ b/libc/string/sparc/_glibc_inc.h
@@ -6,6 +6,8 @@
#include <features.h>
#include <bits/wordsize.h>
+/* Is alignment really needed? */
+
#if __WORDSIZE == 32
# define ENTRY_ALIGN 4
#else
diff --git a/libc/string/sparc/sparc32/sparcv9b/memchr.S b/libc/string/sparc/sparc32/sparcv9b/memchr.S
index 7e86a2972..43a16ff11 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memchr.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memchr.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include "sparc64/memchr.S"
+#include "../../sparc64/memchr.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/memcpy.S b/libc/string/sparc/sparc32/sparcv9b/memcpy.S
index 7f697542e..2024869dd 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memcpy.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include "sparc64/sparcv9b/memcpy.S"
+#include "../../sparc64/sparcv9b/memcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/memset.S b/libc/string/sparc/sparc32/sparcv9b/memset.S
index ac67b7ab7..e49173172 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memset.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memset.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/memset.S>
+#include "../../sparc64/memset.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
index 440ad7e21..17ffa5e4d 100644
--- a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/stpcpy.S>
+#include "../../sparc64/stpcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcat.S b/libc/string/sparc/sparc32/sparcv9b/strcat.S
index 7a2223570..9ed125a4b 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcat.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcat.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/strcat.S>
+#include "../../sparc64/strcat.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strchr.S b/libc/string/sparc/sparc32/sparcv9b/strchr.S
index ddd32120d..6b2727a1f 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strchr.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strchr.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/strchr.S>
+#include "../../sparc64/strchr.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcmp.S b/libc/string/sparc/sparc32/sparcv9b/strcmp.S
index 5330f4359..854403ffd 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcmp.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcmp.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/strcmp.S>
+#include "../../sparc64/strcmp.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcpy.S b/libc/string/sparc/sparc32/sparcv9b/strcpy.S
index 0b35c9be0..e8102bde4 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcpy.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/strcpy.S>
+#include "../../sparc64/strcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strlen.S b/libc/string/sparc/sparc32/sparcv9b/strlen.S
index b8f4dba4f..8673333a2 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strlen.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strlen.S
@@ -1,4 +1,4 @@
#define ASI_PNF 0x82
#define ASI_BLK_P 0xf0
#define XCC icc
-#include <sparc64/strlen.S>
+#include "../../sparc64/strlen.S"
diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h
index 88cef2ea3..415ce90a7 100644
--- a/libc/string/x86_64/_glibc_inc.h
+++ b/libc/string/x86_64/_glibc_inc.h
@@ -6,15 +6,8 @@
#include <features.h>
#include <bits/wordsize.h>
-#if __WORDSIZE == 32
-# define ENTRY_ALIGN 4
-#else
-# define ENTRY_ALIGN 2
-#endif
-
#define ENTRY(sym) \
.global sym; \
- .align ENTRY_ALIGN; \
.type sym,%function; \
sym:
diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S
index b3bb0f96c..697b992d0 100644
--- a/libc/string/x86_64/memcpy.S
+++ b/libc/string/x86_64/memcpy.S
@@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))
subq $32, %rcx
js 2f
- .p2align 4
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
3:
-
/* Now correct the loop counter. Please note that in the following
code the flags are not changed anymore. */
subq $32, %rcx
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index d72d74468..46751006b 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -53,15 +53,17 @@ ENTRY (memset)
imul %rax,%r8
#endif
test $0x7,%edi /* Check for alignment. */
- je 2f
+ jz 2f
- .p2align 4
-1: /* Align ptr to 8 byte. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+1:
+ /* Align ptr to 8 byte. */
mov %sil,(%rcx)
dec %rdx
inc %rcx
- test $0x7,%ecx
- jne 1b
+ test $0x7,%cl
+ jnz 1b
2: /* Check for really large regions. */
mov %rdx,%rax
@@ -70,8 +72,10 @@ ENTRY (memset)
cmp LARGE, %rdx
jae 11f
- .p2align 4
-3: /* Copy 64 bytes. */
+ /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+ .p2align 4,,11
+3:
+ /* Fill 64 bytes. */
mov %r8,(%rcx)
mov %r8,0x8(%rcx)
mov %r8,0x10(%rcx)
@@ -84,7 +88,7 @@ ENTRY (memset)
dec %rax
jne 3b
-4: /* Copy final bytes. */
+4: /* Fill final bytes. */
and $0x3f,%edx
mov %rdx,%rax
shr $0x3,%rax
@@ -107,16 +111,18 @@ ENTRY (memset)
jne 8b
9:
#if BZERO_P
- nop
+ /* nothing */
#else
/* Load result (only if used as memset). */
mov %rdi,%rax /* start address of destination is result */
#endif
retq
- .p2align 4
-11: /* Copy 64 bytes without polluting the cache. */
- /* We could use movntdq %xmm0,(%rcx) here to further
+ /* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+ .p2align 4,,14
+11:
+ /* Fill 64 bytes without polluting the cache. */
+ /* We could use movntdq %xmm0,(%rcx) here to further
speed up for large cases but let's not use XMM registers. */
movnti %r8,(%rcx)
movnti %r8,0x8(%rcx)
diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S
index 9b0068981..23d068fea 100644
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -21,6 +21,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (BP_SYM (strcat))
@@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat))
/* Now the source is aligned. Scan for NUL byte. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
@@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat))
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
@@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat))
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
22:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat))
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
23:
movb %al, (%rdx) /* 1st byte. */
testb %al, %al /* Is it NUL. */
diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S
index 8e59c4c19..9ef46b7f2 100644
--- a/libc/string/x86_64/strchr.S
+++ b/libc/string/x86_64/strchr.S
@@ -20,6 +20,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (BP_SYM (strchr))
@@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr))
each of whose bytes is C. This turns each byte that is C
into a zero. */
- .p2align 4
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
4:
/* Main Loop is unrolled 4 times. */
/* First unroll. */
@@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr))
reversed. */
- .p2align 4 /* Align, it's a jump target. */
-3: movq %r9,%rdx /* move to %rdx so that we can access bytes */
+ /* Align, it's a jump target. */
+ /* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+ .p2align 4,,9
+3:
+ movq %r9,%rdx /* move to %rdx so that we can access bytes */
subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte C? */
jz 6f /* yes => return pointer */
@@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr))
incq %rax
6:
- nop
+ /* nop - huh?? */
retq
END (BP_SYM (strchr))
diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S
index d9a51b0bb..612a30d1a 100644
--- a/libc/string/x86_64/strcpy.S
+++ b/libc/string/x86_64/strcpy.S
@@ -20,6 +20,8 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
+
#ifndef USE_AS_STPCPY
# define STRCPY strcpy
#endif
@@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))
/* Now the sources is aligned. Unfortunatly we cannot force
to have both source and destination aligned, so ignore the
alignment of the destination. */
- .p2align 4
+
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
1:
/* 1st unroll. */
movq (%rsi), %rax /* Read double word (8 bytes). */
@@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))
/* Do the last few bytes. %rax contains the value to write.
The loop is unrolled twice. */
- .p2align 4
+
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
3:
/* Note that stpcpy needs to return with the value of the NUL
byte. */
diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S
index fed12b5f6..fd9b09c48 100644
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -25,6 +25,8 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
+
/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */
#define STRPBRK_P (defined strcspn)
@@ -53,26 +55,28 @@ ENTRY (strcspn)
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+
L(2): movb (%rax), %cl /* get byte from skipset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 1(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 2(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
movb 3(%rax), %cl /* get byte from skipset */
addq $4, %rax /* increment skipset pointer */
movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jnz L(2) /* no => process next dword from skipset */
L(1): leaq -4(%rdx), %rax /* prepare loop */
@@ -86,7 +90,13 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+
L(3): addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S
index 0441dc46c..4213f0ab6 100644
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -20,6 +20,7 @@
#include "_glibc_inc.h"
+/* Seems to be unrolled too much */
.text
ENTRY (strlen)
@@ -39,8 +40,11 @@ ENTRY (strlen)
1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
- .p2align 4 /* Align loop. */
-4: /* Main Loop is unrolled 4 times. */
+ /* Align loop. */
+ /* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+ .p2align 4,,10
+4:
+ /* Main Loop is unrolled 4 times. */
/* First unroll. */
movq (%rax), %rcx /* get double word (= 8 bytes) in question */
addq $8,%rax /* adjust pointer for next word */
@@ -97,8 +101,11 @@ ENTRY (strlen)
the addition will not result in 0. */
jz 4b /* no NUL found => continue loop */
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
+ /* Align, it is a jump target. */
+ /* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+ .p2align 3,,8
+3:
+ subq $8,%rax /* correct pointer increment. */
testb %cl, %cl /* is first byte NUL? */
jz 2f /* yes => return */
diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S
index c126abd2e..41cff0490 100644
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -50,26 +50,28 @@ ENTRY (strspn)
Although all the following instruction only modify %cl we always
have a correct zero-extended 64-bit value in %rcx. */
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from stopset */
+ /* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+ .p2align 3,,6
+L(2):
+ movb (%rax), %cl /* get byte from stopset */
testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 1(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 2(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jz L(1) /* yes => start compare loop */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
movb 3(%rax), %cl /* get byte from stopset */
addq $4, %rax /* increment stopset pointer */
movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
- testb $0xff, %cl /* is NUL char? */
+ testb %cl, %cl /* is NUL char? */
jnz L(2) /* no => process next dword from stopset */
L(1): leaq -4(%rdx), %rax /* prepare loop */
@@ -83,8 +85,14 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */
value in the table. But the value of NUL is NUL so the loop
terminates for NUL in every case. */
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
+ /* Next 3 insns are 9 bytes total. */
+ /* .p2align 4,,9 would make sure we decode them in one go, */
+ /* but it will also align entire function to 16 bytes, */
+ /* potentially creating largish padding at link time. */
+ /* We are aligning to 8 bytes instead: */
+ .p2align 3,,8
+L(3):
+ addq $4, %rax /* adjust pointer for full loop round */
movb (%rax), %cl /* get byte from string */
testb %cl, (%rsp,%rcx) /* is it contained in skipset? */