70 files changed, 1080 insertions, 658 deletions
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S
index 3704f96b5..103580a0c 100644
--- a/libc/string/arm/_memcpy.S
+++ b/libc/string/arm/_memcpy.S
@@ -39,7 +39,9 @@
 
 #include <features.h>
 #include <endian.h>
+#include <bits/arm_asm.h>
 
+#if !defined(THUMB1_ONLY)
 /*
  * This is one fun bit of code ...
  * Some easy listening music is suggested while trying to understand this
@@ -77,12 +79,36 @@
 .type _memcpy,%function
 .align 4
 
+/* XXX: The Thumb-2 conditionals can be removed if/when we require an
+   assembler that supports unified syntax.  */
+.macro copy regs
+#if defined(__thumb2__)
+	ittt	ge
+	ldmiage	r1!, \regs
+	stmiage	r0!, \regs
+#else
+	ldmgeia	r1!, \regs
+	stmgeia	r0!, \regs
+#endif
+.endm
+
+.macro copydb regs
+#if defined(__thumb2__)
+	ittt	ge
+	ldmdbge	r1!, \regs
+	stmdbge	r0!, \regs
+#else
+	ldmgedb	r1!, \regs
+	stmgedb	r0!, \regs
+#endif
+.endm
+
 _memcpy:
 	/* Determine copy direction */
 	cmp	r1, r0
 	bcc	.Lmemcpy_backwards
 
-	moveq	r0, #0			/* Quick abort for len=0 */
+	IT(tt, eq)			/* Quick abort for src=dst */
 #if defined(__USE_BX__)
         bxeq    lr
 #else
@@ -102,7 +128,7 @@ _memcpy:
 	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
 	subs	r2, r2, #0x14         
 	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
-	stmdb	sp!, {r4}		/* borrow r4 */
+	str	r4, [sp, #-4]!		/* borrow r4 */
 
 	/* blat 32 bytes at a time */
 	/* XXX for really big copies perhaps we should use more registers */
@@ -115,19 +141,22 @@ _memcpy:
 	bge	.Lmemcpy_floop32
 
 	cmn	r2, #0x10
-	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
-	stmgeia	r0!, {r3, r4, r12, lr}
+	/* blat a remaining 16 bytes */
+	copy	"{r3, r4, r12, lr}"
 	subge	r2, r2, #0x10         
-	ldmia	sp!, {r4}		/* return r4 */
+	ldr	r4, [sp], #4		/* restore r4 */
 
 .Lmemcpy_fl32:
 	adds	r2, r2, #0x14         
 
 	/* blat 12 bytes at a time */
 .Lmemcpy_floop12:
-	ldmgeia	r1!, {r3, r12, lr}
-	stmgeia	r0!, {r3, r12, lr}
+	copy	"{r3, r12, lr}"
+#if defined(__thumb2__)
+	subsge	r2, r2, #0x0c         
+#else
 	subges	r2, r2, #0x0c         
+#endif
 	bge	.Lmemcpy_floop12
 
 .Lmemcpy_fl12:
@@ -135,26 +164,48 @@ _memcpy:
 	blt	.Lmemcpy_fl4
 
 	subs	r2, r2, #4
+	IT(tt, lt)
 	ldrlt	r3, [r1], #4
 	strlt	r3, [r0], #4
-	ldmgeia	r1!, {r3, r12}
-	stmgeia	r0!, {r3, r12}
+	copy	"{r3, r12}"
 	subge	r2, r2, #4
 
 .Lmemcpy_fl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
+#if defined(__thumb2__)
+	it	eq
+	popeq	{r0, pc}		/* done */
+#elif defined(__ARM_ARCH_4T__)
+	ldmeqia	sp!, {r0, r3}		/* done */
+	bxeq	r3
+#else
 	ldmeqia	sp!, {r0, pc}		/* done */
+#endif
 
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
+#if defined(__thumb2__)
+	itt	ge
+	ldrbge	r3, [r1], #1
+	strbge	r3, [r0], #1
+	itt	gt
+	ldrbgt	r3, [r1], #1
+	strbgt	r3, [r0], #1
+#else
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
+#endif
+#if defined(__ARM_ARCH_4T__)
+	ldmia	sp!, {r0, r3}
+	bx	r3
+#else
 	ldmia	sp!, {r0, pc}
+#endif
 
 	/* erg - unaligned destination */
 .Lmemcpy_fdestul:
@@ -164,10 +215,19 @@ _memcpy:
 	/* align destination with byte copies */
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
+#if defined(__thumb2__)
+	itt	ge
+	ldrbge	r3, [r1], #1
+	strbge	r3, [r0], #1
+	itt	gt
+	ldrbgt	r3, [r1], #1
+	strbgt	r3, [r0], #1
+#else
 	ldrgeb	r3, [r1], #1
 	strgeb	r3, [r0], #1
 	ldrgtb	r3, [r1], #1
 	strgtb	r3, [r0], #1
+#endif
 	subs	r2, r2, r12
 	blt	.Lmemcpy_fl4		/* less the 4 bytes */
 
@@ -370,12 +430,12 @@ _memcpy:
 
 .Lmemcpy_bl32:
 	cmn	r2, #0x10            
-	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
-	stmgedb	r0!, {r3, r4, r12, lr}
+	/* blat a remaining 16 bytes */
+	copydb	"{r3, r4, r12, lr}"
 	subge	r2, r2, #0x10         
 	adds	r2, r2, #0x14         
-	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
-	stmgedb	r0!, {r3, r12, lr}
+	/* blat a remaining 12 bytes */
+	copydb	"{r3, r12, lr}"
 	subge	r2, r2, #0x0c         
 	ldmia	sp!, {r4, lr}
 
@@ -383,15 +443,16 @@ _memcpy:
 	adds	r2, r2, #8
 	blt	.Lmemcpy_bl4
 	subs	r2, r2, #4
+	IT(tt, lt)
 	ldrlt	r3, [r1, #-4]!
 	strlt	r3, [r0, #-4]!
-	ldmgedb	r1!, {r3, r12}
-	stmgedb	r0!, {r3, r12}
+	copydb	"{r3, r12}"
 	subge	r2, r2, #4
 
 .Lmemcpy_bl4:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
+	IT(t, eq)
 #if defined(__USE_BX__)
         bxeq    lr
 #else
@@ -401,10 +462,19 @@ _memcpy:
 	cmp	r2, #2
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
+#ifdef __thumb2__
+	itt	ge
+	ldrbge	r3, [r1, #-1]!
+	strbge	r3, [r0, #-1]!
+	itt	gt
+	ldrbgt	r3, [r1, #-1]!
+	strbgt	r3, [r0, #-1]!
+#else
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
+#endif
 #if defined(__USE_BX__)
         bx      lr
 #else
@@ -417,10 +487,19 @@ _memcpy:
 	/* align destination with byte copies */
 	ldrb	r3, [r1, #-1]!
 	strb	r3, [r0, #-1]!
+#ifdef __thumb2__
+	itt	ge
+	ldrbge	r3, [r1, #-1]!
+	strbge	r3, [r0, #-1]!
+	itt	gt
+	ldrbgt	r3, [r1, #-1]!
+	strbgt	r3, [r0, #-1]!
+#else
 	ldrgeb	r3, [r1, #-1]!
 	strgeb	r3, [r0, #-1]!
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
+#endif
 	subs	r2, r2, r12
 	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
 	ands	r12, r1, #3
@@ -591,3 +670,77 @@ _memcpy:
 .Lmemcpy_bsrcul1l4:
 	add	r1, r1, #1
 	b	.Lmemcpy_bl4
+
+#else /* THUMB1_ONLY */
+
+/* This is a fairly dumb implementation for when we can't use the 32-bit code
+   above.  */
+.text
+.global _memcpy
+.hidden _memcpy
+.type _memcpy,%function
+.align 4
+.thumb
+_memcpy:
+	push	{r0, r4}
+	cmp	r2, #0
+	beq	.Lmemcpy_exit
+	@ See if we have overlapping regions, and need to reverse the
+	@ direction of the copy
+	cmp	r0, r1
+	bls	.Lmemcpy_forwards
+	add	r4, r1, r2
+	cmp	r0, r4
+	bcc	.Lmemcpy_backwards
+.Lmemcpy_forwards:
+	/* Forwards.  */
+	mov	r3, r0
+	eor	r3, r1
+	mov	r4, #3
+	tst	r3, r4
+	bne	.Lmemcpy_funaligned
+	cmp	r2, #8
+	bcc	.Lmemcpy_funaligned
+1:	@ copy up to the first word boundary.
+	tst	r0, r4
+	beq	1f
+	ldrb	r3, [r1]
+	add	r1, r1, #1
+	strb	r3, [r0]
+	add	r0, r0, #1
+	sub	r2, r2, #1
+	b	1b
+1:	@ Copy aligned words
+	ldr	r3, [r1]
+	add	r1, r1, #4
+	str	r3, [r0]
+	add	r0, r0, #4
+	sub	r2, r2, #4
+	cmp	r2, #4
+	bcs	1b
+	cmp	r2, #0
+	beq	.Lmemcpy_exit
+.Lmemcpy_funaligned:
+1:
+	ldrb	r3, [r1]
+	add	r1, r1, #1
+	strb	r3, [r0]
+	add	r0, r0, #1
+	sub	r2, r2, #1
+	bne	1b
+.Lmemcpy_exit:
+	pop	{r0, r4}
+	bx	lr
+
+.Lmemcpy_backwards:
+	add	r0, r0, r2
+	add	r1, r1, r2
+1:
+	sub	r0, r0, #1
+	sub	r1, r1, #1
+	ldrb	r3, [r1]
+	strb	r3, [r0]
+	sub	r2, r2, #1
+	bne	1b
+	b	.Lmemcpy_exit
+#endif
diff --git a/libc/string/arm/bcopy.S b/libc/string/arm/bcopy.S
index db3c9e6c1..2d6e90d13 100644
--- a/libc/string/arm/bcopy.S
+++ b/libc/string/arm/bcopy.S
@@ -40,6 +40,7 @@
 /* bcopy = memcpy/memmove with arguments reversed. */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
@@ -48,12 +49,23 @@
 .type bcopy,%function
 .align 4
 
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+bcopy:
+	push	{r2, lr}
+	mov	ip, r0
+	mov	r0, r1
+	mov	r1, ip
+	bl	_memcpy
+	POP_RET
+#else
 bcopy:
 	/* switch the source and destination registers */
 	eor     r0, r1, r0 
 	eor     r1, r0, r1 
 	eor     r0, r1, r0 
 	b	_memcpy /* (PLT) */
+#endif
 	
 .size bcopy,.-bcopy
 
diff --git a/libc/string/arm/bzero.S b/libc/string/arm/bzero.S
index ee49cf560..e576a12e9 100644
--- a/libc/string/arm/bzero.S
+++ b/libc/string/arm/bzero.S
@@ -38,6 +38,7 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
@@ -46,10 +47,21 @@
 .type bzero,%function
 .align 4
 
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+bzero:
+	push	{r2, lr}
+	mov	r2, r1
+	mov	r1, #0
+	bl	HIDDEN_JUMPTARGET(memset)
+	POP_RET
+#else
+
 bzero:
 	mov	r2, r1
 	mov	r1, #0
 	b	HIDDEN_JUMPTARGET(memset)
+#endif
 
 .size bzero,.-bzero
 
diff --git a/libc/string/arm/memcmp.S b/libc/string/arm/memcmp.S
index 4f78b5128..65409f43a 100644
--- a/libc/string/arm/memcmp.S
+++ b/libc/string/arm/memcmp.S
@@ -30,15 +30,41 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 .text
 .global memcmp
 .type memcmp,%function
 .align 4
 
+#if defined(THUMB1_ONLY)
+.thumb_func
+memcmp:
+	cmp	r2, #0
+	bne	1f
+	mov	r0, #0
+	bx	lr
+1:
+	push	{r4}
+	add	r4, r0, r2
+2:
+	ldrb	r2, [r0]
+	add	r0, r0, #1
+	ldrb	r3, [r1]
+	add	r1, r1, #1
+	cmp	r4, r0
+	beq	3f
+	cmp	r2, r3
+	beq	2b
+3:
+	sub	r0, r2, r3
+        pop	{r4}
+	bx	lr
+#else
 memcmp:
 	/* if ((len - 1) < 0) return 0 */
 	subs	r2, r2, #1
+	IT(tt, mi)
 	movmi	r0, #0
 #if defined(__USE_BX__)
         bxmi    lr
@@ -51,6 +77,7 @@ memcmp:
 	ldrb	r2, [r0], #1
 	ldrb	r3, [r1], #1
 	cmp	ip, r0
+	IT(t, cs)
 	cmpcs	r2, r3
 	beq	1b
 	sub	r0, r2, r3
@@ -59,6 +86,7 @@ memcmp:
 #else
  	mov	pc, lr
 #endif
+#endif
 
 .size memcmp,.-memcmp
 
diff --git a/libc/string/arm/memcpy.S b/libc/string/arm/memcpy.S
index 7a5b6ab76..d2013d211 100644
--- a/libc/string/arm/memcpy.S
+++ b/libc/string/arm/memcpy.S
@@ -38,16 +38,23 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 .text
 .global memcpy
 .type memcpy,%function
 .align 4
 
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
 memcpy:
-	stmfd	sp!, {r0, lr}
+	push	{r0, lr}
 	bl	_memcpy
-	ldmfd	sp!, {r0, pc}
+	POP_RET
+#else
+memcpy:
+	b	_memcpy
+#endif
 
 .size memcpy,.-memcpy
 
diff --git a/libc/string/arm/memmove.S b/libc/string/arm/memmove.S
index 45cd9b4d4..c11b98dd4 100644
--- a/libc/string/arm/memmove.S
+++ b/libc/string/arm/memmove.S
@@ -38,16 +38,23 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 .text
 .global memmove
 .type memmove,%function
 .align 4
 
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
 memmove:
-	stmfd	sp!, {r0, lr}
+	push	{r2, lr}
 	bl	_memcpy
-	ldmfd	sp!, {r0, pc}
+	POP_RET
+#else
+memmove:
+	b	_memcpy
+#endif
 
 .size memmove,.-memmove
 
diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S
index 16bfe0dc5..66aa6039c 100644
--- a/libc/string/arm/memset.S
+++ b/libc/string/arm/memset.S
@@ -19,12 +19,52 @@
 
 #include <features.h>
 #include <sys/syscall.h>
+#include <bits/arm_asm.h>
 
 .text
 .global memset
 .type memset,%function
 .align 4
 
+#if defined(THUMB1_ONLY)
+.thumb_func
+memset:
+	mov	ip, r0
+	cmp	r2, #8		@ at least 8 bytes to do?
+	bcc	2f
+
+	lsl	r3, r1, #8
+	orr	r1, r3
+	lsl	r3, r1, #16
+	orr	r1, r3
+
+	mov	r3, #3
+1:	@ Fill up to the first word boundary
+	tst	r0, r3
+	beq	1f
+	strb	r1, [r0]
+	add	r0, r0, #1
+	sub	r2, r2, #1
+	b	1b
+1:	@ Fill aligned words
+	str	r1, [r0]
+	add	r0, r0, #4
+	sub	r2, r2, #4
+	cmp	r2, #4
+	bcs	1b
+
+2:	@ Fill the remaining bytes
+	cmp	r2, #0
+	beq	2f
+1:
+	strb	r1, [r0]
+	add	r0, r0, #1
+	sub	r2, r2, #1
+	bne	1b
+2:
+	mov	r0, ip
+	bx lr
+#else
 memset:
 	mov	a4, a1
 	cmp	a3, $8		@ at least 8 bytes to do?
@@ -33,8 +73,14 @@ memset:
 	orr	a2, a2, a2, lsl $16
 1:
 	tst	a4, $3		@ aligned yet?
+#if defined(__thumb2__)
+	itt	ne
+	strbne	a2, [a4], $1
+	subne	a3, a3, $1
+#else
 	strneb	a2, [a4], $1
 	subne	a3, a3, $1
+#endif
 	bne	1b
 	mov	ip, a2
 1:
@@ -51,16 +97,30 @@ memset:
 	stmia	a4!, {a2, ip}
 	sub	a3, a3, $8
 	cmp	a3, $8		@ 8 bytes still to do?
+#if defined(__thumb2__)
+	itt	ge
+	stmiage	a4!, {a2, ip}
+	subge	a3, a3, $8
+#else
 	stmgeia	a4!, {a2, ip}
 	subge	a3, a3, $8
+#endif
 	bge	1b
 2:
 	movs	a3, a3		@ anything left?
+	IT(t, eq)
 #if defined(__USE_BX__)
         bxeq    lr
 #else
         moveq	pc, lr		@ nope
 #endif
+#if defined (__thumb2__)
+1:
+	strb	a2, [a4], #1
+	subs	a3, a3, #1
+	bne	1b
+	bx	lr
+#else
 	rsb	a3, a3, $7
 	add	pc, pc, a3, lsl $2
 	mov	r0, r0
@@ -76,6 +136,8 @@ memset:
 #else
  	mov	pc, lr
 #endif
+#endif
+#endif
 
 .size memset,.-memset
 
diff --git a/libc/string/arm/strcmp.S b/libc/string/arm/strcmp.S
index 89aa38874..97363c1c2 100644
--- a/libc/string/arm/strcmp.S
+++ b/libc/string/arm/strcmp.S
@@ -30,17 +30,35 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 .text
 .global strcmp
 .type strcmp,%function
 .align 4
 
+#if defined(__thumb__) && !defined(__thumb2__)
+.thumb_func
+strcmp:
+1:
+	ldrb	r2, [r0]
+	add	r0, r0, #1
+	ldrb	r3, [r1]
+	add	r1, r1, #1
+	cmp	r2, #0
+	beq	2f
+	cmp	r2, r3
+	beq	1b
+2:
+	sub	r0, r2, r3
+	bx      lr
+#else
 strcmp:
 1:
 	ldrb	r2, [r0], #1
 	ldrb	r3, [r1], #1
 	cmp	r2, #1
+	IT(t, cs)
 	cmpcs	r2, r3
 	beq	1b
 	sub	r0, r2, r3
@@ -49,6 +67,7 @@ strcmp:
 #else
   	mov	pc, lr
 #endif
+#endif
 
 .size strcmp,.-strcmp
 
diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
index 5b4b02e17..949e918f4 100644
--- a/libc/string/arm/strlen.S
+++ b/libc/string/arm/strlen.S
@@ -20,6 +20,7 @@
 #include <features.h>
 #include <endian.h>
 #include <sys/syscall.h>
+#include <bits/arm_asm.h>
 
 /* size_t strlen(const char *S)
  * entry: r0 -> string
@@ -31,6 +32,19 @@
 .type strlen,%function
 .align 4
 
+#if defined(THUMB1_ONLY)
+/* A simple implementation for when the ARM implementation can't be used.  */
+.thumb_func
+strlen:
+	mov r2, #0
+1:
+	ldrb	r1, [r0, r2]
+	add	r2, r2, #1
+	cmp	r1, #0
+	bne	1b
+	sub	r0, r2, #1
+	bx lr
+#else
 strlen:
 	bic     r1, r0, $3              @ addr of word containing first byte
 	ldr     r2, [r1], $4            @ get the first word
@@ -41,38 +55,48 @@ strlen:
 #if __BYTE_ORDER == __BIG_ENDIAN
 	orr     r2, r2, $0xff000000     @ set this byte to non-zero
 	subs    r3, r3, $1              @ any more to do?
+	IT(t, gt)
 	orrgt   r2, r2, $0x00ff0000     @ if so, set this byte
 	subs    r3, r3, $1              @ more?
+	IT(t, gt)
 	orrgt   r2, r2, $0x0000ff00     @ then set.
 #else
 	orr     r2, r2, $0x000000ff     @ set this byte to non-zero
 	subs    r3, r3, $1              @ any more to do?
+	IT(t, gt)
 	orrgt   r2, r2, $0x0000ff00     @ if so, set this byte
 	subs    r3, r3, $1              @ more?
+	IT(t, gt)
 	orrgt   r2, r2, $0x00ff0000     @ then set.
 #endif
 Laligned:				@ here, we have a word in r2.  Does it
 	tst     r2, $0x000000ff         @ contain any zeroes?
+	IT(tttt, ne)
 	tstne   r2, $0x0000ff00         @
 	tstne   r2, $0x00ff0000         @
 	tstne   r2, $0xff000000         @
 	addne   r0, r0, $4              @ if not, the string is 4 bytes longer
+	IT(t, ne)
 	ldrne   r2, [r1], $4            @ and we continue to the next word
 	bne     Laligned                @
 Llastword:				@ drop through to here once we find a
 #if __BYTE_ORDER == __BIG_ENDIAN
 	tst     r2, $0xff000000         @ word that has a zero byte in it
+	IT(tttt, ne)
 	addne   r0, r0, $1              @
 	tstne   r2, $0x00ff0000         @ and add up to 3 bytes on to it
 	addne   r0, r0, $1              @
 	tstne   r2, $0x0000ff00         @ (if first three all non-zero, 4th
+	IT(t, ne)
 	addne   r0, r0, $1              @  must be zero)
 #else
 	tst     r2, $0x000000ff         @
+	IT(tttt, ne)
 	addne   r0, r0, $1              @
 	tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it
 	addne   r0, r0, $1              @
 	tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th
+	IT(t, ne)
 	addne   r0, r0, $1              @  must be zero)
 #endif
 #if defined(__USE_BX__)
@@ -80,6 +104,7 @@ Llastword:				@ drop through to here once we find a
 #else
   	mov	pc,lr
 #endif
+#endif
 
 .size strlen,.-strlen
 
diff --git a/libc/string/arm/strncmp.S b/libc/string/arm/strncmp.S
index eaf0620b4..8487639c8 100644
--- a/libc/string/arm/strncmp.S
+++ b/libc/string/arm/strncmp.S
@@ -30,15 +30,46 @@
  */
 
 #include <features.h>
+#include <bits/arm_asm.h>
 
 .text
 .global strncmp
 .type strncmp,%function
 .align 4
 
+#if defined(THUMB1_ONLY)
+.thumb_func
 strncmp:
 	/* if (len == 0) return 0 */
 	cmp	r2, #0
+	bne	1f
+	mov	r0, #0
+	bx	lr
+1:
+	push	{r4}
+
+	/* ip == last src address to compare */
+	add	r4, r0, r2
+2:
+	cmp	r4, r0
+	beq	3f
+	ldrb	r2, [r0]
+	add	r0, r0, #1
+	ldrb	r3, [r1]
+	add	r1, r1, #1
+	cmp	r2, #0
+	beq	3f
+	cmp	r2, r3
+	beq	2b
+3:
+	sub	r0, r2, r3
+	pop	{r4}
+	bx	lr
+#else
+strncmp:
+	/* if (len == 0) return 0 */
+	cmp	r2, #0
+	IT(tt, eq)
 	moveq	r0, #0
 #if defined(__USE_BX__)
         bxeq    lr
@@ -53,6 +84,7 @@ strncmp:
 	ldrb	r2, [r0], #1
 	ldrb	r3, [r1], #1
 	cmp	ip, r0
+	IT(tt, cs)
 	cmpcs	r2, #1
 	cmpcs	r2, r3
 	beq	1b
@@ -62,6 +94,7 @@ strncmp:
 #else
   	mov	pc, lr
 #endif
+#endif
 
 .size strncmp,.-strncmp
 
diff --git a/libc/string/avr32/Makefile b/libc/string/avr32/Makefile
index 0002ffdce..e19e9d9ec 100644
--- a/libc/string/avr32/Makefile
+++ b/libc/string/avr32/Makefile
@@ -16,8 +16,8 @@
 # along with this program; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 
-top_srcdir     := ../../../
-top_builddir   := ../../../
+top_srcdir	:= ../../../
+top_builddir	:= ../../../
 
 all: objs
 
diff --git a/libc/string/avr32/bcopy.S b/libc/string/avr32/bcopy.S
index e1d173165..bdd521814 100644
--- a/libc/string/avr32/bcopy.S
+++ b/libc/string/avr32/bcopy.S
@@ -10,17 +10,17 @@
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
-       .text
-       .global bcopy
-       .type   bcopy, @function
-       .align  1
+	.text
+	.global bcopy
+	.type	bcopy, @function
+	.align	1
 bcopy:
-       /* Swap the first two arguments */
-       eor     r11, r12
-       eor     r12, r11
-       eor     r11, r12
-       rjmp    HIDDEN_JUMPTARGET(memmove)
+	/* Swap the first two arguments */
+	eor	r11, r12
+	eor	r12, r11
+	eor	r11, r12
+	rjmp	HIDDEN_JUMPTARGET(memmove)
 
-       .size   bcopy, . - bcopy
+	.size	bcopy, . - bcopy
 
 #endif /* __UCLIBC_SUSV3_LEGACY__ */
diff --git a/libc/string/avr32/bzero.S b/libc/string/avr32/bzero.S
index 928148dcb..ca1bd2dd2 100644
--- a/libc/string/avr32/bzero.S
+++ b/libc/string/avr32/bzero.S
@@ -10,15 +10,15 @@
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
-       .text
-       .global bzero
-       .type   bzero, @function
-       .align  1
+	.text
+	.global bzero
+	.type	bzero, @function
+	.align	1
 bzero:
-       mov     r10, r11
-       mov     r11, 0
-       rjmp    HIDDEN_JUMPTARGET(memset)
+	mov	r10, r11
+	mov	r11, 0
+	rjmp	HIDDEN_JUMPTARGET(memset)
 
-       .size   bzero, . - bzero
+	.size	bzero, . - bzero
 
 #endif /* __UCLIBC_SUSV3_LEGACY__ */
diff --git a/libc/string/avr32/memcmp.S b/libc/string/avr32/memcmp.S
index 5d7eac3d9..ae6cc9189 100644
--- a/libc/string/avr32/memcmp.S
+++ b/libc/string/avr32/memcmp.S
@@ -12,48 +12,48 @@
 #define s2 r11
 #define len r10
 
-       .text
-       .global memcmp
-       .type   memcmp, @function
-       .align  1
+	.text
+	.global memcmp
+	.type	memcmp, @function
+	.align	1
 memcmp:
-       sub     len, 4
-       brlt    .Lless_than_4
+	sub	len, 4
+	brlt	.Lless_than_4
 
-1:     ld.w    r8, s1++
-       ld.w    r9, s2++
-       cp.w    r8, r9
-       brne    .Lfound_word
-       sub     len, 4
-       brge    1b
+1:	ld.w	r8, s1++
+	ld.w	r9, s2++
+	cp.w	r8, r9
+	brne	.Lfound_word
+	sub	len, 4
+	brge	1b
 
 .Lless_than_4:
-       sub     len, -4
-       reteq   0
+	sub	len, -4
+	reteq	0
 
-1:     ld.ub   r8, s1++
-       ld.ub   r9, s2++
-       sub     r8, r9
-       retne   r8
-       sub     len, 1
-       brgt    1b
+1:	ld.ub	r8, s1++
+	ld.ub	r9, s2++
+	sub	r8, r9
+	retne	r8
+	sub	len, 1
+	brgt	1b
 
-       retal   0
+	retal	0
 
 .Lfound_word:
-       mov     len, 4
-
-2:     bfextu  r11, r9, 24, 8
-       bfextu  r12, r8, 24, 8
-       sub     r12, r11
-       retne   r12
-       lsl     r8, 8
-       lsl     r9, 8
-       sub     len, 1
-       brne    2b
-       retal   r12
-
-       .size   memcmp, . - memcmp
+	mov	len, 4
+
+2:	bfextu	r11, r9, 24, 8
+	bfextu	r12, r8, 24, 8
+	sub	r12, r11
+	retne	r12
+	lsl	r8, 8
+	lsl	r9, 8
+	sub	len, 1
+	brne	2b
+	retal	r12
+
+	.size	memcmp, . - memcmp
 
 libc_hidden_def(memcmp)
 #ifdef __UCLIBC_SUSV3_LEGACY__
diff --git a/libc/string/avr32/memcpy.S b/libc/string/avr32/memcpy.S
index f95aabd13..bf091abf8 100644
--- a/libc/string/avr32/memcpy.S
+++ b/libc/string/avr32/memcpy.S
@@ -11,101 +11,101 @@
 #define src r11
 #define len r10
 
-       .text
-       .global memcpy
-       .type   memcpy, @function
+	.text
+	.global	memcpy
+	.type	memcpy, @function
 memcpy:
-       pref    src[0]
-       mov     dst, r12
+	pref	src[0]
+	mov	dst, r12
 
-       /* If we have less than 32 bytes, don't do anything fancy */
-       cp.w    len, 32
-       brge    .Lmore_than_31
+	/* If we have less than 32 bytes, don't do anything fancy */
+	cp.w	len, 32
+	brge	.Lmore_than_31
 
-       sub     len, 1
-       retlt   r12
-1:     ld.ub   r8, src++
-       st.b    dst++, r8
-       sub     len, 1
-       brge    1b
-       retal   r12
+	sub	len, 1
+	retlt	r12
+1:	ld.ub	r8, src++
+	st.b	dst++, r8
+	sub	len, 1
+	brge	1b
+	retal	r12
 
 .Lmore_than_31:
-       pushm   r0-r7, lr
+	pushm	r0-r7, lr
 
-       /* Check alignment */
-       mov     r8, src
-       andl    r8, 31, COH
-       brne    .Lunaligned_src
-       mov     r8, dst
-       andl    r8, 3, COH
-       brne    .Lunaligned_dst
+	/* Check alignment */
+	mov	r8, src
+	andl	r8, 31, COH
+	brne	.Lunaligned_src
+	mov	r8, dst
+	andl	r8, 3, COH
+	brne	.Lunaligned_dst
 
 .Laligned_copy:
-       sub     len, 32
-       brlt    .Lless_than_32
+	sub	len, 32
+	brlt	.Lless_than_32
 
-1:     /* Copy 32 bytes at a time */
-       ldm     src, r0-r7
-       sub     src, -32
-       stm     dst, r0-r7
-       sub     dst, -32
-       sub     len, 32
-       brge    1b
+1:	/* Copy 32 bytes at a time */
+	ldm	src, r0-r7
+	sub	src, -32
+	stm	dst, r0-r7
+	sub	dst, -32
+	sub	len, 32
+	brge	1b
 
 .Lless_than_32:
-       /* Copy 16 more bytes if possible */
-       sub     len, -16
-       brlt    .Lless_than_16
-       ldm     src, r0-r3
-       sub     src, -16
-       sub     len, 16
-       stm     dst, r0-r3
-       sub     dst, -16
+	/* Copy 16 more bytes if possible */
+	sub	len, -16
+	brlt	.Lless_than_16
+	ldm	src, r0-r3
+	sub	src, -16
+	sub	len, 16
+	stm	dst, r0-r3
+	sub	dst, -16
 
 .Lless_than_16:
-       /* Do the remaining as byte copies */
-       neg     len
-       add     pc, pc, len << 2
-       .rept   15
-       ld.ub   r0, src++
-       st.b    dst++, r0
-       .endr
+	/* Do the remaining as byte copies */
+	neg	len
+	add	pc, pc, len << 2
+	.rept	15
+	ld.ub	r0, src++
+	st.b	dst++, r0
+	.endr
 
-       popm    r0-r7, pc
+	popm	r0-r7, pc
 
 .Lunaligned_src:
-       /* Make src cacheline-aligned. r8 = (src & 31) */
-       rsub    r8, r8, 32
-       sub     len, r8
-1:     ld.ub   r0, src++
-       st.b    dst++, r0
-       sub     r8, 1
-       brne    1b
-
-       /* If dst is word-aligned, we're ready to go */
-       pref    src[0]
-       mov     r8, 3
-       tst     dst, r8
-       breq    .Laligned_copy
+	/* Make src cacheline-aligned. r8 = (src & 31) */
+	rsub	r8, r8, 32
+	sub	len, r8
+1:	ld.ub	r0, src++
+	st.b	dst++, r0
+	sub	r8, 1
+	brne	1b
+
+	/* If dst is word-aligned, we're ready to go */
+	pref	src[0]
+	mov	r8, 3
+	tst	dst, r8
+	breq	.Laligned_copy
 
 .Lunaligned_dst:
-       /* src is aligned, but dst is not. Expect bad performance */
-       sub     len, 4
-       brlt    2f
-1:     ld.w    r0, src++
-       st.w    dst++, r0
-       sub     len, 4
-       brge    1b
-
-2:     neg     len
-       add     pc, pc, len << 2
-       .rept   3
-       ld.ub   r0, src++
-       st.b    dst++, r0
-       .endr
-
-       popm    r0-r7, pc
-       .size   memcpy, . - memcpy
+	/* src is aligned, but dst is not. Expect bad performance */
+	sub	len, 4
+	brlt	2f
+1:	ld.w	r0, src++
+	st.w	dst++, r0
+	sub	len, 4
+	brge	1b
+
+2:	neg	len
+	add	pc, pc, len << 2
+	.rept	3
+	ld.ub	r0, src++
+	st.b	dst++, r0
+	.endr
+
+	popm	r0-r7, pc
+	.size	memcpy, . - memcpy
 
 libc_hidden_def(memcpy)
diff --git a/libc/string/avr32/memmove.S b/libc/string/avr32/memmove.S
index 8ca4da54d..535f4a257 100644
--- a/libc/string/avr32/memmove.S
+++ b/libc/string/avr32/memmove.S
@@ -10,107 +10,107 @@
 #define src r11
 #define len r10
 
-       .text
-       .global memmove
-       .type   memmove, @function
+	.text
+	.global memmove
+	.type	memmove, @function
 memmove:
-       cp.w    src, dst
-       brge    HIDDEN_JUMPTARGET(memcpy)
-
-       add     dst, len
-       add     src, len
-       pref    src[-1]
-
-       /*
-        * The rest is basically the same as in memcpy.S except that
-        * the direction is reversed.
-        */
-       cp.w    len, 32
-       brge    .Lmore_than_31
-
-       sub     len, 1
-       retlt   r12
-1:     ld.ub   r8, --src
-       st.b    --dst, r8
-       sub     len, 1
-       brge    1b
-       retal   r12
+	cp.w	src, dst
+	brge	HIDDEN_JUMPTARGET(memcpy)
+
+	add	dst, len
+	add	src, len
+	pref	src[-1]
+
+	/*
+	 * The rest is basically the same as in memcpy.S except that
+	 * the direction is reversed.
+	 */
+	cp.w	len, 32
+	brge	.Lmore_than_31
+
+	sub	len, 1
+	retlt	r12
+1:	ld.ub	r8, --src
+	st.b	--dst, r8
+	sub	len, 1
+	brge	1b
+	retal	r12
 
 .Lmore_than_31:
-       pushm   r0-r7, lr
+	pushm	r0-r7, lr
 
-       /* Check alignment */
-       mov     r8, src
-       andl    r8, 31, COH
-       brne    .Lunaligned_src
-       mov     r8, r12
-       andl    r8, 3, COH
-       brne    .Lunaligned_dst
+	/* Check alignment */
+	mov	r8, src
+	andl	r8, 31, COH
+	brne	.Lunaligned_src
+	mov	r8, r12
+	andl	r8, 3, COH
+	brne	.Lunaligned_dst
 
 .Laligned_copy:
-       sub     len, 32
-       brlt    .Lless_than_32
+	sub	len, 32
+	brlt	.Lless_than_32
 
-1:     /* Copy 32 bytes at a time */
-       sub     src, 32
-       ldm     src, r0-r7
-       sub     dst, 32
-       sub     len, 32
-       stm     dst, r0-r7
-       brge    1b
+1:	/* Copy 32 bytes at a time */
+	sub	src, 32
+	ldm	src, r0-r7
+	sub	dst, 32
+	sub	len, 32
+	stm	dst, r0-r7
+	brge	1b
 
 .Lless_than_32:
-       /* Copy 16 more bytes if possible */
-       sub     len, -16
-       brlt    .Lless_than_16
-       sub     src, 16
-       ldm     src, r0-r3
-       sub     dst, 16
-       sub     len, 16
-       stm     dst, r0-r3
+	/* Copy 16 more bytes if possible */
+	sub	len, -16
+	brlt	.Lless_than_16
+	sub	src, 16
+	ldm	src, r0-r3
+	sub	dst, 16
+	sub	len, 16
+	stm	dst, r0-r3
 
 .Lless_than_16:
-       /* Do the remaining as byte copies */
-       sub     len, -16
-       breq    2f
-1:     ld.ub   r0, --src
-       st.b    --dst, r0
-       sub     len, 1
-       brne    1b
+	/* Do the remaining as byte copies */
+	sub	len, -16
+	breq	2f
+1:	ld.ub	r0, --src
+	st.b	--dst, r0
+	sub	len, 1
+	brne	1b
 
-2:     popm    r0-r7, pc
+2:	popm	r0-r7, pc
 
 .Lunaligned_src:
-       /* Make src cacheline-aligned. r8 = (src & 31) */
-       sub     len, r8
-1:     ld.ub   r0, --src
-       st.b    --dst, r0
-       sub     r8, 1
-       brne    1b
-
-       /* If dst is word-aligned, we're ready to go */
-       pref    src[-4]
-       mov     r8, 3
-       tst     dst, r8
-       breq    .Laligned_copy
+	/* Make src cacheline-aligned. r8 = (src & 31) */
+	sub	len, r8
+1:	ld.ub	r0, --src
+	st.b	--dst, r0
+	sub	r8, 1
+	brne	1b
+
+	/* If dst is word-aligned, we're ready to go */
+	pref	src[-4]
+	mov	r8, 3
+	tst	dst, r8
+	breq	.Laligned_copy
 
 .Lunaligned_dst:
-       /* src is aligned, but dst is not. Expect bad performance */
-       sub     len, 4
-       brlt    2f
-1:     ld.w    r0, --src
-       st.w    --dst, r0
-       sub     len, 4
-       brge    1b
-
-2:     neg     len
-       add     pc, pc, len << 2
-       .rept   3
-       ld.ub   r0, --src
-       st.b    --dst, r0
-       .endr
-
-       popm    r0-r7, pc
-       .size   memmove, . - memmove
+	/* src is aligned, but dst is not. Expect bad performance */
+	sub	len, 4
+	brlt	2f
+1:	ld.w	r0, --src
+	st.w	--dst, r0
+	sub	len, 4
+	brge	1b
+
+2:	neg	len
+	add	pc, pc, len << 2
+	.rept	3
+	ld.ub	r0, --src
+	st.b	--dst, r0
+	.endr
+
+	popm	r0-r7, pc
+	.size	memmove, . - memmove
 
 libc_hidden_def(memmove)
diff --git a/libc/string/avr32/memset.S b/libc/string/avr32/memset.S
index 964bf4834..472b2be35 100644
--- a/libc/string/avr32/memset.S
+++ b/libc/string/avr32/memset.S
@@ -12,54 +12,54 @@
 #define c r11
 #define n r10
 
-       .text
-       .global memset
-       .type   memset, @function
+	.text
+	.global memset
+	.type	memset, @function
 
-       .align  1
+	.align	1
 memset:
-       cp.w    n, 32
-       mov     r9, s
-       brge    .Llarge_memset
+	cp.w	n, 32
+	mov	r9, s
+	brge	.Llarge_memset
 
-       sub     n, 1
-       retlt   s
-1:     st.b    s++, c
-       sub     n, 1
-       brge    1b
+	sub	n, 1
+	retlt	s
+1:	st.b	s++, c
+	sub	n, 1
+	brge	1b
 
-       retal   r9
+	retal	r9
 
 .Llarge_memset:
-       mov     r8, r11
-       mov     r11, 3
-       bfins   r8, r8, 8, 8
-       bfins   r8, r8, 16, 16
-       tst     s, r11
-       breq    2f
+	mov	r8, r11
+	mov	r11, 3
+	bfins	r8, r8, 8, 8
+	bfins	r8, r8, 16, 16
+	tst	s, r11
+	breq	2f
 
-1:     st.b    s++, r8
-       sub     n, 1
-       tst     s, r11
-       brne    1b
+1:	st.b	s++, r8
+	sub	n, 1
+	tst	s, r11
+	brne	1b
 
-2:     mov     r11, r9
-       mov     r9, r8
-       sub     n, 8
+2:	mov	r11, r9
+	mov	r9, r8
+	sub	n, 8
 
-3:     st.d    s++, r8
-       sub     n, 8
-       brge    3b
+3:	st.d	s++, r8
+	sub	n, 8
+	brge	3b
 
-       /* If we are done, n == -8 and we'll skip all st.b insns below */
-       neg     n
-       lsl     n, 1
-       add     pc, n
-       .rept   7
-       st.b    s++, r8
-       .endr
-       retal   r11
+	/* If we are done, n == -8 and we'll skip all st.b insns below */
+	neg	n
+	lsl	n, 1
+	add	pc, n
+	.rept	7
+	st.b	s++, r8
+	.endr
+	retal	r11
 
-       .size   memset, . - memset
+	.size	memset, . - memset
 
 libc_hidden_def(memset)
diff --git a/libc/string/avr32/strcmp.S b/libc/string/avr32/strcmp.S
index e9f087577..f73bd43e7 100644
--- a/libc/string/avr32/strcmp.S
+++ b/libc/string/avr32/strcmp.S
@@ -12,77 +12,77 @@
 #define s2 r11
 #define len r10
 
-       .text
-       .global strcmp
-       .type   strcmp, @function
-       .align  1
+	.text
+	.global strcmp
+	.type	strcmp, @function
+	.align	1
 strcmp:
-       mov     r8, 3
-       tst     s1, r8
-       brne    .Lunaligned_s1
-       tst     s2, r8
-       brne    .Lunaligned_s2
+	mov	r8, 3
+	tst	s1, r8
+	brne	.Lunaligned_s1
+	tst	s2, r8
+	brne	.Lunaligned_s2
 
-1:     ld.w    r8, s1++
-       ld.w    r9, s2++
-       cp.w    r8, r9
-       brne    2f
-       tnbz    r8
-       brne    1b
-       retal   0
+1:	ld.w	r8, s1++
+	ld.w	r9, s2++
+	cp.w	r8, r9
+	brne	2f
+	tnbz	r8
+	brne	1b
+	retal	0
 
-2:     bfextu  r12, r8, 24, 8
-       bfextu  r11, r9, 24, 8
-       sub     r12, r11
-       retne   r12
-       cp.w    r11, 0
-       reteq   0
-       bfextu  r12, r8, 16, 8
-       bfextu  r11, r9, 16, 8
-       sub     r12, r11
-       retne   r12
-       cp.w    r11, 0
-       reteq   0
-       bfextu  r12, r8, 8, 8
-       bfextu  r11, r9, 8, 8
-       sub     r12, r11
-       retne   r12
-       cp.w    r11, 0
-       reteq   0
-       bfextu  r12, r8, 0, 8
-       bfextu  r11, r9, 0, 8
-       sub     r12, r11
-       retal   r12
+2:	bfextu	r12, r8, 24, 8
+	bfextu	r11, r9, 24, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 16, 8
+	bfextu	r11, r9, 16, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 8, 8
+	bfextu	r11, r9, 8, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 0, 8
+	bfextu	r11, r9, 0, 8
+	sub	r12, r11
+	retal	r12
 
 .Lunaligned_s1:
-3:     tst     s1, r8
-       breq    4f
-       ld.ub   r10, s1++
-       ld.ub   r9, s2++
-       sub     r10, r9
-       retne   r10
-       cp.w    r9, 0
-       brne    3b
-       retal   r10
+3:	tst	s1, r8
+	breq	4f
+	ld.ub	r10, s1++
+	ld.ub	r9, s2++
+	sub	r10, r9
+	retne	r10
+	cp.w	r9, 0
+	brne	3b
+	retal	r10
 
-4:     tst     s2, r8
-       breq    1b
+4:	tst	s2, r8
+	breq	1b
 
 .Lunaligned_s2:
-       /*
-        * s1 and s2 can't both be aligned, and unaligned word loads
-        * can trigger spurious exceptions if we cross a page boundary.
-        * Do it the slow way...
-        */
-1:     ld.ub   r8, s1++
-       ld.ub   r9, s2++
-       sub     r8, r9
-       retne   r8
-       cp.w    r9, 0
-       brne    1b
-       retal   0
+	/*
+	 * s1 and s2 can't both be aligned, and unaligned word loads
+	 * can trigger spurious exceptions if we cross a page boundary.
+	 * Do it the slow way...
+	 */
+1:	ld.ub	r8, s1++
+	ld.ub	r9, s2++
+	sub	r8, r9
+	retne	r8
+	cp.w	r9, 0
+	brne	1b
+	retal	0
 
-       .size   strcmp, . - strcmp
+	.size	strcmp, . - strcmp
 
 libc_hidden_def(strcmp)
 #ifndef __UCLIBC_HAS_LOCALE__
diff --git a/libc/string/avr32/strlen.S b/libc/string/avr32/strlen.S
index d2808998d..5223e5365 100644
--- a/libc/string/avr32/strlen.S
+++ b/libc/string/avr32/strlen.S
@@ -10,53 +10,53 @@
 
 #define str r12
 
-       .text
-       .global strlen
-       .type   strlen, @function
+	.text
+	.global strlen
+	.type	strlen, @function
 strlen:
-       mov     r11, r12
-
-       mov     r9, str
-       andl    r9, 3, COH
-       brne    .Lunaligned_str
-
-1:     ld.w    r8, str++
-       tnbz    r8
-       brne    1b
-
-       sub     r12, r11
-       bfextu  r9, r8, 24, 8
-       cp.w    r9, 0
-       subeq   r12, 4
-       reteq   r12
-       bfextu  r9, r8, 16, 8
-       cp.w    r9, 0
-       subeq   r12, 3
-       reteq   r12
-       bfextu  r9, r8, 8, 8
-       cp.w    r9, 0
-       subeq   r12, 2
-       reteq   r12
-       sub     r12, 1
-       retal   r12
+	mov	r11, r12
+
+	mov	r9, str
+	andl	r9, 3, COH
+	brne	.Lunaligned_str
+
+1:	ld.w	r8, str++
+	tnbz	r8
+	brne	1b
+
+	sub	r12, r11
+	bfextu	r9, r8, 24, 8
+	cp.w	r9, 0
+	subeq	r12, 4
+	reteq	r12
+	bfextu	r9, r8, 16, 8
+	cp.w	r9, 0
+	subeq	r12, 3
+	reteq	r12
+	bfextu	r9, r8, 8, 8
+	cp.w	r9, 0
+	subeq	r12, 2
+	reteq	r12
+	sub	r12, 1
+	retal	r12
 
 .Lunaligned_str:
-       add     pc, pc, r9 << 3
-       sub     r0, r0, 0       /* 4-byte nop */
-       ld.ub   r8, str++
-       sub     r8, r8, 0
-       breq    1f
-       ld.ub   r8, str++
-       sub     r8, r8, 0
-       breq    1f
-       ld.ub   r8, str++
-       sub     r8, r8, 0
-       brne    1b
-
-1:     sub     r12, 1
-       sub     r12, r11
-       retal   r12
-
-       .size   strlen, . - strlen
+	add	pc, pc, r9 << 3
+	sub	r0, r0, 0	/* 4-byte nop */
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	breq	1f
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	breq	1f
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	brne	1b
+
+1:	sub	r12, 1
+	sub	r12, r11
+	retal	r12
+
+	.size	strlen, . - strlen
 
 libc_hidden_def(strlen)
diff --git a/libc/string/bfin/memchr.S b/libc/string/bfin/memchr.S
index 23626d6a4..88e46bef6 100644
--- a/libc/string/bfin/memchr.S
+++ b/libc/string/bfin/memchr.S
@@ -1,5 +1,5 @@
 /* memchr.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* void *memchr(const void *s, int c, size_t n);
  * R0 = address (s)
  * R1 = sought byte (c)
@@ -21,30 +23,29 @@
 
 .align 2
 
-.global _memchr
-.type _memchr, STT_FUNC
-_memchr:
+.weak _memchr
+ENTRY(_memchr)
 	P0 = R0;             // P0 = address
 	P2 = R2;             // P2 = count
 	R1 = R1.B(Z);
 	CC = R2 == 0;
-	IF CC JUMP failed;
+	IF CC JUMP .Lfailed;
 
-bytes:
-	LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
+.Lbytes:
+	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
 
-byte_loop_s:
+.Lbyte_loop_s:
 	R3 = B[P0++](Z);
 	CC = R3 == R1;
-	IF CC JUMP found;
-byte_loop_e:
+	IF CC JUMP .Lfound;
+.Lbyte_loop_e:
 	NOP;
 
-failed:
+.Lfailed:
 	R0=0;
 	RTS;
 
-found:
+.Lfound:
 	R0 = P0;
 	R0 += -1;
 	RTS;
diff --git a/libc/string/bfin/memcmp.S b/libc/string/bfin/memcmp.S
index f2679d5ae..7cc76ad96 100644
--- a/libc/string/bfin/memcmp.S
+++ b/libc/string/bfin/memcmp.S
@@ -1,5 +1,5 @@
 /* memcmp.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* int memcmp(const void *s1, const void *s2, size_t n);
  * R0 = First Address (s1)
  * R1 = Second Address (s2)
@@ -21,28 +23,27 @@
 
 .align 2
 
-.global _memcmp
-.type _memcmp, STT_FUNC
-_memcmp:
+.weak _memcmp
+ENTRY(_memcmp)
 	I1 = P3;
-	P0 = R0;              // P0 = s1 address
-	P3 = R1;              // P3 = s2 Address
-	P2 = R2 ;             // P2 = count
+	P0 = R0;			/* P0 = s1 address */
+	P3 = R1;			/* P3 = s2 Address  */
+	P2 = R2 ;			/* P2 = count */
 	CC = R2 <= 7(IU);
-	IF CC JUMP  too_small;
-	I0 = R1;		    // s2
-	R1 = R1 | R0;         // OR addresses together
-	R1 <<= 30;            // check bottom two bits
-	CC =  AZ;             // AZ set if zero.
-	IF !CC JUMP  bytes ;  // Jump if addrs not aligned.
+	IF CC JUMP .Ltoo_small;
+	I0 = R1;			/* s2 */
+	R1 = R1 | R0;		/* OR addresses together */
+	R1 <<= 30;		/* check bottom two bits */
+	CC =  AZ;			/* AZ set if zero. */
+	IF !CC JUMP .Lbytes ;	/* Jump if addrs not aligned. */
 
-	P1 = P2 >> 2;          // count = n/4
+	P1 = P2 >> 2;		/* count = n/4 */
 	R3 =  3;
-	R2 = R2 & R3;         // remainder
-	P2 = R2;               // set remainder
+	R2 = R2 & R3;		/* remainder */
+	P2 = R2;			/* set remainder */
 
-	LSETUP (quad_loop_s , quad_loop_e) LC0=P1;
-quad_loop_s:
+	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s:
 #if !defined(__WORKAROUND_AVOID_DAG1)
 	MNOP || R0 = [P0++] || R1 = [I0++];
 #else
@@ -50,52 +51,54 @@ quad_loop_s:
 	R1 = [I0++];
 #endif
 	CC = R0 == R1;
-	IF !CC JUMP quad_different;
-quad_loop_e:
+	IF !CC JUMP .Lquad_different;
+.Lquad_loop_e:
 	NOP;
 
-	P3 = I0;                 // s2
-too_small:
-	CC = P2 == 0;            //Check zero count
-	IF CC JUMP finished;     // very unlikely
+	P3 = I0;			/* s2 */
+.Ltoo_small:
+	CC = P2 == 0;		/* Check zero count*/
+	IF CC JUMP .Lfinished;	/* very unlikely*/
 
-bytes:
-	LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
-byte_loop_s:
-	R1 = B[P3++](Z);	// *s2
-	R0 = B[P0++](Z);	// *s1
+.Lbytes:
+	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+.Lbyte_loop_s:
+	R1 = B[P3++](Z);	/* *s2 */
+	R0 = B[P0++](Z);	/* *s1 */
 	CC = R0 == R1;
-	IF !CC JUMP different;
-byte_loop_e:
+	IF !CC JUMP .Ldifferent;
+.Lbyte_loop_e:
 	NOP;
 
-different:
+.Ldifferent:
 	R0 = R0 - R1;
 	P3 = I1;
 	RTS;
 
-quad_different:
-	// We've read two quads which don't match.
-	// Can't just compare them, because we're
-	// a little-endian machine, so the MSBs of
-	// the regs occur at later addresses in the
-	// string.
-	// Arrange to re-read those two quads again,
-	// byte-by-byte.
-	P0 += -4;	// back up to the start of the
-	P3 = I0;	// quads, and increase the
-	P2 += 4;	// remainder count
+.Lquad_different:
+	/* We've read two quads which don't match.
+	 * Can't just compare them, because we're
+	 * a little-endian machine, so the MSBs of
+	 * the regs occur at later addresses in the
+	 * string.
+	 * Arrange to re-read those two quads again,
+	 * byte-by-byte.
+	 */
+	P0 += -4;		/* back up to the start of the */
+	P3 = I0;		/* quads, and increase the*/
+	P2 += 4;		/* remainder count*/
 	P3 += -4;
-	JUMP bytes;
+	JUMP .Lbytes;
 
-finished:
+.Lfinished:
 	R0 = 0;
 	P3 = I1;
 	RTS;
+
 .size _memcmp,.-_memcmp
 
 libc_hidden_def (memcmp)
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
-strong_alias (memcmp,bcmp)
+weak_alias (memcmp,bcmp)
 #endif
diff --git a/libc/string/bfin/memcpy.S b/libc/string/bfin/memcpy.S
index e7ba7048e..bdd760691 100644
--- a/libc/string/bfin/memcpy.S
+++ b/libc/string/bfin/memcpy.S
@@ -1,5 +1,5 @@
 /* memcpy.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* void *memcpy(void *dest, const void *src, size_t n);
  * R0 = To Address (dest) (leave unchanged to form result)
  * R1 = From Address (src)
@@ -21,54 +23,55 @@
 
 .align 2
 
-.global _memcpy
-.type _memcpy, STT_FUNC
-_memcpy:
+.weak _memcpy
+ENTRY(_memcpy)
 	[--SP] = P3;
-	P0 = R0;              // P0 = To address
-	P3 = R1;              // P3 = From Address
-	P2 = R2 ;             // P2 = count
+	P0 = R0;              /* P0 = To address */
+	P3 = R1;              /* P3 = From Address */
+	P2 = R2;              /* P2 = count */
 	CC = R2 <= 7(IU);
-	IF CC JUMP  too_small;
+	IF CC JUMP .Ltoo_small;
 	I0 = R1;
-	R3 = R1 | R0;         // OR addresses together
-	R3 <<= 30;            // check bottom two bits
-	CC =  AZ;             // AZ set if zero.
-	IF !CC JUMP  bytes ;  // Jump if addrs not aligned.
-	P1 = P2 >> 2;         // count = n/4
+	R3 = R1 | R0;         /* OR addresses together */
+	R3 <<= 30;            /* check bottom two bits */
+	CC =  AZ;             /* AZ set if zero. */
+	IF !CC JUMP .Lbytes;  /* Jump if addrs not aligned. */
+	P1 = P2 >> 2;         /* count = n/4 */
 	P1 += -1;
 	R3 =  3;
-	R2 = R2 & R3;         // remainder
-	P2 = R2;              // set remainder
+	R2 = R2 & R3;         /* remainder */
+	P2 = R2;              /* set remainder */
 	R1 = [I0++];
 #if !defined(__WORKAROUND_AVOID_DAG1)
-	LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop:		MNOP || [P0++] = R1 || R1 = [I0++];
+	LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1;
+.Lquad_loop:	MNOP || [P0++] = R1 || R1 = [I0++];
 #else
-	LSETUP (quad_loop_s , quad_loop_e) LC0=P1;
-quad_loop_s:	[P0++] = R1;
-quad_loop_e:	R1 = [I0++];
+	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s:	[P0++] = R1;
+.Lquad_loop_e:	R1 = [I0++];
 #endif
 	[P0++] = R1;
 
-	CC = P2 == 0;         // any remaining bytes?
-	P3 = I0;	      // Ammend P3 for remaining copy
-	IF !CC JUMP bytes;
+	CC = P2 == 0;         /* any remaining bytes? */
+	P3 = I0;              /* Ammend P3 for remaining copy */
+	IF !CC JUMP .Lbytes;
 	P3 = [SP++];
 	RTS;
 
-too_small:
-	CC = P2 == 0;           //Check zero count
-	IF CC JUMP finished;    // very unlikely
+.Ltoo_small:
+	CC = P2 == 0;          /* Check zero count */
+	IF CC JUMP .Lfinished; /* very unlikely */
 
-bytes:
-	LSETUP (byte_loop_s , byte_loop_e) LC0=P2;
-byte_loop_s:	R1 = B[P3++](Z);
-byte_loop_e:	B[P0++] = R1;
+.Lbytes:
+	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+.Lbyte_loop_s:	R1 = B[P3++](Z);
+.Lbyte_loop_e:	B[P0++] = R1;
 
-finished:
+.Lfinished:
 	P3 = [SP++];
+
 	RTS;
+
 .size _memcpy,.-_memcpy
 
 libc_hidden_def (memcpy)
diff --git a/libc/string/bfin/memmove.S b/libc/string/bfin/memmove.S
index 3d446f326..73e363820 100644
--- a/libc/string/bfin/memmove.S
+++ b/libc/string/bfin/memmove.S
@@ -1,5 +1,5 @@
 /* memmove.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* void *memmove(void *dest, const void *src, size_t n);
  * R0 = To Address (dest) (leave unchanged to form result)
  * R1 = From Address (src)
@@ -21,75 +23,78 @@
 
 .align 2
 
-.global _memmove
-.type _memmove, STT_FUNC
-_memmove:
+.weak _memmove
+ENTRY(_memmove)
 	I1 = P3;
-	P0 = R0;              // P0 = To address
-	P3 = R1;              // P3 = From Address
-	P2 = R2 ;             // P2 = count
-	CC = P2 == 0;           //Check zero count
-	IF CC JUMP finished;    // very unlikely
+	P0 = R0;                  /* P0 = To address */
+	P3 = R1;                  /* P3 = From Address */
+	P2 = R2;                  /* P2 = count */
+	CC = P2 == 0;             /* Check zero count*/
+	IF CC JUMP .Lfinished;    /* very unlikely */
 
-	CC = R1 < R0 (IU);	// From < To
-	IF !CC JUMP no_overlap;
+	CC = R1 < R0 (IU);        /* From < To */
+	IF !CC JUMP .Lno_overlap;
 	R3 = R1 + R2;
-	CC = R0 <= R3 (IU);	// (From+len) >= To
-	IF CC JUMP overlap;
-no_overlap:
+	CC = R0 <= R3 (IU);       /* (From+len) >= To */
+	IF CC JUMP .Loverlap;
+.Lno_overlap:
 	R3 = 11;
 	CC = R2 <= R3;
-	IF CC JUMP  bytes;
-	R3 = R1 | R0;         // OR addresses together
-	R3 <<= 30;            // check bottom two bits
-	CC =  AZ;             // AZ set if zero.
-	IF !CC JUMP  bytes ;  // Jump if addrs not aligned.
+	IF CC JUMP .Lbytes;
+	R3 = R1 | R0;             /* OR addresses together */
+	R3 <<= 30;                /* check bottom two bits */
+	CC =  AZ;                 /* AZ set if zero.*/
+	IF !CC JUMP .Lbytes;      /* Jump if addrs not aligned.*/
 
 	I0 = P3;
-	P1 = P2 >> 2;         // count = n/4
+	P1 = P2 >> 2;             /* count = n/4 */
 	P1 += -1;
 	R3 =  3;
-	R2 = R2 & R3;         // remainder
-	P2 = R2;              // set remainder
+	R2 = R2 & R3;             /* remainder */
+	P2 = R2;                  /* set remainder */
 	R1 = [I0++];
 
 #if !defined(__WORKAROUND_AVOID_DAG1)
-	LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop:		MNOP || [P0++] = R1 || R1 = [I0++];
+	LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1;
+.Lquad_loop:	MNOP || [P0++] = R1 || R1 = [I0++];
 #else
-	LSETUP (quad_loop_s, quad_loop_e) LC0=P1;
-quad_loop_s:	[P0++] = R1;
-quad_loop_e:	R1 = [I0++];
+	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s:	[P0++] = R1;
+.Lquad_loop_e:	R1 = [I0++];
 #endif
 	[P0++] = R1;
 
-	CC = P2 == 0;         // any remaining bytes?
-	P3 = I0;		  // Ammend P3 to updated ptr.
-	IF !CC JUMP bytes;
+	CC = P2 == 0;             /* any remaining bytes? */
+	P3 = I0;                  /* Ammend P3 to updated ptr. */
+	IF !CC JUMP .Lbytes;
 	P3 = I1;
 	RTS;
 
-bytes:		LSETUP (byte2_s , byte2_e) LC0=P2;
-byte2_s:	R1 = B[P3++](Z);
-byte2_e:	B[P0++] = R1;
+.Lbytes:     LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2;
+.Lbyte2_s:   R1 = B[P3++](Z);
+.Lbyte2_e:   B[P0++] = R1;
 
-finished:
-	P3 = I1;
+.Lfinished:  P3 = I1;
 	RTS;
 
-overlap:
+.Loverlap:
 	P2 += -1;
 	P0 = P0 + P2;
 	P3 = P3 + P2;
 	R1 = B[P3--] (Z);
 	CC = P2 == 0;
-	IF CC JUMP no_loop;
-	LSETUP (ol_s, ol_e) LC0 = P2;
-ol_s:		B[P0--] = R1;
-ol_e:		R1 = B[P3--] (Z);
-no_loop:	B[P0] = R1;
+	IF CC JUMP .Lno_loop;
+#if defined(__WORKAROUND_SPECULATIVE_LOADS)
+	NOP;
+	NOP;
+#endif
+	LSETUP (.Lol_s, .Lol_e) LC0 = P2;
+.Lol_s:    B[P0--] = R1;
+.Lol_e:    R1 = B[P3--] (Z);
+.Lno_loop: B[P0] = R1;
 	P3 = I1;
 	RTS;
+
 .size _memmove,.-_memmove
 
 libc_hidden_def (memmove)
diff --git a/libc/string/bfin/memset.S b/libc/string/bfin/memset.S
index bd8eb4b6a..64012f783 100644
--- a/libc/string/bfin/memset.S
+++ b/libc/string/bfin/memset.S
@@ -1,5 +1,5 @@
 /* memset.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* void *memset(void *s, int c, size_t n);
  * R0 = address (s) (leave unchanged to form result)
  * R1 = filler byte (c)
@@ -21,66 +23,68 @@
 
 .align 2
 
-.global _memset
-.type _memset, STT_FUNC
-_memset:
-	P0 = R0 ;             // P0 = address
-	P2 = R2 ;             // P2 = count
-	R3 = R0 + R2;         // end
+.weak _memset
+ENTRY(_memset)
+	P0 = R0 ;              /* P0 = address */
+	P2 = R2 ;              /* P2 = count   */
+	R3 = R0 + R2;          /* end          */
 	CC = R2 <= 7(IU);
-	IF CC JUMP  too_small;
-	R1 = R1.B (Z);        // R1 = fill char
+	IF CC JUMP  .Ltoo_small;
+	R1 = R1.B (Z);         /* R1 = fill char */
 	R2 =  3;
-	R2 = R0 & R2;         // addr bottom two bits
-	CC =  R2 == 0;             // AZ set if zero.
-	IF !CC JUMP  force_align ;  // Jump if addr not aligned.
+	R2 = R0 & R2;          /* addr bottom two bits */
+	CC =  R2 == 0;             /* AZ set if zero.	*/
+	IF !CC JUMP  .Lforce_align ;  /* Jump if addr not aligned. */
 
-aligned:
-	P1 = P2 >> 2;          // count = n/4
-	R2 = R1 <<  8;         // create quad filler
+.Laligned:
+	P1 = P2 >> 2;          /* count = n/4        */
+	R2 = R1 <<  8;         /* create quad filler */
 	R2.L = R2.L + R1.L(NS);
 	R2.H = R2.L + R1.H(NS);
 	P2 = R3;
 
-	LSETUP (quad_loop , quad_loop) LC0=P1;
-quad_loop:
+	LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1;
+.Lquad_loop:
 	[P0++] = R2;
 
 	CC = P0 == P2;
-	IF !CC JUMP bytes_left;
+	IF !CC JUMP .Lbytes_left;
 	RTS;
 
-bytes_left:
-	R2 = R3;         // end point
-	R3 = P0;         // current position
-	R2 = R2 - R3;    // bytes left
+.Lbytes_left:
+	R2 = R3;                /* end point */
+	R3 = P0;                /* current position */
+	R2 = R2 - R3;           /* bytes left */
 	P2 = R2;
 
-too_small:
-	CC = P2 == 0;           //Check zero count
-	IF CC JUMP finished;    // Unusual
+.Ltoo_small:
+	CC = P2 == 0;           /* Check zero count */
+	IF CC JUMP .Lfinished;    /* Unusual */
 
-bytes:       LSETUP (byte_loop , byte_loop) LC0=P2;
-byte_loop:   B[P0++] = R1;
+.Lbytes:
+	LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2;
+.Lbyte_loop:
+	B[P0++] = R1;
 
-finished:
+.Lfinished:
 	RTS;
 
-force_align:
-	CC = BITTST (R0, 0 );  // odd byte
+.Lforce_align:
+	CC = BITTST (R0, 0);  /* odd byte */
 	R0 = 4;
 	R0 = R0 - R2;
 	P1 = R0;
-	R0 = P0;			// Recover return address
-	IF !CC JUMP skip1;
+	R0 = P0;		    /* Recover return address */
+	IF !CC JUMP .Lskip1;
 	B[P0++] = R1;
-skip1:
-	CC = R2 <= 2;          // 2 bytes
-	P2 -= P1;              // reduce count
-	IF !CC JUMP aligned;
+.Lskip1:
+	CC = R2 <= 2;          /* 2 bytes */
+	P2 -= P1;              /* reduce count */
+	IF !CC JUMP .Laligned;
 	B[P0++] = R1;
 	B[P0++] = R1;
-	JUMP aligned;
+	JUMP .Laligned;
+
 .size _memset,.-_memset
 
 libc_hidden_def (memset)
diff --git a/libc/string/bfin/strcmp.S b/libc/string/bfin/strcmp.S
index 6365024ec..12e8c53c6 100644
--- a/libc/string/bfin/strcmp.S
+++ b/libc/string/bfin/strcmp.S
@@ -1,5 +1,5 @@
 /* strcmp.S
- * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved.
+ * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
  *
  * This file is subject to the terms and conditions of the GNU Library General
  * Public License. See the file "COPYING.LIB" in the main directory of this
@@ -9,6 +9,8 @@
  * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
  */
 
+#include <sysdep.h>
+
 /* Fast strcmp() for Blackfin.
  * When both strings are aligned, this processes four characters at
  * a time. Uses a hw loop with "very big" count to loop "forever",
@@ -21,9 +23,8 @@
 
 .align 2
 
-.global _strcmp
-.type _strcmp, STT_FUNC
-_strcmp:
+.weak _strcmp
+ENTRY(_strcmp)
 	[--sp] = (R7:4);
 	p1 = r0;
 	p2 = r1;
@@ -34,13 +35,13 @@ _strcmp:
 	r0 = r0 | r1;	// check both pointers at same time
 	r0 <<= 30;	// dump all but last 2 bits
 	cc = az;	// are they zero?
-	if !cc jump unaligned;	// no; use unaligned code.
+	if !cc jump .Lunaligned;	// no; use unaligned code.
 			// fall-thru for aligned case..
 
 	  // note that r0 is zero from the previous...
 	  //           p0 set to -1
 
-	lsetup (beginloop, endloop) lc0=p0;
+	LSETUP (.Lbeginloop, .Lendloop) lc0=p0;
 	  // pick up first words
 	r1 = [p1++];
 	r2 = [p2++];
@@ -49,8 +50,8 @@ _strcmp:
 	r7.h = 0xFF;
 		// loop : 9 cycles to check 4 characters
 	cc = r1 == r2;
-beginloop:
-	if !cc jump notequal4;	// compare failure, exit loop
+.Lbeginloop:
+	if !cc jump .Lnotequal4;	// compare failure, exit loop
 
 	  // starting with   44332211
 	  // see if char 3 or char 1 is 0
@@ -63,18 +64,18 @@ beginloop:
 	// add to zero,  and  (r1 is free, reload)
 	r6 = r3 +|+ r0 || r1 = [p1++] || nop;
 	cc |= az;	// true if either is zero
-	if cc jump zero4;	// leave if a zero somewhere
-endloop:
+	if cc jump .Lzero4;	// leave if a zero somewhere
+.Lendloop:
 	cc = r1 == r2;
 
  // loop exits
-notequal4:		// compare failure on 4-char compare
+.Lnotequal4:		// compare failure on 4-char compare
 			// address pointers are one word ahead;
 			// faster to use zero4 exit code
 	p1 += 4;
 	p2 += 4;
 
-zero4:			// one of the bytes in word 1 is zero
+.Lzero4:			// one of the bytes in word 1 is zero
 			// but we've already fetched the next word; so
 			// backup two to look at failing word again
 	p1 += -8;
@@ -85,27 +86,27 @@ zero4:			// one of the bytes in word 1 is zero
 		// here when pointers are unaligned: checks one
 		// character at a time.  Also use at the end of
 		// the word-check algorithm to figure out what happened
-unaligned:
+.Lunaligned:
 	  //	R0 is non-zero from before.
 	  //           p0 set to -1
 
 	r0 = 0 (Z);
 	r1 = B[p1++] (Z);
 	r2 = B[p2++] (Z);
-	lsetup (beginloop1, endloop1) lc0=p0;
+	LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0;
 
-beginloop1:
+.Lbeginloop1:
 	cc = r1;	// first char must be non-zero
 	// chars must be the same
 	r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop;
 	cc &= az;
 	r3 = r0 - r2;	// second char must be non-zero
 	cc &= an;
-	if !cc jump exitloop1;
-endloop1:
+	if !cc jump .Lexitloop1;
+.Lendloop1:
 	r2 = B[p2++] (Z);
 
-exitloop1: // here means we found a zero or a difference.
+.Lexitloop1: // here means we found a zero or a difference.
 	   // we have r2(N), p2(N), r1(N+1), p1(N+2)
 	r1=B[p1+ -2] (Z);
 	r0 = r1 - r2;
@@ -116,6 +117,6 @@ exitloop1: // here means we found a zero or a difference.
 libc_hidden_def (strcmp)
 
 #ifndef __UCLIBC_HAS_LOCALE__
-strong_alias (strcmp,strcoll)
+weak_alias (strcmp,strcoll)
 libc_hidden_def (strcoll)
 #endif
diff --git a/libc/string/cris/memcpy.c b/libc/string/cris/memcpy.c
index a85108109..0cce37a30 100644
--- a/libc/string/cris/memcpy.c
+++ b/libc/string/cris/memcpy.c
@@ -66,7 +66,7 @@
 
 void *memcpy(void *, const void *, unsigned int);
 
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
 void *memcpy(void *pdst,
              const void *psrc,
              unsigned int pn)
@@ -130,7 +130,7 @@ void *memcpy(void *pdst,
      here (beware: they may be moved to temporary registers).
       This way, we do not have to save/move the registers around into
      temporaries; we can safely use them straight away.  */
-    __asm__ volatile ("\
+    __asm__ __volatile__ ("\
 	.syntax no_register_prefix					\n\
 									\n\
         ;; Check that the register asm declaration got right.		\n\
diff --git a/libc/string/cris/memmove.c b/libc/string/cris/memmove.c
index 437637078..b6620afe0 100644
--- a/libc/string/cris/memmove.c
+++ b/libc/string/cris/memmove.c
@@ -27,7 +27,7 @@
 #include "memcopy.h"
 #include "../generic/pagecopy.h"
 
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
 void *memmove (void *dest, const void *src, size_t len)
 {
   unsigned long int dstp = (long int) dest;
diff --git a/libc/string/cris/memset.c b/libc/string/cris/memset.c
index 7e71bc50f..9cc959a33 100644
--- a/libc/string/cris/memset.c
+++ b/libc/string/cris/memset.c
@@ -59,7 +59,7 @@
 
 void *memset(void *, int, unsigned long);
 
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
 void *memset(void *pdst,
              int c,
              unsigned long plen)
@@ -124,7 +124,7 @@ void *memset(void *pdst,
      here (beware: they may be moved to temporary registers).
       This way, we do not have to save/move the registers around into
      temporaries; we can safely use them straight away.  */
-    __asm__ volatile ("								\n\
+    __asm__ __volatile__ ("								\n\
 	.syntax no_register_prefix						\n\
 										\n\
         ;; Check that the register asm declaration got right.			\n\
diff --git a/libc/string/cris/strcpy.c b/libc/string/cris/strcpy.c
index 0af25253e..955a990b7 100644
--- a/libc/string/cris/strcpy.c
+++ b/libc/string/cris/strcpy.c
@@ -6,7 +6,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strcpy)
+/* Experimentally off - libc_hidden_proto(strcpy) */
 char *strcpy(char *dest, const char *src)
 {
   char *ret = dest;
diff --git a/libc/string/cris/strncpy.c b/libc/string/cris/strncpy.c
index 93a6608bc..3f2775bdd 100644
--- a/libc/string/cris/strncpy.c
+++ b/libc/string/cris/strncpy.c
@@ -6,9 +6,9 @@
 
 #include <string.h>
 
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
 
-libc_hidden_proto(strncpy)
+/* Experimentally off - libc_hidden_proto(strncpy) */
 char *strncpy(char *dest, const char *src, size_t count)
 {
   char *ret = dest;
diff --git a/libc/string/frv/memset.S b/libc/string/frv/memset.S
index 4e64550e4..477597dcd 100644
--- a/libc/string/frv/memset.S
+++ b/libc/string/frv/memset.S
@@ -155,4 +155,4 @@ memset:
 	bralr
 	.size		memset, .-memset
 
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
diff --git a/libc/string/i386/memchr.c b/libc/string/i386/memchr.c
index 229d42919..fe4537914 100644
--- a/libc/string/i386/memchr.c
+++ b/libc/string/i386/memchr.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(memchr)
+/* Experimentally off - libc_hidden_proto(memchr) */
 void *memchr(const void *cs, int c, size_t count)
 {
     int d0;
diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c
index a2b8d3d8c..285583f3b 100644
--- a/libc/string/i386/memcpy.c
+++ b/libc/string/i386/memcpy.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
 void *memcpy(void * to, const void * from, size_t n)
 {
     int d0, d1, d2;
diff --git a/libc/string/i386/memmove.c b/libc/string/i386/memmove.c
index a26fe2be1..a924efcbc 100644
--- a/libc/string/i386/memmove.c
+++ b/libc/string/i386/memmove.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
 void *memmove(void *dest, const void *src, size_t n)
 {
     int d0, d1, d2;
diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c
index eea48040a..bbaa45215 100644
--- a/libc/string/i386/memset.c
+++ b/libc/string/i386/memset.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
 void *memset(void *s, int c, size_t count)
 {
     int d0, d1;
diff --git a/libc/string/i386/strcat.c b/libc/string/i386/strcat.c
index e0b1f3b51..2cf0237a6 100644
--- a/libc/string/i386/strcat.c
+++ b/libc/string/i386/strcat.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strcat)
+/* Experimentally off - libc_hidden_proto(strcat) */
 char *strcat(char * dest, const char * src)
 {
     int d0, d1, d2, d3;
diff --git a/libc/string/i386/strchr.c b/libc/string/i386/strchr.c
index 7568d48db..46b1dfb6e 100644
--- a/libc/string/i386/strchr.c
+++ b/libc/string/i386/strchr.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strchr)
+/* Experimentally off - libc_hidden_proto(strchr) */
 char *strchr(const char *s, int c)
 {
     int d0;
diff --git a/libc/string/i386/strcmp.c b/libc/string/i386/strcmp.c
index 47635d817..eff230c5c 100644
--- a/libc/string/i386/strcmp.c
+++ b/libc/string/i386/strcmp.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strcmp)
+/* Experimentally off - libc_hidden_proto(strcmp) */
 int strcmp(const char *cs, const char *ct)
 {
     int d0, d1;
@@ -55,7 +55,7 @@ int strcmp(const char *cs, const char *ct)
 libc_hidden_def(strcmp)
 
 #ifndef __UCLIBC_HAS_LOCALE__
-libc_hidden_proto(strcoll)
+/* Experimentally off - libc_hidden_proto(strcoll) */
 strong_alias(strcmp,strcoll)
 libc_hidden_def(strcoll)
 #endif
diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c
index 9e2b81009..09065a9b7 100644
--- a/libc/string/i386/strcpy.c
+++ b/libc/string/i386/strcpy.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strcpy)
+/* Experimentally off - libc_hidden_proto(strcpy) */
 char *strcpy(char * dest, const char * src)
 {
     int d0, d1, d2;
diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c
index f0767b600..61a178393 100644
--- a/libc/string/i386/strlen.c
+++ b/libc/string/i386/strlen.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strlen)
+/* Experimentally off - libc_hidden_proto(strlen) */
 size_t strlen(const char *s)
 {
     int d0;
diff --git a/libc/string/i386/strncat.c b/libc/string/i386/strncat.c
index c1061421e..5849db3b3 100644
--- a/libc/string/i386/strncat.c
+++ b/libc/string/i386/strncat.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strncat)
+/* Experimentally off - libc_hidden_proto(strncat) */
 char *strncat(char * dest, 
 	const char * src, size_t count)
 {
diff --git a/libc/string/i386/strncmp.c b/libc/string/i386/strncmp.c
index d716789c3..a14bb503b 100644
--- a/libc/string/i386/strncmp.c
+++ b/libc/string/i386/strncmp.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strncmp)
+/* Experimentally off - libc_hidden_proto(strncmp) */
 int strncmp(const char *cs, const char *ct, size_t count)
 {
     register int __res;
diff --git a/libc/string/i386/strncpy.c b/libc/string/i386/strncpy.c
index c061fe37e..76aa6ae1b 100644
--- a/libc/string/i386/strncpy.c
+++ b/libc/string/i386/strncpy.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strncpy)
+/* Experimentally off - libc_hidden_proto(strncpy) */
 char *strncpy(char * dest, const char * src, size_t count)
 {
     int d0, d1, d2, d3;
diff --git a/libc/string/i386/strnlen.c b/libc/string/i386/strnlen.c
index 77b5c7568..02c72f530 100644
--- a/libc/string/i386/strnlen.c
+++ b/libc/string/i386/strnlen.c
@@ -33,7 +33,7 @@
 #include <string.h>
 
 #ifdef __USE_GNU
-libc_hidden_proto(strnlen)
+/* Experimentally off - libc_hidden_proto(strnlen) */
 size_t strnlen(const char *s, size_t count)
 {
     int d0;
diff --git a/libc/string/i386/strrchr.c b/libc/string/i386/strrchr.c
index e3b2df6fb..ef378685b 100644
--- a/libc/string/i386/strrchr.c
+++ b/libc/string/i386/strrchr.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(strrchr)
+/* Experimentally off - libc_hidden_proto(strrchr) */
 char *strrchr(const char *s, int c)
 {
     int d0, d1;
diff --git a/libc/string/ia64/memcpy.S b/libc/string/ia64/memcpy.S
index db019f860..810eb0c0e 100644
--- a/libc/string/ia64/memcpy.S
+++ b/libc/string/ia64/memcpy.S
@@ -115,7 +115,7 @@
 #if defined(USE_LFETCH)
 #define LOOP(shift)						\
 		ALIGN(32);					\
-.loop##shift##:							\
+.loop##shift :							\
 { .mmb								\
 (p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
 (p[0])	lfetch.nt1 [ptr1], 16 ;					\
@@ -139,7 +139,7 @@
 #else
 #define LOOP(shift)						\
 		ALIGN(32);					\
-.loop##shift##:							\
+.loop##shift :							\
 { .mmb								\
 (p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
 	nop.b 0 ;						\
diff --git a/libc/string/ia64/memmove.S b/libc/string/ia64/memmove.S
index 0328f84de..00342d8e0 100644
--- a/libc/string/ia64/memmove.S
+++ b/libc/string/ia64/memmove.S
@@ -64,7 +64,7 @@
 
 #define LOOP(shift)							\
 		ALIGN(32);						\
-.loop##shift##:								\
+.loop##shift :								\
 (p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
 (p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
 (p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\
diff --git a/libc/string/ia64/sysdep.h b/libc/string/ia64/sysdep.h
index 03e74360d..d10020ac1 100644
--- a/libc/string/ia64/sysdep.h
+++ b/libc/string/ia64/sysdep.h
@@ -34,7 +34,7 @@
 #define ASM_UNW_PRLG_GRSAVE(ninputs)	(32+(ninputs))
 
 #ifdef	__STDC__
-#define C_LABEL(name)		name##:
+#define C_LABEL(name)		name :
 #else
 #define C_LABEL(name)		name/**/:
 #endif
diff --git a/libc/string/powerpc/memcpy.c b/libc/string/powerpc/memcpy.c
index ed8022313..bcbb806f8 100644
--- a/libc/string/powerpc/memcpy.c
+++ b/libc/string/powerpc/memcpy.c
@@ -21,7 +21,7 @@
 
 #include <string.h>
 
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
 void *memcpy(void *to, const void *from, size_t n)
 /* PPC can do pre increment and load/store, but not post increment and load/store.
    Therefore use *++ptr instead of *ptr++. */
diff --git a/libc/string/powerpc/memmove.c b/libc/string/powerpc/memmove.c
index 327161116..7a4a7e5ff 100644
--- a/libc/string/powerpc/memmove.c
+++ b/libc/string/powerpc/memmove.c
@@ -21,9 +21,9 @@
 
 #include <string.h>
 
-libc_hidden_proto(memcpy)
+/* Experimentally off - libc_hidden_proto(memcpy) */
 
-libc_hidden_proto(memmove)
+/* Experimentally off - libc_hidden_proto(memmove) */
 void *memmove(void *to, const void *from, size_t n)
 {
 	unsigned long rem, chunks, tmp1, tmp2;
diff --git a/libc/string/powerpc/memset.c b/libc/string/powerpc/memset.c
index 891e0b8aa..d62ec0ee0 100644
--- a/libc/string/powerpc/memset.c
+++ b/libc/string/powerpc/memset.c
@@ -21,14 +21,14 @@
 
 #include <string.h>
 
-libc_hidden_proto(memset)
+/* Experimentally off - libc_hidden_proto(memset) */
 
 static inline int expand_byte_word(int c){
 	/* this does: 
 	   c = c << 8 | c;
 	   c = c << 16 | c ;
 	*/
-	asm("rlwimi	%0,%0,8,16,23\n"
+	__asm__("rlwimi	%0,%0,8,16,23\n"
 	    "\trlwimi	%0,%0,16,0,15\n"
 	    : "=r" (c) : "0" (c));
 	return c;
diff --git a/libc/string/sparc/_glibc_inc.h b/libc/string/sparc/_glibc_inc.h
index 4eb4d755c..e0aef52c2 100644
--- a/libc/string/sparc/_glibc_inc.h
+++ b/libc/string/sparc/_glibc_inc.h
@@ -6,6 +6,8 @@
 #include <features.h>
 #include <bits/wordsize.h>
 
+/* Is alignment really needed? */
+
 #if __WORDSIZE == 32
 # define ENTRY_ALIGN 4
 #else
diff --git a/libc/string/sparc/sparc32/sparcv9b/memchr.S b/libc/string/sparc/sparc32/sparcv9b/memchr.S
index 7e86a2972..43a16ff11 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memchr.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memchr.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include "sparc64/memchr.S"
+#include "../../sparc64/memchr.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/memcpy.S b/libc/string/sparc/sparc32/sparcv9b/memcpy.S
index 7f697542e..2024869dd 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memcpy.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include "sparc64/sparcv9b/memcpy.S"
+#include "../../sparc64/sparcv9b/memcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/memset.S b/libc/string/sparc/sparc32/sparcv9b/memset.S
index ac67b7ab7..e49173172 100644
--- a/libc/string/sparc/sparc32/sparcv9b/memset.S
+++ b/libc/string/sparc/sparc32/sparcv9b/memset.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/memset.S>
+#include "../../sparc64/memset.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
index 440ad7e21..17ffa5e4d 100644
--- a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/stpcpy.S>
+#include "../../sparc64/stpcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcat.S b/libc/string/sparc/sparc32/sparcv9b/strcat.S
index 7a2223570..9ed125a4b 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcat.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcat.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/strcat.S>
+#include "../../sparc64/strcat.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strchr.S b/libc/string/sparc/sparc32/sparcv9b/strchr.S
index ddd32120d..6b2727a1f 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strchr.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strchr.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/strchr.S>
+#include "../../sparc64/strchr.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcmp.S b/libc/string/sparc/sparc32/sparcv9b/strcmp.S
index 5330f4359..854403ffd 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcmp.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcmp.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/strcmp.S>
+#include "../../sparc64/strcmp.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strcpy.S b/libc/string/sparc/sparc32/sparcv9b/strcpy.S
index 0b35c9be0..e8102bde4 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strcpy.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strcpy.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/strcpy.S>
+#include "../../sparc64/strcpy.S"
diff --git a/libc/string/sparc/sparc32/sparcv9b/strlen.S b/libc/string/sparc/sparc32/sparcv9b/strlen.S
index b8f4dba4f..8673333a2 100644
--- a/libc/string/sparc/sparc32/sparcv9b/strlen.S
+++ b/libc/string/sparc/sparc32/sparcv9b/strlen.S
@@ -1,4 +1,4 @@
 #define ASI_PNF     0x82
 #define ASI_BLK_P   0xf0
 #define XCC icc
-#include <sparc64/strlen.S>
+#include "../../sparc64/strlen.S"
diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h
index 88cef2ea3..415ce90a7 100644
--- a/libc/string/x86_64/_glibc_inc.h
+++ b/libc/string/x86_64/_glibc_inc.h
@@ -6,15 +6,8 @@
 #include <features.h>
 #include <bits/wordsize.h>
 
-#if __WORDSIZE == 32
-# define ENTRY_ALIGN 4
-#else
-# define ENTRY_ALIGN 2
-#endif
-
 #define ENTRY(sym) \
 	.global sym; \
-	.align  ENTRY_ALIGN; \
 	.type   sym,%function; \
 	sym:
 
diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S
index b3bb0f96c..697b992d0 100644
--- a/libc/string/x86_64/memcpy.S
+++ b/libc/string/x86_64/memcpy.S
@@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))
 	subq	$32, %rcx
 	js	2f
 
-	.p2align 4
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
 3:
-
 	/* Now correct the loop counter.  Please note that in the following
 	   code the flags are not changed anymore.  */
 	subq	$32, %rcx
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index d72d74468..46751006b 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -53,15 +53,17 @@ ENTRY (memset)
 	imul	%rax,%r8
 #endif
 	test	$0x7,%edi	/* Check for alignment.  */
-	je	2f
+	jz	2f
 
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+1:
+	/* Align ptr to 8 byte.  */
 	mov	%sil,(%rcx)
 	dec	%rdx
 	inc	%rcx
-	test	$0x7,%ecx
-	jne	1b
+	test	$0x7,%cl
+	jnz	1b
 
 2:	/* Check for really large regions.  */
 	mov	%rdx,%rax
@@ -70,8 +72,10 @@ ENTRY (memset)
 	cmp	LARGE, %rdx
 	jae	11f
 
-	.p2align 4
-3:	/* Copy 64 bytes.  */
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
+3:
+	/* Fill 64 bytes.  */
 	mov	%r8,(%rcx)
 	mov	%r8,0x8(%rcx)
 	mov	%r8,0x10(%rcx)
@@ -84,7 +88,7 @@ ENTRY (memset)
 	dec	%rax
 	jne	3b
 
-4:	/* Copy final bytes.  */
+4:	/* Fill final bytes.  */
 	and	$0x3f,%edx
 	mov	%rdx,%rax
 	shr	$0x3,%rax
@@ -107,16 +111,18 @@ ENTRY (memset)
 	jne	8b
 9:
 #if BZERO_P
-	nop
+	/* nothing */
 #else
 	/* Load result (only if used as memset).  */
 	mov	%rdi,%rax	/* start address of destination is result */
 #endif
 	retq
 
-	.p2align 4
-11:	/* Copy 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
+	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+	.p2align 4,,14
+11:
+	/* Fill 64 bytes without polluting the cache.  */
+	/* We could use	movntdq %xmm0,(%rcx) here to further
 	   speed up for large cases but let's not use XMM registers.  */
 	movnti	%r8,(%rcx)
 	movnti  %r8,0x8(%rcx)
diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S
index 9b0068981..23d068fea 100644
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -21,6 +21,7 @@
 
 #include "_glibc_inc.h"
 
+/* Seems to be unrolled too much */
 
 	.text
 ENTRY (BP_SYM (strcat))
@@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat))
 
 
 	/* Now the source is aligned.  Scan for NUL byte.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
@@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat))
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */
@@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat))
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 22:
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat))
 
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 23:
 	movb	%al, (%rdx)	/* 1st byte.  */
 	testb	%al, %al	/* Is it NUL.  */
diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S
index 8e59c4c19..9ef46b7f2 100644
--- a/libc/string/x86_64/strchr.S
+++ b/libc/string/x86_64/strchr.S
@@ -20,6 +20,7 @@
 
 #include "_glibc_inc.h"
 
+/* Seems to be unrolled too much */
 
 	.text
 ENTRY (BP_SYM (strchr))
@@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr))
 	 each of whose bytes is C.  This turns each byte that is C
 	 into a zero.  */
 
-	.p2align 4
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
@@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr))
 	   reversed.  */
 
 
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
+	/* Align, it's a jump target.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+3:
+	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
 	subq	$8,%rax		/* correct pointer increment.  */
 	testb %cl, %cl		/* is first byte C? */
 	jz 6f			/* yes => return pointer */
@@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr))
 	incq %rax
 
 6:
-	nop
+	/* nop - huh?? */
 	retq
 END (BP_SYM (strchr))
 
diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S
index d9a51b0bb..612a30d1a 100644
--- a/libc/string/x86_64/strcpy.S
+++ b/libc/string/x86_64/strcpy.S
@@ -20,6 +20,8 @@
 
 #include "_glibc_inc.h"
 
+/* Seems to be unrolled too much */
+
 #ifndef USE_AS_STPCPY
 # define STRCPY strcpy
 #endif
@@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 1:
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))
 
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 3:
 	/* Note that stpcpy needs to return with the value of the NUL
 	   byte.  */
diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S
index fed12b5f6..fd9b09c48 100644
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -25,6 +25,8 @@
 
 #include "_glibc_inc.h"
 
+/* Seems to be unrolled too much */
+
 /* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */
 #define STRPBRK_P (defined strcspn)
 
@@ -53,26 +55,28 @@ ENTRY (strcspn)
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
 
-	.p2align 4
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+
 L(2):	movb (%rax), %cl	/* get byte from skipset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
 	movb 1(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
 	movb 2(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
 
 	movb 3(%rax), %cl	/* get byte from skipset */
 	addq $4, %rax		/* increment skipset pointer */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jnz L(2)		/* no => process next dword from skipset */
 
 L(1):	leaq -4(%rdx), %rax	/* prepare loop */
@@ -86,7 +90,13 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 
-	.p2align 4
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
 	movb (%rax), %cl	/* get byte from string */
diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S
index 0441dc46c..4213f0ab6 100644
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -20,6 +20,7 @@
 
 #include "_glibc_inc.h"
 
+/* Seems to be unrolled too much */
 
 	.text
 ENTRY (strlen)
@@ -39,8 +40,11 @@ ENTRY (strlen)
 
 1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
 
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
+	/* Align loop.  */
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
+4:
+	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
@@ -97,8 +101,11 @@ ENTRY (strlen)
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */
diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S
index c126abd2e..41cff0490 100644
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -50,26 +50,28 @@ ENTRY (strspn)
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
 
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+L(2):
+	movb (%rax), %cl	/* get byte from stopset */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
 	movb 1(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
 	movb 2(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 
 	movb 3(%rax), %cl	/* get byte from stopset */
 	addq $4, %rax		/* increment stopset pointer */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-	testb $0xff, %cl	/* is NUL char? */
+	testb %cl, %cl		/* is NUL char? */
 	jnz L(2)		/* no => process next dword from stopset */
 
 L(1):	leaq -4(%rdx), %rax	/* prepare loop */
@@ -83,8 +85,14 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+L(3):
+	addq $4, %rax		/* adjust pointer for full loop round */
 
 	movb (%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */