diff options
Diffstat (limited to 'libc')
70 files changed, 1080 insertions, 658 deletions
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S index 3704f96b5..103580a0c 100644 --- a/libc/string/arm/_memcpy.S +++ b/libc/string/arm/_memcpy.S @@ -39,7 +39,9 @@  #include <features.h>  #include <endian.h> +#include <bits/arm_asm.h> +#if !defined(THUMB1_ONLY)  /*   * This is one fun bit of code ...   * Some easy listening music is suggested while trying to understand this @@ -77,12 +79,36 @@  .type _memcpy,%function  .align 4 +/* XXX: The Thumb-2 conditionals can be removed if/when we require an +   assembler that supports unified syntax.  */ +.macro copy regs +#if defined(__thumb2__) +	ittt	ge +	ldmiage	r1!, \regs +	stmiage	r0!, \regs +#else +	ldmgeia	r1!, \regs +	stmgeia	r0!, \regs +#endif +.endm + +.macro copydb regs +#if defined(__thumb2__) +	ittt	ge +	ldmdbge	r1!, \regs +	stmdbge	r0!, \regs +#else +	ldmgedb	r1!, \regs +	stmgedb	r0!, \regs +#endif +.endm +  _memcpy:  	/* Determine copy direction */  	cmp	r1, r0  	bcc	.Lmemcpy_backwards -	moveq	r0, #0			/* Quick abort for len=0 */ +	IT(tt, eq)			/* Quick abort for src=dst */  #if defined(__USE_BX__)          bxeq    lr  #else @@ -102,7 +128,7 @@ _memcpy:  	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */  	subs	r2, r2, #0x14           	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */ -	stmdb	sp!, {r4}		/* borrow r4 */ +	str	r4, [sp, #-4]!		/* borrow r4 */  	/* blat 32 bytes at a time */  	/* XXX for really big copies perhaps we should use more registers */ @@ -115,19 +141,22 @@ _memcpy:  	bge	.Lmemcpy_floop32  	cmn	r2, #0x10 -	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */ -	stmgeia	r0!, {r3, r4, r12, lr} +	/* blat a remaining 16 bytes */ +	copy	"{r3, r4, r12, lr}"  	subge	r2, r2, #0x10          -	ldmia	sp!, {r4}		/* return r4 */ +	ldr	r4, [sp], #4		/* restore r4 */  .Lmemcpy_fl32:  	adds	r2, r2, #0x14           	/* blat 12 bytes at a time */  .Lmemcpy_floop12: -	ldmgeia	r1!, {r3, r12, lr} -	stmgeia	r0!, {r3, r12, lr} +	copy	"{r3, r12, lr}" +#if defined(__thumb2__) +	subsge	r2, r2, #0x0c          +#else  	subges	r2, r2, #0x0c          +#endif  	bge	.Lmemcpy_floop12  .Lmemcpy_fl12: @@ -135,26 +164,48 @@ _memcpy:  	blt	.Lmemcpy_fl4  	subs	r2, r2, #4 +	IT(tt, lt)  	ldrlt	r3, [r1], #4  	strlt	r3, [r0], #4 -	ldmgeia	r1!, {r3, r12} -	stmgeia	r0!, {r3, r12} +	copy	"{r3, r12}"  	subge	r2, r2, #4  .Lmemcpy_fl4:  	/* less than 4 bytes to go */  	adds	r2, r2, #4 +#if defined(__thumb2__) +	it	eq +	popeq	{r0, pc}		/* done */ +#elif defined(__ARM_ARCH_4T__) +	ldmeqia	sp!, {r0, r3}		/* done */ +	bxeq	r3 +#else  	ldmeqia	sp!, {r0, pc}		/* done */ +#endif  	/* copy the crud byte at a time */  	cmp	r2, #2  	ldrb	r3, [r1], #1  	strb	r3, [r0], #1 +#if defined(__thumb2__) +	itt	ge +	ldrbge	r3, [r1], #1 +	strbge	r3, [r0], #1 +	itt	gt +	ldrbgt	r3, [r1], #1 +	strbgt	r3, [r0], #1 +#else  	ldrgeb	r3, [r1], #1  	strgeb	r3, [r0], #1  	ldrgtb	r3, [r1], #1  	strgtb	r3, [r0], #1 +#endif +#if defined(__ARM_ARCH_4T__) +	ldmia	sp!, {r0, r3} +	bx	r3 +#else  	ldmia	sp!, {r0, pc} +#endif  	/* erg - unaligned destination */  .Lmemcpy_fdestul: @@ -164,10 +215,19 @@ _memcpy:  	/* align destination with byte copies */  	ldrb	r3, [r1], #1  	strb	r3, [r0], #1 +#if defined(__thumb2__) +	itt	ge +	ldrbge	r3, [r1], #1 +	strbge	r3, [r0], #1 +	itt	gt +	ldrbgt	r3, [r1], #1 +	strbgt	r3, [r0], #1 +#else  	ldrgeb	r3, [r1], #1  	strgeb	r3, [r0], #1  	ldrgtb	r3, [r1], #1  	strgtb	r3, [r0], #1 +#endif  	subs	r2, r2, r12  	blt	.Lmemcpy_fl4		/* less the 4 bytes */ @@ -370,12 +430,12 @@ _memcpy:  .Lmemcpy_bl32:  	cmn	r2, #0x10             -	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */ -	stmgedb	r0!, {r3, r4, r12, lr} +	/* blat a remaining 16 bytes */ +	copydb	"{r3, r4, r12, lr}"  	subge	r2, r2, #0x10           	adds	r2, r2, #0x14          -	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */ -	stmgedb	r0!, {r3, r12, lr} +	/* blat a remaining 12 bytes */ +	copydb	"{r3, r12, lr}"  	subge	r2, r2, #0x0c           	ldmia	sp!, {r4, lr} @@ -383,15 +443,16 @@ _memcpy:  	adds	r2, r2, #8  	blt	.Lmemcpy_bl4  	subs	r2, r2, #4 +	IT(tt, lt)  	ldrlt	r3, [r1, #-4]!  	strlt	r3, [r0, #-4]! -	ldmgedb	r1!, {r3, r12} -	stmgedb	r0!, {r3, r12} +	copydb	"{r3, r12}"  	subge	r2, r2, #4  .Lmemcpy_bl4:  	/* less than 4 bytes to go */  	adds	r2, r2, #4 +	IT(t, eq)  #if defined(__USE_BX__)          bxeq    lr  #else @@ -401,10 +462,19 @@ _memcpy:  	cmp	r2, #2  	ldrb	r3, [r1, #-1]!  	strb	r3, [r0, #-1]! +#ifdef __thumb2__ +	itt	ge +	ldrbge	r3, [r1, #-1]! +	strbge	r3, [r0, #-1]! +	itt	gt +	ldrbgt	r3, [r1, #-1]! +	strbgt	r3, [r0, #-1]! +#else  	ldrgeb	r3, [r1, #-1]!  	strgeb	r3, [r0, #-1]!  	ldrgtb	r3, [r1, #-1]!  	strgtb	r3, [r0, #-1]! +#endif  #if defined(__USE_BX__)          bx      lr  #else @@ -417,10 +487,19 @@ _memcpy:  	/* align destination with byte copies */  	ldrb	r3, [r1, #-1]!  	strb	r3, [r0, #-1]! +#ifdef __thumb2__ +	itt	ge +	ldrbge	r3, [r1, #-1]! +	strbge	r3, [r0, #-1]! +	itt	gt +	ldrbgt	r3, [r1, #-1]! +	strbgt	r3, [r0, #-1]! +#else  	ldrgeb	r3, [r1, #-1]!  	strgeb	r3, [r0, #-1]!  	ldrgtb	r3, [r1, #-1]!  	strgtb	r3, [r0, #-1]! +#endif  	subs	r2, r2, r12  	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */  	ands	r12, r1, #3 @@ -591,3 +670,77 @@ _memcpy:  .Lmemcpy_bsrcul1l4:  	add	r1, r1, #1  	b	.Lmemcpy_bl4 + +#else /* THUMB1_ONLY */ + +/* This is a fairly dumb implementation for when we can't use the 32-bit code +   above.  */ +.text +.global _memcpy +.hidden _memcpy +.type _memcpy,%function +.align 4 +.thumb +_memcpy: +	push	{r0, r4} +	cmp	r2, #0 +	beq	.Lmemcpy_exit +	@ See if we have overlapping regions, and need to reverse the +	@ direction of the copy +	cmp	r0, r1 +	bls	.Lmemcpy_forwards +	add	r4, r1, r2 +	cmp	r0, r4 +	bcc	.Lmemcpy_backwards +.Lmemcpy_forwards: +	/* Forwards.  */ +	mov	r3, r0 +	eor	r3, r1 +	mov	r4, #3 +	tst	r3, r4 +	bne	.Lmemcpy_funaligned +	cmp	r2, #8 +	bcc	.Lmemcpy_funaligned +1:	@ copy up to the first word boundary. +	tst	r0, r4 +	beq	1f +	ldrb	r3, [r1] +	add	r1, r1, #1 +	strb	r3, [r0] +	add	r0, r0, #1 +	sub	r2, r2, #1 +	b	1b +1:	@ Copy aligned words +	ldr	r3, [r1] +	add	r1, r1, #4 +	str	r3, [r0] +	add	r0, r0, #4 +	sub	r2, r2, #4 +	cmp	r2, #4 +	bcs	1b +	cmp	r2, #0 +	beq	.Lmemcpy_exit +.Lmemcpy_funaligned: +1: +	ldrb	r3, [r1] +	add	r1, r1, #1 +	strb	r3, [r0] +	add	r0, r0, #1 +	sub	r2, r2, #1 +	bne	1b +.Lmemcpy_exit: +	pop	{r0, r4} +	bx	lr + +.Lmemcpy_backwards: +	add	r0, r0, r2 +	add	r1, r1, r2 +1: +	sub	r0, r0, #1 +	sub	r1, r1, #1 +	ldrb	r3, [r1] +	strb	r3, [r0] +	sub	r2, r2, #1 +	bne	1b +	b	.Lmemcpy_exit +#endif diff --git a/libc/string/arm/bcopy.S b/libc/string/arm/bcopy.S index db3c9e6c1..2d6e90d13 100644 --- a/libc/string/arm/bcopy.S +++ b/libc/string/arm/bcopy.S @@ -40,6 +40,7 @@  /* bcopy = memcpy/memmove with arguments reversed. */  #include <features.h> +#include <bits/arm_asm.h>  #ifdef __UCLIBC_SUSV3_LEGACY__ @@ -48,12 +49,23 @@  .type bcopy,%function  .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +bcopy: +	push	{r2, lr} +	mov	ip, r0 +	mov	r0, r1 +	mov	r1, ip +	bl	_memcpy +	POP_RET +#else  bcopy:  	/* switch the source and destination registers */  	eor     r0, r1, r0   	eor     r1, r0, r1   	eor     r0, r1, r0   	b	_memcpy /* (PLT) */ +#endif  .size bcopy,.-bcopy diff --git a/libc/string/arm/bzero.S b/libc/string/arm/bzero.S index ee49cf560..e576a12e9 100644 --- a/libc/string/arm/bzero.S +++ b/libc/string/arm/bzero.S @@ -38,6 +38,7 @@   */  #include <features.h> +#include <bits/arm_asm.h>  #ifdef __UCLIBC_SUSV3_LEGACY__ @@ -46,10 +47,21 @@  .type bzero,%function  .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +bzero: +	push	{r2, lr} +	mov	r2, r1 +	mov	r1, #0 +	bl	HIDDEN_JUMPTARGET(memset) +	POP_RET +#else +  bzero:  	mov	r2, r1  	mov	r1, #0  	b	HIDDEN_JUMPTARGET(memset) +#endif  .size bzero,.-bzero diff --git a/libc/string/arm/memcmp.S b/libc/string/arm/memcmp.S index 4f78b5128..65409f43a 100644 --- a/libc/string/arm/memcmp.S +++ b/libc/string/arm/memcmp.S @@ -30,15 +30,41 @@   */  #include <features.h> +#include <bits/arm_asm.h>  .text  .global memcmp  .type memcmp,%function  .align 4 +#if defined(THUMB1_ONLY) +.thumb_func +memcmp: +	cmp	r2, #0 +	bne	1f +	mov	r0, #0 +	bx	lr +1: +	push	{r4} +	add	r4, r0, r2 +2: +	ldrb	r2, [r0] +	add	r0, r0, #1 +	ldrb	r3, [r1] +	add	r1, r1, #1 +	cmp	r4, r0 +	beq	3f +	cmp	r2, r3 +	beq	2b +3: +	sub	r0, r2, r3 +        pop	{r4} +	bx	lr +#else  memcmp:  	/* if ((len - 1) < 0) return 0 */  	subs	r2, r2, #1 +	IT(tt, mi)  	movmi	r0, #0  #if defined(__USE_BX__)          bxmi    lr @@ -51,6 +77,7 @@ memcmp:  	ldrb	r2, [r0], #1  	ldrb	r3, [r1], #1  	cmp	ip, r0 +	IT(t, cs)  	cmpcs	r2, r3  	beq	1b  	sub	r0, r2, r3 @@ -59,6 +86,7 @@ memcmp:  #else   	mov	pc, lr  #endif +#endif  .size memcmp,.-memcmp diff --git a/libc/string/arm/memcpy.S b/libc/string/arm/memcpy.S index 7a5b6ab76..d2013d211 100644 --- a/libc/string/arm/memcpy.S +++ b/libc/string/arm/memcpy.S @@ -38,16 +38,23 @@   */  #include <features.h> +#include <bits/arm_asm.h>  .text  .global memcpy  .type memcpy,%function  .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func  memcpy: -	stmfd	sp!, {r0, lr} +	push	{r0, lr}  	bl	_memcpy -	ldmfd	sp!, {r0, pc} +	POP_RET +#else +memcpy: +	b	_memcpy +#endif  .size memcpy,.-memcpy diff --git a/libc/string/arm/memmove.S b/libc/string/arm/memmove.S index 45cd9b4d4..c11b98dd4 100644 --- a/libc/string/arm/memmove.S +++ b/libc/string/arm/memmove.S @@ -38,16 +38,23 @@   */  #include <features.h> +#include <bits/arm_asm.h>  .text  .global memmove  .type memmove,%function  .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func  memmove: -	stmfd	sp!, {r0, lr} +	push	{r2, lr}  	bl	_memcpy -	ldmfd	sp!, {r0, pc} +	POP_RET +#else +memmove: +	b	_memcpy +#endif  .size memmove,.-memmove diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S index 16bfe0dc5..66aa6039c 100644 --- a/libc/string/arm/memset.S +++ b/libc/string/arm/memset.S @@ -19,12 +19,52 @@  #include <features.h>  #include <sys/syscall.h> +#include <bits/arm_asm.h>  .text  .global memset  .type memset,%function  .align 4 +#if defined(THUMB1_ONLY) +.thumb_func +memset: +	mov	ip, r0 +	cmp	r2, #8		@ at least 8 bytes to do? +	bcc	2f + +	lsl	r3, r1, #8 +	orr	r1, r3 +	lsl	r3, r1, #16 +	orr	r1, r3 + +	mov	r3, #3 +1:	@ Fill up to the first word boundary +	tst	r0, r3 +	beq	1f +	strb	r1, [r0] +	add	r0, r0, #1 +	sub	r2, r2, #1 +	b	1b +1:	@ Fill aligned words +	str	r1, [r0] +	add	r0, r0, #4 +	sub	r2, r2, #4 +	cmp	r2, #4 +	bcs	1b + +2:	@ Fill the remaining bytes +	cmp	r2, #0 +	beq	2f +1: +	strb	r1, [r0] +	add	r0, r0, #1 +	sub	r2, r2, #1 +	bne	1b +2: +	mov	r0, ip +	bx lr +#else  memset:  	mov	a4, a1  	cmp	a3, $8		@ at least 8 bytes to do? @@ -33,8 +73,14 @@ memset:  	orr	a2, a2, a2, lsl $16  1:  	tst	a4, $3		@ aligned yet? +#if defined(__thumb2__) +	itt	ne +	strbne	a2, [a4], $1 +	subne	a3, a3, $1 +#else  	strneb	a2, [a4], $1  	subne	a3, a3, $1 +#endif  	bne	1b  	mov	ip, a2  1: @@ -51,16 +97,30 @@ memset:  	stmia	a4!, {a2, ip}  	sub	a3, a3, $8  	cmp	a3, $8		@ 8 bytes still to do? +#if defined(__thumb2__) +	itt	ge +	stmiage	a4!, {a2, ip} +	subge	a3, a3, $8 +#else  	stmgeia	a4!, {a2, ip}  	subge	a3, a3, $8 +#endif  	bge	1b  2:  	movs	a3, a3		@ anything left? +	IT(t, eq)  #if defined(__USE_BX__)          bxeq    lr  #else          moveq	pc, lr		@ nope  #endif +#if defined (__thumb2__) +1: +	strb	a2, [a4], #1 +	subs	a3, a3, #1 +	bne	1b +	bx	lr +#else  	rsb	a3, a3, $7  	add	pc, pc, a3, lsl $2  	mov	r0, r0 @@ -76,6 +136,8 @@ memset:  #else   	mov	pc, lr  #endif +#endif +#endif  .size memset,.-memset diff --git a/libc/string/arm/strcmp.S b/libc/string/arm/strcmp.S index 89aa38874..97363c1c2 100644 --- a/libc/string/arm/strcmp.S +++ b/libc/string/arm/strcmp.S @@ -30,17 +30,35 @@   */  #include <features.h> +#include <bits/arm_asm.h>  .text  .global strcmp  .type strcmp,%function  .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +strcmp: +1: +	ldrb	r2, [r0] +	add	r0, r0, #1 +	ldrb	r3, [r1] +	add	r1, r1, #1 +	cmp	r2, #0 +	beq	2f +	cmp	r2, r3 +	beq	1b +2: +	sub	r0, r2, r3 +	bx      lr +#else  strcmp:  1:  	ldrb	r2, [r0], #1  	ldrb	r3, [r1], #1  	cmp	r2, #1 +	IT(t, cs)  	cmpcs	r2, r3  	beq	1b  	sub	r0, r2, r3 @@ -49,6 +67,7 @@ strcmp:  #else    	mov	pc, lr  #endif +#endif  .size strcmp,.-strcmp diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S index 5b4b02e17..949e918f4 100644 --- a/libc/string/arm/strlen.S +++ b/libc/string/arm/strlen.S @@ -20,6 +20,7 @@  #include <features.h>  #include <endian.h>  #include <sys/syscall.h> +#include <bits/arm_asm.h>  /* size_t strlen(const char *S)   * entry: r0 -> string @@ -31,6 +32,19 @@  .type strlen,%function  .align 4 +#if defined(THUMB1_ONLY) +/* A simple implementation for when the ARM implementation can't be used.  */ +.thumb_func +strlen: +	mov r2, #0 +1: +	ldrb	r1, [r0, r2] +	add	r2, r2, #1 +	cmp	r1, #0 +	bne	1b +	sub	r0, r2, #1 +	bx lr +#else  strlen:  	bic     r1, r0, $3              @ addr of word containing first byte  	ldr     r2, [r1], $4            @ get the first word @@ -41,38 +55,48 @@ strlen:  #if __BYTE_ORDER == __BIG_ENDIAN  	orr     r2, r2, $0xff000000     @ set this byte to non-zero  	subs    r3, r3, $1              @ any more to do? +	IT(t, gt)  	orrgt   r2, r2, $0x00ff0000     @ if so, set this byte  	subs    r3, r3, $1              @ more? +	IT(t, gt)  	orrgt   r2, r2, $0x0000ff00     @ then set.  #else  	orr     r2, r2, $0x000000ff     @ set this byte to non-zero  	subs    r3, r3, $1              @ any more to do? +	IT(t, gt)  	orrgt   r2, r2, $0x0000ff00     @ if so, set this byte  	subs    r3, r3, $1              @ more? +	IT(t, gt)  	orrgt   r2, r2, $0x00ff0000     @ then set.  #endif  Laligned:				@ here, we have a word in r2.  Does it  	tst     r2, $0x000000ff         @ contain any zeroes? +	IT(tttt, ne)  	tstne   r2, $0x0000ff00         @  	tstne   r2, $0x00ff0000         @  	tstne   r2, $0xff000000         @  	addne   r0, r0, $4              @ if not, the string is 4 bytes longer +	IT(t, ne)  	ldrne   r2, [r1], $4            @ and we continue to the next word  	bne     Laligned                @  Llastword:				@ drop through to here once we find a  #if __BYTE_ORDER == __BIG_ENDIAN  	tst     r2, $0xff000000         @ word that has a zero byte in it +	IT(tttt, ne)  	addne   r0, r0, $1              @  	tstne   r2, $0x00ff0000         @ and add up to 3 bytes on to it  	addne   r0, r0, $1              @  	tstne   r2, $0x0000ff00         @ (if first three all non-zero, 4th +	IT(t, ne)  	addne   r0, r0, $1              @  must be zero)  #else  	tst     r2, $0x000000ff         @ +	IT(tttt, ne)  	addne   r0, r0, $1              @  	tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it  	addne   r0, r0, $1              @  	tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th +	IT(t, ne)  	addne   r0, r0, $1              @  must be zero)  #endif  #if defined(__USE_BX__) @@ -80,6 +104,7 @@ Llastword:				@ drop through to here once we find a  #else    	mov	pc,lr  #endif +#endif  .size strlen,.-strlen diff --git a/libc/string/arm/strncmp.S b/libc/string/arm/strncmp.S index eaf0620b4..8487639c8 100644 --- a/libc/string/arm/strncmp.S +++ b/libc/string/arm/strncmp.S @@ -30,15 +30,46 @@   */  #include <features.h> +#include <bits/arm_asm.h>  .text  .global strncmp  .type strncmp,%function  .align 4 +#if defined(THUMB1_ONLY) +.thumb_func  strncmp:  	/* if (len == 0) return 0 */  	cmp	r2, #0 +	bne	1f +	mov	r0, #0 +	bx	lr +1: +	push	{r4} + +	/* ip == last src address to compare */ +	add	r4, r0, r2 +2: +	cmp	r4, r0 +	beq	3f +	ldrb	r2, [r0] +	add	r0, r0, #1 +	ldrb	r3, [r1] +	add	r1, r1, #1 +	cmp	r2, #0 +	beq	3f +	cmp	r2, r3 +	beq	2b +3: +	sub	r0, r2, r3 +	pop	{r4} +	bx	lr +#else +strncmp: +	/* if (len == 0) return 0 */ +	cmp	r2, #0 +	IT(tt, eq)  	moveq	r0, #0  #if defined(__USE_BX__)          bxeq    lr @@ -53,6 +84,7 @@ strncmp:  	ldrb	r2, [r0], #1  	ldrb	r3, [r1], #1  	cmp	ip, r0 +	IT(tt, cs)  	cmpcs	r2, #1  	cmpcs	r2, r3  	beq	1b @@ -62,6 +94,7 @@ strncmp:  #else    	mov	pc, lr  #endif +#endif  .size strncmp,.-strncmp diff --git a/libc/string/avr32/Makefile b/libc/string/avr32/Makefile index 0002ffdce..e19e9d9ec 100644 --- a/libc/string/avr32/Makefile +++ b/libc/string/avr32/Makefile @@ -16,8 +16,8 @@  # along with this program; if not, write to the Free Software Foundation, Inc.,  # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -top_srcdir     := ../../../ -top_builddir   := ../../../ +top_srcdir	:= ../../../ +top_builddir	:= ../../../  all: objs diff --git a/libc/string/avr32/bcopy.S b/libc/string/avr32/bcopy.S index e1d173165..bdd521814 100644 --- a/libc/string/avr32/bcopy.S +++ b/libc/string/avr32/bcopy.S @@ -10,17 +10,17 @@  #ifdef __UCLIBC_SUSV3_LEGACY__ -       .text -       .global bcopy -       .type   bcopy, @function -       .align  1 +	.text +	.global bcopy +	.type	bcopy, @function +	.align	1  bcopy: -       /* Swap the first two arguments */ -       eor     r11, r12 -       eor     r12, r11 -       eor     r11, r12 -       rjmp    HIDDEN_JUMPTARGET(memmove) +	/* Swap the first two arguments */ +	eor	r11, r12 +	eor	r12, r11 +	eor	r11, r12 +	rjmp	HIDDEN_JUMPTARGET(memmove) -       .size   bcopy, . - bcopy +	.size	bcopy, . - bcopy  #endif /* __UCLIBC_SUSV3_LEGACY__ */ diff --git a/libc/string/avr32/bzero.S b/libc/string/avr32/bzero.S index 928148dcb..ca1bd2dd2 100644 --- a/libc/string/avr32/bzero.S +++ b/libc/string/avr32/bzero.S @@ -10,15 +10,15 @@  #ifdef __UCLIBC_SUSV3_LEGACY__ -       .text -       .global bzero -       .type   bzero, @function -       .align  1 +	.text +	.global bzero +	.type	bzero, @function +	.align	1  bzero: -       mov     r10, r11 -       mov     r11, 0 -       rjmp    HIDDEN_JUMPTARGET(memset) +	mov	r10, r11 +	mov	r11, 0 +	rjmp	HIDDEN_JUMPTARGET(memset) -       .size   bzero, . - bzero +	.size	bzero, . - bzero  #endif /* __UCLIBC_SUSV3_LEGACY__ */ diff --git a/libc/string/avr32/memcmp.S b/libc/string/avr32/memcmp.S index 5d7eac3d9..ae6cc9189 100644 --- a/libc/string/avr32/memcmp.S +++ b/libc/string/avr32/memcmp.S @@ -12,48 +12,48 @@  #define s2 r11  #define len r10 -       .text -       .global memcmp -       .type   memcmp, @function -       .align  1 +	.text +	.global memcmp +	.type	memcmp, @function +	.align	1  memcmp: -       sub     len, 4 -       brlt    .Lless_than_4 +	sub	len, 4 +	brlt	.Lless_than_4 -1:     ld.w    r8, s1++ -       ld.w    r9, s2++ -       cp.w    r8, r9 -       brne    .Lfound_word -       sub     len, 4 -       brge    1b +1:	ld.w	r8, s1++ +	ld.w	r9, s2++ +	cp.w	r8, r9 +	brne	.Lfound_word +	sub	len, 4 +	brge	1b  .Lless_than_4: -       sub     len, -4 -       reteq   0 +	sub	len, -4 +	reteq	0 -1:     ld.ub   r8, s1++ -       ld.ub   r9, s2++ -       sub     r8, r9 -       retne   r8 -       sub     len, 1 -       brgt    1b +1:	ld.ub	r8, s1++ +	ld.ub	r9, s2++ +	sub	r8, r9 +	retne	r8 +	sub	len, 1 +	brgt	1b -       retal   0 +	retal	0  .Lfound_word: -       mov     len, 4 - -2:     bfextu  r11, r9, 24, 8 -       bfextu  r12, r8, 24, 8 -       sub     r12, r11 -       retne   r12 -       lsl     r8, 8 -       lsl     r9, 8 -       sub     len, 1 -       brne    2b -       retal   r12 - -       .size   memcmp, . - memcmp +	mov	len, 4 + +2:	bfextu	r11, r9, 24, 8 +	bfextu	r12, r8, 24, 8 +	sub	r12, r11 +	retne	r12 +	lsl	r8, 8 +	lsl	r9, 8 +	sub	len, 1 +	brne	2b +	retal	r12 + +	.size	memcmp, . - memcmp  libc_hidden_def(memcmp)  #ifdef __UCLIBC_SUSV3_LEGACY__ diff --git a/libc/string/avr32/memcpy.S b/libc/string/avr32/memcpy.S index f95aabd13..bf091abf8 100644 --- a/libc/string/avr32/memcpy.S +++ b/libc/string/avr32/memcpy.S @@ -11,101 +11,101 @@  #define src r11  #define len r10 -       .text -       .global memcpy -       .type   memcpy, @function +	.text +	.global	memcpy +	.type	memcpy, @function  memcpy: -       pref    src[0] -       mov     dst, r12 +	pref	src[0] +	mov	dst, r12 -       /* If we have less than 32 bytes, don't do anything fancy */ -       cp.w    len, 32 -       brge    .Lmore_than_31 +	/* If we have less than 32 bytes, don't do anything fancy */ +	cp.w	len, 32 +	brge	.Lmore_than_31 -       sub     len, 1 -       retlt   r12 -1:     ld.ub   r8, src++ -       st.b    dst++, r8 -       sub     len, 1 -       brge    1b -       retal   r12 +	sub	len, 1 +	retlt	r12 +1:	ld.ub	r8, src++ +	st.b	dst++, r8 +	sub	len, 1 +	brge	1b +	retal	r12  .Lmore_than_31: -       pushm   r0-r7, lr +	pushm	r0-r7, lr -       /* Check alignment */ -       mov     r8, src -       andl    r8, 31, COH -       brne    .Lunaligned_src -       mov     r8, dst -       andl    r8, 3, COH -       brne    .Lunaligned_dst +	/* Check alignment */ +	mov	r8, src +	andl	r8, 31, COH +	brne	.Lunaligned_src +	mov	r8, dst +	andl	r8, 3, COH +	brne	.Lunaligned_dst  .Laligned_copy: -       sub     len, 32 -       brlt    .Lless_than_32 +	sub	len, 32 +	brlt	.Lless_than_32 -1:     /* Copy 32 bytes at a time */ -       ldm     src, r0-r7 -       sub     src, -32 -       stm     dst, r0-r7 -       sub     dst, -32 -       sub     len, 32 -       brge    1b +1:	/* Copy 32 bytes at a time */ +	ldm	src, r0-r7 +	sub	src, -32 +	stm	dst, r0-r7 +	sub	dst, -32 +	sub	len, 32 +	brge	1b  .Lless_than_32: -       /* Copy 16 more bytes if possible */ -       sub     len, -16 -       brlt    .Lless_than_16 -       ldm     src, r0-r3 -       sub     src, -16 -       sub     len, 16 -       stm     dst, r0-r3 -       sub     dst, -16 +	/* Copy 16 more bytes if possible */ +	sub	len, -16 +	brlt	.Lless_than_16 +	ldm	src, r0-r3 +	sub	src, -16 +	sub	len, 16 +	stm	dst, r0-r3 +	sub	dst, -16  .Lless_than_16: -       /* Do the remaining as byte copies */ -       neg     len -       add     pc, pc, len << 2 -       .rept   15 -       ld.ub   r0, src++ -       st.b    dst++, r0 -       .endr +	/* Do the remaining as byte copies */ +	neg	len +	add	pc, pc, len << 2 +	.rept	15 +	ld.ub	r0, src++ +	st.b	dst++, r0 +	.endr -       popm    r0-r7, pc +	popm	r0-r7, pc  .Lunaligned_src: -       /* Make src cacheline-aligned. r8 = (src & 31) */ -       rsub    r8, r8, 32 -       sub     len, r8 -1:     ld.ub   r0, src++ -       st.b    dst++, r0 -       sub     r8, 1 -       brne    1b - -       /* If dst is word-aligned, we're ready to go */ -       pref    src[0] -       mov     r8, 3 -       tst     dst, r8 -       breq    .Laligned_copy +	/* Make src cacheline-aligned. r8 = (src & 31) */ +	rsub	r8, r8, 32 +	sub	len, r8 +1:	ld.ub	r0, src++ +	st.b	dst++, r0 +	sub	r8, 1 +	brne	1b + +	/* If dst is word-aligned, we're ready to go */ +	pref	src[0] +	mov	r8, 3 +	tst	dst, r8 +	breq	.Laligned_copy  .Lunaligned_dst: -       /* src is aligned, but dst is not. Expect bad performance */ -       sub     len, 4 -       brlt    2f -1:     ld.w    r0, src++ -       st.w    dst++, r0 -       sub     len, 4 -       brge    1b - -2:     neg     len -       add     pc, pc, len << 2 -       .rept   3 -       ld.ub   r0, src++ -       st.b    dst++, r0 -       .endr - -       popm    r0-r7, pc -       .size   memcpy, . - memcpy +	/* src is aligned, but dst is not. Expect bad performance */ +	sub	len, 4 +	brlt	2f +1:	ld.w	r0, src++ +	st.w	dst++, r0 +	sub	len, 4 +	brge	1b + +2:	neg	len +	add	pc, pc, len << 2 +	.rept	3 +	ld.ub	r0, src++ +	st.b	dst++, r0 +	.endr + +	popm	r0-r7, pc +	.size	memcpy, . - memcpy  libc_hidden_def(memcpy) diff --git a/libc/string/avr32/memmove.S b/libc/string/avr32/memmove.S index 8ca4da54d..535f4a257 100644 --- a/libc/string/avr32/memmove.S +++ b/libc/string/avr32/memmove.S @@ -10,107 +10,107 @@  #define src r11  #define len r10 -       .text -       .global memmove -       .type   memmove, @function +	.text +	.global memmove +	.type	memmove, @function  memmove: -       cp.w    src, dst -       brge    HIDDEN_JUMPTARGET(memcpy) - -       add     dst, len -       add     src, len -       pref    src[-1] - -       /* -        * The rest is basically the same as in memcpy.S except that -        * the direction is reversed. -        */ -       cp.w    len, 32 -       brge    .Lmore_than_31 - -       sub     len, 1 -       retlt   r12 -1:     ld.ub   r8, --src -       st.b    --dst, r8 -       sub     len, 1 -       brge    1b -       retal   r12 +	cp.w	src, dst +	brge	HIDDEN_JUMPTARGET(memcpy) + +	add	dst, len +	add	src, len +	pref	src[-1] + +	/* +	 * The rest is basically the same as in memcpy.S except that +	 * the direction is reversed. +	 */ +	cp.w	len, 32 +	brge	.Lmore_than_31 + +	sub	len, 1 +	retlt	r12 +1:	ld.ub	r8, --src +	st.b	--dst, r8 +	sub	len, 1 +	brge	1b +	retal	r12  .Lmore_than_31: -       pushm   r0-r7, lr +	pushm	r0-r7, lr -       /* Check alignment */ -       mov     r8, src -       andl    r8, 31, COH -       brne    .Lunaligned_src -       mov     r8, r12 -       andl    r8, 3, COH -       brne    .Lunaligned_dst +	/* Check alignment */ +	mov	r8, src +	andl	r8, 31, COH +	brne	.Lunaligned_src +	mov	r8, r12 +	andl	r8, 3, COH +	brne	.Lunaligned_dst  .Laligned_copy: -       sub     len, 32 -       brlt    .Lless_than_32 +	sub	len, 32 +	brlt	.Lless_than_32 -1:     /* Copy 32 bytes at a time */ -       sub     src, 32 -       ldm     src, r0-r7 -       sub     dst, 32 -       sub     len, 32 -       stm     dst, r0-r7 -       brge    1b +1:	/* Copy 32 bytes at a time */ +	sub	src, 32 +	ldm	src, r0-r7 +	sub	dst, 32 +	sub	len, 32 +	stm	dst, r0-r7 +	brge	1b  .Lless_than_32: -       /* Copy 16 more bytes if possible */ -       sub     len, -16 -       brlt    .Lless_than_16 -       sub     src, 16 -       ldm     src, r0-r3 -       sub     dst, 16 -       sub     len, 16 -       stm     dst, r0-r3 +	/* Copy 16 more bytes if possible */ +	sub	len, -16 +	brlt	.Lless_than_16 +	sub	src, 16 +	ldm	src, r0-r3 +	sub	dst, 16 +	sub	len, 16 +	stm	dst, r0-r3  .Lless_than_16: -       /* Do the remaining as byte copies */ -       sub     len, -16 -       breq    2f -1:     ld.ub   r0, --src -       st.b    --dst, r0 -       sub     len, 1 -       brne    1b +	/* Do the remaining as byte copies */ +	sub	len, -16 +	breq	2f +1:	ld.ub	r0, --src +	st.b	--dst, r0 +	sub	len, 1 +	brne	1b -2:     popm    r0-r7, pc +2:	popm	r0-r7, pc  .Lunaligned_src: -       /* Make src cacheline-aligned. r8 = (src & 31) */ -       sub     len, r8 -1:     ld.ub   r0, --src -       st.b    --dst, r0 -       sub     r8, 1 -       brne    1b - -       /* If dst is word-aligned, we're ready to go */ -       pref    src[-4] -       mov     r8, 3 -       tst     dst, r8 -       breq    .Laligned_copy +	/* Make src cacheline-aligned. r8 = (src & 31) */ +	sub	len, r8 +1:	ld.ub	r0, --src +	st.b	--dst, r0 +	sub	r8, 1 +	brne	1b + +	/* If dst is word-aligned, we're ready to go */ +	pref	src[-4] +	mov	r8, 3 +	tst	dst, r8 +	breq	.Laligned_copy  .Lunaligned_dst: -       /* src is aligned, but dst is not. Expect bad performance */ -       sub     len, 4 -       brlt    2f -1:     ld.w    r0, --src -       st.w    --dst, r0 -       sub     len, 4 -       brge    1b - -2:     neg     len -       add     pc, pc, len << 2 -       .rept   3 -       ld.ub   r0, --src -       st.b    --dst, r0 -       .endr - -       popm    r0-r7, pc -       .size   memmove, . - memmove +	/* src is aligned, but dst is not. Expect bad performance */ +	sub	len, 4 +	brlt	2f +1:	ld.w	r0, --src +	st.w	--dst, r0 +	sub	len, 4 +	brge	1b + +2:	neg	len +	add	pc, pc, len << 2 +	.rept	3 +	ld.ub	r0, --src +	st.b	--dst, r0 +	.endr + +	popm	r0-r7, pc +	.size	memmove, . - memmove  libc_hidden_def(memmove) diff --git a/libc/string/avr32/memset.S b/libc/string/avr32/memset.S index 964bf4834..472b2be35 100644 --- a/libc/string/avr32/memset.S +++ b/libc/string/avr32/memset.S @@ -12,54 +12,54 @@  #define c r11  #define n r10 -       .text -       .global memset -       .type   memset, @function +	.text +	.global memset +	.type	memset, @function -       .align  1 +	.align	1  memset: -       cp.w    n, 32 -       mov     r9, s -       brge    .Llarge_memset +	cp.w	n, 32 +	mov	r9, s +	brge	.Llarge_memset -       sub     n, 1 -       retlt   s -1:     st.b    s++, c -       sub     n, 1 -       brge    1b +	sub	n, 1 +	retlt	s +1:	st.b	s++, c +	sub	n, 1 +	brge	1b -       retal   r9 +	retal	r9  .Llarge_memset: -       mov     r8, r11 -       mov     r11, 3 -       bfins   r8, r8, 8, 8 -       bfins   r8, r8, 16, 16 -       tst     s, r11 -       breq    2f +	mov	r8, r11 +	mov	r11, 3 +	bfins	r8, r8, 8, 8 +	bfins	r8, r8, 16, 16 +	tst	s, r11 +	breq	2f -1:     st.b    s++, r8 -       sub     n, 1 -       tst     s, r11 -       brne    1b +1:	st.b	s++, r8 +	sub	n, 1 +	tst	s, r11 +	brne	1b -2:     mov     r11, r9 -       mov     r9, r8 -       sub     n, 8 +2:	mov	r11, r9 +	mov	r9, r8 +	sub	n, 8 -3:     st.d    s++, r8 -       sub     n, 8 -       brge    3b +3:	st.d	s++, r8 +	sub	n, 8 +	brge	3b -       /* If we are done, n == -8 and we'll skip all st.b insns below */ -       neg     n -       lsl     n, 1 -       add     pc, n -       .rept   7 -       st.b    s++, r8 -       .endr -       retal   r11 +	/* If we are done, n == -8 and we'll skip all st.b insns below */ +	neg	n +	lsl	n, 1 +	add	pc, n +	.rept	7 +	st.b	s++, r8 +	.endr +	retal	r11 -       .size   memset, . - memset +	.size	memset, . - memset  libc_hidden_def(memset) diff --git a/libc/string/avr32/strcmp.S b/libc/string/avr32/strcmp.S index e9f087577..f73bd43e7 100644 --- a/libc/string/avr32/strcmp.S +++ b/libc/string/avr32/strcmp.S @@ -12,77 +12,77 @@  #define s2 r11  #define len r10 -       .text -       .global strcmp -       .type   strcmp, @function -       .align  1 +	.text +	.global strcmp +	.type	strcmp, @function +	.align	1  strcmp: -       mov     r8, 3 -       tst     s1, r8 -       brne    .Lunaligned_s1 -       tst     s2, r8 -       brne    .Lunaligned_s2 +	mov	r8, 3 +	tst	s1, r8 +	brne	.Lunaligned_s1 +	tst	s2, r8 +	brne	.Lunaligned_s2 -1:     ld.w    r8, s1++ -       ld.w    r9, s2++ -       cp.w    r8, r9 -       brne    2f -       tnbz    r8 -       brne    1b -       retal   0 +1:	ld.w	r8, s1++ +	ld.w	r9, s2++ +	cp.w	r8, r9 +	brne	2f +	tnbz	r8 +	brne	1b +	retal	0 -2:     bfextu  r12, r8, 24, 8 -       bfextu  r11, r9, 24, 8 -       sub     r12, r11 -       retne   r12 -       cp.w    r11, 0 -       reteq   0 -       bfextu  r12, r8, 16, 8 -       bfextu  r11, r9, 16, 8 -       sub     r12, r11 -       retne   r12 -       cp.w    r11, 0 -       reteq   0 -       bfextu  r12, r8, 8, 8 -       bfextu  r11, r9, 8, 8 -       sub     r12, r11 -       retne   r12 -       cp.w    r11, 0 -       reteq   0 -       bfextu  r12, r8, 0, 8 -       bfextu  r11, r9, 0, 8 -       sub     r12, r11 -       retal   r12 +2:	bfextu	r12, r8, 24, 8 +	bfextu	r11, r9, 24, 8 +	sub	r12, r11 +	retne	r12 +	cp.w	r11, 0 +	reteq	0 +	bfextu	r12, r8, 16, 8 +	bfextu	r11, r9, 16, 8 +	sub	r12, r11 +	retne	r12 +	cp.w	r11, 0 +	reteq	0 +	bfextu	r12, r8, 8, 8 +	bfextu	r11, r9, 8, 8 +	sub	r12, r11 +	retne	r12 +	cp.w	r11, 0 +	reteq	0 +	bfextu	r12, r8, 0, 8 +	bfextu	r11, r9, 0, 8 +	sub	r12, r11 +	retal	r12  .Lunaligned_s1: -3:     tst     s1, r8 -       breq    4f -       ld.ub   r10, s1++ -       ld.ub   r9, s2++ -       sub     r10, r9 -       retne   r10 -       cp.w    r9, 0 -       brne    3b -       retal   r10 +3:	tst	s1, r8 +	breq	4f +	ld.ub	r10, s1++ +	ld.ub	r9, s2++ +	sub	r10, r9 +	retne	r10 +	cp.w	r9, 0 +	brne	3b +	retal	r10 -4:     tst     s2, r8 -       breq    1b +4:	tst	s2, r8 +	breq	1b  .Lunaligned_s2: -       /* -        * s1 and s2 can't both be aligned, and unaligned word loads -        * can trigger spurious exceptions if we cross a page boundary. -        * Do it the slow way... -        */ -1:     ld.ub   r8, s1++ -       ld.ub   r9, s2++ -       sub     r8, r9 -       retne   r8 -       cp.w    r9, 0 -       brne    1b -       retal   0 +	/* +	 * s1 and s2 can't both be aligned, and unaligned word loads +	 * can trigger spurious exceptions if we cross a page boundary. +	 * Do it the slow way... +	 */ +1:	ld.ub	r8, s1++ +	ld.ub	r9, s2++ +	sub	r8, r9 +	retne	r8 +	cp.w	r9, 0 +	brne	1b +	retal	0 -       .size   strcmp, . - strcmp +	.size	strcmp, . - strcmp  libc_hidden_def(strcmp)  #ifndef __UCLIBC_HAS_LOCALE__ diff --git a/libc/string/avr32/strlen.S b/libc/string/avr32/strlen.S index d2808998d..5223e5365 100644 --- a/libc/string/avr32/strlen.S +++ b/libc/string/avr32/strlen.S @@ -10,53 +10,53 @@  #define str r12 -       .text -       .global strlen -       .type   strlen, @function +	.text +	.global strlen +	.type	strlen, @function  strlen: -       mov     r11, r12 - -       mov     r9, str -       andl    r9, 3, COH -       brne    .Lunaligned_str - -1:     ld.w    r8, str++ -       tnbz    r8 -       brne    1b - -       sub     r12, r11 -       bfextu  r9, r8, 24, 8 -       cp.w    r9, 0 -       subeq   r12, 4 -       reteq   r12 -       bfextu  r9, r8, 16, 8 -       cp.w    r9, 0 -       subeq   r12, 3 -       reteq   r12 -       bfextu  r9, r8, 8, 8 -       cp.w    r9, 0 -       subeq   r12, 2 -       reteq   r12 -       sub     r12, 1 -       retal   r12 +	mov	r11, r12 + +	mov	r9, str +	andl	r9, 3, COH +	brne	.Lunaligned_str + +1:	ld.w	r8, str++ +	tnbz	r8 +	brne	1b + +	sub	r12, r11 +	bfextu	r9, r8, 24, 8 +	cp.w	r9, 0 +	subeq	r12, 4 +	reteq	r12 +	bfextu	r9, r8, 16, 8 +	cp.w	r9, 0 +	subeq	r12, 3 +	reteq	r12 +	bfextu	r9, r8, 8, 8 +	cp.w	r9, 0 +	subeq	r12, 2 +	reteq	r12 +	sub	r12, 1 +	retal	r12  .Lunaligned_str: -       add     pc, pc, r9 << 3 -       sub     r0, r0, 0       /* 4-byte nop */ -       ld.ub   r8, str++ -       sub     r8, r8, 0 -       breq    1f -       ld.ub   r8, str++ -       sub     r8, r8, 0 -       breq    1f -       ld.ub   r8, str++ -       sub     r8, r8, 0 -       brne    1b - -1:     sub     r12, 1 -       sub     r12, r11 -       retal   r12 - -       .size   strlen, . - strlen +	add	pc, pc, r9 << 3 +	sub	r0, r0, 0	/* 4-byte nop */ +	ld.ub	r8, str++ +	sub	r8, r8, 0 +	breq	1f +	ld.ub	r8, str++ +	sub	r8, r8, 0 +	breq	1f +	ld.ub	r8, str++ +	sub	r8, r8, 0 +	brne	1b + +1:	sub	r12, 1 +	sub	r12, r11 +	retal	r12 + +	.size	strlen, . - strlen  libc_hidden_def(strlen) diff --git a/libc/string/bfin/memchr.S b/libc/string/bfin/memchr.S index 23626d6a4..88e46bef6 100644 --- a/libc/string/bfin/memchr.S +++ b/libc/string/bfin/memchr.S @@ -1,5 +1,5 @@  /* memchr.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* void *memchr(const void *s, int c, size_t n);   * R0 = address (s)   * R1 = sought byte (c) @@ -21,30 +23,29 @@  .align 2 -.global _memchr -.type _memchr, STT_FUNC -_memchr: +.weak _memchr +ENTRY(_memchr)  	P0 = R0;             // P0 = address  	P2 = R2;             // P2 = count  	R1 = R1.B(Z);  	CC = R2 == 0; -	IF CC JUMP failed; +	IF CC JUMP .Lfailed; -bytes: -	LSETUP (byte_loop_s , byte_loop_e) LC0=P2; +.Lbytes: +	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; -byte_loop_s: +.Lbyte_loop_s:  	R3 = B[P0++](Z);  	CC = R3 == R1; -	IF CC JUMP found; -byte_loop_e: +	IF CC JUMP .Lfound; +.Lbyte_loop_e:  	NOP; -failed: +.Lfailed:  	R0=0;  	RTS; -found: +.Lfound:  	R0 = P0;  	R0 += -1;  	RTS; diff --git a/libc/string/bfin/memcmp.S b/libc/string/bfin/memcmp.S index f2679d5ae..7cc76ad96 100644 --- a/libc/string/bfin/memcmp.S +++ b/libc/string/bfin/memcmp.S @@ -1,5 +1,5 @@  /* memcmp.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* int memcmp(const void *s1, const void *s2, size_t n);   * R0 = First Address (s1)   * R1 = Second Address (s2) @@ -21,28 +23,27 @@  .align 2 -.global _memcmp -.type _memcmp, STT_FUNC -_memcmp: +.weak _memcmp +ENTRY(_memcmp)  	I1 = P3; -	P0 = R0;              // P0 = s1 address -	P3 = R1;              // P3 = s2 Address -	P2 = R2 ;             // P2 = count +	P0 = R0;			/* P0 = s1 address */ +	P3 = R1;			/* P3 = s2 Address  */ +	P2 = R2 ;			/* P2 = count */  	CC = R2 <= 7(IU); -	IF CC JUMP  too_small; -	I0 = R1;		    // s2 -	R1 = R1 | R0;         // OR addresses together -	R1 <<= 30;            // check bottom two bits -	CC =  AZ;             // AZ set if zero. -	IF !CC JUMP  bytes ;  // Jump if addrs not aligned. +	IF CC JUMP .Ltoo_small; +	I0 = R1;			/* s2 */ +	R1 = R1 | R0;		/* OR addresses together */ +	R1 <<= 30;		/* check bottom two bits */ +	CC =  AZ;			/* AZ set if zero. */ +	IF !CC JUMP .Lbytes ;	/* Jump if addrs not aligned. */ -	P1 = P2 >> 2;          // count = n/4 +	P1 = P2 >> 2;		/* count = n/4 */  	R3 =  3; -	R2 = R2 & R3;         // remainder -	P2 = R2;               // set remainder +	R2 = R2 & R3;		/* remainder */ +	P2 = R2;			/* set remainder */ -	LSETUP (quad_loop_s , quad_loop_e) LC0=P1; -quad_loop_s: +	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s:  #if !defined(__WORKAROUND_AVOID_DAG1)  	MNOP || R0 = [P0++] || R1 = [I0++];  #else @@ -50,52 +51,54 @@ quad_loop_s:  	R1 = [I0++];  #endif  	CC = R0 == R1; -	IF !CC JUMP quad_different; -quad_loop_e: +	IF !CC JUMP .Lquad_different; +.Lquad_loop_e:  	NOP; -	P3 = I0;                 // s2 -too_small: -	CC = P2 == 0;            //Check zero count -	IF CC JUMP finished;     // very unlikely +	P3 = I0;			/* s2 */ +.Ltoo_small: +	CC = P2 == 0;		/* Check zero count*/ +	IF CC JUMP .Lfinished;	/* very unlikely*/ -bytes: -	LSETUP (byte_loop_s , byte_loop_e) LC0=P2; -byte_loop_s: -	R1 = B[P3++](Z);	// *s2 -	R0 = B[P0++](Z);	// *s1 +.Lbytes: +	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; +.Lbyte_loop_s: +	R1 = B[P3++](Z);	/* *s2 */ +	R0 = B[P0++](Z);	/* *s1 */  	CC = R0 == R1; -	IF !CC JUMP different; -byte_loop_e: +	IF !CC JUMP .Ldifferent; +.Lbyte_loop_e:  	NOP; -different: +.Ldifferent:  	R0 = R0 - R1;  	P3 = I1;  	RTS; -quad_different: -	// We've read two quads which don't match. -	// Can't just compare them, because we're -	// a little-endian machine, so the MSBs of -	// the regs occur at later addresses in the -	// string. -	// Arrange to re-read those two quads again, -	// byte-by-byte. -	P0 += -4;	// back up to the start of the -	P3 = I0;	// quads, and increase the -	P2 += 4;	// remainder count +.Lquad_different: +	/* We've read two quads which don't match. +	 * Can't just compare them, because we're +	 * a little-endian machine, so the MSBs of +	 * the regs occur at later addresses in the +	 * string. +	 * Arrange to re-read those two quads again, +	 * byte-by-byte. +	 */ +	P0 += -4;		/* back up to the start of the */ +	P3 = I0;		/* quads, and increase the*/ +	P2 += 4;		/* remainder count*/  	P3 += -4; -	JUMP bytes; +	JUMP .Lbytes; -finished: +.Lfinished:  	R0 = 0;  	P3 = I1;  	RTS; +  .size _memcmp,.-_memcmp  libc_hidden_def (memcmp)  #ifdef __UCLIBC_SUSV3_LEGACY__ -strong_alias (memcmp,bcmp) +weak_alias (memcmp,bcmp)  #endif diff --git a/libc/string/bfin/memcpy.S b/libc/string/bfin/memcpy.S index e7ba7048e..bdd760691 100644 --- a/libc/string/bfin/memcpy.S +++ b/libc/string/bfin/memcpy.S @@ -1,5 +1,5 @@  /* memcpy.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* void *memcpy(void *dest, const void *src, size_t n);   * R0 = To Address (dest) (leave unchanged to form result)   * R1 = From Address (src) @@ -21,54 +23,55 @@  .align 2 -.global _memcpy -.type _memcpy, STT_FUNC -_memcpy: +.weak _memcpy +ENTRY(_memcpy)  	[--SP] = P3; -	P0 = R0;              // P0 = To address -	P3 = R1;              // P3 = From Address -	P2 = R2 ;             // P2 = count +	P0 = R0;              /* P0 = To address */ +	P3 = R1;              /* P3 = From Address */ +	P2 = R2;              /* P2 = count */  	CC = R2 <= 7(IU); -	IF CC JUMP  too_small; +	IF CC JUMP .Ltoo_small;  	I0 = R1; -	R3 = R1 | R0;         // OR addresses together -	R3 <<= 30;            // check bottom two bits -	CC =  AZ;             // AZ set if zero. -	IF !CC JUMP  bytes ;  // Jump if addrs not aligned. -	P1 = P2 >> 2;         // count = n/4 +	R3 = R1 | R0;         /* OR addresses together */ +	R3 <<= 30;            /* check bottom two bits */ +	CC =  AZ;             /* AZ set if zero. */ +	IF !CC JUMP .Lbytes;  /* Jump if addrs not aligned. */ +	P1 = P2 >> 2;         /* count = n/4 */  	P1 += -1;  	R3 =  3; -	R2 = R2 & R3;         // remainder -	P2 = R2;              // set remainder +	R2 = R2 & R3;         /* remainder */ +	P2 = R2;              /* set remainder */  	R1 = [I0++];  #if !defined(__WORKAROUND_AVOID_DAG1) -	LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop:		MNOP || [P0++] = R1 || R1 = [I0++]; +	LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1; +.Lquad_loop:	MNOP || [P0++] = R1 || R1 = [I0++];  #else -	LSETUP (quad_loop_s , quad_loop_e) LC0=P1; -quad_loop_s:	[P0++] = R1; -quad_loop_e:	R1 = [I0++]; +	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s:	[P0++] = R1; +.Lquad_loop_e:	R1 = [I0++];  #endif  	[P0++] = R1; -	CC = P2 == 0;         // any remaining bytes? -	P3 = I0;	      // Ammend P3 for remaining copy -	IF !CC JUMP bytes; +	CC = P2 == 0;         /* any remaining bytes? */ +	P3 = I0;              /* Ammend P3 for remaining copy */ +	IF !CC JUMP .Lbytes;  	P3 = [SP++];  	RTS; -too_small: -	CC = P2 == 0;           //Check zero count -	IF CC JUMP finished;    // very unlikely +.Ltoo_small: +	CC = P2 == 0;          /* Check zero count */ +	IF CC JUMP .Lfinished; /* very unlikely */ -bytes: -	LSETUP (byte_loop_s , byte_loop_e) LC0=P2; -byte_loop_s:	R1 = B[P3++](Z); -byte_loop_e:	B[P0++] = R1; +.Lbytes: +	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; +.Lbyte_loop_s:	R1 = B[P3++](Z); +.Lbyte_loop_e:	B[P0++] = R1; -finished: +.Lfinished:  	P3 = [SP++]; +  	RTS; +  .size _memcpy,.-_memcpy  libc_hidden_def (memcpy) diff --git a/libc/string/bfin/memmove.S b/libc/string/bfin/memmove.S index 3d446f326..73e363820 100644 --- a/libc/string/bfin/memmove.S +++ b/libc/string/bfin/memmove.S @@ -1,5 +1,5 @@  /* memmove.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* void *memmove(void *dest, const void *src, size_t n);   * R0 = To Address (dest) (leave unchanged to form result)   * R1 = From Address (src) @@ -21,75 +23,78 @@  .align 2 -.global _memmove -.type _memmove, STT_FUNC -_memmove: +.weak _memmove +ENTRY(_memmove)  	I1 = P3; -	P0 = R0;              // P0 = To address -	P3 = R1;              // P3 = From Address -	P2 = R2 ;             // P2 = count -	CC = P2 == 0;           //Check zero count -	IF CC JUMP finished;    // very unlikely +	P0 = R0;                  /* P0 = To address */ +	P3 = R1;                  /* P3 = From Address */ +	P2 = R2;                  /* P2 = count */ +	CC = P2 == 0;             /* Check zero count*/ +	IF CC JUMP .Lfinished;    /* very unlikely */ -	CC = R1 < R0 (IU);	// From < To -	IF !CC JUMP no_overlap; +	CC = R1 < R0 (IU);        /* From < To */ +	IF !CC JUMP .Lno_overlap;  	R3 = R1 + R2; -	CC = R0 <= R3 (IU);	// (From+len) >= To -	IF CC JUMP overlap; -no_overlap: +	CC = R0 <= R3 (IU);       /* (From+len) >= To */ +	IF CC JUMP .Loverlap; +.Lno_overlap:  	R3 = 11;  	CC = R2 <= R3; -	IF CC JUMP  bytes; -	R3 = R1 | R0;         // OR addresses together -	R3 <<= 30;            // check bottom two bits -	CC =  AZ;             // AZ set if zero. -	IF !CC JUMP  bytes ;  // Jump if addrs not aligned. +	IF CC JUMP .Lbytes; +	R3 = R1 | R0;             /* OR addresses together */ +	R3 <<= 30;                /* check bottom two bits */ +	CC =  AZ;                 /* AZ set if zero.*/ +	IF !CC JUMP .Lbytes;      /* Jump if addrs not aligned.*/  	I0 = P3; -	P1 = P2 >> 2;         // count = n/4 +	P1 = P2 >> 2;             /* count = n/4 */  	P1 += -1;  	R3 =  3; -	R2 = R2 & R3;         // remainder -	P2 = R2;              // set remainder +	R2 = R2 & R3;             /* remainder */ +	P2 = R2;                  /* set remainder */  	R1 = [I0++];  #if !defined(__WORKAROUND_AVOID_DAG1) -	LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop:		MNOP || [P0++] = R1 || R1 = [I0++]; +	LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1; +.Lquad_loop:	MNOP || [P0++] = R1 || R1 = [I0++];  #else -	LSETUP (quad_loop_s, quad_loop_e) LC0=P1; -quad_loop_s:	[P0++] = R1; -quad_loop_e:	R1 = [I0++]; +	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s:	[P0++] = R1; +.Lquad_loop_e:	R1 = [I0++];  #endif  	[P0++] = R1; -	CC = P2 == 0;         // any remaining bytes? -	P3 = I0;		  // Ammend P3 to updated ptr. -	IF !CC JUMP bytes; +	CC = P2 == 0;             /* any remaining bytes? */ +	P3 = I0;                  /* Ammend P3 to updated ptr. */ +	IF !CC JUMP .Lbytes;  	P3 = I1;  	RTS; -bytes:		LSETUP (byte2_s , byte2_e) LC0=P2; -byte2_s:	R1 = B[P3++](Z); -byte2_e:	B[P0++] = R1; +.Lbytes:     LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2; +.Lbyte2_s:   R1 = B[P3++](Z); +.Lbyte2_e:   B[P0++] = R1; -finished: -	P3 = I1; +.Lfinished:  P3 = I1;  	RTS; -overlap: +.Loverlap:  	P2 += -1;  	P0 = P0 + P2;  	P3 = P3 + P2;  	R1 = B[P3--] (Z);  	CC = P2 == 0; -	IF CC JUMP no_loop; -	LSETUP (ol_s, ol_e) LC0 = P2; -ol_s:		B[P0--] = R1; -ol_e:		R1 = B[P3--] (Z); -no_loop:	B[P0] = R1; +	IF CC JUMP .Lno_loop; +#if defined(__WORKAROUND_SPECULATIVE_LOADS) +	NOP; +	NOP; +#endif +	LSETUP (.Lol_s, .Lol_e) LC0 = P2; +.Lol_s:    B[P0--] = R1; +.Lol_e:    R1 = B[P3--] (Z); +.Lno_loop: B[P0] = R1;  	P3 = I1;  	RTS; +  .size _memmove,.-_memmove  libc_hidden_def (memmove) diff --git a/libc/string/bfin/memset.S b/libc/string/bfin/memset.S index bd8eb4b6a..64012f783 100644 --- a/libc/string/bfin/memset.S +++ b/libc/string/bfin/memset.S @@ -1,5 +1,5 @@  /* memset.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* void *memset(void *s, int c, size_t n);   * R0 = address (s) (leave unchanged to form result)   * R1 = filler byte (c) @@ -21,66 +23,68 @@  .align 2 -.global _memset -.type _memset, STT_FUNC -_memset: -	P0 = R0 ;             // P0 = address -	P2 = R2 ;             // P2 = count -	R3 = R0 + R2;         // end +.weak _memset +ENTRY(_memset) +	P0 = R0 ;              /* P0 = address */ +	P2 = R2 ;              /* P2 = count   */ +	R3 = R0 + R2;          /* end          */  	CC = R2 <= 7(IU); -	IF CC JUMP  too_small; -	R1 = R1.B (Z);        // R1 = fill char +	IF CC JUMP  .Ltoo_small; +	R1 = R1.B (Z);         /* R1 = fill char */  	R2 =  3; -	R2 = R0 & R2;         // addr bottom two bits -	CC =  R2 == 0;             // AZ set if zero. -	IF !CC JUMP  force_align ;  // Jump if addr not aligned. +	R2 = R0 & R2;          /* addr bottom two bits */ +	CC =  R2 == 0;             /* AZ set if zero.	*/ +	IF !CC JUMP  .Lforce_align ;  /* Jump if addr not aligned. */ -aligned: -	P1 = P2 >> 2;          // count = n/4 -	R2 = R1 <<  8;         // create quad filler +.Laligned: +	P1 = P2 >> 2;          /* count = n/4        */ +	R2 = R1 <<  8;         /* create quad filler */  	R2.L = R2.L + R1.L(NS);  	R2.H = R2.L + R1.H(NS);  	P2 = R3; -	LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop: +	LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1; +.Lquad_loop:  	[P0++] = R2;  	CC = P0 == P2; -	IF !CC JUMP bytes_left; +	IF !CC JUMP .Lbytes_left;  	RTS; -bytes_left: -	R2 = R3;         // end point -	R3 = P0;         // current position -	R2 = R2 - R3;    // bytes left +.Lbytes_left: +	R2 = R3;                /* end point */ +	R3 = P0;                /* current position */ +	R2 = R2 - R3;           /* bytes left */  	P2 = R2; -too_small: -	CC = P2 == 0;           //Check zero count -	IF CC JUMP finished;    // Unusual +.Ltoo_small: +	CC = P2 == 0;           /* Check zero count */ +	IF CC JUMP .Lfinished;    /* Unusual */ -bytes:       LSETUP (byte_loop , byte_loop) LC0=P2; -byte_loop:   B[P0++] = R1; +.Lbytes: +	LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2; +.Lbyte_loop: +	B[P0++] = R1; -finished: +.Lfinished:  	RTS; -force_align: -	CC = BITTST (R0, 0 );  // odd byte +.Lforce_align: +	CC = BITTST (R0, 0);  /* odd byte */  	R0 = 4;  	R0 = R0 - R2;  	P1 = R0; -	R0 = P0;			// Recover return address -	IF !CC JUMP skip1; +	R0 = P0;		    /* Recover return address */ +	IF !CC JUMP .Lskip1;  	B[P0++] = R1; -skip1: -	CC = R2 <= 2;          // 2 bytes -	P2 -= P1;              // reduce count -	IF !CC JUMP aligned; +.Lskip1: +	CC = R2 <= 2;          /* 2 bytes */ +	P2 -= P1;              /* reduce count */ +	IF !CC JUMP .Laligned;  	B[P0++] = R1;  	B[P0++] = R1; -	JUMP aligned; +	JUMP .Laligned; +  .size _memset,.-_memset  libc_hidden_def (memset) diff --git a/libc/string/bfin/strcmp.S b/libc/string/bfin/strcmp.S index 6365024ec..12e8c53c6 100644 --- a/libc/string/bfin/strcmp.S +++ b/libc/string/bfin/strcmp.S @@ -1,5 +1,5 @@  /* strcmp.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.   *   * This file is subject to the terms and conditions of the GNU Library General   * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@   * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html   */ +#include <sysdep.h> +  /* Fast strcmp() for Blackfin.   * When both strings are aligned, this processes four characters at   * a time. Uses a hw loop with "very big" count to loop "forever", @@ -21,9 +23,8 @@  .align 2 -.global _strcmp -.type _strcmp, STT_FUNC -_strcmp: +.weak _strcmp +ENTRY(_strcmp)  	[--sp] = (R7:4);  	p1 = r0;  	p2 = r1; @@ -34,13 +35,13 @@ _strcmp:  	r0 = r0 | r1;	// check both pointers at same time  	r0 <<= 30;	// dump all but last 2 bits  	cc = az;	// are they zero? -	if !cc jump unaligned;	// no; use unaligned code. +	if !cc jump .Lunaligned;	// no; use unaligned code.  			// fall-thru for aligned case..  	  // note that r0 is zero from the previous...  	  //           p0 set to -1 -	lsetup (beginloop, endloop) lc0=p0; +	LSETUP (.Lbeginloop, .Lendloop) lc0=p0;  	  // pick up first words  	r1 = [p1++];  	r2 = [p2++]; @@ -49,8 +50,8 @@ _strcmp:  	r7.h = 0xFF;  		// loop : 9 cycles to check 4 characters  	cc = r1 == r2; -beginloop: -	if !cc jump notequal4;	// compare failure, exit loop +.Lbeginloop: +	if !cc jump .Lnotequal4;	// compare failure, exit loop  	  // starting with   44332211  	  // see if char 3 or char 1 is 0 @@ -63,18 +64,18 @@ beginloop:  	// add to zero,  and  (r1 is free, reload)  	r6 = r3 +|+ r0 || r1 = [p1++] || nop;  	cc |= az;	// true if either is zero -	if cc jump zero4;	// leave if a zero somewhere -endloop: +	if cc jump .Lzero4;	// leave if a zero somewhere +.Lendloop:  	cc = r1 == r2;   // loop exits -notequal4:		// compare failure on 4-char compare +.Lnotequal4:		// compare failure on 4-char compare  			// address pointers are one word ahead;  			// faster to use zero4 exit code  	p1 += 4;  	p2 += 4; -zero4:			// one of the bytes in word 1 is zero +.Lzero4:			// one of the bytes in word 1 is zero  			// but we've already fetched the next word; so  			// backup two to look at failing word again  	p1 += -8; @@ -85,27 +86,27 @@ zero4:			// one of the bytes in word 1 is zero  		// here when pointers are unaligned: checks one  		// character at a time.  Also use at the end of  		// the word-check algorithm to figure out what happened -unaligned: +.Lunaligned:  	  //	R0 is non-zero from before.  	  //           p0 set to -1  	r0 = 0 (Z);  	r1 = B[p1++] (Z);  	r2 = B[p2++] (Z); -	lsetup (beginloop1, endloop1) lc0=p0; +	LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0; -beginloop1: +.Lbeginloop1:  	cc = r1;	// first char must be non-zero  	// chars must be the same  	r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop;  	cc &= az;  	r3 = r0 - r2;	// second char must be non-zero  	cc &= an; -	if !cc jump exitloop1; -endloop1: +	if !cc jump .Lexitloop1; +.Lendloop1:  	r2 = B[p2++] (Z); -exitloop1: // here means we found a zero or a difference. +.Lexitloop1: // here means we found a zero or a difference.  	   // we have r2(N), p2(N), r1(N+1), p1(N+2)  	r1=B[p1+ -2] (Z);  	r0 = r1 - r2; @@ -116,6 +117,6 @@ exitloop1: // here means we found a zero or a difference.  libc_hidden_def (strcmp)  #ifndef __UCLIBC_HAS_LOCALE__ -strong_alias (strcmp,strcoll) +weak_alias (strcmp,strcoll)  libc_hidden_def (strcoll)  #endif diff --git a/libc/string/cris/memcpy.c b/libc/string/cris/memcpy.c index a85108109..0cce37a30 100644 --- a/libc/string/cris/memcpy.c +++ b/libc/string/cris/memcpy.c @@ -66,7 +66,7 @@  void *memcpy(void *, const void *, unsigned int); -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */  void *memcpy(void *pdst,               const void *psrc,               unsigned int pn) @@ -130,7 +130,7 @@ void *memcpy(void *pdst,       here (beware: they may be moved to temporary registers).        This way, we do not have to save/move the registers around into       temporaries; we can safely use them straight away.  */ -    __asm__ volatile ("\ +    __asm__ __volatile__ ("\  	.syntax no_register_prefix					\n\  									\n\          ;; Check that the register asm declaration got right.		\n\ diff --git a/libc/string/cris/memmove.c b/libc/string/cris/memmove.c index 437637078..b6620afe0 100644 --- a/libc/string/cris/memmove.c +++ b/libc/string/cris/memmove.c @@ -27,7 +27,7 @@  #include "memcopy.h"  #include "../generic/pagecopy.h" -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */  void *memmove (void *dest, const void *src, size_t len)  {    unsigned long int dstp = (long int) dest; diff --git a/libc/string/cris/memset.c b/libc/string/cris/memset.c index 7e71bc50f..9cc959a33 100644 --- a/libc/string/cris/memset.c +++ b/libc/string/cris/memset.c @@ -59,7 +59,7 @@  void *memset(void *, int, unsigned long); -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */  void *memset(void *pdst,               int c,               unsigned long plen) @@ -124,7 +124,7 @@ void *memset(void *pdst,       here (beware: they may be moved to temporary registers).        This way, we do not have to save/move the registers around into       temporaries; we can safely use them straight away.  */ -    __asm__ volatile ("								\n\ +    __asm__ __volatile__ ("								\n\  	.syntax no_register_prefix						\n\  										\n\          ;; Check that the register asm declaration got right.			\n\ diff --git a/libc/string/cris/strcpy.c b/libc/string/cris/strcpy.c index 0af25253e..955a990b7 100644 --- a/libc/string/cris/strcpy.c +++ b/libc/string/cris/strcpy.c @@ -6,7 +6,7 @@  #include <string.h> -libc_hidden_proto(strcpy) +/* Experimentally off - libc_hidden_proto(strcpy) */  char *strcpy(char *dest, const char *src)  {    char *ret = dest; diff --git a/libc/string/cris/strncpy.c b/libc/string/cris/strncpy.c index 93a6608bc..3f2775bdd 100644 --- a/libc/string/cris/strncpy.c +++ b/libc/string/cris/strncpy.c @@ -6,9 +6,9 @@  #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ -libc_hidden_proto(strncpy) +/* Experimentally off - libc_hidden_proto(strncpy) */  char *strncpy(char *dest, const char *src, size_t count)  {    char *ret = dest; diff --git a/libc/string/frv/memset.S b/libc/string/frv/memset.S index 4e64550e4..477597dcd 100644 --- a/libc/string/frv/memset.S +++ b/libc/string/frv/memset.S @@ -155,4 +155,4 @@ memset:  	bralr  	.size		memset, .-memset -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ diff --git a/libc/string/i386/memchr.c b/libc/string/i386/memchr.c index 229d42919..fe4537914 100644 --- a/libc/string/i386/memchr.c +++ b/libc/string/i386/memchr.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(memchr) +/* Experimentally off - libc_hidden_proto(memchr) */  void *memchr(const void *cs, int c, size_t count)  {      int d0; diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c index a2b8d3d8c..285583f3b 100644 --- a/libc/string/i386/memcpy.c +++ b/libc/string/i386/memcpy.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */  void *memcpy(void * to, const void * from, size_t n)  {      int d0, d1, d2; diff --git a/libc/string/i386/memmove.c b/libc/string/i386/memmove.c index a26fe2be1..a924efcbc 100644 --- a/libc/string/i386/memmove.c +++ b/libc/string/i386/memmove.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */  void *memmove(void *dest, const void *src, size_t n)  {      int d0, d1, d2; diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c index eea48040a..bbaa45215 100644 --- a/libc/string/i386/memset.c +++ b/libc/string/i386/memset.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */  void *memset(void *s, int c, size_t count)  {      int d0, d1; diff --git a/libc/string/i386/strcat.c b/libc/string/i386/strcat.c index e0b1f3b51..2cf0237a6 100644 --- a/libc/string/i386/strcat.c +++ b/libc/string/i386/strcat.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strcat) +/* Experimentally off - libc_hidden_proto(strcat) */  char *strcat(char * dest, const char * src)  {      int d0, d1, d2, d3; diff --git a/libc/string/i386/strchr.c b/libc/string/i386/strchr.c index 7568d48db..46b1dfb6e 100644 --- a/libc/string/i386/strchr.c +++ b/libc/string/i386/strchr.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strchr) +/* Experimentally off - libc_hidden_proto(strchr) */  char *strchr(const char *s, int c)  {      int d0; diff --git a/libc/string/i386/strcmp.c b/libc/string/i386/strcmp.c index 47635d817..eff230c5c 100644 --- a/libc/string/i386/strcmp.c +++ b/libc/string/i386/strcmp.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strcmp) +/* Experimentally off - libc_hidden_proto(strcmp) */  int strcmp(const char *cs, const char *ct)  {      int d0, d1; @@ -55,7 +55,7 @@ int strcmp(const char *cs, const char *ct)  libc_hidden_def(strcmp)  #ifndef __UCLIBC_HAS_LOCALE__ -libc_hidden_proto(strcoll) +/* Experimentally off - libc_hidden_proto(strcoll) */  strong_alias(strcmp,strcoll)  libc_hidden_def(strcoll)  #endif diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c index 9e2b81009..09065a9b7 100644 --- a/libc/string/i386/strcpy.c +++ b/libc/string/i386/strcpy.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strcpy) +/* Experimentally off - libc_hidden_proto(strcpy) */  char *strcpy(char * dest, const char * src)  {      int d0, d1, d2; diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c index f0767b600..61a178393 100644 --- a/libc/string/i386/strlen.c +++ b/libc/string/i386/strlen.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strlen) +/* Experimentally off - libc_hidden_proto(strlen) */  size_t strlen(const char *s)  {      int d0; diff --git a/libc/string/i386/strncat.c b/libc/string/i386/strncat.c index c1061421e..5849db3b3 100644 --- a/libc/string/i386/strncat.c +++ b/libc/string/i386/strncat.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strncat) +/* Experimentally off - libc_hidden_proto(strncat) */  char *strncat(char * dest,   	const char * src, size_t count)  { diff --git a/libc/string/i386/strncmp.c b/libc/string/i386/strncmp.c index d716789c3..a14bb503b 100644 --- a/libc/string/i386/strncmp.c +++ b/libc/string/i386/strncmp.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strncmp) +/* Experimentally off - libc_hidden_proto(strncmp) */  int strncmp(const char *cs, const char *ct, size_t count)  {      register int __res; diff --git a/libc/string/i386/strncpy.c b/libc/string/i386/strncpy.c index c061fe37e..76aa6ae1b 100644 --- a/libc/string/i386/strncpy.c +++ b/libc/string/i386/strncpy.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strncpy) +/* Experimentally off - libc_hidden_proto(strncpy) */  char *strncpy(char * dest, const char * src, size_t count)  {      int d0, d1, d2, d3; diff --git a/libc/string/i386/strnlen.c b/libc/string/i386/strnlen.c index 77b5c7568..02c72f530 100644 --- a/libc/string/i386/strnlen.c +++ b/libc/string/i386/strnlen.c @@ -33,7 +33,7 @@  #include <string.h>  #ifdef __USE_GNU -libc_hidden_proto(strnlen) +/* Experimentally off - libc_hidden_proto(strnlen) */  size_t strnlen(const char *s, size_t count)  {      int d0; diff --git a/libc/string/i386/strrchr.c b/libc/string/i386/strrchr.c index e3b2df6fb..ef378685b 100644 --- a/libc/string/i386/strrchr.c +++ b/libc/string/i386/strrchr.c @@ -32,7 +32,7 @@  #include <string.h> -libc_hidden_proto(strrchr) +/* Experimentally off - libc_hidden_proto(strrchr) */  char *strrchr(const char *s, int c)  {      int d0, d1; diff --git a/libc/string/ia64/memcpy.S b/libc/string/ia64/memcpy.S index db019f860..810eb0c0e 100644 --- a/libc/string/ia64/memcpy.S +++ b/libc/string/ia64/memcpy.S @@ -115,7 +115,7 @@  #if defined(USE_LFETCH)  #define LOOP(shift)						\  		ALIGN(32);					\ -.loop##shift##:							\ +.loop##shift :							\  { .mmb								\  (p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\  (p[0])	lfetch.nt1 [ptr1], 16 ;					\ @@ -139,7 +139,7 @@  #else  #define LOOP(shift)						\  		ALIGN(32);					\ -.loop##shift##:							\ +.loop##shift :							\  { .mmb								\  (p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\  	nop.b 0 ;						\ diff --git a/libc/string/ia64/memmove.S b/libc/string/ia64/memmove.S index 0328f84de..00342d8e0 100644 --- a/libc/string/ia64/memmove.S +++ b/libc/string/ia64/memmove.S @@ -64,7 +64,7 @@  #define LOOP(shift)							\  		ALIGN(32);						\ -.loop##shift##:								\ +.loop##shift :								\  (p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\  (p[MEMLAT+1])	st8	[dest] = value, 8 ;				\  (p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\ diff --git a/libc/string/ia64/sysdep.h b/libc/string/ia64/sysdep.h index 03e74360d..d10020ac1 100644 --- a/libc/string/ia64/sysdep.h +++ b/libc/string/ia64/sysdep.h @@ -34,7 +34,7 @@  #define ASM_UNW_PRLG_GRSAVE(ninputs)	(32+(ninputs))  #ifdef	__STDC__ -#define C_LABEL(name)		name##: +#define C_LABEL(name)		name :  #else  #define C_LABEL(name)		name/**/:  #endif diff --git a/libc/string/powerpc/memcpy.c b/libc/string/powerpc/memcpy.c index ed8022313..bcbb806f8 100644 --- a/libc/string/powerpc/memcpy.c +++ b/libc/string/powerpc/memcpy.c @@ -21,7 +21,7 @@  #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */  void *memcpy(void *to, const void *from, size_t n)  /* PPC can do pre increment and load/store, but not post increment and load/store.     Therefore use *++ptr instead of *ptr++. */ diff --git a/libc/string/powerpc/memmove.c b/libc/string/powerpc/memmove.c index 327161116..7a4a7e5ff 100644 --- a/libc/string/powerpc/memmove.c +++ b/libc/string/powerpc/memmove.c @@ -21,9 +21,9 @@  #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */ -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */  void *memmove(void *to, const void *from, size_t n)  {  	unsigned long rem, chunks, tmp1, tmp2; diff --git a/libc/string/powerpc/memset.c b/libc/string/powerpc/memset.c index 891e0b8aa..d62ec0ee0 100644 --- a/libc/string/powerpc/memset.c +++ b/libc/string/powerpc/memset.c @@ -21,14 +21,14 @@  #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */  static inline int expand_byte_word(int c){  	/* this does:   	   c = c << 8 | c;  	   c = c << 16 | c ;  	*/ -	asm("rlwimi	%0,%0,8,16,23\n" +	__asm__("rlwimi	%0,%0,8,16,23\n"  	    "\trlwimi	%0,%0,16,0,15\n"  	    : "=r" (c) : "0" (c));  	return c; diff --git a/libc/string/sparc/_glibc_inc.h b/libc/string/sparc/_glibc_inc.h index 4eb4d755c..e0aef52c2 100644 --- a/libc/string/sparc/_glibc_inc.h +++ b/libc/string/sparc/_glibc_inc.h @@ -6,6 +6,8 @@  #include <features.h>  #include <bits/wordsize.h> +/* Is alignment really needed? */ +  #if __WORDSIZE == 32  # define ENTRY_ALIGN 4  #else diff --git a/libc/string/sparc/sparc32/sparcv9b/memchr.S b/libc/string/sparc/sparc32/sparcv9b/memchr.S index 7e86a2972..43a16ff11 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memchr.S +++ b/libc/string/sparc/sparc32/sparcv9b/memchr.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include "sparc64/memchr.S" +#include "../../sparc64/memchr.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/memcpy.S b/libc/string/sparc/sparc32/sparcv9b/memcpy.S index 7f697542e..2024869dd 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/memcpy.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include "sparc64/sparcv9b/memcpy.S" +#include "../../sparc64/sparcv9b/memcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/memset.S b/libc/string/sparc/sparc32/sparcv9b/memset.S index ac67b7ab7..e49173172 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memset.S +++ b/libc/string/sparc/sparc32/sparcv9b/memset.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/memset.S> +#include "../../sparc64/memset.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S index 440ad7e21..17ffa5e4d 100644 --- a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/stpcpy.S> +#include "../../sparc64/stpcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcat.S b/libc/string/sparc/sparc32/sparcv9b/strcat.S index 7a2223570..9ed125a4b 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcat.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcat.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/strcat.S> +#include "../../sparc64/strcat.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strchr.S b/libc/string/sparc/sparc32/sparcv9b/strchr.S index ddd32120d..6b2727a1f 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strchr.S +++ b/libc/string/sparc/sparc32/sparcv9b/strchr.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/strchr.S> +#include "../../sparc64/strchr.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcmp.S b/libc/string/sparc/sparc32/sparcv9b/strcmp.S index 5330f4359..854403ffd 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcmp.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcmp.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/strcmp.S> +#include "../../sparc64/strcmp.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcpy.S b/libc/string/sparc/sparc32/sparcv9b/strcpy.S index 0b35c9be0..e8102bde4 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcpy.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/strcpy.S> +#include "../../sparc64/strcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strlen.S b/libc/string/sparc/sparc32/sparcv9b/strlen.S index b8f4dba4f..8673333a2 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strlen.S +++ b/libc/string/sparc/sparc32/sparcv9b/strlen.S @@ -1,4 +1,4 @@  #define ASI_PNF     0x82  #define ASI_BLK_P   0xf0  #define XCC icc -#include <sparc64/strlen.S> +#include "../../sparc64/strlen.S" diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h index 88cef2ea3..415ce90a7 100644 --- a/libc/string/x86_64/_glibc_inc.h +++ b/libc/string/x86_64/_glibc_inc.h @@ -6,15 +6,8 @@  #include <features.h>  #include <bits/wordsize.h> -#if __WORDSIZE == 32 -# define ENTRY_ALIGN 4 -#else -# define ENTRY_ALIGN 2 -#endif -  #define ENTRY(sym) \  	.global sym; \ -	.align  ENTRY_ALIGN; \  	.type   sym,%function; \  	sym: diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S index b3bb0f96c..697b992d0 100644 --- a/libc/string/x86_64/memcpy.S +++ b/libc/string/x86_64/memcpy.S @@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))  	subq	$32, %rcx  	js	2f -	.p2align 4 +	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */ +	.p2align 4,,11  3: -  	/* Now correct the loop counter.  Please note that in the following  	   code the flags are not changed anymore.  */  	subq	$32, %rcx diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index d72d74468..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -53,15 +53,17 @@ ENTRY (memset)  	imul	%rax,%r8  #endif  	test	$0x7,%edi	/* Check for alignment.  */ -	je	2f +	jz	2f -	.p2align 4 -1:	/* Align ptr to 8 byte.  */ +	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */ +	.p2align 4,,9 +1: +	/* Align ptr to 8 byte.  */  	mov	%sil,(%rcx)  	dec	%rdx  	inc	%rcx -	test	$0x7,%ecx -	jne	1b +	test	$0x7,%cl +	jnz	1b  2:	/* Check for really large regions.  */  	mov	%rdx,%rax @@ -70,8 +72,10 @@ ENTRY (memset)  	cmp	LARGE, %rdx  	jae	11f -	.p2align 4 -3:	/* Copy 64 bytes.  */ +	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */ +	.p2align 4,,11 +3: +	/* Fill 64 bytes.  */  	mov	%r8,(%rcx)  	mov	%r8,0x8(%rcx)  	mov	%r8,0x10(%rcx) @@ -84,7 +88,7 @@ ENTRY (memset)  	dec	%rax  	jne	3b -4:	/* Copy final bytes.  */ +4:	/* Fill final bytes.  */  	and	$0x3f,%edx  	mov	%rdx,%rax  	shr	$0x3,%rax @@ -107,16 +111,18 @@ ENTRY (memset)  	jne	8b  9:  #if BZERO_P -	nop +	/* nothing */  #else  	/* Load result (only if used as memset).  */  	mov	%rdi,%rax	/* start address of destination is result */  #endif  	retq -	.p2align 4 -11:	/* Copy 64 bytes without polluting the cache.  */ -	/* We could use	movntdq    %xmm0,(%rcx) here to further +	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */ +	.p2align 4,,14 +11: +	/* Fill 64 bytes without polluting the cache.  */ +	/* We could use	movntdq %xmm0,(%rcx) here to further  	   speed up for large cases but let's not use XMM registers.  */  	movnti	%r8,(%rcx)  	movnti  %r8,0x8(%rcx) diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S index 9b0068981..23d068fea 100644 --- a/libc/string/x86_64/strcat.S +++ b/libc/string/x86_64/strcat.S @@ -21,6 +21,7 @@  #include "_glibc_inc.h" +/* Seems to be unrolled too much */  	.text  ENTRY (BP_SYM (strcat)) @@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat))  	/* Now the source is aligned.  Scan for NUL byte.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  4:  	/* First unroll.  */  	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */ @@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat))  				   the addition will not result in 0.  */  	jz 4b			/* no NUL found => continue loop */ -	.p2align 4		/* Align, it's a jump target.  */ -3:	subq $8,%rax		/* correct pointer increment.  */ +	/* Align, it is a jump target.  */ +	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ +	.p2align 3,,8 +3: +	subq $8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte NUL? */  	jz 2f			/* yes => return */ @@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat))  	/* Now the sources is aligned.  Unfortunatly we cannot force  	   to have both source and destination aligned, so ignore the  	   alignment of the destination.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  22:  	/* 1st unroll.  */  	movq	(%rsi), %rax	/* Read double word (8 bytes).  */ @@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat))  	/* Do the last few bytes. %rax contains the value to write.  	   The loop is unrolled twice.  */ -	.p2align 4 + +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6  23:  	movb	%al, (%rdx)	/* 1st byte.  */  	testb	%al, %al	/* Is it NUL.  */ diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S index 8e59c4c19..9ef46b7f2 100644 --- a/libc/string/x86_64/strchr.S +++ b/libc/string/x86_64/strchr.S @@ -20,6 +20,7 @@  #include "_glibc_inc.h" +/* Seems to be unrolled too much */  	.text  ENTRY (BP_SYM (strchr)) @@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr))  	 each of whose bytes is C.  This turns each byte that is C  	 into a zero.  */ -	.p2align 4 +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  4:  	/* Main Loop is unrolled 4 times.  */  	/* First unroll.  */ @@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr))  	   reversed.  */ -	.p2align 4		/* Align, it's a jump target.  */ -3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */ +	/* Align, it's a jump target.  */ +	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */ +	.p2align 4,,9 +3: +	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */  	subq	$8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte C? */  	jz 6f			/* yes => return pointer */ @@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr))  	incq %rax  6: -	nop +	/* nop - huh?? */  	retq  END (BP_SYM (strchr)) diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S index d9a51b0bb..612a30d1a 100644 --- a/libc/string/x86_64/strcpy.S +++ b/libc/string/x86_64/strcpy.S @@ -20,6 +20,8 @@  #include "_glibc_inc.h" +/* Seems to be unrolled too much */ +  #ifndef USE_AS_STPCPY  # define STRCPY strcpy  #endif @@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))  	/* Now the sources is aligned.  Unfortunatly we cannot force  	   to have both source and destination aligned, so ignore the  	   alignment of the destination.  */ -	.p2align 4 + +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10  1:  	/* 1st unroll.  */  	movq	(%rsi), %rax	/* Read double word (8 bytes).  */ @@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))  	/* Do the last few bytes. %rax contains the value to write.  	   The loop is unrolled twice.  */ -	.p2align 4 + +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6  3:  	/* Note that stpcpy needs to return with the value of the NUL  	   byte.  */ diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S index fed12b5f6..fd9b09c48 100644 --- a/libc/string/x86_64/strcspn.S +++ b/libc/string/x86_64/strcspn.S @@ -25,6 +25,8 @@  #include "_glibc_inc.h" +/* Seems to be unrolled too much */ +  /* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */  #define STRPBRK_P (defined strcspn) @@ -53,26 +55,28 @@ ENTRY (strcspn)     Although all the following instruction only modify %cl we always     have a correct zero-extended 64-bit value in %rcx.  */ -	.p2align 4 +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6 +  L(2):	movb (%rax), %cl	/* get byte from skipset */  	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */  	movb 1(%rax), %cl	/* get byte from skipset */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */  	movb 2(%rax), %cl	/* get byte from skipset */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */  	movb 3(%rax), %cl	/* get byte from skipset */  	addq $4, %rax		/* increment skipset pointer */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jnz L(2)		/* no => process next dword from skipset */  L(1):	leaq -4(%rdx), %rax	/* prepare loop */ @@ -86,7 +90,13 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	   value in the table.  But the value of NUL is NUL so the loop  	   terminates for NUL in every case.  */ -	.p2align 4 +	/* Next 3 insns are 9 bytes total. */ +	/* .p2align 4,,9 would make sure we decode them in one go, */ +	/* but it will also align entire function to 16 bytes, */ +	/* potentially creating largish padding at link time. */ +	/* We are aligning to 8 bytes instead: */ +	.p2align 3,,8 +  L(3):	addq $4, %rax		/* adjust pointer for full loop round */  	movb (%rax), %cl	/* get byte from string */ diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S index 0441dc46c..4213f0ab6 100644 --- a/libc/string/x86_64/strlen.S +++ b/libc/string/x86_64/strlen.S @@ -20,6 +20,7 @@  #include "_glibc_inc.h" +/* Seems to be unrolled too much */  	.text  ENTRY (strlen) @@ -39,8 +40,11 @@ ENTRY (strlen)  1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */ -	.p2align 4		/* Align loop.  */ -4:	/* Main Loop is unrolled 4 times.  */ +	/* Align loop.  */ +	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */ +	.p2align 4,,10 +4: +	/* Main Loop is unrolled 4 times.  */  	/* First unroll.  */  	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */  	addq $8,%rax		/* adjust pointer for next word */ @@ -97,8 +101,11 @@ ENTRY (strlen)  				   the addition will not result in 0.  */  	jz 4b			/* no NUL found => continue loop */ -	.p2align 4		/* Align, it's a jump target.  */ -3:	subq $8,%rax		/* correct pointer increment.  */ +	/* Align, it is a jump target.  */ +	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ +	.p2align 3,,8 +3: +	subq $8,%rax		/* correct pointer increment.  */  	testb %cl, %cl		/* is first byte NUL? */  	jz 2f			/* yes => return */ diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S index c126abd2e..41cff0490 100644 --- a/libc/string/x86_64/strspn.S +++ b/libc/string/x86_64/strspn.S @@ -50,26 +50,28 @@ ENTRY (strspn)     Although all the following instruction only modify %cl we always     have a correct zero-extended 64-bit value in %rcx.  */ -	.p2align 4 -L(2):	movb (%rax), %cl	/* get byte from stopset */ +	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */ +	.p2align 3,,6 +L(2): +	movb (%rax), %cl	/* get byte from stopset */  	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */  	movb 1(%rax), %cl	/* get byte from stopset */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */  	movb 2(%rax), %cl	/* get byte from stopset */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jz L(1)			/* yes => start compare loop */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */  	movb 3(%rax), %cl	/* get byte from stopset */  	addq $4, %rax		/* increment stopset pointer */  	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */ -	testb $0xff, %cl	/* is NUL char? */ +	testb %cl, %cl		/* is NUL char? */  	jnz L(2)		/* no => process next dword from stopset */  L(1):	leaq -4(%rdx), %rax	/* prepare loop */ @@ -83,8 +85,14 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	   value in the table.  But the value of NUL is NUL so the loop  	   terminates for NUL in every case.  */ -	.p2align 4 -L(3):	addq $4, %rax		/* adjust pointer for full loop round */ +	/* Next 3 insns are 9 bytes total. */ +	/* .p2align 4,,9 would make sure we decode them in one go, */ +	/* but it will also align entire function to 16 bytes, */ +	/* potentially creating largish padding at link time. */ +	/* We are aligning to 8 bytes instead: */ +	.p2align 3,,8 +L(3): +	addq $4, %rax		/* adjust pointer for full loop round */  	movb (%rax), %cl	/* get byte from string */  	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */  | 
