diff options
70 files changed, 1080 insertions, 658 deletions
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S index 3704f96b5..103580a0c 100644 --- a/libc/string/arm/_memcpy.S +++ b/libc/string/arm/_memcpy.S @@ -39,7 +39,9 @@ #include <features.h> #include <endian.h> +#include <bits/arm_asm.h> +#if !defined(THUMB1_ONLY) /* * This is one fun bit of code ... * Some easy listening music is suggested while trying to understand this @@ -77,12 +79,36 @@ .type _memcpy,%function .align 4 +/* XXX: The Thumb-2 conditionals can be removed if/when we require an + assembler that supports unified syntax. */ +.macro copy regs +#if defined(__thumb2__) + ittt ge + ldmiage r1!, \regs + stmiage r0!, \regs +#else + ldmgeia r1!, \regs + stmgeia r0!, \regs +#endif +.endm + +.macro copydb regs +#if defined(__thumb2__) + ittt ge + ldmdbge r1!, \regs + stmdbge r0!, \regs +#else + ldmgedb r1!, \regs + stmgedb r0!, \regs +#endif +.endm + _memcpy: /* Determine copy direction */ cmp r1, r0 bcc .Lmemcpy_backwards - moveq r0, #0 /* Quick abort for len=0 */ + IT(tt, eq) /* Quick abort for src=dst */ #if defined(__USE_BX__) bxeq lr #else @@ -102,7 +128,7 @@ _memcpy: blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ subs r2, r2, #0x14 blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ - stmdb sp!, {r4} /* borrow r4 */ + str r4, [sp, #-4]! /* borrow r4 */ /* blat 32 bytes at a time */ /* XXX for really big copies perhaps we should use more registers */ @@ -115,19 +141,22 @@ _memcpy: bge .Lmemcpy_floop32 cmn r2, #0x10 - ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ - stmgeia r0!, {r3, r4, r12, lr} + /* blat a remaining 16 bytes */ + copy "{r3, r4, r12, lr}" subge r2, r2, #0x10 - ldmia sp!, {r4} /* return r4 */ + ldr r4, [sp], #4 /* restore r4 */ .Lmemcpy_fl32: adds r2, r2, #0x14 /* blat 12 bytes at a time */ .Lmemcpy_floop12: - ldmgeia r1!, {r3, r12, lr} - stmgeia r0!, {r3, r12, lr} + copy "{r3, r12, lr}" +#if defined(__thumb2__) + subsge r2, r2, #0x0c +#else subges r2, r2, #0x0c +#endif bge .Lmemcpy_floop12 .Lmemcpy_fl12: @@ -135,26 +164,48 @@ _memcpy: blt .Lmemcpy_fl4 subs r2, r2, #4 + IT(tt, lt) ldrlt r3, [r1], #4 strlt r3, [r0], #4 - ldmgeia r1!, {r3, r12} - stmgeia r0!, {r3, r12} + copy "{r3, r12}" subge r2, r2, #4 .Lmemcpy_fl4: /* less than 4 bytes to go */ adds r2, r2, #4 +#if defined(__thumb2__) + it eq + popeq {r0, pc} /* done */ +#elif defined(__ARM_ARCH_4T__) + ldmeqia sp!, {r0, r3} /* done */ + bxeq r3 +#else ldmeqia sp!, {r0, pc} /* done */ +#endif /* copy the crud byte at a time */ cmp r2, #2 ldrb r3, [r1], #1 strb r3, [r0], #1 +#if defined(__thumb2__) + itt ge + ldrbge r3, [r1], #1 + strbge r3, [r0], #1 + itt gt + ldrbgt r3, [r1], #1 + strbgt r3, [r0], #1 +#else ldrgeb r3, [r1], #1 strgeb r3, [r0], #1 ldrgtb r3, [r1], #1 strgtb r3, [r0], #1 +#endif +#if defined(__ARM_ARCH_4T__) + ldmia sp!, {r0, r3} + bx r3 +#else ldmia sp!, {r0, pc} +#endif /* erg - unaligned destination */ .Lmemcpy_fdestul: @@ -164,10 +215,19 @@ _memcpy: /* align destination with byte copies */ ldrb r3, [r1], #1 strb r3, [r0], #1 +#if defined(__thumb2__) + itt ge + ldrbge r3, [r1], #1 + strbge r3, [r0], #1 + itt gt + ldrbgt r3, [r1], #1 + strbgt r3, [r0], #1 +#else ldrgeb r3, [r1], #1 strgeb r3, [r0], #1 ldrgtb r3, [r1], #1 strgtb r3, [r0], #1 +#endif subs r2, r2, r12 blt .Lmemcpy_fl4 /* less the 4 bytes */ @@ -370,12 +430,12 @@ _memcpy: .Lmemcpy_bl32: cmn r2, #0x10 - ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ - stmgedb r0!, {r3, r4, r12, lr} + /* blat a remaining 16 bytes */ + copydb "{r3, r4, r12, lr}" subge r2, r2, #0x10 adds r2, r2, #0x14 - ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ - stmgedb r0!, {r3, r12, lr} + /* blat a remaining 12 bytes */ + copydb "{r3, r12, lr}" subge r2, r2, #0x0c ldmia sp!, {r4, lr} @@ -383,15 +443,16 @@ _memcpy: adds r2, r2, #8 blt .Lmemcpy_bl4 subs r2, r2, #4 + IT(tt, lt) ldrlt r3, [r1, #-4]! strlt r3, [r0, #-4]! - ldmgedb r1!, {r3, r12} - stmgedb r0!, {r3, r12} + copydb "{r3, r12}" subge r2, r2, #4 .Lmemcpy_bl4: /* less than 4 bytes to go */ adds r2, r2, #4 + IT(t, eq) #if defined(__USE_BX__) bxeq lr #else @@ -401,10 +462,19 @@ _memcpy: cmp r2, #2 ldrb r3, [r1, #-1]! strb r3, [r0, #-1]! +#ifdef __thumb2__ + itt ge + ldrbge r3, [r1, #-1]! + strbge r3, [r0, #-1]! + itt gt + ldrbgt r3, [r1, #-1]! + strbgt r3, [r0, #-1]! +#else ldrgeb r3, [r1, #-1]! strgeb r3, [r0, #-1]! ldrgtb r3, [r1, #-1]! strgtb r3, [r0, #-1]! +#endif #if defined(__USE_BX__) bx lr #else @@ -417,10 +487,19 @@ _memcpy: /* align destination with byte copies */ ldrb r3, [r1, #-1]! strb r3, [r0, #-1]! +#ifdef __thumb2__ + itt ge + ldrbge r3, [r1, #-1]! + strbge r3, [r0, #-1]! + itt gt + ldrbgt r3, [r1, #-1]! + strbgt r3, [r0, #-1]! +#else ldrgeb r3, [r1, #-1]! strgeb r3, [r0, #-1]! ldrgtb r3, [r1, #-1]! strgtb r3, [r0, #-1]! +#endif subs r2, r2, r12 blt .Lmemcpy_bl4 /* less than 4 bytes to go */ ands r12, r1, #3 @@ -591,3 +670,77 @@ _memcpy: .Lmemcpy_bsrcul1l4: add r1, r1, #1 b .Lmemcpy_bl4 + +#else /* THUMB1_ONLY */ + +/* This is a fairly dumb implementation for when we can't use the 32-bit code + above. */ +.text +.global _memcpy +.hidden _memcpy +.type _memcpy,%function +.align 4 +.thumb +_memcpy: + push {r0, r4} + cmp r2, #0 + beq .Lmemcpy_exit + @ See if we have overlapping regions, and need to reverse the + @ direction of the copy + cmp r0, r1 + bls .Lmemcpy_forwards + add r4, r1, r2 + cmp r0, r4 + bcc .Lmemcpy_backwards +.Lmemcpy_forwards: + /* Forwards. */ + mov r3, r0 + eor r3, r1 + mov r4, #3 + tst r3, r4 + bne .Lmemcpy_funaligned + cmp r2, #8 + bcc .Lmemcpy_funaligned +1: @ copy up to the first word boundary. + tst r0, r4 + beq 1f + ldrb r3, [r1] + add r1, r1, #1 + strb r3, [r0] + add r0, r0, #1 + sub r2, r2, #1 + b 1b +1: @ Copy aligned words + ldr r3, [r1] + add r1, r1, #4 + str r3, [r0] + add r0, r0, #4 + sub r2, r2, #4 + cmp r2, #4 + bcs 1b + cmp r2, #0 + beq .Lmemcpy_exit +.Lmemcpy_funaligned: +1: + ldrb r3, [r1] + add r1, r1, #1 + strb r3, [r0] + add r0, r0, #1 + sub r2, r2, #1 + bne 1b +.Lmemcpy_exit: + pop {r0, r4} + bx lr + +.Lmemcpy_backwards: + add r0, r0, r2 + add r1, r1, r2 +1: + sub r0, r0, #1 + sub r1, r1, #1 + ldrb r3, [r1] + strb r3, [r0] + sub r2, r2, #1 + bne 1b + b .Lmemcpy_exit +#endif diff --git a/libc/string/arm/bcopy.S b/libc/string/arm/bcopy.S index db3c9e6c1..2d6e90d13 100644 --- a/libc/string/arm/bcopy.S +++ b/libc/string/arm/bcopy.S @@ -40,6 +40,7 @@ /* bcopy = memcpy/memmove with arguments reversed. */ #include <features.h> +#include <bits/arm_asm.h> #ifdef __UCLIBC_SUSV3_LEGACY__ @@ -48,12 +49,23 @@ .type bcopy,%function .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +bcopy: + push {r2, lr} + mov ip, r0 + mov r0, r1 + mov r1, ip + bl _memcpy + POP_RET +#else bcopy: /* switch the source and destination registers */ eor r0, r1, r0 eor r1, r0, r1 eor r0, r1, r0 b _memcpy /* (PLT) */ +#endif .size bcopy,.-bcopy diff --git a/libc/string/arm/bzero.S b/libc/string/arm/bzero.S index ee49cf560..e576a12e9 100644 --- a/libc/string/arm/bzero.S +++ b/libc/string/arm/bzero.S @@ -38,6 +38,7 @@ */ #include <features.h> +#include <bits/arm_asm.h> #ifdef __UCLIBC_SUSV3_LEGACY__ @@ -46,10 +47,21 @@ .type bzero,%function .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +bzero: + push {r2, lr} + mov r2, r1 + mov r1, #0 + bl HIDDEN_JUMPTARGET(memset) + POP_RET +#else + bzero: mov r2, r1 mov r1, #0 b HIDDEN_JUMPTARGET(memset) +#endif .size bzero,.-bzero diff --git a/libc/string/arm/memcmp.S b/libc/string/arm/memcmp.S index 4f78b5128..65409f43a 100644 --- a/libc/string/arm/memcmp.S +++ b/libc/string/arm/memcmp.S @@ -30,15 +30,41 @@ */ #include <features.h> +#include <bits/arm_asm.h> .text .global memcmp .type memcmp,%function .align 4 +#if defined(THUMB1_ONLY) +.thumb_func +memcmp: + cmp r2, #0 + bne 1f + mov r0, #0 + bx lr +1: + push {r4} + add r4, r0, r2 +2: + ldrb r2, [r0] + add r0, r0, #1 + ldrb r3, [r1] + add r1, r1, #1 + cmp r4, r0 + beq 3f + cmp r2, r3 + beq 2b +3: + sub r0, r2, r3 + pop {r4} + bx lr +#else memcmp: /* if ((len - 1) < 0) return 0 */ subs r2, r2, #1 + IT(tt, mi) movmi r0, #0 #if defined(__USE_BX__) bxmi lr @@ -51,6 +77,7 @@ memcmp: ldrb r2, [r0], #1 ldrb r3, [r1], #1 cmp ip, r0 + IT(t, cs) cmpcs r2, r3 beq 1b sub r0, r2, r3 @@ -59,6 +86,7 @@ memcmp: #else mov pc, lr #endif +#endif .size memcmp,.-memcmp diff --git a/libc/string/arm/memcpy.S b/libc/string/arm/memcpy.S index 7a5b6ab76..d2013d211 100644 --- a/libc/string/arm/memcpy.S +++ b/libc/string/arm/memcpy.S @@ -38,16 +38,23 @@ */ #include <features.h> +#include <bits/arm_asm.h> .text .global memcpy .type memcpy,%function .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func memcpy: - stmfd sp!, {r0, lr} + push {r0, lr} bl _memcpy - ldmfd sp!, {r0, pc} + POP_RET +#else +memcpy: + b _memcpy +#endif .size memcpy,.-memcpy diff --git a/libc/string/arm/memmove.S b/libc/string/arm/memmove.S index 45cd9b4d4..c11b98dd4 100644 --- a/libc/string/arm/memmove.S +++ b/libc/string/arm/memmove.S @@ -38,16 +38,23 @@ */ #include <features.h> +#include <bits/arm_asm.h> .text .global memmove .type memmove,%function .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func memmove: - stmfd sp!, {r0, lr} + push {r2, lr} bl _memcpy - ldmfd sp!, {r0, pc} + POP_RET +#else +memmove: + b _memcpy +#endif .size memmove,.-memmove diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S index 16bfe0dc5..66aa6039c 100644 --- a/libc/string/arm/memset.S +++ b/libc/string/arm/memset.S @@ -19,12 +19,52 @@ #include <features.h> #include <sys/syscall.h> +#include <bits/arm_asm.h> .text .global memset .type memset,%function .align 4 +#if defined(THUMB1_ONLY) +.thumb_func +memset: + mov ip, r0 + cmp r2, #8 @ at least 8 bytes to do? + bcc 2f + + lsl r3, r1, #8 + orr r1, r3 + lsl r3, r1, #16 + orr r1, r3 + + mov r3, #3 +1: @ Fill up to the first word boundary + tst r0, r3 + beq 1f + strb r1, [r0] + add r0, r0, #1 + sub r2, r2, #1 + b 1b +1: @ Fill aligned words + str r1, [r0] + add r0, r0, #4 + sub r2, r2, #4 + cmp r2, #4 + bcs 1b + +2: @ Fill the remaining bytes + cmp r2, #0 + beq 2f +1: + strb r1, [r0] + add r0, r0, #1 + sub r2, r2, #1 + bne 1b +2: + mov r0, ip + bx lr +#else memset: mov a4, a1 cmp a3, $8 @ at least 8 bytes to do? @@ -33,8 +73,14 @@ memset: orr a2, a2, a2, lsl $16 1: tst a4, $3 @ aligned yet? +#if defined(__thumb2__) + itt ne + strbne a2, [a4], $1 + subne a3, a3, $1 +#else strneb a2, [a4], $1 subne a3, a3, $1 +#endif bne 1b mov ip, a2 1: @@ -51,16 +97,30 @@ memset: stmia a4!, {a2, ip} sub a3, a3, $8 cmp a3, $8 @ 8 bytes still to do? +#if defined(__thumb2__) + itt ge + stmiage a4!, {a2, ip} + subge a3, a3, $8 +#else stmgeia a4!, {a2, ip} subge a3, a3, $8 +#endif bge 1b 2: movs a3, a3 @ anything left? + IT(t, eq) #if defined(__USE_BX__) bxeq lr #else moveq pc, lr @ nope #endif +#if defined (__thumb2__) +1: + strb a2, [a4], #1 + subs a3, a3, #1 + bne 1b + bx lr +#else rsb a3, a3, $7 add pc, pc, a3, lsl $2 mov r0, r0 @@ -76,6 +136,8 @@ memset: #else mov pc, lr #endif +#endif +#endif .size memset,.-memset diff --git a/libc/string/arm/strcmp.S b/libc/string/arm/strcmp.S index 89aa38874..97363c1c2 100644 --- a/libc/string/arm/strcmp.S +++ b/libc/string/arm/strcmp.S @@ -30,17 +30,35 @@ */ #include <features.h> +#include <bits/arm_asm.h> .text .global strcmp .type strcmp,%function .align 4 +#if defined(__thumb__) && !defined(__thumb2__) +.thumb_func +strcmp: +1: + ldrb r2, [r0] + add r0, r0, #1 + ldrb r3, [r1] + add r1, r1, #1 + cmp r2, #0 + beq 2f + cmp r2, r3 + beq 1b +2: + sub r0, r2, r3 + bx lr +#else strcmp: 1: ldrb r2, [r0], #1 ldrb r3, [r1], #1 cmp r2, #1 + IT(t, cs) cmpcs r2, r3 beq 1b sub r0, r2, r3 @@ -49,6 +67,7 @@ strcmp: #else mov pc, lr #endif +#endif .size strcmp,.-strcmp diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S index 5b4b02e17..949e918f4 100644 --- a/libc/string/arm/strlen.S +++ b/libc/string/arm/strlen.S @@ -20,6 +20,7 @@ #include <features.h> #include <endian.h> #include <sys/syscall.h> +#include <bits/arm_asm.h> /* size_t strlen(const char *S) * entry: r0 -> string @@ -31,6 +32,19 @@ .type strlen,%function .align 4 +#if defined(THUMB1_ONLY) +/* A simple implementation for when the ARM implementation can't be used. */ +.thumb_func +strlen: + mov r2, #0 +1: + ldrb r1, [r0, r2] + add r2, r2, #1 + cmp r1, #0 + bne 1b + sub r0, r2, #1 + bx lr +#else strlen: bic r1, r0, $3 @ addr of word containing first byte ldr r2, [r1], $4 @ get the first word @@ -41,38 +55,48 @@ strlen: #if __BYTE_ORDER == __BIG_ENDIAN orr r2, r2, $0xff000000 @ set this byte to non-zero subs r3, r3, $1 @ any more to do? + IT(t, gt) orrgt r2, r2, $0x00ff0000 @ if so, set this byte subs r3, r3, $1 @ more? + IT(t, gt) orrgt r2, r2, $0x0000ff00 @ then set. #else orr r2, r2, $0x000000ff @ set this byte to non-zero subs r3, r3, $1 @ any more to do? + IT(t, gt) orrgt r2, r2, $0x0000ff00 @ if so, set this byte subs r3, r3, $1 @ more? + IT(t, gt) orrgt r2, r2, $0x00ff0000 @ then set. #endif Laligned: @ here, we have a word in r2. Does it tst r2, $0x000000ff @ contain any zeroes? + IT(tttt, ne) tstne r2, $0x0000ff00 @ tstne r2, $0x00ff0000 @ tstne r2, $0xff000000 @ addne r0, r0, $4 @ if not, the string is 4 bytes longer + IT(t, ne) ldrne r2, [r1], $4 @ and we continue to the next word bne Laligned @ Llastword: @ drop through to here once we find a #if __BYTE_ORDER == __BIG_ENDIAN tst r2, $0xff000000 @ word that has a zero byte in it + IT(tttt, ne) addne r0, r0, $1 @ tstne r2, $0x00ff0000 @ and add up to 3 bytes on to it addne r0, r0, $1 @ tstne r2, $0x0000ff00 @ (if first three all non-zero, 4th + IT(t, ne) addne r0, r0, $1 @ must be zero) #else tst r2, $0x000000ff @ + IT(tttt, ne) addne r0, r0, $1 @ tstne r2, $0x0000ff00 @ and add up to 3 bytes on to it addne r0, r0, $1 @ tstne r2, $0x00ff0000 @ (if first three all non-zero, 4th + IT(t, ne) addne r0, r0, $1 @ must be zero) #endif #if defined(__USE_BX__) @@ -80,6 +104,7 @@ Llastword: @ drop through to here once we find a #else mov pc,lr #endif +#endif .size strlen,.-strlen diff --git a/libc/string/arm/strncmp.S b/libc/string/arm/strncmp.S index eaf0620b4..8487639c8 100644 --- a/libc/string/arm/strncmp.S +++ b/libc/string/arm/strncmp.S @@ -30,15 +30,46 @@ */ #include <features.h> +#include <bits/arm_asm.h> .text .global strncmp .type strncmp,%function .align 4 +#if defined(THUMB1_ONLY) +.thumb_func strncmp: /* if (len == 0) return 0 */ cmp r2, #0 + bne 1f + mov r0, #0 + bx lr +1: + push {r4} + + /* ip == last src address to compare */ + add r4, r0, r2 +2: + cmp r4, r0 + beq 3f + ldrb r2, [r0] + add r0, r0, #1 + ldrb r3, [r1] + add r1, r1, #1 + cmp r2, #0 + beq 3f + cmp r2, r3 + beq 2b +3: + sub r0, r2, r3 + pop {r4} + bx lr +#else +strncmp: + /* if (len == 0) return 0 */ + cmp r2, #0 + IT(tt, eq) moveq r0, #0 #if defined(__USE_BX__) bxeq lr @@ -53,6 +84,7 @@ strncmp: ldrb r2, [r0], #1 ldrb r3, [r1], #1 cmp ip, r0 + IT(tt, cs) cmpcs r2, #1 cmpcs r2, r3 beq 1b @@ -62,6 +94,7 @@ strncmp: #else mov pc, lr #endif +#endif .size strncmp,.-strncmp diff --git a/libc/string/avr32/Makefile b/libc/string/avr32/Makefile index 0002ffdce..e19e9d9ec 100644 --- a/libc/string/avr32/Makefile +++ b/libc/string/avr32/Makefile @@ -16,8 +16,8 @@ # along with this program; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -top_srcdir := ../../../ -top_builddir := ../../../ +top_srcdir := ../../../ +top_builddir := ../../../ all: objs diff --git a/libc/string/avr32/bcopy.S b/libc/string/avr32/bcopy.S index e1d173165..bdd521814 100644 --- a/libc/string/avr32/bcopy.S +++ b/libc/string/avr32/bcopy.S @@ -10,17 +10,17 @@ #ifdef __UCLIBC_SUSV3_LEGACY__ - .text - .global bcopy - .type bcopy, @function - .align 1 + .text + .global bcopy + .type bcopy, @function + .align 1 bcopy: - /* Swap the first two arguments */ - eor r11, r12 - eor r12, r11 - eor r11, r12 - rjmp HIDDEN_JUMPTARGET(memmove) + /* Swap the first two arguments */ + eor r11, r12 + eor r12, r11 + eor r11, r12 + rjmp HIDDEN_JUMPTARGET(memmove) - .size bcopy, . - bcopy + .size bcopy, . - bcopy #endif /* __UCLIBC_SUSV3_LEGACY__ */ diff --git a/libc/string/avr32/bzero.S b/libc/string/avr32/bzero.S index 928148dcb..ca1bd2dd2 100644 --- a/libc/string/avr32/bzero.S +++ b/libc/string/avr32/bzero.S @@ -10,15 +10,15 @@ #ifdef __UCLIBC_SUSV3_LEGACY__ - .text - .global bzero - .type bzero, @function - .align 1 + .text + .global bzero + .type bzero, @function + .align 1 bzero: - mov r10, r11 - mov r11, 0 - rjmp HIDDEN_JUMPTARGET(memset) + mov r10, r11 + mov r11, 0 + rjmp HIDDEN_JUMPTARGET(memset) - .size bzero, . - bzero + .size bzero, . - bzero #endif /* __UCLIBC_SUSV3_LEGACY__ */ diff --git a/libc/string/avr32/memcmp.S b/libc/string/avr32/memcmp.S index 5d7eac3d9..ae6cc9189 100644 --- a/libc/string/avr32/memcmp.S +++ b/libc/string/avr32/memcmp.S @@ -12,48 +12,48 @@ #define s2 r11 #define len r10 - .text - .global memcmp - .type memcmp, @function - .align 1 + .text + .global memcmp + .type memcmp, @function + .align 1 memcmp: - sub len, 4 - brlt .Lless_than_4 + sub len, 4 + brlt .Lless_than_4 -1: ld.w r8, s1++ - ld.w r9, s2++ - cp.w r8, r9 - brne .Lfound_word - sub len, 4 - brge 1b +1: ld.w r8, s1++ + ld.w r9, s2++ + cp.w r8, r9 + brne .Lfound_word + sub len, 4 + brge 1b .Lless_than_4: - sub len, -4 - reteq 0 + sub len, -4 + reteq 0 -1: ld.ub r8, s1++ - ld.ub r9, s2++ - sub r8, r9 - retne r8 - sub len, 1 - brgt 1b +1: ld.ub r8, s1++ + ld.ub r9, s2++ + sub r8, r9 + retne r8 + sub len, 1 + brgt 1b - retal 0 + retal 0 .Lfound_word: - mov len, 4 - -2: bfextu r11, r9, 24, 8 - bfextu r12, r8, 24, 8 - sub r12, r11 - retne r12 - lsl r8, 8 - lsl r9, 8 - sub len, 1 - brne 2b - retal r12 - - .size memcmp, . - memcmp + mov len, 4 + +2: bfextu r11, r9, 24, 8 + bfextu r12, r8, 24, 8 + sub r12, r11 + retne r12 + lsl r8, 8 + lsl r9, 8 + sub len, 1 + brne 2b + retal r12 + + .size memcmp, . - memcmp libc_hidden_def(memcmp) #ifdef __UCLIBC_SUSV3_LEGACY__ diff --git a/libc/string/avr32/memcpy.S b/libc/string/avr32/memcpy.S index f95aabd13..bf091abf8 100644 --- a/libc/string/avr32/memcpy.S +++ b/libc/string/avr32/memcpy.S @@ -11,101 +11,101 @@ #define src r11 #define len r10 - .text - .global memcpy - .type memcpy, @function + .text + .global memcpy + .type memcpy, @function memcpy: - pref src[0] - mov dst, r12 + pref src[0] + mov dst, r12 - /* If we have less than 32 bytes, don't do anything fancy */ - cp.w len, 32 - brge .Lmore_than_31 + /* If we have less than 32 bytes, don't do anything fancy */ + cp.w len, 32 + brge .Lmore_than_31 - sub len, 1 - retlt r12 -1: ld.ub r8, src++ - st.b dst++, r8 - sub len, 1 - brge 1b - retal r12 + sub len, 1 + retlt r12 +1: ld.ub r8, src++ + st.b dst++, r8 + sub len, 1 + brge 1b + retal r12 .Lmore_than_31: - pushm r0-r7, lr + pushm r0-r7, lr - /* Check alignment */ - mov r8, src - andl r8, 31, COH - brne .Lunaligned_src - mov r8, dst - andl r8, 3, COH - brne .Lunaligned_dst + /* Check alignment */ + mov r8, src + andl r8, 31, COH + brne .Lunaligned_src + mov r8, dst + andl r8, 3, COH + brne .Lunaligned_dst .Laligned_copy: - sub len, 32 - brlt .Lless_than_32 + sub len, 32 + brlt .Lless_than_32 -1: /* Copy 32 bytes at a time */ - ldm src, r0-r7 - sub src, -32 - stm dst, r0-r7 - sub dst, -32 - sub len, 32 - brge 1b +1: /* Copy 32 bytes at a time */ + ldm src, r0-r7 + sub src, -32 + stm dst, r0-r7 + sub dst, -32 + sub len, 32 + brge 1b .Lless_than_32: - /* Copy 16 more bytes if possible */ - sub len, -16 - brlt .Lless_than_16 - ldm src, r0-r3 - sub src, -16 - sub len, 16 - stm dst, r0-r3 - sub dst, -16 + /* Copy 16 more bytes if possible */ + sub len, -16 + brlt .Lless_than_16 + ldm src, r0-r3 + sub src, -16 + sub len, 16 + stm dst, r0-r3 + sub dst, -16 .Lless_than_16: - /* Do the remaining as byte copies */ - neg len - add pc, pc, len << 2 - .rept 15 - ld.ub r0, src++ - st.b dst++, r0 - .endr + /* Do the remaining as byte copies */ + neg len + add pc, pc, len << 2 + .rept 15 + ld.ub r0, src++ + st.b dst++, r0 + .endr - popm r0-r7, pc + popm r0-r7, pc .Lunaligned_src: - /* Make src cacheline-aligned. r8 = (src & 31) */ - rsub r8, r8, 32 - sub len, r8 -1: ld.ub r0, src++ - st.b dst++, r0 - sub r8, 1 - brne 1b - - /* If dst is word-aligned, we're ready to go */ - pref src[0] - mov r8, 3 - tst dst, r8 - breq .Laligned_copy + /* Make src cacheline-aligned. r8 = (src & 31) */ + rsub r8, r8, 32 + sub len, r8 +1: ld.ub r0, src++ + st.b dst++, r0 + sub r8, 1 + brne 1b + + /* If dst is word-aligned, we're ready to go */ + pref src[0] + mov r8, 3 + tst dst, r8 + breq .Laligned_copy .Lunaligned_dst: - /* src is aligned, but dst is not. Expect bad performance */ - sub len, 4 - brlt 2f -1: ld.w r0, src++ - st.w dst++, r0 - sub len, 4 - brge 1b - -2: neg len - add pc, pc, len << 2 - .rept 3 - ld.ub r0, src++ - st.b dst++, r0 - .endr - - popm r0-r7, pc - .size memcpy, . - memcpy + /* src is aligned, but dst is not. Expect bad performance */ + sub len, 4 + brlt 2f +1: ld.w r0, src++ + st.w dst++, r0 + sub len, 4 + brge 1b + +2: neg len + add pc, pc, len << 2 + .rept 3 + ld.ub r0, src++ + st.b dst++, r0 + .endr + + popm r0-r7, pc + .size memcpy, . - memcpy libc_hidden_def(memcpy) diff --git a/libc/string/avr32/memmove.S b/libc/string/avr32/memmove.S index 8ca4da54d..535f4a257 100644 --- a/libc/string/avr32/memmove.S +++ b/libc/string/avr32/memmove.S @@ -10,107 +10,107 @@ #define src r11 #define len r10 - .text - .global memmove - .type memmove, @function + .text + .global memmove + .type memmove, @function memmove: - cp.w src, dst - brge HIDDEN_JUMPTARGET(memcpy) - - add dst, len - add src, len - pref src[-1] - - /* - * The rest is basically the same as in memcpy.S except that - * the direction is reversed. - */ - cp.w len, 32 - brge .Lmore_than_31 - - sub len, 1 - retlt r12 -1: ld.ub r8, --src - st.b --dst, r8 - sub len, 1 - brge 1b - retal r12 + cp.w src, dst + brge HIDDEN_JUMPTARGET(memcpy) + + add dst, len + add src, len + pref src[-1] + + /* + * The rest is basically the same as in memcpy.S except that + * the direction is reversed. + */ + cp.w len, 32 + brge .Lmore_than_31 + + sub len, 1 + retlt r12 +1: ld.ub r8, --src + st.b --dst, r8 + sub len, 1 + brge 1b + retal r12 .Lmore_than_31: - pushm r0-r7, lr + pushm r0-r7, lr - /* Check alignment */ - mov r8, src - andl r8, 31, COH - brne .Lunaligned_src - mov r8, r12 - andl r8, 3, COH - brne .Lunaligned_dst + /* Check alignment */ + mov r8, src + andl r8, 31, COH + brne .Lunaligned_src + mov r8, r12 + andl r8, 3, COH + brne .Lunaligned_dst .Laligned_copy: - sub len, 32 - brlt .Lless_than_32 + sub len, 32 + brlt .Lless_than_32 -1: /* Copy 32 bytes at a time */ - sub src, 32 - ldm src, r0-r7 - sub dst, 32 - sub len, 32 - stm dst, r0-r7 - brge 1b +1: /* Copy 32 bytes at a time */ + sub src, 32 + ldm src, r0-r7 + sub dst, 32 + sub len, 32 + stm dst, r0-r7 + brge 1b .Lless_than_32: - /* Copy 16 more bytes if possible */ - sub len, -16 - brlt .Lless_than_16 - sub src, 16 - ldm src, r0-r3 - sub dst, 16 - sub len, 16 - stm dst, r0-r3 + /* Copy 16 more bytes if possible */ + sub len, -16 + brlt .Lless_than_16 + sub src, 16 + ldm src, r0-r3 + sub dst, 16 + sub len, 16 + stm dst, r0-r3 .Lless_than_16: - /* Do the remaining as byte copies */ - sub len, -16 - breq 2f -1: ld.ub r0, --src - st.b --dst, r0 - sub len, 1 - brne 1b + /* Do the remaining as byte copies */ + sub len, -16 + breq 2f +1: ld.ub r0, --src + st.b --dst, r0 + sub len, 1 + brne 1b -2: popm r0-r7, pc +2: popm r0-r7, pc .Lunaligned_src: - /* Make src cacheline-aligned. r8 = (src & 31) */ - sub len, r8 -1: ld.ub r0, --src - st.b --dst, r0 - sub r8, 1 - brne 1b - - /* If dst is word-aligned, we're ready to go */ - pref src[-4] - mov r8, 3 - tst dst, r8 - breq .Laligned_copy + /* Make src cacheline-aligned. r8 = (src & 31) */ + sub len, r8 +1: ld.ub r0, --src + st.b --dst, r0 + sub r8, 1 + brne 1b + + /* If dst is word-aligned, we're ready to go */ + pref src[-4] + mov r8, 3 + tst dst, r8 + breq .Laligned_copy .Lunaligned_dst: - /* src is aligned, but dst is not. Expect bad performance */ - sub len, 4 - brlt 2f -1: ld.w r0, --src - st.w --dst, r0 - sub len, 4 - brge 1b - -2: neg len - add pc, pc, len << 2 - .rept 3 - ld.ub r0, --src - st.b --dst, r0 - .endr - - popm r0-r7, pc - .size memmove, . - memmove + /* src is aligned, but dst is not. Expect bad performance */ + sub len, 4 + brlt 2f +1: ld.w r0, --src + st.w --dst, r0 + sub len, 4 + brge 1b + +2: neg len + add pc, pc, len << 2 + .rept 3 + ld.ub r0, --src + st.b --dst, r0 + .endr + + popm r0-r7, pc + .size memmove, . - memmove libc_hidden_def(memmove) diff --git a/libc/string/avr32/memset.S b/libc/string/avr32/memset.S index 964bf4834..472b2be35 100644 --- a/libc/string/avr32/memset.S +++ b/libc/string/avr32/memset.S @@ -12,54 +12,54 @@ #define c r11 #define n r10 - .text - .global memset - .type memset, @function + .text + .global memset + .type memset, @function - .align 1 + .align 1 memset: - cp.w n, 32 - mov r9, s - brge .Llarge_memset + cp.w n, 32 + mov r9, s + brge .Llarge_memset - sub n, 1 - retlt s -1: st.b s++, c - sub n, 1 - brge 1b + sub n, 1 + retlt s +1: st.b s++, c + sub n, 1 + brge 1b - retal r9 + retal r9 .Llarge_memset: - mov r8, r11 - mov r11, 3 - bfins r8, r8, 8, 8 - bfins r8, r8, 16, 16 - tst s, r11 - breq 2f + mov r8, r11 + mov r11, 3 + bfins r8, r8, 8, 8 + bfins r8, r8, 16, 16 + tst s, r11 + breq 2f -1: st.b s++, r8 - sub n, 1 - tst s, r11 - brne 1b +1: st.b s++, r8 + sub n, 1 + tst s, r11 + brne 1b -2: mov r11, r9 - mov r9, r8 - sub n, 8 +2: mov r11, r9 + mov r9, r8 + sub n, 8 -3: st.d s++, r8 - sub n, 8 - brge 3b +3: st.d s++, r8 + sub n, 8 + brge 3b - /* If we are done, n == -8 and we'll skip all st.b insns below */ - neg n - lsl n, 1 - add pc, n - .rept 7 - st.b s++, r8 - .endr - retal r11 + /* If we are done, n == -8 and we'll skip all st.b insns below */ + neg n + lsl n, 1 + add pc, n + .rept 7 + st.b s++, r8 + .endr + retal r11 - .size memset, . - memset + .size memset, . - memset libc_hidden_def(memset) diff --git a/libc/string/avr32/strcmp.S b/libc/string/avr32/strcmp.S index e9f087577..f73bd43e7 100644 --- a/libc/string/avr32/strcmp.S +++ b/libc/string/avr32/strcmp.S @@ -12,77 +12,77 @@ #define s2 r11 #define len r10 - .text - .global strcmp - .type strcmp, @function - .align 1 + .text + .global strcmp + .type strcmp, @function + .align 1 strcmp: - mov r8, 3 - tst s1, r8 - brne .Lunaligned_s1 - tst s2, r8 - brne .Lunaligned_s2 + mov r8, 3 + tst s1, r8 + brne .Lunaligned_s1 + tst s2, r8 + brne .Lunaligned_s2 -1: ld.w r8, s1++ - ld.w r9, s2++ - cp.w r8, r9 - brne 2f - tnbz r8 - brne 1b - retal 0 +1: ld.w r8, s1++ + ld.w r9, s2++ + cp.w r8, r9 + brne 2f + tnbz r8 + brne 1b + retal 0 -2: bfextu r12, r8, 24, 8 - bfextu r11, r9, 24, 8 - sub r12, r11 - retne r12 - cp.w r11, 0 - reteq 0 - bfextu r12, r8, 16, 8 - bfextu r11, r9, 16, 8 - sub r12, r11 - retne r12 - cp.w r11, 0 - reteq 0 - bfextu r12, r8, 8, 8 - bfextu r11, r9, 8, 8 - sub r12, r11 - retne r12 - cp.w r11, 0 - reteq 0 - bfextu r12, r8, 0, 8 - bfextu r11, r9, 0, 8 - sub r12, r11 - retal r12 +2: bfextu r12, r8, 24, 8 + bfextu r11, r9, 24, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 16, 8 + bfextu r11, r9, 16, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 8, 8 + bfextu r11, r9, 8, 8 + sub r12, r11 + retne r12 + cp.w r11, 0 + reteq 0 + bfextu r12, r8, 0, 8 + bfextu r11, r9, 0, 8 + sub r12, r11 + retal r12 .Lunaligned_s1: -3: tst s1, r8 - breq 4f - ld.ub r10, s1++ - ld.ub r9, s2++ - sub r10, r9 - retne r10 - cp.w r9, 0 - brne 3b - retal r10 +3: tst s1, r8 + breq 4f + ld.ub r10, s1++ + ld.ub r9, s2++ + sub r10, r9 + retne r10 + cp.w r9, 0 + brne 3b + retal r10 -4: tst s2, r8 - breq 1b +4: tst s2, r8 + breq 1b .Lunaligned_s2: - /* - * s1 and s2 can't both be aligned, and unaligned word loads - * can trigger spurious exceptions if we cross a page boundary. - * Do it the slow way... - */ -1: ld.ub r8, s1++ - ld.ub r9, s2++ - sub r8, r9 - retne r8 - cp.w r9, 0 - brne 1b - retal 0 + /* + * s1 and s2 can't both be aligned, and unaligned word loads + * can trigger spurious exceptions if we cross a page boundary. + * Do it the slow way... + */ +1: ld.ub r8, s1++ + ld.ub r9, s2++ + sub r8, r9 + retne r8 + cp.w r9, 0 + brne 1b + retal 0 - .size strcmp, . - strcmp + .size strcmp, . - strcmp libc_hidden_def(strcmp) #ifndef __UCLIBC_HAS_LOCALE__ diff --git a/libc/string/avr32/strlen.S b/libc/string/avr32/strlen.S index d2808998d..5223e5365 100644 --- a/libc/string/avr32/strlen.S +++ b/libc/string/avr32/strlen.S @@ -10,53 +10,53 @@ #define str r12 - .text - .global strlen - .type strlen, @function + .text + .global strlen + .type strlen, @function strlen: - mov r11, r12 - - mov r9, str - andl r9, 3, COH - brne .Lunaligned_str - -1: ld.w r8, str++ - tnbz r8 - brne 1b - - sub r12, r11 - bfextu r9, r8, 24, 8 - cp.w r9, 0 - subeq r12, 4 - reteq r12 - bfextu r9, r8, 16, 8 - cp.w r9, 0 - subeq r12, 3 - reteq r12 - bfextu r9, r8, 8, 8 - cp.w r9, 0 - subeq r12, 2 - reteq r12 - sub r12, 1 - retal r12 + mov r11, r12 + + mov r9, str + andl r9, 3, COH + brne .Lunaligned_str + +1: ld.w r8, str++ + tnbz r8 + brne 1b + + sub r12, r11 + bfextu r9, r8, 24, 8 + cp.w r9, 0 + subeq r12, 4 + reteq r12 + bfextu r9, r8, 16, 8 + cp.w r9, 0 + subeq r12, 3 + reteq r12 + bfextu r9, r8, 8, 8 + cp.w r9, 0 + subeq r12, 2 + reteq r12 + sub r12, 1 + retal r12 .Lunaligned_str: - add pc, pc, r9 << 3 - sub r0, r0, 0 /* 4-byte nop */ - ld.ub r8, str++ - sub r8, r8, 0 - breq 1f - ld.ub r8, str++ - sub r8, r8, 0 - breq 1f - ld.ub r8, str++ - sub r8, r8, 0 - brne 1b - -1: sub r12, 1 - sub r12, r11 - retal r12 - - .size strlen, . - strlen + add pc, pc, r9 << 3 + sub r0, r0, 0 /* 4-byte nop */ + ld.ub r8, str++ + sub r8, r8, 0 + breq 1f + ld.ub r8, str++ + sub r8, r8, 0 + breq 1f + ld.ub r8, str++ + sub r8, r8, 0 + brne 1b + +1: sub r12, 1 + sub r12, r11 + retal r12 + + .size strlen, . - strlen libc_hidden_def(strlen) diff --git a/libc/string/bfin/memchr.S b/libc/string/bfin/memchr.S index 23626d6a4..88e46bef6 100644 --- a/libc/string/bfin/memchr.S +++ b/libc/string/bfin/memchr.S @@ -1,5 +1,5 @@ /* memchr.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* void *memchr(const void *s, int c, size_t n); * R0 = address (s) * R1 = sought byte (c) @@ -21,30 +23,29 @@ .align 2 -.global _memchr -.type _memchr, STT_FUNC -_memchr: +.weak _memchr +ENTRY(_memchr) P0 = R0; // P0 = address P2 = R2; // P2 = count R1 = R1.B(Z); CC = R2 == 0; - IF CC JUMP failed; + IF CC JUMP .Lfailed; -bytes: - LSETUP (byte_loop_s , byte_loop_e) LC0=P2; +.Lbytes: + LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; -byte_loop_s: +.Lbyte_loop_s: R3 = B[P0++](Z); CC = R3 == R1; - IF CC JUMP found; -byte_loop_e: + IF CC JUMP .Lfound; +.Lbyte_loop_e: NOP; -failed: +.Lfailed: R0=0; RTS; -found: +.Lfound: R0 = P0; R0 += -1; RTS; diff --git a/libc/string/bfin/memcmp.S b/libc/string/bfin/memcmp.S index f2679d5ae..7cc76ad96 100644 --- a/libc/string/bfin/memcmp.S +++ b/libc/string/bfin/memcmp.S @@ -1,5 +1,5 @@ /* memcmp.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* int memcmp(const void *s1, const void *s2, size_t n); * R0 = First Address (s1) * R1 = Second Address (s2) @@ -21,28 +23,27 @@ .align 2 -.global _memcmp -.type _memcmp, STT_FUNC -_memcmp: +.weak _memcmp +ENTRY(_memcmp) I1 = P3; - P0 = R0; // P0 = s1 address - P3 = R1; // P3 = s2 Address - P2 = R2 ; // P2 = count + P0 = R0; /* P0 = s1 address */ + P3 = R1; /* P3 = s2 Address */ + P2 = R2 ; /* P2 = count */ CC = R2 <= 7(IU); - IF CC JUMP too_small; - I0 = R1; // s2 - R1 = R1 | R0; // OR addresses together - R1 <<= 30; // check bottom two bits - CC = AZ; // AZ set if zero. - IF !CC JUMP bytes ; // Jump if addrs not aligned. + IF CC JUMP .Ltoo_small; + I0 = R1; /* s2 */ + R1 = R1 | R0; /* OR addresses together */ + R1 <<= 30; /* check bottom two bits */ + CC = AZ; /* AZ set if zero. */ + IF !CC JUMP .Lbytes ; /* Jump if addrs not aligned. */ - P1 = P2 >> 2; // count = n/4 + P1 = P2 >> 2; /* count = n/4 */ R3 = 3; - R2 = R2 & R3; // remainder - P2 = R2; // set remainder + R2 = R2 & R3; /* remainder */ + P2 = R2; /* set remainder */ - LSETUP (quad_loop_s , quad_loop_e) LC0=P1; -quad_loop_s: + LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s: #if !defined(__WORKAROUND_AVOID_DAG1) MNOP || R0 = [P0++] || R1 = [I0++]; #else @@ -50,52 +51,54 @@ quad_loop_s: R1 = [I0++]; #endif CC = R0 == R1; - IF !CC JUMP quad_different; -quad_loop_e: + IF !CC JUMP .Lquad_different; +.Lquad_loop_e: NOP; - P3 = I0; // s2 -too_small: - CC = P2 == 0; //Check zero count - IF CC JUMP finished; // very unlikely + P3 = I0; /* s2 */ +.Ltoo_small: + CC = P2 == 0; /* Check zero count*/ + IF CC JUMP .Lfinished; /* very unlikely*/ -bytes: - LSETUP (byte_loop_s , byte_loop_e) LC0=P2; -byte_loop_s: - R1 = B[P3++](Z); // *s2 - R0 = B[P0++](Z); // *s1 +.Lbytes: + LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; +.Lbyte_loop_s: + R1 = B[P3++](Z); /* *s2 */ + R0 = B[P0++](Z); /* *s1 */ CC = R0 == R1; - IF !CC JUMP different; -byte_loop_e: + IF !CC JUMP .Ldifferent; +.Lbyte_loop_e: NOP; -different: +.Ldifferent: R0 = R0 - R1; P3 = I1; RTS; -quad_different: - // We've read two quads which don't match. - // Can't just compare them, because we're - // a little-endian machine, so the MSBs of - // the regs occur at later addresses in the - // string. - // Arrange to re-read those two quads again, - // byte-by-byte. - P0 += -4; // back up to the start of the - P3 = I0; // quads, and increase the - P2 += 4; // remainder count +.Lquad_different: + /* We've read two quads which don't match. + * Can't just compare them, because we're + * a little-endian machine, so the MSBs of + * the regs occur at later addresses in the + * string. + * Arrange to re-read those two quads again, + * byte-by-byte. + */ + P0 += -4; /* back up to the start of the */ + P3 = I0; /* quads, and increase the*/ + P2 += 4; /* remainder count*/ P3 += -4; - JUMP bytes; + JUMP .Lbytes; -finished: +.Lfinished: R0 = 0; P3 = I1; RTS; + .size _memcmp,.-_memcmp libc_hidden_def (memcmp) #ifdef __UCLIBC_SUSV3_LEGACY__ -strong_alias (memcmp,bcmp) +weak_alias (memcmp,bcmp) #endif diff --git a/libc/string/bfin/memcpy.S b/libc/string/bfin/memcpy.S index e7ba7048e..bdd760691 100644 --- a/libc/string/bfin/memcpy.S +++ b/libc/string/bfin/memcpy.S @@ -1,5 +1,5 @@ /* memcpy.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* void *memcpy(void *dest, const void *src, size_t n); * R0 = To Address (dest) (leave unchanged to form result) * R1 = From Address (src) @@ -21,54 +23,55 @@ .align 2 -.global _memcpy -.type _memcpy, STT_FUNC -_memcpy: +.weak _memcpy +ENTRY(_memcpy) [--SP] = P3; - P0 = R0; // P0 = To address - P3 = R1; // P3 = From Address - P2 = R2 ; // P2 = count + P0 = R0; /* P0 = To address */ + P3 = R1; /* P3 = From Address */ + P2 = R2; /* P2 = count */ CC = R2 <= 7(IU); - IF CC JUMP too_small; + IF CC JUMP .Ltoo_small; I0 = R1; - R3 = R1 | R0; // OR addresses together - R3 <<= 30; // check bottom two bits - CC = AZ; // AZ set if zero. - IF !CC JUMP bytes ; // Jump if addrs not aligned. - P1 = P2 >> 2; // count = n/4 + R3 = R1 | R0; /* OR addresses together */ + R3 <<= 30; /* check bottom two bits */ + CC = AZ; /* AZ set if zero. */ + IF !CC JUMP .Lbytes; /* Jump if addrs not aligned. */ + P1 = P2 >> 2; /* count = n/4 */ P1 += -1; R3 = 3; - R2 = R2 & R3; // remainder - P2 = R2; // set remainder + R2 = R2 & R3; /* remainder */ + P2 = R2; /* set remainder */ R1 = [I0++]; #if !defined(__WORKAROUND_AVOID_DAG1) - LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop: MNOP || [P0++] = R1 || R1 = [I0++]; + LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1; +.Lquad_loop: MNOP || [P0++] = R1 || R1 = [I0++]; #else - LSETUP (quad_loop_s , quad_loop_e) LC0=P1; -quad_loop_s: [P0++] = R1; -quad_loop_e: R1 = [I0++]; + LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s: [P0++] = R1; +.Lquad_loop_e: R1 = [I0++]; #endif [P0++] = R1; - CC = P2 == 0; // any remaining bytes? - P3 = I0; // Ammend P3 for remaining copy - IF !CC JUMP bytes; + CC = P2 == 0; /* any remaining bytes? */ + P3 = I0; /* Ammend P3 for remaining copy */ + IF !CC JUMP .Lbytes; P3 = [SP++]; RTS; -too_small: - CC = P2 == 0; //Check zero count - IF CC JUMP finished; // very unlikely +.Ltoo_small: + CC = P2 == 0; /* Check zero count */ + IF CC JUMP .Lfinished; /* very unlikely */ -bytes: - LSETUP (byte_loop_s , byte_loop_e) LC0=P2; -byte_loop_s: R1 = B[P3++](Z); -byte_loop_e: B[P0++] = R1; +.Lbytes: + LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; +.Lbyte_loop_s: R1 = B[P3++](Z); +.Lbyte_loop_e: B[P0++] = R1; -finished: +.Lfinished: P3 = [SP++]; + RTS; + .size _memcpy,.-_memcpy libc_hidden_def (memcpy) diff --git a/libc/string/bfin/memmove.S b/libc/string/bfin/memmove.S index 3d446f326..73e363820 100644 --- a/libc/string/bfin/memmove.S +++ b/libc/string/bfin/memmove.S @@ -1,5 +1,5 @@ /* memmove.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* void *memmove(void *dest, const void *src, size_t n); * R0 = To Address (dest) (leave unchanged to form result) * R1 = From Address (src) @@ -21,75 +23,78 @@ .align 2 -.global _memmove -.type _memmove, STT_FUNC -_memmove: +.weak _memmove +ENTRY(_memmove) I1 = P3; - P0 = R0; // P0 = To address - P3 = R1; // P3 = From Address - P2 = R2 ; // P2 = count - CC = P2 == 0; //Check zero count - IF CC JUMP finished; // very unlikely + P0 = R0; /* P0 = To address */ + P3 = R1; /* P3 = From Address */ + P2 = R2; /* P2 = count */ + CC = P2 == 0; /* Check zero count*/ + IF CC JUMP .Lfinished; /* very unlikely */ - CC = R1 < R0 (IU); // From < To - IF !CC JUMP no_overlap; + CC = R1 < R0 (IU); /* From < To */ + IF !CC JUMP .Lno_overlap; R3 = R1 + R2; - CC = R0 <= R3 (IU); // (From+len) >= To - IF CC JUMP overlap; -no_overlap: + CC = R0 <= R3 (IU); /* (From+len) >= To */ + IF CC JUMP .Loverlap; +.Lno_overlap: R3 = 11; CC = R2 <= R3; - IF CC JUMP bytes; - R3 = R1 | R0; // OR addresses together - R3 <<= 30; // check bottom two bits - CC = AZ; // AZ set if zero. - IF !CC JUMP bytes ; // Jump if addrs not aligned. + IF CC JUMP .Lbytes; + R3 = R1 | R0; /* OR addresses together */ + R3 <<= 30; /* check bottom two bits */ + CC = AZ; /* AZ set if zero.*/ + IF !CC JUMP .Lbytes; /* Jump if addrs not aligned.*/ I0 = P3; - P1 = P2 >> 2; // count = n/4 + P1 = P2 >> 2; /* count = n/4 */ P1 += -1; R3 = 3; - R2 = R2 & R3; // remainder - P2 = R2; // set remainder + R2 = R2 & R3; /* remainder */ + P2 = R2; /* set remainder */ R1 = [I0++]; #if !defined(__WORKAROUND_AVOID_DAG1) - LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop: MNOP || [P0++] = R1 || R1 = [I0++]; + LSETUP (.Lquad_loop, .Lquad_loop) LC0=P1; +.Lquad_loop: MNOP || [P0++] = R1 || R1 = [I0++]; #else - LSETUP (quad_loop_s, quad_loop_e) LC0=P1; -quad_loop_s: [P0++] = R1; -quad_loop_e: R1 = [I0++]; + LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s: [P0++] = R1; +.Lquad_loop_e: R1 = [I0++]; #endif [P0++] = R1; - CC = P2 == 0; // any remaining bytes? - P3 = I0; // Ammend P3 to updated ptr. - IF !CC JUMP bytes; + CC = P2 == 0; /* any remaining bytes? */ + P3 = I0; /* Ammend P3 to updated ptr. */ + IF !CC JUMP .Lbytes; P3 = I1; RTS; -bytes: LSETUP (byte2_s , byte2_e) LC0=P2; -byte2_s: R1 = B[P3++](Z); -byte2_e: B[P0++] = R1; +.Lbytes: LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2; +.Lbyte2_s: R1 = B[P3++](Z); +.Lbyte2_e: B[P0++] = R1; -finished: - P3 = I1; +.Lfinished: P3 = I1; RTS; -overlap: +.Loverlap: P2 += -1; P0 = P0 + P2; P3 = P3 + P2; R1 = B[P3--] (Z); CC = P2 == 0; - IF CC JUMP no_loop; - LSETUP (ol_s, ol_e) LC0 = P2; -ol_s: B[P0--] = R1; -ol_e: R1 = B[P3--] (Z); -no_loop: B[P0] = R1; + IF CC JUMP .Lno_loop; +#if defined(__WORKAROUND_SPECULATIVE_LOADS) + NOP; + NOP; +#endif + LSETUP (.Lol_s, .Lol_e) LC0 = P2; +.Lol_s: B[P0--] = R1; +.Lol_e: R1 = B[P3--] (Z); +.Lno_loop: B[P0] = R1; P3 = I1; RTS; + .size _memmove,.-_memmove libc_hidden_def (memmove) diff --git a/libc/string/bfin/memset.S b/libc/string/bfin/memset.S index bd8eb4b6a..64012f783 100644 --- a/libc/string/bfin/memset.S +++ b/libc/string/bfin/memset.S @@ -1,5 +1,5 @@ /* memset.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* void *memset(void *s, int c, size_t n); * R0 = address (s) (leave unchanged to form result) * R1 = filler byte (c) @@ -21,66 +23,68 @@ .align 2 -.global _memset -.type _memset, STT_FUNC -_memset: - P0 = R0 ; // P0 = address - P2 = R2 ; // P2 = count - R3 = R0 + R2; // end +.weak _memset +ENTRY(_memset) + P0 = R0 ; /* P0 = address */ + P2 = R2 ; /* P2 = count */ + R3 = R0 + R2; /* end */ CC = R2 <= 7(IU); - IF CC JUMP too_small; - R1 = R1.B (Z); // R1 = fill char + IF CC JUMP .Ltoo_small; + R1 = R1.B (Z); /* R1 = fill char */ R2 = 3; - R2 = R0 & R2; // addr bottom two bits - CC = R2 == 0; // AZ set if zero. - IF !CC JUMP force_align ; // Jump if addr not aligned. + R2 = R0 & R2; /* addr bottom two bits */ + CC = R2 == 0; /* AZ set if zero. */ + IF !CC JUMP .Lforce_align ; /* Jump if addr not aligned. */ -aligned: - P1 = P2 >> 2; // count = n/4 - R2 = R1 << 8; // create quad filler +.Laligned: + P1 = P2 >> 2; /* count = n/4 */ + R2 = R1 << 8; /* create quad filler */ R2.L = R2.L + R1.L(NS); R2.H = R2.L + R1.H(NS); P2 = R3; - LSETUP (quad_loop , quad_loop) LC0=P1; -quad_loop: + LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1; +.Lquad_loop: [P0++] = R2; CC = P0 == P2; - IF !CC JUMP bytes_left; + IF !CC JUMP .Lbytes_left; RTS; -bytes_left: - R2 = R3; // end point - R3 = P0; // current position - R2 = R2 - R3; // bytes left +.Lbytes_left: + R2 = R3; /* end point */ + R3 = P0; /* current position */ + R2 = R2 - R3; /* bytes left */ P2 = R2; -too_small: - CC = P2 == 0; //Check zero count - IF CC JUMP finished; // Unusual +.Ltoo_small: + CC = P2 == 0; /* Check zero count */ + IF CC JUMP .Lfinished; /* Unusual */ -bytes: LSETUP (byte_loop , byte_loop) LC0=P2; -byte_loop: B[P0++] = R1; +.Lbytes: + LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2; +.Lbyte_loop: + B[P0++] = R1; -finished: +.Lfinished: RTS; -force_align: - CC = BITTST (R0, 0 ); // odd byte +.Lforce_align: + CC = BITTST (R0, 0); /* odd byte */ R0 = 4; R0 = R0 - R2; P1 = R0; - R0 = P0; // Recover return address - IF !CC JUMP skip1; + R0 = P0; /* Recover return address */ + IF !CC JUMP .Lskip1; B[P0++] = R1; -skip1: - CC = R2 <= 2; // 2 bytes - P2 -= P1; // reduce count - IF !CC JUMP aligned; +.Lskip1: + CC = R2 <= 2; /* 2 bytes */ + P2 -= P1; /* reduce count */ + IF !CC JUMP .Laligned; B[P0++] = R1; B[P0++] = R1; - JUMP aligned; + JUMP .Laligned; + .size _memset,.-_memset libc_hidden_def (memset) diff --git a/libc/string/bfin/strcmp.S b/libc/string/bfin/strcmp.S index 6365024ec..12e8c53c6 100644 --- a/libc/string/bfin/strcmp.S +++ b/libc/string/bfin/strcmp.S @@ -1,5 +1,5 @@ /* strcmp.S - * Copyright (C) 2003, 2005, 2006 Analog Devices Inc., All Rights Reserved. + * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved. * * This file is subject to the terms and conditions of the GNU Library General * Public License. See the file "COPYING.LIB" in the main directory of this @@ -9,6 +9,8 @@ * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html */ +#include <sysdep.h> + /* Fast strcmp() for Blackfin. * When both strings are aligned, this processes four characters at * a time. Uses a hw loop with "very big" count to loop "forever", @@ -21,9 +23,8 @@ .align 2 -.global _strcmp -.type _strcmp, STT_FUNC -_strcmp: +.weak _strcmp +ENTRY(_strcmp) [--sp] = (R7:4); p1 = r0; p2 = r1; @@ -34,13 +35,13 @@ _strcmp: r0 = r0 | r1; // check both pointers at same time r0 <<= 30; // dump all but last 2 bits cc = az; // are they zero? - if !cc jump unaligned; // no; use unaligned code. + if !cc jump .Lunaligned; // no; use unaligned code. // fall-thru for aligned case.. // note that r0 is zero from the previous... // p0 set to -1 - lsetup (beginloop, endloop) lc0=p0; + LSETUP (.Lbeginloop, .Lendloop) lc0=p0; // pick up first words r1 = [p1++]; r2 = [p2++]; @@ -49,8 +50,8 @@ _strcmp: r7.h = 0xFF; // loop : 9 cycles to check 4 characters cc = r1 == r2; -beginloop: - if !cc jump notequal4; // compare failure, exit loop +.Lbeginloop: + if !cc jump .Lnotequal4; // compare failure, exit loop // starting with 44332211 // see if char 3 or char 1 is 0 @@ -63,18 +64,18 @@ beginloop: // add to zero, and (r1 is free, reload) r6 = r3 +|+ r0 || r1 = [p1++] || nop; cc |= az; // true if either is zero - if cc jump zero4; // leave if a zero somewhere -endloop: + if cc jump .Lzero4; // leave if a zero somewhere +.Lendloop: cc = r1 == r2; // loop exits -notequal4: // compare failure on 4-char compare +.Lnotequal4: // compare failure on 4-char compare // address pointers are one word ahead; // faster to use zero4 exit code p1 += 4; p2 += 4; -zero4: // one of the bytes in word 1 is zero +.Lzero4: // one of the bytes in word 1 is zero // but we've already fetched the next word; so // backup two to look at failing word again p1 += -8; @@ -85,27 +86,27 @@ zero4: // one of the bytes in word 1 is zero // here when pointers are unaligned: checks one // character at a time. Also use at the end of // the word-check algorithm to figure out what happened -unaligned: +.Lunaligned: // R0 is non-zero from before. // p0 set to -1 r0 = 0 (Z); r1 = B[p1++] (Z); r2 = B[p2++] (Z); - lsetup (beginloop1, endloop1) lc0=p0; + LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0; -beginloop1: +.Lbeginloop1: cc = r1; // first char must be non-zero // chars must be the same r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop; cc &= az; r3 = r0 - r2; // second char must be non-zero cc &= an; - if !cc jump exitloop1; -endloop1: + if !cc jump .Lexitloop1; +.Lendloop1: r2 = B[p2++] (Z); -exitloop1: // here means we found a zero or a difference. +.Lexitloop1: // here means we found a zero or a difference. // we have r2(N), p2(N), r1(N+1), p1(N+2) r1=B[p1+ -2] (Z); r0 = r1 - r2; @@ -116,6 +117,6 @@ exitloop1: // here means we found a zero or a difference. libc_hidden_def (strcmp) #ifndef __UCLIBC_HAS_LOCALE__ -strong_alias (strcmp,strcoll) +weak_alias (strcmp,strcoll) libc_hidden_def (strcoll) #endif diff --git a/libc/string/cris/memcpy.c b/libc/string/cris/memcpy.c index a85108109..0cce37a30 100644 --- a/libc/string/cris/memcpy.c +++ b/libc/string/cris/memcpy.c @@ -66,7 +66,7 @@ void *memcpy(void *, const void *, unsigned int); -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */ void *memcpy(void *pdst, const void *psrc, unsigned int pn) @@ -130,7 +130,7 @@ void *memcpy(void *pdst, here (beware: they may be moved to temporary registers). This way, we do not have to save/move the registers around into temporaries; we can safely use them straight away. */ - __asm__ volatile ("\ + __asm__ __volatile__ ("\ .syntax no_register_prefix \n\ \n\ ;; Check that the register asm declaration got right. \n\ diff --git a/libc/string/cris/memmove.c b/libc/string/cris/memmove.c index 437637078..b6620afe0 100644 --- a/libc/string/cris/memmove.c +++ b/libc/string/cris/memmove.c @@ -27,7 +27,7 @@ #include "memcopy.h" #include "../generic/pagecopy.h" -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */ void *memmove (void *dest, const void *src, size_t len) { unsigned long int dstp = (long int) dest; diff --git a/libc/string/cris/memset.c b/libc/string/cris/memset.c index 7e71bc50f..9cc959a33 100644 --- a/libc/string/cris/memset.c +++ b/libc/string/cris/memset.c @@ -59,7 +59,7 @@ void *memset(void *, int, unsigned long); -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ void *memset(void *pdst, int c, unsigned long plen) @@ -124,7 +124,7 @@ void *memset(void *pdst, here (beware: they may be moved to temporary registers). This way, we do not have to save/move the registers around into temporaries; we can safely use them straight away. */ - __asm__ volatile (" \n\ + __asm__ __volatile__ (" \n\ .syntax no_register_prefix \n\ \n\ ;; Check that the register asm declaration got right. \n\ diff --git a/libc/string/cris/strcpy.c b/libc/string/cris/strcpy.c index 0af25253e..955a990b7 100644 --- a/libc/string/cris/strcpy.c +++ b/libc/string/cris/strcpy.c @@ -6,7 +6,7 @@ #include <string.h> -libc_hidden_proto(strcpy) +/* Experimentally off - libc_hidden_proto(strcpy) */ char *strcpy(char *dest, const char *src) { char *ret = dest; diff --git a/libc/string/cris/strncpy.c b/libc/string/cris/strncpy.c index 93a6608bc..3f2775bdd 100644 --- a/libc/string/cris/strncpy.c +++ b/libc/string/cris/strncpy.c @@ -6,9 +6,9 @@ #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ -libc_hidden_proto(strncpy) +/* Experimentally off - libc_hidden_proto(strncpy) */ char *strncpy(char *dest, const char *src, size_t count) { char *ret = dest; diff --git a/libc/string/frv/memset.S b/libc/string/frv/memset.S index 4e64550e4..477597dcd 100644 --- a/libc/string/frv/memset.S +++ b/libc/string/frv/memset.S @@ -155,4 +155,4 @@ memset: bralr .size memset, .-memset -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ diff --git a/libc/string/i386/memchr.c b/libc/string/i386/memchr.c index 229d42919..fe4537914 100644 --- a/libc/string/i386/memchr.c +++ b/libc/string/i386/memchr.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(memchr) +/* Experimentally off - libc_hidden_proto(memchr) */ void *memchr(const void *cs, int c, size_t count) { int d0; diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c index a2b8d3d8c..285583f3b 100644 --- a/libc/string/i386/memcpy.c +++ b/libc/string/i386/memcpy.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */ void *memcpy(void * to, const void * from, size_t n) { int d0, d1, d2; diff --git a/libc/string/i386/memmove.c b/libc/string/i386/memmove.c index a26fe2be1..a924efcbc 100644 --- a/libc/string/i386/memmove.c +++ b/libc/string/i386/memmove.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */ void *memmove(void *dest, const void *src, size_t n) { int d0, d1, d2; diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c index eea48040a..bbaa45215 100644 --- a/libc/string/i386/memset.c +++ b/libc/string/i386/memset.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ void *memset(void *s, int c, size_t count) { int d0, d1; diff --git a/libc/string/i386/strcat.c b/libc/string/i386/strcat.c index e0b1f3b51..2cf0237a6 100644 --- a/libc/string/i386/strcat.c +++ b/libc/string/i386/strcat.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strcat) +/* Experimentally off - libc_hidden_proto(strcat) */ char *strcat(char * dest, const char * src) { int d0, d1, d2, d3; diff --git a/libc/string/i386/strchr.c b/libc/string/i386/strchr.c index 7568d48db..46b1dfb6e 100644 --- a/libc/string/i386/strchr.c +++ b/libc/string/i386/strchr.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strchr) +/* Experimentally off - libc_hidden_proto(strchr) */ char *strchr(const char *s, int c) { int d0; diff --git a/libc/string/i386/strcmp.c b/libc/string/i386/strcmp.c index 47635d817..eff230c5c 100644 --- a/libc/string/i386/strcmp.c +++ b/libc/string/i386/strcmp.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strcmp) +/* Experimentally off - libc_hidden_proto(strcmp) */ int strcmp(const char *cs, const char *ct) { int d0, d1; @@ -55,7 +55,7 @@ int strcmp(const char *cs, const char *ct) libc_hidden_def(strcmp) #ifndef __UCLIBC_HAS_LOCALE__ -libc_hidden_proto(strcoll) +/* Experimentally off - libc_hidden_proto(strcoll) */ strong_alias(strcmp,strcoll) libc_hidden_def(strcoll) #endif diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c index 9e2b81009..09065a9b7 100644 --- a/libc/string/i386/strcpy.c +++ b/libc/string/i386/strcpy.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strcpy) +/* Experimentally off - libc_hidden_proto(strcpy) */ char *strcpy(char * dest, const char * src) { int d0, d1, d2; diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c index f0767b600..61a178393 100644 --- a/libc/string/i386/strlen.c +++ b/libc/string/i386/strlen.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strlen) +/* Experimentally off - libc_hidden_proto(strlen) */ size_t strlen(const char *s) { int d0; diff --git a/libc/string/i386/strncat.c b/libc/string/i386/strncat.c index c1061421e..5849db3b3 100644 --- a/libc/string/i386/strncat.c +++ b/libc/string/i386/strncat.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strncat) +/* Experimentally off - libc_hidden_proto(strncat) */ char *strncat(char * dest, const char * src, size_t count) { diff --git a/libc/string/i386/strncmp.c b/libc/string/i386/strncmp.c index d716789c3..a14bb503b 100644 --- a/libc/string/i386/strncmp.c +++ b/libc/string/i386/strncmp.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strncmp) +/* Experimentally off - libc_hidden_proto(strncmp) */ int strncmp(const char *cs, const char *ct, size_t count) { register int __res; diff --git a/libc/string/i386/strncpy.c b/libc/string/i386/strncpy.c index c061fe37e..76aa6ae1b 100644 --- a/libc/string/i386/strncpy.c +++ b/libc/string/i386/strncpy.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strncpy) +/* Experimentally off - libc_hidden_proto(strncpy) */ char *strncpy(char * dest, const char * src, size_t count) { int d0, d1, d2, d3; diff --git a/libc/string/i386/strnlen.c b/libc/string/i386/strnlen.c index 77b5c7568..02c72f530 100644 --- a/libc/string/i386/strnlen.c +++ b/libc/string/i386/strnlen.c @@ -33,7 +33,7 @@ #include <string.h> #ifdef __USE_GNU -libc_hidden_proto(strnlen) +/* Experimentally off - libc_hidden_proto(strnlen) */ size_t strnlen(const char *s, size_t count) { int d0; diff --git a/libc/string/i386/strrchr.c b/libc/string/i386/strrchr.c index e3b2df6fb..ef378685b 100644 --- a/libc/string/i386/strrchr.c +++ b/libc/string/i386/strrchr.c @@ -32,7 +32,7 @@ #include <string.h> -libc_hidden_proto(strrchr) +/* Experimentally off - libc_hidden_proto(strrchr) */ char *strrchr(const char *s, int c) { int d0, d1; diff --git a/libc/string/ia64/memcpy.S b/libc/string/ia64/memcpy.S index db019f860..810eb0c0e 100644 --- a/libc/string/ia64/memcpy.S +++ b/libc/string/ia64/memcpy.S @@ -115,7 +115,7 @@ #if defined(USE_LFETCH) #define LOOP(shift) \ ALIGN(32); \ -.loop##shift##: \ +.loop##shift : \ { .mmb \ (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ (p[0]) lfetch.nt1 [ptr1], 16 ; \ @@ -139,7 +139,7 @@ #else #define LOOP(shift) \ ALIGN(32); \ -.loop##shift##: \ +.loop##shift : \ { .mmb \ (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \ nop.b 0 ; \ diff --git a/libc/string/ia64/memmove.S b/libc/string/ia64/memmove.S index 0328f84de..00342d8e0 100644 --- a/libc/string/ia64/memmove.S +++ b/libc/string/ia64/memmove.S @@ -64,7 +64,7 @@ #define LOOP(shift) \ ALIGN(32); \ -.loop##shift##: \ +.loop##shift : \ (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \ (p[MEMLAT+1]) st8 [dest] = value, 8 ; \ (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \ diff --git a/libc/string/ia64/sysdep.h b/libc/string/ia64/sysdep.h index 03e74360d..d10020ac1 100644 --- a/libc/string/ia64/sysdep.h +++ b/libc/string/ia64/sysdep.h @@ -34,7 +34,7 @@ #define ASM_UNW_PRLG_GRSAVE(ninputs) (32+(ninputs)) #ifdef __STDC__ -#define C_LABEL(name) name##: +#define C_LABEL(name) name : #else #define C_LABEL(name) name/**/: #endif diff --git a/libc/string/powerpc/memcpy.c b/libc/string/powerpc/memcpy.c index ed8022313..bcbb806f8 100644 --- a/libc/string/powerpc/memcpy.c +++ b/libc/string/powerpc/memcpy.c @@ -21,7 +21,7 @@ #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */ void *memcpy(void *to, const void *from, size_t n) /* PPC can do pre increment and load/store, but not post increment and load/store. Therefore use *++ptr instead of *ptr++. */ diff --git a/libc/string/powerpc/memmove.c b/libc/string/powerpc/memmove.c index 327161116..7a4a7e5ff 100644 --- a/libc/string/powerpc/memmove.c +++ b/libc/string/powerpc/memmove.c @@ -21,9 +21,9 @@ #include <string.h> -libc_hidden_proto(memcpy) +/* Experimentally off - libc_hidden_proto(memcpy) */ -libc_hidden_proto(memmove) +/* Experimentally off - libc_hidden_proto(memmove) */ void *memmove(void *to, const void *from, size_t n) { unsigned long rem, chunks, tmp1, tmp2; diff --git a/libc/string/powerpc/memset.c b/libc/string/powerpc/memset.c index 891e0b8aa..d62ec0ee0 100644 --- a/libc/string/powerpc/memset.c +++ b/libc/string/powerpc/memset.c @@ -21,14 +21,14 @@ #include <string.h> -libc_hidden_proto(memset) +/* Experimentally off - libc_hidden_proto(memset) */ static inline int expand_byte_word(int c){ /* this does: c = c << 8 | c; c = c << 16 | c ; */ - asm("rlwimi %0,%0,8,16,23\n" + __asm__("rlwimi %0,%0,8,16,23\n" "\trlwimi %0,%0,16,0,15\n" : "=r" (c) : "0" (c)); return c; diff --git a/libc/string/sparc/_glibc_inc.h b/libc/string/sparc/_glibc_inc.h index 4eb4d755c..e0aef52c2 100644 --- a/libc/string/sparc/_glibc_inc.h +++ b/libc/string/sparc/_glibc_inc.h @@ -6,6 +6,8 @@ #include <features.h> #include <bits/wordsize.h> +/* Is alignment really needed? */ + #if __WORDSIZE == 32 # define ENTRY_ALIGN 4 #else diff --git a/libc/string/sparc/sparc32/sparcv9b/memchr.S b/libc/string/sparc/sparc32/sparcv9b/memchr.S index 7e86a2972..43a16ff11 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memchr.S +++ b/libc/string/sparc/sparc32/sparcv9b/memchr.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include "sparc64/memchr.S" +#include "../../sparc64/memchr.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/memcpy.S b/libc/string/sparc/sparc32/sparcv9b/memcpy.S index 7f697542e..2024869dd 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/memcpy.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include "sparc64/sparcv9b/memcpy.S" +#include "../../sparc64/sparcv9b/memcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/memset.S b/libc/string/sparc/sparc32/sparcv9b/memset.S index ac67b7ab7..e49173172 100644 --- a/libc/string/sparc/sparc32/sparcv9b/memset.S +++ b/libc/string/sparc/sparc32/sparcv9b/memset.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/memset.S> +#include "../../sparc64/memset.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S index 440ad7e21..17ffa5e4d 100644 --- a/libc/string/sparc/sparc32/sparcv9b/stpcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/stpcpy.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/stpcpy.S> +#include "../../sparc64/stpcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcat.S b/libc/string/sparc/sparc32/sparcv9b/strcat.S index 7a2223570..9ed125a4b 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcat.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcat.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/strcat.S> +#include "../../sparc64/strcat.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strchr.S b/libc/string/sparc/sparc32/sparcv9b/strchr.S index ddd32120d..6b2727a1f 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strchr.S +++ b/libc/string/sparc/sparc32/sparcv9b/strchr.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/strchr.S> +#include "../../sparc64/strchr.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcmp.S b/libc/string/sparc/sparc32/sparcv9b/strcmp.S index 5330f4359..854403ffd 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcmp.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcmp.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/strcmp.S> +#include "../../sparc64/strcmp.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strcpy.S b/libc/string/sparc/sparc32/sparcv9b/strcpy.S index 0b35c9be0..e8102bde4 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strcpy.S +++ b/libc/string/sparc/sparc32/sparcv9b/strcpy.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/strcpy.S> +#include "../../sparc64/strcpy.S" diff --git a/libc/string/sparc/sparc32/sparcv9b/strlen.S b/libc/string/sparc/sparc32/sparcv9b/strlen.S index b8f4dba4f..8673333a2 100644 --- a/libc/string/sparc/sparc32/sparcv9b/strlen.S +++ b/libc/string/sparc/sparc32/sparcv9b/strlen.S @@ -1,4 +1,4 @@ #define ASI_PNF 0x82 #define ASI_BLK_P 0xf0 #define XCC icc -#include <sparc64/strlen.S> +#include "../../sparc64/strlen.S" diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h index 88cef2ea3..415ce90a7 100644 --- a/libc/string/x86_64/_glibc_inc.h +++ b/libc/string/x86_64/_glibc_inc.h @@ -6,15 +6,8 @@ #include <features.h> #include <bits/wordsize.h> -#if __WORDSIZE == 32 -# define ENTRY_ALIGN 4 -#else -# define ENTRY_ALIGN 2 -#endif - #define ENTRY(sym) \ .global sym; \ - .align ENTRY_ALIGN; \ .type sym,%function; \ sym: diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S index b3bb0f96c..697b992d0 100644 --- a/libc/string/x86_64/memcpy.S +++ b/libc/string/x86_64/memcpy.S @@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy)) subq $32, %rcx js 2f - .p2align 4 + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 3: - /* Now correct the loop counter. Please note that in the following code the flags are not changed anymore. */ subq $32, %rcx diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S index d72d74468..46751006b 100644 --- a/libc/string/x86_64/memset.S +++ b/libc/string/x86_64/memset.S @@ -53,15 +53,17 @@ ENTRY (memset) imul %rax,%r8 #endif test $0x7,%edi /* Check for alignment. */ - je 2f + jz 2f - .p2align 4 -1: /* Align ptr to 8 byte. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +1: + /* Align ptr to 8 byte. */ mov %sil,(%rcx) dec %rdx inc %rcx - test $0x7,%ecx - jne 1b + test $0x7,%cl + jnz 1b 2: /* Check for really large regions. */ mov %rdx,%rax @@ -70,8 +72,10 @@ ENTRY (memset) cmp LARGE, %rdx jae 11f - .p2align 4 -3: /* Copy 64 bytes. */ + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 +3: + /* Fill 64 bytes. */ mov %r8,(%rcx) mov %r8,0x8(%rcx) mov %r8,0x10(%rcx) @@ -84,7 +88,7 @@ ENTRY (memset) dec %rax jne 3b -4: /* Copy final bytes. */ +4: /* Fill final bytes. */ and $0x3f,%edx mov %rdx,%rax shr $0x3,%rax @@ -107,16 +111,18 @@ ENTRY (memset) jne 8b 9: #if BZERO_P - nop + /* nothing */ #else /* Load result (only if used as memset). */ mov %rdi,%rax /* start address of destination is result */ #endif retq - .p2align 4 -11: /* Copy 64 bytes without polluting the cache. */ - /* We could use movntdq %xmm0,(%rcx) here to further + /* Next 3 insns are 14 bytes total, make sure we decode them in one go */ + .p2align 4,,14 +11: + /* Fill 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further speed up for large cases but let's not use XMM registers. */ movnti %r8,(%rcx) movnti %r8,0x8(%rcx) diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S index 9b0068981..23d068fea 100644 --- a/libc/string/x86_64/strcat.S +++ b/libc/string/x86_64/strcat.S @@ -21,6 +21,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (BP_SYM (strcat)) @@ -44,7 +45,9 @@ ENTRY (BP_SYM (strcat)) /* Now the source is aligned. Scan for NUL byte. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 4: /* First unroll. */ movq (%rax), %rcx /* get double word (= 8 bytes) in question */ @@ -102,8 +105,11 @@ ENTRY (BP_SYM (strcat)) the addition will not result in 0. */ jz 4b /* no NUL found => continue loop */ - .p2align 4 /* Align, it's a jump target. */ -3: subq $8,%rax /* correct pointer increment. */ + /* Align, it is a jump target. */ + /* Next 3 insns are 8 bytes total, make sure we decode them in one go */ + .p2align 3,,8 +3: + subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte NUL? */ jz 2f /* yes => return */ @@ -159,7 +165,9 @@ ENTRY (BP_SYM (strcat)) /* Now the sources is aligned. Unfortunatly we cannot force to have both source and destination aligned, so ignore the alignment of the destination. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 22: /* 1st unroll. */ movq (%rsi), %rax /* Read double word (8 bytes). */ @@ -236,7 +244,9 @@ ENTRY (BP_SYM (strcat)) /* Do the last few bytes. %rax contains the value to write. The loop is unrolled twice. */ - .p2align 4 + + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 23: movb %al, (%rdx) /* 1st byte. */ testb %al, %al /* Is it NUL. */ diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S index 8e59c4c19..9ef46b7f2 100644 --- a/libc/string/x86_64/strchr.S +++ b/libc/string/x86_64/strchr.S @@ -20,6 +20,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (BP_SYM (strchr)) @@ -91,7 +92,8 @@ ENTRY (BP_SYM (strchr)) each of whose bytes is C. This turns each byte that is C into a zero. */ - .p2align 4 + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 4: /* Main Loop is unrolled 4 times. */ /* First unroll. */ @@ -229,8 +231,11 @@ ENTRY (BP_SYM (strchr)) reversed. */ - .p2align 4 /* Align, it's a jump target. */ -3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + /* Align, it's a jump target. */ + /* Next 3 insns are 9 bytes total, make sure we decode them in one go */ + .p2align 4,,9 +3: + movq %r9,%rdx /* move to %rdx so that we can access bytes */ subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte C? */ jz 6f /* yes => return pointer */ @@ -280,7 +285,7 @@ ENTRY (BP_SYM (strchr)) incq %rax 6: - nop + /* nop - huh?? */ retq END (BP_SYM (strchr)) diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S index d9a51b0bb..612a30d1a 100644 --- a/libc/string/x86_64/strcpy.S +++ b/libc/string/x86_64/strcpy.S @@ -20,6 +20,8 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ + #ifndef USE_AS_STPCPY # define STRCPY strcpy #endif @@ -51,7 +53,9 @@ ENTRY (BP_SYM (STRCPY)) /* Now the sources is aligned. Unfortunatly we cannot force to have both source and destination aligned, so ignore the alignment of the destination. */ - .p2align 4 + + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 1: /* 1st unroll. */ movq (%rsi), %rax /* Read double word (8 bytes). */ @@ -128,7 +132,9 @@ ENTRY (BP_SYM (STRCPY)) /* Do the last few bytes. %rax contains the value to write. The loop is unrolled twice. */ - .p2align 4 + + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 3: /* Note that stpcpy needs to return with the value of the NUL byte. */ diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S index fed12b5f6..fd9b09c48 100644 --- a/libc/string/x86_64/strcspn.S +++ b/libc/string/x86_64/strcspn.S @@ -25,6 +25,8 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ + /* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */ #define STRPBRK_P (defined strcspn) @@ -53,26 +55,28 @@ ENTRY (strcspn) Although all the following instruction only modify %cl we always have a correct zero-extended 64-bit value in %rcx. */ - .p2align 4 + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 + L(2): movb (%rax), %cl /* get byte from skipset */ testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 1(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 2(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ movb 3(%rax), %cl /* get byte from skipset */ addq $4, %rax /* increment skipset pointer */ movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jnz L(2) /* no => process next dword from skipset */ L(1): leaq -4(%rdx), %rax /* prepare loop */ @@ -86,7 +90,13 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */ value in the table. But the value of NUL is NUL so the loop terminates for NUL in every case. */ - .p2align 4 + /* Next 3 insns are 9 bytes total. */ + /* .p2align 4,,9 would make sure we decode them in one go, */ + /* but it will also align entire function to 16 bytes, */ + /* potentially creating largish padding at link time. */ + /* We are aligning to 8 bytes instead: */ + .p2align 3,,8 + L(3): addq $4, %rax /* adjust pointer for full loop round */ movb (%rax), %cl /* get byte from string */ diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S index 0441dc46c..4213f0ab6 100644 --- a/libc/string/x86_64/strlen.S +++ b/libc/string/x86_64/strlen.S @@ -20,6 +20,7 @@ #include "_glibc_inc.h" +/* Seems to be unrolled too much */ .text ENTRY (strlen) @@ -39,8 +40,11 @@ ENTRY (strlen) 1: movq $0xfefefefefefefeff,%r8 /* Save magic. */ - .p2align 4 /* Align loop. */ -4: /* Main Loop is unrolled 4 times. */ + /* Align loop. */ + /* Next 3 insns are 10 bytes total, make sure we decode them in one go */ + .p2align 4,,10 +4: + /* Main Loop is unrolled 4 times. */ /* First unroll. */ movq (%rax), %rcx /* get double word (= 8 bytes) in question */ addq $8,%rax /* adjust pointer for next word */ @@ -97,8 +101,11 @@ ENTRY (strlen) the addition will not result in 0. */ jz 4b /* no NUL found => continue loop */ - .p2align 4 /* Align, it's a jump target. */ -3: subq $8,%rax /* correct pointer increment. */ + /* Align, it is a jump target. */ + /* Next 3 insns are 8 bytes total, make sure we decode them in one go */ + .p2align 3,,8 +3: + subq $8,%rax /* correct pointer increment. */ testb %cl, %cl /* is first byte NUL? */ jz 2f /* yes => return */ diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S index c126abd2e..41cff0490 100644 --- a/libc/string/x86_64/strspn.S +++ b/libc/string/x86_64/strspn.S @@ -50,26 +50,28 @@ ENTRY (strspn) Although all the following instruction only modify %cl we always have a correct zero-extended 64-bit value in %rcx. */ - .p2align 4 -L(2): movb (%rax), %cl /* get byte from stopset */ + /* Next 3 insns are 6 bytes total, make sure we decode them in one go */ + .p2align 3,,6 +L(2): + movb (%rax), %cl /* get byte from stopset */ testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 1(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 2(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jz L(1) /* yes => start compare loop */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ movb 3(%rax), %cl /* get byte from stopset */ addq $4, %rax /* increment stopset pointer */ movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - testb $0xff, %cl /* is NUL char? */ + testb %cl, %cl /* is NUL char? */ jnz L(2) /* no => process next dword from stopset */ L(1): leaq -4(%rdx), %rax /* prepare loop */ @@ -83,8 +85,14 @@ L(1): leaq -4(%rdx), %rax /* prepare loop */ value in the table. But the value of NUL is NUL so the loop terminates for NUL in every case. */ - .p2align 4 -L(3): addq $4, %rax /* adjust pointer for full loop round */ + /* Next 3 insns are 9 bytes total. */ + /* .p2align 4,,9 would make sure we decode them in one go, */ + /* but it will also align entire function to 16 bytes, */ + /* potentially creating largish padding at link time. */ + /* We are aligning to 8 bytes instead: */ + .p2align 3,,8 +L(3): + addq $4, %rax /* adjust pointer for full loop round */ movb (%rax), %cl /* get byte from string */ testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ |