diff options
| author | Giuseppe Cavallaro <peppe.cavallaro@st.com> | 2009-06-18 10:37:09 +0200 |
|---|---|---|
| committer | Carmelo Amoroso <carmelo.amoroso@st.com> | 2009-09-27 09:58:37 +0200 |
| commit | af2e5dd9301f2659e6edbad8264d1260537b9cee (patch) | |
| tree | 35c6ce484d76a4b7cc3a47bed52c8de4143b9088 /libc | |
| parent | 3a62a3fc97e0a0da82f34d4f93575fc36599b403 (diff) | |
| download | uClibc-alpine-af2e5dd9301f2659e6edbad8264d1260537b9cee.tar.bz2 uClibc-alpine-af2e5dd9301f2659e6edbad8264d1260537b9cee.tar.xz | |
sh: add a new memmove optimised for SH4
This patch adds the memmove fuction for SH4.
By default, it used the generic implementation.
This new code uses the memcpy for BWD copies and implements FWD copy
when required (see comment within the code itself).
The idea behind is to get advantage of using the optimised memcpy for SH4
and use the FPU for FWD copies (for big sizes) as well.
LMBench bw_mem test showed a significant improvement on uClibc because bcopy
invokes memmove, directly.
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
Diffstat (limited to 'libc')
| -rw-r--r-- | libc/string/sh/sh4/memmove.c | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c new file mode 100644 index 000000000..3102039a6 --- /dev/null +++ b/libc/string/sh/sh4/memmove.c @@ -0,0 +1,119 @@ +/* memmove implementation for SH4 + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +#include <string.h> + +static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len); + +void *memmove(void *dest, const void *src, size_t len) +{ + unsigned long int d = (long int)dest; + unsigned long int s = (long int)src; + unsigned long int res; + + if (d >= s) + res = d - s; + else + res = s - d; + /* + * 1) dest and src are not overlap ==> memcpy (BWD/FDW) + * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW) + * 3) left-to-right overlap ==> Copy from the beginning to the end + * 4) right-to-left overlap ==> Copy from the end to the beginning + */ + + if (res == 0) /* 100% overlap */ + memcpy(dest, src, len); /* No overlap */ + else if (res >= len) + memcpy(dest, src, len); + else { + if (d > s) /* right-to-left overlap */ + memcpy(dest, src, len); /* memcpy is BWD */ + else /* cannot use SH4 memcpy for this case */ + fpu_optimised_copy_fwd(dest, src, len); + } + return (dest); +} + +#define FPSCR_SR (1 << 20) +#define STORE_FPSCR(x) __asm__ volatile("sts fpscr, %0" : "=r"(x)) +#define LOAD_FPSCR(x) __asm__ volatile("lds %0, fpscr" : : "r"(x)) + +static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len) +{ + char *d = (char *)dest; + char *s = (char *)src; + + if (len >= 64) { + unsigned long fpscr; + int *s1; + int *d1; + + /* Align the dest to 4 byte boundary. */ + while ((unsigned)d & 0x7) { + *d++ = *s++; + len--; + } + + s1 = (int *)s; + d1 = (int *)d; + + /* check if s is well aligned to use FPU */ + if (!((unsigned)s1 & 0x7)) { + + /* Align the dest to cache-line boundary */ + while ((unsigned)d1 & 0x1c) { + *d1++ = *s1++; + len -= 4; + } + + /* Use paired single precision load or store mode for + * 64-bit tranfering.*/ + STORE_FPSCR(fpscr); + LOAD_FPSCR(FPSCR_SR); + + while (len >= 32) { + __asm__ volatile ("fmov @%0+,dr0":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr2":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr4":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr6":"+r" (s1)); + __asm__ + volatile ("fmov dr0,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr2,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr4,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr6,@%0"::"r" + (d1):"memory"); + d1 += 2; + len -= 32; + } + + LOAD_FPSCR(fpscr); + } + s = (char *)s1; + d = (char *)d1; + /*TODO: other subcases could be covered here?!?*/ + } + /* Go to per-byte copy */ + while (len > 0) { + *d++ = *s++; + len--; + } + return; +} + +libc_hidden_def(memmove) |
