summaryrefslogtreecommitdiffstats
path: root/libc
diff options
context:
space:
mode:
authorGiuseppe Cavallaro <peppe.cavallaro@st.com>2009-06-18 10:37:09 +0200
committerCarmelo Amoroso <carmelo.amoroso@st.com>2009-09-27 09:58:37 +0200
commitaf2e5dd9301f2659e6edbad8264d1260537b9cee (patch)
tree35c6ce484d76a4b7cc3a47bed52c8de4143b9088 /libc
parent3a62a3fc97e0a0da82f34d4f93575fc36599b403 (diff)
downloaduClibc-alpine-af2e5dd9301f2659e6edbad8264d1260537b9cee.tar.bz2
uClibc-alpine-af2e5dd9301f2659e6edbad8264d1260537b9cee.tar.xz
sh: add a new memmove optimised for SH4
This patch adds the memmove fuction for SH4. By default, it used the generic implementation. This new code uses the memcpy for BWD copies and implements FWD copy when required (see comment within the code itself). The idea behind is to get advantage of using the optimised memcpy for SH4 and use the FPU for FWD copies (for big sizes) as well. LMBench bw_mem test showed a significant improvement on uClibc because bcopy invokes memmove, directly. Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com> Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
Diffstat (limited to 'libc')
-rw-r--r--libc/string/sh/sh4/memmove.c119
1 files changed, 119 insertions, 0 deletions
diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c
new file mode 100644
index 000000000..3102039a6
--- /dev/null
+++ b/libc/string/sh/sh4/memmove.c
@@ -0,0 +1,119 @@
+/* memmove implementation for SH4
+ *
+ * Copyright (C) 2009 STMicroelectronics Ltd.
+ *
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+#include <string.h>
+
+static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len);
+
+void *memmove(void *dest, const void *src, size_t len)
+{
+ unsigned long int d = (long int)dest;
+ unsigned long int s = (long int)src;
+ unsigned long int res;
+
+ if (d >= s)
+ res = d - s;
+ else
+ res = s - d;
+ /*
+ * 1) dest and src are not overlap ==> memcpy (BWD/FDW)
+ * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW)
+ * 3) left-to-right overlap ==> Copy from the beginning to the end
+ * 4) right-to-left overlap ==> Copy from the end to the beginning
+ */
+
+ if (res == 0) /* 100% overlap */
+ memcpy(dest, src, len); /* No overlap */
+ else if (res >= len)
+ memcpy(dest, src, len);
+ else {
+ if (d > s) /* right-to-left overlap */
+ memcpy(dest, src, len); /* memcpy is BWD */
+ else /* cannot use SH4 memcpy for this case */
+ fpu_optimised_copy_fwd(dest, src, len);
+ }
+ return (dest);
+}
+
+#define FPSCR_SR (1 << 20)
+#define STORE_FPSCR(x) __asm__ volatile("sts fpscr, %0" : "=r"(x))
+#define LOAD_FPSCR(x) __asm__ volatile("lds %0, fpscr" : : "r"(x))
+
+static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len)
+{
+ char *d = (char *)dest;
+ char *s = (char *)src;
+
+ if (len >= 64) {
+ unsigned long fpscr;
+ int *s1;
+ int *d1;
+
+ /* Align the dest to 4 byte boundary. */
+ while ((unsigned)d & 0x7) {
+ *d++ = *s++;
+ len--;
+ }
+
+ s1 = (int *)s;
+ d1 = (int *)d;
+
+ /* check if s is well aligned to use FPU */
+ if (!((unsigned)s1 & 0x7)) {
+
+ /* Align the dest to cache-line boundary */
+ while ((unsigned)d1 & 0x1c) {
+ *d1++ = *s1++;
+ len -= 4;
+ }
+
+ /* Use paired single precision load or store mode for
+ * 64-bit tranfering.*/
+ STORE_FPSCR(fpscr);
+ LOAD_FPSCR(FPSCR_SR);
+
+ while (len >= 32) {
+ __asm__ volatile ("fmov @%0+,dr0":"+r" (s1));
+ __asm__ volatile ("fmov @%0+,dr2":"+r" (s1));
+ __asm__ volatile ("fmov @%0+,dr4":"+r" (s1));
+ __asm__ volatile ("fmov @%0+,dr6":"+r" (s1));
+ __asm__
+ volatile ("fmov dr0,@%0"::"r"
+ (d1):"memory");
+ d1 += 2;
+ __asm__
+ volatile ("fmov dr2,@%0"::"r"
+ (d1):"memory");
+ d1 += 2;
+ __asm__
+ volatile ("fmov dr4,@%0"::"r"
+ (d1):"memory");
+ d1 += 2;
+ __asm__
+ volatile ("fmov dr6,@%0"::"r"
+ (d1):"memory");
+ d1 += 2;
+ len -= 32;
+ }
+
+ LOAD_FPSCR(fpscr);
+ }
+ s = (char *)s1;
+ d = (char *)d1;
+ /*TODO: other subcases could be covered here?!?*/
+ }
+ /* Go to per-byte copy */
+ while (len > 0) {
+ *d++ = *s++;
+ len--;
+ }
+ return;
+}
+
+libc_hidden_def(memmove)