From df7958a9606a342e3c3ac5a40fc41f3a79669d62 Mon Sep 17 00:00:00 2001
From: Denis Vlasenko <vda.linux@googlemail.com>
Date: Tue, 15 Apr 2008 08:27:24 +0000
Subject: amd64 string ops: use alignment more carefully, and comment it. By
 capping max padding to not be bigger than three next insns, we avoid having
 ridiculously big NOPs like this one:

53:66 66 66 66 2e 0f 1f nopw   %cs:0x0(%rax,%rax,1)
5a:84 00 00 00 00 00

which was bigger than next three insns combined!

Size changes:

   text    data     bss     dec     hex filename
    102       0       0     102      66 x86_64/memcpy.o
    102       0       0     102      66 x86_64.old/memcpy.o

     90       0       0      90      5a x86_64/mempcpy.o
    102       0       0     102      66 x86_64.old/mempcpy.o

    210       0       0     210      d2 x86_64/memset.o
    242       0       0     242      f2 x86_64.old/memset.o

    213       0       0     213      d5 x86_64/stpcpy.o
    220       0       0     220      dc x86_64.old/stpcpy.o

    428       0       0     428     1ac x86_64/strcat.o
    444       0       0     444     1bc x86_64.old/strcat.o

    417       0       0     417     1a1 x86_64/strchr.o
    418       0       0     418     1a2 x86_64.old/strchr.o

     33       0       0      33      21 x86_64/strcmp.o
     33       0       0      33      21 x86_64.old/strcmp.o

    213       0       0     213      d5 x86_64/strcpy.o
    220       0       0     220      dc x86_64.old/strcpy.o

    135       0       0     135      87 x86_64/strcspn.o
    151       0       0     151      97 x86_64.old/strcspn.o

    225       0       0     225      e1 x86_64/strlen.o
    233       0       0     233      e9 x86_64.old/strlen.o

    140       0       0     140      8c x86_64/strpbrk.o
    156       0       0     156      9c x86_64.old/strpbrk.o

    135       0       0     135      87 x86_64/strspn.o
    151       0       0     151      97 x86_64.old/strspn.o

Also, a few files got their .text alignment relaxed from 16 to 8 bytes,
which reduces padding at link time.
---
 libc/string/x86_64/memset.S | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'libc/string/x86_64/memset.S')

diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index 3092e81eb..46751006b 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -55,8 +55,10 @@ ENTRY (memset)
 	test	$0x7,%edi	/* Check for alignment.  */
 	jz	2f
 
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+1:
+	/* Align ptr to 8 byte.  */
 	mov	%sil,(%rcx)
 	dec	%rdx
 	inc	%rcx
@@ -70,8 +72,10 @@ ENTRY (memset)
 	cmp	LARGE, %rdx
 	jae	11f
 
-	.p2align 4
-3:	/* Fill 64 bytes.  */
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
+3:
+	/* Fill 64 bytes.  */
 	mov	%r8,(%rcx)
 	mov	%r8,0x8(%rcx)
 	mov	%r8,0x10(%rcx)
@@ -114,9 +118,11 @@ ENTRY (memset)
 #endif
 	retq
 
-	.p2align 4
-11:	/* Fill 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
+	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+	.p2align 4,,14
+11:
+	/* Fill 64 bytes without polluting the cache.  */
+	/* We could use	movntdq %xmm0,(%rcx) here to further
 	   speed up for large cases but let's not use XMM registers.  */
 	movnti	%r8,(%rcx)
 	movnti  %r8,0x8(%rcx)
-- 
cgit v1.2.3