1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
|
From ed217c98b003f226500d87e65600f984a692b6ce Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 29 May 2018 09:53:24 +0200
Subject: [PATCH] x86/xpti: avoid copying L4 page table contents when possible
For mitigation of Meltdown the current L4 page table is copied to the
cpu local root page table each time a 64 bit pv guest is entered.
Copying can be avoided in cases where the guest L4 page table hasn't
been modified while running the hypervisor, e.g. when handling
interrupts or any hypercall not modifying the L4 page table or %cr3.
So add a per-cpu flag indicating whether the copying should be
performed and set that flag only when loading a new %cr3 or modifying
the L4 page table. This includes synchronization of the cpu local
root page table with other cpus, so add a special synchronization flag
for that case.
A simple performance check (compiling the hypervisor via "make -j 4")
in dom0 with 4 vcpus shows a significant improvement:
- real time drops from 112 seconds to 103 seconds
- system time drops from 142 seconds to 131 seconds
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
xen/arch/x86/flushtlb.c | 4 ++++
xen/arch/x86/mm.c | 36 +++++++++++++++++++++++-------------
xen/arch/x86/mm/shadow/multi.c | 3 +++
xen/arch/x86/smp.c | 2 +-
xen/arch/x86/x86_64/asm-offsets.c | 1 +
xen/arch/x86/x86_64/entry.S | 9 +++++++--
xen/arch/x86/x86_64/traps.c | 2 ++
xen/include/asm-x86/current.h | 8 ++++++++
xen/include/asm-x86/flushtlb.h | 8 ++++++++
9 files changed, 57 insertions(+), 16 deletions(-)
diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
index 8a7a76b8ff..2729ba42e7 100644
--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -8,6 +8,7 @@
*/
#include <xen/sched.h>
+#include <xen/smp.h>
#include <xen/softirq.h>
#include <asm/flushtlb.h>
#include <asm/page.h>
@@ -160,5 +161,8 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
local_irq_restore(irqfl);
+ if ( flags & FLUSH_ROOT_PGTBL )
+ get_cpu_info()->root_pgt_changed = true;
+
return flags;
}
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 63a933fd5f..171137310b 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -512,6 +512,7 @@ void make_cr3(struct vcpu *v, unsigned long mfn)
void write_ptbase(struct vcpu *v)
{
+ get_cpu_info()->root_pgt_changed = true;
write_cr3(v->arch.cr3);
}
@@ -4052,18 +4053,27 @@ long do_mmu_update(
case PGT_l4_page_table:
rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
- /*
- * No need to sync if all uses of the page can be accounted
- * to the page lock we hold, its pinned status, and uses on
- * this (v)CPU.
- */
- if ( !rc && !cpu_has_no_xpti &&
- ((page->u.inuse.type_info & PGT_count_mask) >
- (1 + !!(page->u.inuse.type_info & PGT_pinned) +
- (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
- (pagetable_get_pfn(curr->arch.guest_table_user) ==
- mfn))) )
- sync_guest = true;
+ if ( !rc && !cpu_has_no_xpti )
+ {
+ bool local_in_use = false;
+
+ if ( pagetable_get_pfn(curr->arch.guest_table) == mfn )
+ {
+ local_in_use = true;
+ get_cpu_info()->root_pgt_changed = true;
+ }
+
+ /*
+ * No need to sync if all uses of the page can be
+ * accounted to the page lock we hold, its pinned
+ * status, and uses on this (v)CPU.
+ */
+ if ( (page->u.inuse.type_info & PGT_count_mask) >
+ (1 + !!(page->u.inuse.type_info & PGT_pinned) +
+ (pagetable_get_pfn(curr->arch.guest_table_user) ==
+ mfn) + local_in_use) )
+ sync_guest = true;
+ }
break;
case PGT_writable_page:
perfc_incr(writable_mmu_updates);
@@ -4177,7 +4187,7 @@ long do_mmu_update(
cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu));
if ( !cpumask_empty(mask) )
- flush_mask(mask, FLUSH_TLB_GLOBAL);
+ flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
}
perfc_add(num_page_updates, i);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 93771d9e43..a53d3db56c 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -951,6 +951,8 @@ static int shadow_set_l4e(struct domain *d,
/* Write the new entry */
shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ flush_root_pgtbl_domain(d);
+
flags |= SHADOW_SET_CHANGED;
if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
@@ -965,6 +967,7 @@ static int shadow_set_l4e(struct domain *d,
}
sh_put_ref(d, osl3mfn, paddr);
}
+
return flags;
}
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 70de53d4fe..e8b949da61 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -208,7 +208,7 @@ void invalidate_interrupt(struct cpu_user_regs *regs)
ack_APIC_irq();
perfc_incr(ipis);
if ( __sync_local_execstate() )
- flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL);
+ flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
flush_area_local(flush_va, flags);
cpumask_clear_cpu(smp_processor_id(), &flush_cpumask);
}
diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
index cc97d753df..a347083d23 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -144,6 +144,7 @@ void __dummy__(void)
OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl);
OFFSET(CPUINFO_xen_spec_ctrl, struct cpu_info, xen_spec_ctrl);
OFFSET(CPUINFO_spec_ctrl_flags, struct cpu_info, spec_ctrl_flags);
+ OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed);
DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
BLANK();
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index cdf5090ec7..67e9b49a5c 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -161,11 +161,15 @@ restore_all_guest:
mov VCPU_cr3(%rbx), %r9
GET_STACK_END(dx)
mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
+ test %rdi, %rdi
+ jz .Lrag_keep_cr3
+ mov %rdi, %rax
+ cmpb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
+ je .Lrag_copy_done
+ movb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx)
movabs $PADDR_MASK & PAGE_MASK, %rsi
movabs $DIRECTMAP_VIRT_START, %rcx
- mov %rdi, %rax
and %rsi, %rdi
- jz .Lrag_keep_cr3
and %r9, %rsi
add %rcx, %rdi
add %rcx, %rsi
@@ -180,6 +184,7 @@ restore_all_guest:
sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
rep movsq
+.Lrag_copy_done:
mov STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi
mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
mov %rdi, %rsi
diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
index 4f92a2e1ca..8bb2f1afe5 100644
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -284,6 +284,8 @@ void toggle_guest_pt(struct vcpu *v)
v->arch.flags ^= TF_kernel_mode;
update_cr3(v);
+ get_cpu_info()->root_pgt_changed = true;
+
/* Don't flush user global mappings from the TLB. Don't tick TLB clock. */
asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" );
diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index 7afff0e245..f0061bd497 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -59,6 +59,14 @@ struct cpu_info {
uint8_t xen_spec_ctrl;
uint8_t spec_ctrl_flags;
+ /*
+ * The following field controls copying of the L4 page table of 64-bit
+ * PV guests to the per-cpu root page table on entering the guest context.
+ * If set the L4 page table is being copied to the root page table and
+ * the field will be reset.
+ */
+ bool root_pgt_changed;
+
unsigned long __pad;
/* get_stack_bottom() must be 16-byte aligned */
};
diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h
index 5f78bbbe3d..ca2cd16721 100644
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -101,6 +101,8 @@ void write_cr3(unsigned long cr3);
#define FLUSH_CACHE 0x400
/* VA for the flush has a valid mapping */
#define FLUSH_VA_VALID 0x800
+ /* Flush the per-cpu root page table */
+#define FLUSH_ROOT_PGTBL 0x2000
/* Flush local TLBs/caches. */
unsigned int flush_area_local(const void *va, unsigned int flags);
@@ -132,6 +134,12 @@ void flush_area_mask(const cpumask_t *, const void *va, unsigned int flags);
#define flush_tlb_one_all(v) \
flush_tlb_one_mask(&cpu_online_map, v)
+#define flush_root_pgtbl_domain(d) \
+{ \
+ if ( !cpu_has_no_xpti && is_pv_domain(d) && !is_pv_32bit_domain(d) ) \
+ flush_mask((d)->domain_dirty_cpumask, FLUSH_ROOT_PGTBL); \
+}
+
static inline void flush_page_to_ram(unsigned long mfn) {}
static inline int invalidate_dcache_va_range(const void *p,
unsigned long size)
--
2.15.2
|