aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Riomar <henrik.riomar@gmail.com>2018-01-17 21:03:25 +0100
committerNatanael Copa <ncopa@alpinelinux.org>2018-01-19 02:04:51 +0000
commitf2f3a06de22b3f503815c79aeae8878b8320f5da (patch)
tree70bafad378e588265ee6c69fa0f6ed49abb6022e
parent59b28fe95c97f40dcb090e39a5243af9f35a6845 (diff)
downloadaports-f2f3a06de22b3f503815c79aeae8878b8320f5da.tar.bz2
aports-f2f3a06de22b3f503815c79aeae8878b8320f5da.tar.xz
main/xen: XPTI xsa254
Add Xen page-table isolation (XPTI) for XEN 4.9.1 More info: http://xenbits.xen.org/xsa/xsa254/README.pti
-rw-r--r--main/xen/0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch409
-rw-r--r--main/xen/0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch46
-rw-r--r--main/xen/0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch761
-rw-r--r--main/xen/0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch164
-rw-r--r--main/xen/APKBUILD13
5 files changed, 1392 insertions, 1 deletions
diff --git a/main/xen/0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch b/main/xen/0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch
new file mode 100644
index 0000000000..749306b6d5
--- /dev/null
+++ b/main/xen/0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch
@@ -0,0 +1,409 @@
+From c60e88d20d08253904d582478b50d2eebbef1fb6 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Wed, 17 Jan 2018 17:03:51 +0100
+Subject: [PATCH 1/4] x86/entry: Remove support for partial cpu_user_regs
+ frames
+
+Save all GPRs on entry to Xen.
+
+The entry_int82() path is via a DPL1 gate, only usable by 32bit PV guests, so
+can get away with only saving the 32bit registers. All other entrypoints can
+be reached from 32 or 64bit contexts.
+
+This is part of XSA-254.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: f9eb74789af77e985ae653193f3622263499f674
+master date: 2018-01-05 19:57:07 +0000
+(cherry picked from commit 2213ffe1a2d82c3c9c4a154ea6ee252395aa8693)
+---
+ tools/tests/x86_emulator/x86_emulate.c | 1 -
+ xen/arch/x86/domain.c | 1 -
+ xen/arch/x86/traps.c | 2 -
+ xen/arch/x86/x86_64/compat/entry.S | 7 ++-
+ xen/arch/x86/x86_64/entry.S | 12 ++--
+ xen/arch/x86/x86_64/traps.c | 13 ++--
+ xen/arch/x86/x86_emulate.c | 1 -
+ xen/arch/x86/x86_emulate/x86_emulate.c | 8 +--
+ xen/common/wait.c | 1 -
+ xen/include/asm-x86/asm_defns.h | 105 +++------------------------------
+ 10 files changed, 26 insertions(+), 125 deletions(-)
+
+diff --git a/tools/tests/x86_emulator/x86_emulate.c b/tools/tests/x86_emulator/x86_emulate.c
+index 79661d5c2b..b10ca2cfc9 100644
+--- a/tools/tests/x86_emulator/x86_emulate.c
++++ b/tools/tests/x86_emulator/x86_emulate.c
+@@ -3,7 +3,6 @@
+ #include <sys/mman.h>
+
+ #define cpu_has_amd_erratum(nr) 0
+-#define mark_regs_dirty(r) ((void)(r))
+ #define cpu_has_mpx false
+ #define read_bndcfgu() 0
+ #define xstate_set_init(what)
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 765bc0085d..07b50315b9 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -148,7 +148,6 @@ static void noreturn continue_idle_domain(struct vcpu *v)
+ static void noreturn continue_nonidle_domain(struct vcpu *v)
+ {
+ check_wakeup_from_wait();
+- mark_regs_dirty(guest_cpu_user_regs());
+ reset_stack_and_jump(ret_from_intr);
+ }
+
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index e861f7af66..3aef0f2769 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -2204,7 +2204,6 @@ static int priv_op_read_io(unsigned int port, unsigned int bytes,
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+- mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ return X86EMUL_DONE;
+ }
+@@ -2234,7 +2233,6 @@ static int priv_op_write_io(unsigned int port, unsigned int bytes,
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+- mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ if ( (bytes == 1) && pv_post_outb_hook )
+ pv_post_outb_hook(port, val);
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index 90bda09614..37864a67f3 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -14,7 +14,8 @@
+ ENTRY(entry_int82)
+ ASM_CLAC
+ pushq $0
+- SAVE_VOLATILE type=HYPERCALL_VECTOR compat=1
++ movl $HYPERCALL_VECTOR, 4(%rsp)
++ SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */
+ CR4_PV32_RESTORE
+
+ GET_CURRENT(bx)
+@@ -58,7 +59,6 @@ compat_test_guest_events:
+ /* %rbx: struct vcpu */
+ compat_process_softirqs:
+ sti
+- andl $~TRAP_regs_partial,UREGS_entry_vector(%rsp)
+ call do_softirq
+ jmp compat_test_all_events
+
+@@ -195,7 +195,8 @@ ENTRY(cstar_enter)
+ pushq $FLAT_USER_CS32
+ pushq %rcx
+ pushq $0
+- SAVE_VOLATILE TRAP_syscall
++ movl $TRAP_syscall, 4(%rsp)
++ SAVE_ALL
+ GET_CURRENT(bx)
+ movq VCPU_domain(%rbx),%rcx
+ cmpb $0,DOMAIN_is_32bit_pv(%rcx)
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 65c771f979..668bf8ac28 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -96,7 +96,8 @@ ENTRY(lstar_enter)
+ pushq $FLAT_KERNEL_CS64
+ pushq %rcx
+ pushq $0
+- SAVE_VOLATILE TRAP_syscall
++ movl $TRAP_syscall, 4(%rsp)
++ SAVE_ALL
+ GET_CURRENT(bx)
+ testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
+ jz switch_to_kernel
+@@ -138,7 +139,6 @@ test_guest_events:
+ /* %rbx: struct vcpu */
+ process_softirqs:
+ sti
+- SAVE_PRESERVED
+ call do_softirq
+ jmp test_all_events
+
+@@ -188,7 +188,8 @@ GLOBAL(sysenter_eflags_saved)
+ pushq $3 /* ring 3 null cs */
+ pushq $0 /* null rip */
+ pushq $0
+- SAVE_VOLATILE TRAP_syscall
++ movl $TRAP_syscall, 4(%rsp)
++ SAVE_ALL
+ GET_CURRENT(bx)
+ cmpb $0,VCPU_sysenter_disables_events(%rbx)
+ movq VCPU_sysenter_addr(%rbx),%rax
+@@ -205,7 +206,6 @@ UNLIKELY_END(sysenter_nt_set)
+ leal (,%rcx,TBF_INTERRUPT),%ecx
+ UNLIKELY_START(z, sysenter_gpf)
+ movq VCPU_trap_ctxt(%rbx),%rsi
+- SAVE_PRESERVED
+ movl $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+ movl %eax,TRAPBOUNCE_error_code(%rdx)
+ movq TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_eip(%rsi),%rax
+@@ -223,7 +223,8 @@ UNLIKELY_END(sysenter_gpf)
+ ENTRY(int80_direct_trap)
+ ASM_CLAC
+ pushq $0
+- SAVE_VOLATILE 0x80
++ movl $0x80, 4(%rsp)
++ SAVE_ALL
+
+ cmpb $0,untrusted_msi(%rip)
+ UNLIKELY_START(ne, msi_check)
+@@ -251,7 +252,6 @@ int80_slow_path:
+ * IDT entry with DPL==0.
+ */
+ movl $((0x80 << 3) | X86_XEC_IDT),UREGS_error_code(%rsp)
+- SAVE_PRESERVED
+ movl $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+ /* A GPF wouldn't have incremented the instruction pointer. */
+ subq $2,UREGS_rip(%rsp)
+diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c
+index fb42158070..4f92a2e1ca 100644
+--- a/xen/arch/x86/x86_64/traps.c
++++ b/xen/arch/x86/x86_64/traps.c
+@@ -81,15 +81,10 @@ static void _show_registers(
+ regs->rbp, regs->rsp, regs->r8);
+ printk("r9: %016lx r10: %016lx r11: %016lx\n",
+ regs->r9, regs->r10, regs->r11);
+- if ( !(regs->entry_vector & TRAP_regs_partial) )
+- {
+- printk("r12: %016lx r13: %016lx r14: %016lx\n",
+- regs->r12, regs->r13, regs->r14);
+- printk("r15: %016lx cr0: %016lx cr4: %016lx\n",
+- regs->r15, crs[0], crs[4]);
+- }
+- else
+- printk("cr0: %016lx cr4: %016lx\n", crs[0], crs[4]);
++ printk("r12: %016lx r13: %016lx r14: %016lx\n",
++ regs->r12, regs->r13, regs->r14);
++ printk("r15: %016lx cr0: %016lx cr4: %016lx\n",
++ regs->r15, crs[0], crs[4]);
+ printk("cr3: %016lx cr2: %016lx\n", crs[3], crs[2]);
+ printk("fsb: %016lx gsb: %016lx gss: %016lx\n",
+ crs[5], crs[6], crs[7]);
+diff --git a/xen/arch/x86/x86_emulate.c b/xen/arch/x86/x86_emulate.c
+index cc334ca8f9..c7ba221d11 100644
+--- a/xen/arch/x86/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate.c
+@@ -11,7 +11,6 @@
+
+ #include <xen/domain_page.h>
+ #include <asm/x86_emulate.h>
+-#include <asm/asm_defns.h> /* mark_regs_dirty() */
+ #include <asm/processor.h> /* current_cpu_info */
+ #include <asm/xstate.h>
+ #include <asm/amd.h> /* cpu_has_amd_erratum() */
+diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
+index 5ef14a9c72..f20481a611 100644
+--- a/xen/arch/x86/x86_emulate/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate/x86_emulate.c
+@@ -1937,10 +1937,10 @@ decode_register(
+ case 9: p = &regs->r9; break;
+ case 10: p = &regs->r10; break;
+ case 11: p = &regs->r11; break;
+- case 12: mark_regs_dirty(regs); p = &regs->r12; break;
+- case 13: mark_regs_dirty(regs); p = &regs->r13; break;
+- case 14: mark_regs_dirty(regs); p = &regs->r14; break;
+- case 15: mark_regs_dirty(regs); p = &regs->r15; break;
++ case 12: p = &regs->r12; break;
++ case 13: p = &regs->r13; break;
++ case 14: p = &regs->r14; break;
++ case 15: p = &regs->r15; break;
+ #endif
+ default: BUG(); p = NULL; break;
+ }
+diff --git a/xen/common/wait.c b/xen/common/wait.c
+index 9490a17dc2..c5fc094e2c 100644
+--- a/xen/common/wait.c
++++ b/xen/common/wait.c
+@@ -127,7 +127,6 @@ static void __prepare_to_wait(struct waitqueue_vcpu *wqv)
+ unsigned long dummy;
+ u32 entry_vector = cpu_info->guest_cpu_user_regs.entry_vector;
+
+- cpu_info->guest_cpu_user_regs.entry_vector &= ~TRAP_regs_partial;
+ ASSERT(wqv->esp == 0);
+
+ /* Save current VCPU affinity; force wakeup on *this* CPU only. */
+diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h
+index 388fc93b9d..98192eb4e6 100644
+--- a/xen/include/asm-x86/asm_defns.h
++++ b/xen/include/asm-x86/asm_defns.h
+@@ -17,15 +17,6 @@
+ void ret_from_intr(void);
+ #endif
+
+-#ifdef CONFIG_FRAME_POINTER
+-/* Indicate special exception stack frame by inverting the frame pointer. */
+-#define SETUP_EXCEPTION_FRAME_POINTER(offs) \
+- leaq offs(%rsp),%rbp; \
+- notq %rbp
+-#else
+-#define SETUP_EXCEPTION_FRAME_POINTER(offs)
+-#endif
+-
+ #ifndef NDEBUG
+ #define ASSERT_INTERRUPT_STATUS(x, msg) \
+ pushf; \
+@@ -42,31 +33,6 @@ void ret_from_intr(void);
+ #define ASSERT_INTERRUPTS_DISABLED \
+ ASSERT_INTERRUPT_STATUS(z, "INTERRUPTS DISABLED")
+
+-/*
+- * This flag is set in an exception frame when registers R12-R15 did not get
+- * saved.
+- */
+-#define _TRAP_regs_partial 16
+-#define TRAP_regs_partial (1 << _TRAP_regs_partial)
+-/*
+- * This flag gets set in an exception frame when registers R12-R15 possibly
+- * get modified from their originally saved values and hence need to be
+- * restored even if the normal call flow would restore register values.
+- *
+- * The flag being set implies _TRAP_regs_partial to be unset. Restoring
+- * R12-R15 thus is
+- * - required when this flag is set,
+- * - safe when _TRAP_regs_partial is unset.
+- */
+-#define _TRAP_regs_dirty 17
+-#define TRAP_regs_dirty (1 << _TRAP_regs_dirty)
+-
+-#define mark_regs_dirty(r) ({ \
+- struct cpu_user_regs *r__ = (r); \
+- ASSERT(!((r__)->entry_vector & TRAP_regs_partial)); \
+- r__->entry_vector |= TRAP_regs_dirty; \
+-})
+-
+ #ifdef __ASSEMBLY__
+ # define _ASM_EX(p) p-.
+ #else
+@@ -236,7 +202,7 @@ static always_inline void stac(void)
+ #endif
+
+ #ifdef __ASSEMBLY__
+-.macro SAVE_ALL op
++.macro SAVE_ALL op, compat=0
+ .ifeqs "\op", "CLAC"
+ ASM_CLAC
+ .else
+@@ -255,40 +221,6 @@ static always_inline void stac(void)
+ movq %rdx,UREGS_rdx(%rsp)
+ movq %rcx,UREGS_rcx(%rsp)
+ movq %rax,UREGS_rax(%rsp)
+- movq %r8,UREGS_r8(%rsp)
+- movq %r9,UREGS_r9(%rsp)
+- movq %r10,UREGS_r10(%rsp)
+- movq %r11,UREGS_r11(%rsp)
+- movq %rbx,UREGS_rbx(%rsp)
+- movq %rbp,UREGS_rbp(%rsp)
+- SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp)
+- movq %r12,UREGS_r12(%rsp)
+- movq %r13,UREGS_r13(%rsp)
+- movq %r14,UREGS_r14(%rsp)
+- movq %r15,UREGS_r15(%rsp)
+-.endm
+-
+-/*
+- * Save all registers not preserved by C code or used in entry/exit code. Mark
+- * the frame as partial.
+- *
+- * @type: exception type
+- * @compat: R8-R15 don't need saving, and the frame nevertheless is complete
+- */
+-.macro SAVE_VOLATILE type compat=0
+-.if \compat
+- movl $\type,UREGS_entry_vector-UREGS_error_code(%rsp)
+-.else
+- movl $\type|TRAP_regs_partial,\
+- UREGS_entry_vector-UREGS_error_code(%rsp)
+-.endif
+- addq $-(UREGS_error_code-UREGS_r15),%rsp
+- cld
+- movq %rdi,UREGS_rdi(%rsp)
+- movq %rsi,UREGS_rsi(%rsp)
+- movq %rdx,UREGS_rdx(%rsp)
+- movq %rcx,UREGS_rcx(%rsp)
+- movq %rax,UREGS_rax(%rsp)
+ .if !\compat
+ movq %r8,UREGS_r8(%rsp)
+ movq %r9,UREGS_r9(%rsp)
+@@ -297,20 +229,17 @@ static always_inline void stac(void)
+ .endif
+ movq %rbx,UREGS_rbx(%rsp)
+ movq %rbp,UREGS_rbp(%rsp)
+- SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp)
+-.endm
+-
+-/*
+- * Complete a frame potentially only partially saved.
+- */
+-.macro SAVE_PRESERVED
+- btrl $_TRAP_regs_partial,UREGS_entry_vector(%rsp)
+- jnc 987f
++#ifdef CONFIG_FRAME_POINTER
++/* Indicate special exception stack frame by inverting the frame pointer. */
++ leaq UREGS_rbp(%rsp), %rbp
++ notq %rbp
++#endif
++.if !\compat
+ movq %r12,UREGS_r12(%rsp)
+ movq %r13,UREGS_r13(%rsp)
+ movq %r14,UREGS_r14(%rsp)
+ movq %r15,UREGS_r15(%rsp)
+-987:
++.endif
+ .endm
+
+ #define LOAD_ONE_REG(reg, compat) \
+@@ -330,7 +259,6 @@ static always_inline void stac(void)
+ */
+ .macro RESTORE_ALL adj=0 compat=0
+ .if !\compat
+- testl $TRAP_regs_dirty,UREGS_entry_vector(%rsp)
+ movq UREGS_r11(%rsp),%r11
+ movq UREGS_r10(%rsp),%r10
+ movq UREGS_r9(%rsp),%r9
+@@ -347,33 +275,16 @@ static always_inline void stac(void)
+ LOAD_ONE_REG(si, \compat)
+ LOAD_ONE_REG(di, \compat)
+ .if !\compat
+- jz 987f
+ movq UREGS_r15(%rsp),%r15
+ movq UREGS_r14(%rsp),%r14
+ movq UREGS_r13(%rsp),%r13
+ movq UREGS_r12(%rsp),%r12
+-#ifndef NDEBUG
+- .subsection 1
+-987: testl $TRAP_regs_partial,UREGS_entry_vector(%rsp)
+- jnz 987f
+- cmpq UREGS_r15(%rsp),%r15
+- jne 789f
+- cmpq UREGS_r14(%rsp),%r14
+- jne 789f
+- cmpq UREGS_r13(%rsp),%r13
+- jne 789f
+- cmpq UREGS_r12(%rsp),%r12
+- je 987f
+-789: BUG /* Corruption of partial register state. */
+- .subsection 0
+-#endif
+ .else
+ xor %r15, %r15
+ xor %r14, %r14
+ xor %r13, %r13
+ xor %r12, %r12
+ .endif
+-987:
+ LOAD_ONE_REG(bp, \compat)
+ LOAD_ONE_REG(bx, \compat)
+ subq $-(UREGS_error_code-UREGS_r15+\adj), %rsp
+--
+2.15.0
+
diff --git a/main/xen/0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch b/main/xen/0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch
new file mode 100644
index 0000000000..232260bb36
--- /dev/null
+++ b/main/xen/0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch
@@ -0,0 +1,46 @@
+From 344da8f57f442be289bb3c09defb28758b227542 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Wed, 17 Jan 2018 17:04:59 +0100
+Subject: [PATCH 2/4] x86/mm: Always set _PAGE_ACCESSED on L4e updates
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: bd61fe94bee0556bc2f64999a4a8315b93f90f21
+master date: 2018-01-15 13:53:16 +0000
+(cherry picked from commit 87ea7816247090e8e5bc5653b16c412943a058b5)
+---
+ xen/arch/x86/mm.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index e77574f92b..981458907f 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1322,11 +1322,23 @@ get_page_from_l4e(
+ _PAGE_USER|_PAGE_RW); \
+ } while ( 0 )
+
++/*
++ * When shadowing an L4 behind the guests back (e.g. for per-pcpu
++ * purposes), we cannot efficiently sync access bit updates from hardware
++ * (on the shadow tables) back into the guest view.
++ *
++ * We therefore unconditionally set _PAGE_ACCESSED even in the guests
++ * view. This will appear to the guest as a CPU which proactively pulls
++ * all valid L4e's into its TLB, which is compatible with the x86 ABI.
++ *
++ * At the time of writing, all PV guests set the access bit anyway, so
++ * this is no actual change in their behaviour.
++ */
+ #define adjust_guest_l4e(pl4e, d) \
+ do { \
+ if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \
+ likely(!is_pv_32bit_domain(d)) ) \
+- l4e_add_flags((pl4e), _PAGE_USER); \
++ l4e_add_flags((pl4e), _PAGE_USER | _PAGE_ACCESSED); \
+ } while ( 0 )
+
+ #define unadjust_guest_l3e(pl3e, d) \
+--
+2.15.0
+
diff --git a/main/xen/0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch b/main/xen/0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch
new file mode 100644
index 0000000000..296bbe8484
--- /dev/null
+++ b/main/xen/0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch
@@ -0,0 +1,761 @@
+From 92884bbf6c424c402ae76e6da06e62cd33714cb3 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 17 Jan 2018 17:07:33 +0100
+Subject: [PATCH 3/4] x86: Meltdown band-aid against malicious 64-bit PV guests
+
+This is a very simplistic change limiting the amount of memory a running
+64-bit PV guest has mapped (and hence available for attacking): Only the
+mappings of stack, IDT, and TSS are being cloned from the direct map
+into per-CPU page tables. Guest controlled parts of the page tables are
+being copied into those per-CPU page tables upon entry into the guest.
+Cross-vCPU synchronization of top level page table entry changes is
+being effected by forcing other active vCPU-s of the guest into the
+hypervisor.
+
+The change to context_switch() isn't strictly necessary, but there's no
+reason to keep switching page tables once a PV guest is being scheduled
+out.
+
+This isn't providing full isolation yet, but it should be covering all
+pieces of information exposure of which would otherwise require an XSA.
+
+There is certainly much room for improvement, especially of performance,
+here - first and foremost suppressing all the negative effects on AMD
+systems. But in the interest of backportability (including to really old
+hypervisors, which may not even have alternative patching) any such is
+being left out here.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 5784de3e2067ed73efc2fe42e62831e8ae7f46c4
+master date: 2018-01-16 17:49:03 +0100
+(cherry picked from commit 1e0974638d65d9b8acf9ac7511d747188f38bcc3)
+---
+ xen/arch/x86/domain.c | 5 +
+ xen/arch/x86/mm.c | 21 ++++
+ xen/arch/x86/smpboot.c | 198 +++++++++++++++++++++++++++++++++++++
+ xen/arch/x86/x86_64/asm-offsets.c | 2 +
+ xen/arch/x86/x86_64/compat/entry.S | 11 +++
+ xen/arch/x86/x86_64/entry.S | 149 +++++++++++++++++++++++++++-
+ xen/include/asm-x86/asm_defns.h | 30 ++++++
+ xen/include/asm-x86/current.h | 12 +++
+ xen/include/asm-x86/processor.h | 1 +
+ xen/include/asm-x86/x86_64/page.h | 5 +-
+ 10 files changed, 428 insertions(+), 6 deletions(-)
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 07b50315b9..c0f0fc7a32 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -1933,6 +1933,9 @@ static void paravirt_ctxt_switch_to(struct vcpu *v)
+
+ switch_kernel_stack(v);
+
++ this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] =
++ l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW);
++
+ cr4 = pv_guest_cr4_to_real_cr4(v);
+ if ( unlikely(cr4 != read_cr4()) )
+ write_cr4(cr4);
+@@ -2102,6 +2105,8 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
+
+ ASSERT(local_irq_is_enabled());
+
++ get_cpu_info()->xen_cr3 = 0;
++
+ cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
+ /* Allow at most one CPU at a time to be dirty. */
+ ASSERT(cpumask_weight(&dirty_mask) <= 1);
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 981458907f..78f4cb37f5 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -3906,6 +3906,7 @@ long do_mmu_update(
+ struct vcpu *curr = current, *v = curr;
+ struct domain *d = v->domain, *pt_owner = d, *pg_owner;
+ struct domain_mmap_cache mapcache;
++ bool sync_guest = false;
+ uint32_t xsm_needed = 0;
+ uint32_t xsm_checked = 0;
+ int rc = put_old_guest_table(curr);
+@@ -4054,6 +4055,8 @@ long do_mmu_update(
+ case PGT_l4_page_table:
+ rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
++ if ( !rc )
++ sync_guest = true;
+ break;
+ case PGT_writable_page:
+ perfc_incr(writable_mmu_updates);
+@@ -4156,6 +4159,24 @@ long do_mmu_update(
+
+ domain_mmap_cache_destroy(&mapcache);
+
++ if ( sync_guest )
++ {
++ /*
++ * Force other vCPU-s of the affected guest to pick up L4 entry
++ * changes (if any). Issue a flush IPI with empty operation mask to
++ * facilitate this (including ourselves waiting for the IPI to
++ * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
++ * meaningless without FLUSH_CACHE, but will allow to pass the no-op
++ * check in flush_area_mask().
++ */
++ unsigned int cpu = smp_processor_id();
++ cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
++
++ cpumask_andnot(mask, pt_owner->domain_dirty_cpumask, cpumask_of(cpu));
++ if ( !cpumask_empty(mask) )
++ flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
++ }
++
+ perfc_add(num_page_updates, i);
+
+ out:
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 26b5301dcc..965a49f923 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -321,6 +321,9 @@ void start_secondary(void *unused)
+ */
+ spin_debug_disable();
+
++ get_cpu_info()->xen_cr3 = 0;
++ get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt));
++
+ load_system_tables();
+
+ /* Full exception support from here on in. */
+@@ -635,6 +638,187 @@ void cpu_exit_clear(unsigned int cpu)
+ set_cpu_state(CPU_STATE_DEAD);
+ }
+
++static int clone_mapping(const void *ptr, root_pgentry_t *rpt)
++{
++ unsigned long linear = (unsigned long)ptr, pfn;
++ unsigned int flags;
++ l3_pgentry_t *pl3e = l4e_to_l3e(idle_pg_table[root_table_offset(linear)]) +
++ l3_table_offset(linear);
++ l2_pgentry_t *pl2e;
++ l1_pgentry_t *pl1e;
++
++ if ( linear < DIRECTMAP_VIRT_START )
++ return 0;
++
++ flags = l3e_get_flags(*pl3e);
++ ASSERT(flags & _PAGE_PRESENT);
++ if ( flags & _PAGE_PSE )
++ {
++ pfn = (l3e_get_pfn(*pl3e) & ~((1UL << (2 * PAGETABLE_ORDER)) - 1)) |
++ (PFN_DOWN(linear) & ((1UL << (2 * PAGETABLE_ORDER)) - 1));
++ flags &= ~_PAGE_PSE;
++ }
++ else
++ {
++ pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(linear);
++ flags = l2e_get_flags(*pl2e);
++ ASSERT(flags & _PAGE_PRESENT);
++ if ( flags & _PAGE_PSE )
++ {
++ pfn = (l2e_get_pfn(*pl2e) & ~((1UL << PAGETABLE_ORDER) - 1)) |
++ (PFN_DOWN(linear) & ((1UL << PAGETABLE_ORDER) - 1));
++ flags &= ~_PAGE_PSE;
++ }
++ else
++ {
++ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(linear);
++ flags = l1e_get_flags(*pl1e);
++ if ( !(flags & _PAGE_PRESENT) )
++ return 0;
++ pfn = l1e_get_pfn(*pl1e);
++ }
++ }
++
++ if ( !(root_get_flags(rpt[root_table_offset(linear)]) & _PAGE_PRESENT) )
++ {
++ pl3e = alloc_xen_pagetable();
++ if ( !pl3e )
++ return -ENOMEM;
++ clear_page(pl3e);
++ l4e_write(&rpt[root_table_offset(linear)],
++ l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR));
++ }
++ else
++ pl3e = l4e_to_l3e(rpt[root_table_offset(linear)]);
++
++ pl3e += l3_table_offset(linear);
++
++ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
++ {
++ pl2e = alloc_xen_pagetable();
++ if ( !pl2e )
++ return -ENOMEM;
++ clear_page(pl2e);
++ l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
++ }
++ else
++ {
++ ASSERT(!(l3e_get_flags(*pl3e) & _PAGE_PSE));
++ pl2e = l3e_to_l2e(*pl3e);
++ }
++
++ pl2e += l2_table_offset(linear);
++
++ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
++ {
++ pl1e = alloc_xen_pagetable();
++ if ( !pl1e )
++ return -ENOMEM;
++ clear_page(pl1e);
++ l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
++ }
++ else
++ {
++ ASSERT(!(l2e_get_flags(*pl2e) & _PAGE_PSE));
++ pl1e = l2e_to_l1e(*pl2e);
++ }
++
++ pl1e += l1_table_offset(linear);
++
++ if ( l1e_get_flags(*pl1e) & _PAGE_PRESENT )
++ {
++ ASSERT(l1e_get_pfn(*pl1e) == pfn);
++ ASSERT(l1e_get_flags(*pl1e) == flags);
++ }
++ else
++ l1e_write(pl1e, l1e_from_pfn(pfn, flags));
++
++ return 0;
++}
++
++DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
++
++static int setup_cpu_root_pgt(unsigned int cpu)
++{
++ root_pgentry_t *rpt = alloc_xen_pagetable();
++ unsigned int off;
++ int rc;
++
++ if ( !rpt )
++ return -ENOMEM;
++
++ clear_page(rpt);
++ per_cpu(root_pgt, cpu) = rpt;
++
++ rpt[root_table_offset(RO_MPT_VIRT_START)] =
++ idle_pg_table[root_table_offset(RO_MPT_VIRT_START)];
++ /* SH_LINEAR_PT inserted together with guest mappings. */
++ /* PERDOMAIN inserted during context switch. */
++ rpt[root_table_offset(XEN_VIRT_START)] =
++ idle_pg_table[root_table_offset(XEN_VIRT_START)];
++
++ /* Install direct map page table entries for stack, IDT, and TSS. */
++ for ( off = rc = 0; !rc && off < STACK_SIZE; off += PAGE_SIZE )
++ rc = clone_mapping(__va(__pa(stack_base[cpu])) + off, rpt);
++
++ if ( !rc )
++ rc = clone_mapping(idt_tables[cpu], rpt);
++ if ( !rc )
++ rc = clone_mapping(&per_cpu(init_tss, cpu), rpt);
++
++ return rc;
++}
++
++static void cleanup_cpu_root_pgt(unsigned int cpu)
++{
++ root_pgentry_t *rpt = per_cpu(root_pgt, cpu);
++ unsigned int r;
++
++ if ( !rpt )
++ return;
++
++ per_cpu(root_pgt, cpu) = NULL;
++
++ for ( r = root_table_offset(DIRECTMAP_VIRT_START);
++ r < root_table_offset(HYPERVISOR_VIRT_END); ++r )
++ {
++ l3_pgentry_t *l3t;
++ unsigned int i3;
++
++ if ( !(root_get_flags(rpt[r]) & _PAGE_PRESENT) )
++ continue;
++
++ l3t = l4e_to_l3e(rpt[r]);
++
++ for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; ++i3 )
++ {
++ l2_pgentry_t *l2t;
++ unsigned int i2;
++
++ if ( !(l3e_get_flags(l3t[i3]) & _PAGE_PRESENT) )
++ continue;
++
++ ASSERT(!(l3e_get_flags(l3t[i3]) & _PAGE_PSE));
++ l2t = l3e_to_l2e(l3t[i3]);
++
++ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; ++i2 )
++ {
++ if ( !(l2e_get_flags(l2t[i2]) & _PAGE_PRESENT) )
++ continue;
++
++ ASSERT(!(l2e_get_flags(l2t[i2]) & _PAGE_PSE));
++ free_xen_pagetable(l2e_to_l1e(l2t[i2]));
++ }
++
++ free_xen_pagetable(l2t);
++ }
++
++ free_xen_pagetable(l3t);
++ }
++
++ free_xen_pagetable(rpt);
++}
++
+ static void cpu_smpboot_free(unsigned int cpu)
+ {
+ unsigned int order, socket = cpu_to_socket(cpu);
+@@ -673,6 +857,8 @@ static void cpu_smpboot_free(unsigned int cpu)
+ free_domheap_page(mfn_to_page(mfn));
+ }
+
++ cleanup_cpu_root_pgt(cpu);
++
+ order = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+ free_xenheap_pages(per_cpu(gdt_table, cpu), order);
+
+@@ -728,6 +914,9 @@ static int cpu_smpboot_alloc(unsigned int cpu)
+ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+
++ if ( setup_cpu_root_pgt(cpu) )
++ goto oom;
++
+ for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+ i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+ if ( cpu_online(i) && cpu_to_node(i) == node )
+@@ -783,6 +972,8 @@ static struct notifier_block cpu_smpboot_nfb = {
+
+ void __init smp_prepare_cpus(unsigned int max_cpus)
+ {
++ int rc;
++
+ register_cpu_notifier(&cpu_smpboot_nfb);
+
+ mtrr_aps_sync_begin();
+@@ -796,6 +987,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
+
+ stack_base[0] = stack_start;
+
++ rc = setup_cpu_root_pgt(0);
++ if ( rc )
++ panic("Error %d setting up PV root page table\n", rc);
++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
++
+ set_nr_sockets();
+
+ socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
+@@ -865,6 +1061,8 @@ void __init smp_prepare_boot_cpu(void)
+ #if NR_CPUS > 2 * BITS_PER_LONG
+ per_cpu(scratch_cpumask, cpu) = &scratch_cpu0mask;
+ #endif
++
++ get_cpu_info()->xen_cr3 = 0;
+ }
+
+ static void
+diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
+index e136af6b99..b1a4310974 100644
+--- a/xen/arch/x86/x86_64/asm-offsets.c
++++ b/xen/arch/x86/x86_64/asm-offsets.c
+@@ -137,6 +137,8 @@ void __dummy__(void)
+ OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id);
+ OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
+ OFFSET(CPUINFO_cr4, struct cpu_info, cr4);
++ OFFSET(CPUINFO_xen_cr3, struct cpu_info, xen_cr3);
++ OFFSET(CPUINFO_pv_cr3, struct cpu_info, pv_cr3);
+ DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info));
+ BLANK();
+
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index 37864a67f3..86ab78063a 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -197,6 +197,17 @@ ENTRY(cstar_enter)
+ pushq $0
+ movl $TRAP_syscall, 4(%rsp)
+ SAVE_ALL
++
++ GET_STACK_END(bx)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
++ neg %rcx
++ jz .Lcstar_cr3_okay
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++ neg %rcx
++ write_cr3 rcx, rdi, rsi
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++.Lcstar_cr3_okay:
++
+ GET_CURRENT(bx)
+ movq VCPU_domain(%rbx),%rcx
+ cmpb $0,DOMAIN_is_32bit_pv(%rcx)
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 668bf8ac28..16cf095ee1 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -35,6 +35,32 @@ ENTRY(switch_to_kernel)
+ /* %rbx: struct vcpu, interrupts disabled */
+ restore_all_guest:
+ ASSERT_INTERRUPTS_DISABLED
++
++ /* Copy guest mappings and switch to per-CPU root page table. */
++ mov %cr3, %r9
++ GET_STACK_END(dx)
++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi
++ movabs $PADDR_MASK & PAGE_MASK, %rsi
++ movabs $DIRECTMAP_VIRT_START, %rcx
++ mov %rdi, %rax
++ and %rsi, %rdi
++ and %r9, %rsi
++ add %rcx, %rdi
++ add %rcx, %rsi
++ mov $ROOT_PAGETABLE_FIRST_XEN_SLOT, %ecx
++ mov root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rsi), %r8
++ mov %r8, root_table_offset(SH_LINEAR_PT_VIRT_START)*8(%rdi)
++ rep movsq
++ mov $ROOT_PAGETABLE_ENTRIES - \
++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1, %ecx
++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rsi
++ sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \
++ ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi
++ rep movsq
++ mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
++ write_cr3 rax, rdi, rsi
++
+ RESTORE_ALL
+ testw $TRAP_syscall,4(%rsp)
+ jz iret_exit_to_guest
+@@ -69,6 +95,22 @@ iret_exit_to_guest:
+ ALIGN
+ /* No special register assumptions. */
+ restore_all_xen:
++ /*
++ * Check whether we need to switch to the per-CPU page tables, in
++ * case we return to late PV exit code (from an NMI or #MC).
++ */
++ GET_STACK_END(ax)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rax), %rdx
++ mov STACK_CPUINFO_FIELD(pv_cr3)(%rax), %rax
++ test %rdx, %rdx
++ /*
++ * Ideally the condition would be "nsz", but such doesn't exist,
++ * so "g" will have to do.
++ */
++UNLIKELY_START(g, exit_cr3)
++ write_cr3 rax, rdi, rsi
++UNLIKELY_END(exit_cr3)
++
+ RESTORE_ALL adj=8
+ iretq
+
+@@ -98,7 +140,18 @@ ENTRY(lstar_enter)
+ pushq $0
+ movl $TRAP_syscall, 4(%rsp)
+ SAVE_ALL
+- GET_CURRENT(bx)
++
++ GET_STACK_END(bx)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
++ neg %rcx
++ jz .Llstar_cr3_okay
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++ neg %rcx
++ write_cr3 rcx, rdi, rsi
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++.Llstar_cr3_okay:
++
++ __GET_CURRENT(bx)
+ testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
+ jz switch_to_kernel
+
+@@ -190,7 +243,18 @@ GLOBAL(sysenter_eflags_saved)
+ pushq $0
+ movl $TRAP_syscall, 4(%rsp)
+ SAVE_ALL
+- GET_CURRENT(bx)
++
++ GET_STACK_END(bx)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
++ neg %rcx
++ jz .Lsyse_cr3_okay
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++ neg %rcx
++ write_cr3 rcx, rdi, rsi
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++.Lsyse_cr3_okay:
++
++ __GET_CURRENT(bx)
+ cmpb $0,VCPU_sysenter_disables_events(%rbx)
+ movq VCPU_sysenter_addr(%rbx),%rax
+ setne %cl
+@@ -226,13 +290,23 @@ ENTRY(int80_direct_trap)
+ movl $0x80, 4(%rsp)
+ SAVE_ALL
+
++ GET_STACK_END(bx)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rcx
++ neg %rcx
++ jz .Lint80_cr3_okay
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++ neg %rcx
++ write_cr3 rcx, rdi, rsi
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%rbx)
++.Lint80_cr3_okay:
++
+ cmpb $0,untrusted_msi(%rip)
+ UNLIKELY_START(ne, msi_check)
+ movl $0x80,%edi
+ call check_for_unexpected_msi
+ UNLIKELY_END(msi_check)
+
+- GET_CURRENT(bx)
++ __GET_CURRENT(bx)
+
+ /* Check that the callback is non-null. */
+ leaq VCPU_int80_bounce(%rbx),%rdx
+@@ -389,9 +463,27 @@ ENTRY(dom_crash_sync_extable)
+
+ ENTRY(common_interrupt)
+ SAVE_ALL CLAC
++
++ GET_STACK_END(14)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov %rcx, %r15
++ neg %rcx
++ jz .Lintr_cr3_okay
++ jns .Lintr_cr3_load
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ neg %rcx
++.Lintr_cr3_load:
++ write_cr3 rcx, rdi, rsi
++ xor %ecx, %ecx
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ testb $3, UREGS_cs(%rsp)
++ cmovnz %rcx, %r15
++.Lintr_cr3_okay:
++
+ CR4_PV32_RESTORE
+ movq %rsp,%rdi
+ callq do_IRQ
++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ jmp ret_from_intr
+
+ /* No special register assumptions. */
+@@ -409,6 +501,23 @@ ENTRY(page_fault)
+ /* No special register assumptions. */
+ GLOBAL(handle_exception)
+ SAVE_ALL CLAC
++
++ GET_STACK_END(14)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov %rcx, %r15
++ neg %rcx
++ jz .Lxcpt_cr3_okay
++ jns .Lxcpt_cr3_load
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ neg %rcx
++.Lxcpt_cr3_load:
++ write_cr3 rcx, rdi, rsi
++ xor %ecx, %ecx
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ testb $3, UREGS_cs(%rsp)
++ cmovnz %rcx, %r15
++.Lxcpt_cr3_okay:
++
+ handle_exception_saved:
+ GET_CURRENT(bx)
+ testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp)
+@@ -473,6 +582,7 @@ handle_exception_saved:
+ leaq exception_table(%rip),%rdx
+ PERFC_INCR(exceptions, %rax, %rbx)
+ callq *(%rdx,%rax,8)
++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ testb $3,UREGS_cs(%rsp)
+ jz restore_all_xen
+ leaq VCPU_trap_bounce(%rbx),%rdx
+@@ -505,6 +615,7 @@ exception_with_ints_disabled:
+ rep; movsq # make room for ec/ev
+ 1: movq UREGS_error_code(%rsp),%rax # ec/ev
+ movq %rax,UREGS_kernel_sizeof(%rsp)
++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ jmp restore_all_xen # return to fixup code
+
+ /* No special register assumptions. */
+@@ -583,6 +694,17 @@ ENTRY(double_fault)
+ movl $TRAP_double_fault,4(%rsp)
+ /* Set AC to reduce chance of further SMAP faults */
+ SAVE_ALL STAC
++
++ GET_STACK_END(bx)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%rbx), %rbx
++ test %rbx, %rbx
++ jz .Ldblf_cr3_okay
++ jns .Ldblf_cr3_load
++ neg %rbx
++.Ldblf_cr3_load:
++ write_cr3 rbx, rdi, rsi
++.Ldblf_cr3_okay:
++
+ movq %rsp,%rdi
+ call do_double_fault
+ BUG /* do_double_fault() shouldn't return. */
+@@ -601,10 +723,28 @@ ENTRY(nmi)
+ movl $TRAP_nmi,4(%rsp)
+ handle_ist_exception:
+ SAVE_ALL CLAC
++
++ GET_STACK_END(14)
++ mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx
++ mov %rcx, %r15
++ neg %rcx
++ jz .List_cr3_okay
++ jns .List_cr3_load
++ mov %rcx, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++ neg %rcx
++.List_cr3_load:
++ write_cr3 rcx, rdi, rsi
++ movq $0, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
++.List_cr3_okay:
++
+ CR4_PV32_RESTORE
+ testb $3,UREGS_cs(%rsp)
+ jz 1f
+- /* Interrupted guest context. Copy the context to stack bottom. */
++ /*
++ * Interrupted guest context. Clear the restore value for xen_cr3
++ * and copy the context to stack bottom.
++ */
++ xor %r15, %r15
+ GET_CPUINFO_FIELD(guest_cpu_user_regs,di)
+ movq %rsp,%rsi
+ movl $UREGS_kernel_sizeof/8,%ecx
+@@ -614,6 +754,7 @@ handle_ist_exception:
+ movzbl UREGS_entry_vector(%rsp),%eax
+ leaq exception_table(%rip),%rdx
+ callq *(%rdx,%rax,8)
++ mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+ cmpb $TRAP_nmi,UREGS_entry_vector(%rsp)
+ jne ret_from_intr
+
+diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h
+index 98192eb4e6..fb0fee9286 100644
+--- a/xen/include/asm-x86/asm_defns.h
++++ b/xen/include/asm-x86/asm_defns.h
+@@ -93,9 +93,30 @@ void ret_from_intr(void);
+ UNLIKELY_DONE(mp, tag); \
+ __UNLIKELY_END(tag)
+
++ .equ .Lrax, 0
++ .equ .Lrcx, 1
++ .equ .Lrdx, 2
++ .equ .Lrbx, 3
++ .equ .Lrsp, 4
++ .equ .Lrbp, 5
++ .equ .Lrsi, 6
++ .equ .Lrdi, 7
++ .equ .Lr8, 8
++ .equ .Lr9, 9
++ .equ .Lr10, 10
++ .equ .Lr11, 11
++ .equ .Lr12, 12
++ .equ .Lr13, 13
++ .equ .Lr14, 14
++ .equ .Lr15, 15
++
+ #define STACK_CPUINFO_FIELD(field) (1 - CPUINFO_sizeof + CPUINFO_##field)
+ #define GET_STACK_END(reg) \
++ .if .Lr##reg > 8; \
++ movq $STACK_SIZE-1, %r##reg; \
++ .else; \
+ movl $STACK_SIZE-1, %e##reg; \
++ .endif; \
+ orq %rsp, %r##reg
+
+ #define GET_CPUINFO_FIELD(field, reg) \
+@@ -177,6 +198,15 @@ void ret_from_intr(void);
+ #define ASM_STAC ASM_AC(STAC)
+ #define ASM_CLAC ASM_AC(CLAC)
+
++.macro write_cr3 val:req, tmp1:req, tmp2:req
++ mov %cr4, %\tmp1
++ mov %\tmp1, %\tmp2
++ and $~X86_CR4_PGE, %\tmp1
++ mov %\tmp1, %cr4
++ mov %\val, %cr3
++ mov %\tmp2, %cr4
++.endm
++
+ #define CR4_PV32_RESTORE \
+ 667: ASM_NOP5; \
+ .pushsection .altinstr_replacement, "ax"; \
+diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
+index 89849929eb..b929c48c85 100644
+--- a/xen/include/asm-x86/current.h
++++ b/xen/include/asm-x86/current.h
+@@ -41,6 +41,18 @@ struct cpu_info {
+ struct vcpu *current_vcpu;
+ unsigned long per_cpu_offset;
+ unsigned long cr4;
++ /*
++ * Of the two following fields the latter is being set to the CR3 value
++ * to be used on the given pCPU for loading whenever 64-bit PV guest
++ * context is being entered. The value never changes once set.
++ * The former is the value to restore when re-entering Xen, if any. IOW
++ * its value being zero means there's nothing to restore. However, its
++ * value can also be negative, indicating to the exit-to-Xen code that
++ * restoring is not necessary, but allowing any nested entry code paths
++ * to still know the value to put back into CR3.
++ */
++ unsigned long xen_cr3;
++ unsigned long pv_cr3;
+ /* get_stack_bottom() must be 16-byte aligned */
+ };
+
+diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
+index 00cc23ce40..0291e82de3 100644
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -466,6 +466,7 @@ extern idt_entry_t idt_table[];
+ extern idt_entry_t *idt_tables[];
+
+ DECLARE_PER_CPU(struct tss_struct, init_tss);
++DECLARE_PER_CPU(root_pgentry_t *, root_pgt);
+
+ extern void init_int80_direct_trap(struct vcpu *v);
+
+diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
+index 1a6cae6283..749554fbbb 100644
+--- a/xen/include/asm-x86/x86_64/page.h
++++ b/xen/include/asm-x86/x86_64/page.h
+@@ -25,8 +25,8 @@
+ /* These are architectural limits. Current CPUs support only 40-bit phys. */
+ #define PADDR_BITS 52
+ #define VADDR_BITS 48
+-#define PADDR_MASK ((1UL << PADDR_BITS)-1)
+-#define VADDR_MASK ((1UL << VADDR_BITS)-1)
++#define PADDR_MASK ((_AC(1,UL) << PADDR_BITS) - 1)
++#define VADDR_MASK ((_AC(1,UL) << VADDR_BITS) - 1)
+
+ #define is_canonical_address(x) (((long)(x) >> 47) == ((long)(x) >> 63))
+
+@@ -116,6 +116,7 @@ typedef l4_pgentry_t root_pgentry_t;
+ : (((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) || \
+ ((_s) > ROOT_PAGETABLE_LAST_XEN_SLOT)))
+
++#define root_table_offset l4_table_offset
+ #define root_get_pfn l4e_get_pfn
+ #define root_get_flags l4e_get_flags
+ #define root_get_intpte l4e_get_intpte
+--
+2.15.0
+
diff --git a/main/xen/0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch b/main/xen/0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch
new file mode 100644
index 0000000000..d8bcd3a0b0
--- /dev/null
+++ b/main/xen/0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch
@@ -0,0 +1,164 @@
+From 5ab23f2997a50f2f66acef6437ca50df35a32bb6 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 17 Jan 2018 17:08:25 +0100
+Subject: [PATCH 4/4] x86: allow Meltdown band-aid to be disabled
+
+First of all we don't need it on AMD systems. Additionally allow its use
+to be controlled by command line option. For best backportability, this
+intentionally doesn't use alternative instruction patching to achieve
+the intended effect - while we likely want it, this will be later
+follow-up.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: e871e80c38547d9faefc6604532ba3e985e65873
+master date: 2018-01-16 17:50:59 +0100
+(cherry picked from commit dc7d46580d9c633a59be1c3776f79c01dd0cb98b)
+---
+ docs/misc/xen-command-line.markdown | 12 ++++++++++++
+ xen/arch/x86/domain.c | 7 +++++--
+ xen/arch/x86/mm.c | 2 +-
+ xen/arch/x86/smpboot.c | 17 ++++++++++++++---
+ xen/arch/x86/x86_64/entry.S | 2 ++
+ 5 files changed, 34 insertions(+), 6 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
+index 0202b1643d..587cdb9196 100644
+--- a/docs/misc/xen-command-line.markdown
++++ b/docs/misc/xen-command-line.markdown
+@@ -1791,6 +1791,18 @@ In the case that x2apic is in use, this option switches between physical and
+ clustered mode. The default, given no hint from the **FADT**, is cluster
+ mode.
+
++### xpti
++> `= <boolean>`
++
++> Default: `false` on AMD hardware
++> Default: `true` everywhere else
++
++Override default selection of whether to isolate 64-bit PV guest page
++tables.
++
++** WARNING: Not yet a complete isolation implementation, but better than
++nothing. **
++
+ ### xsave
+ > `= <boolean>`
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index c0f0fc7a32..069c314ee5 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -1929,12 +1929,15 @@ static void paravirt_ctxt_switch_from(struct vcpu *v)
+
+ static void paravirt_ctxt_switch_to(struct vcpu *v)
+ {
++ root_pgentry_t *root_pgt = this_cpu(root_pgt);
+ unsigned long cr4;
+
+ switch_kernel_stack(v);
+
+- this_cpu(root_pgt)[root_table_offset(PERDOMAIN_VIRT_START)] =
+- l4e_from_page(v->domain->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW);
++ if ( root_pgt )
++ root_pgt[root_table_offset(PERDOMAIN_VIRT_START)] =
++ l4e_from_page(v->domain->arch.perdomain_l3_pg,
++ __PAGE_HYPERVISOR_RW);
+
+ cr4 = pv_guest_cr4_to_real_cr4(v);
+ if ( unlikely(cr4 != read_cr4()) )
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 78f4cb37f5..2f38a6c195 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -4056,7 +4056,7 @@ long do_mmu_update(
+ rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
+ cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+ if ( !rc )
+- sync_guest = true;
++ sync_guest = this_cpu(root_pgt);
+ break;
+ case PGT_writable_page:
+ perfc_incr(writable_mmu_updates);
+diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
+index 965a49f923..bb033684ef 100644
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -322,7 +322,7 @@ void start_secondary(void *unused)
+ spin_debug_disable();
+
+ get_cpu_info()->xen_cr3 = 0;
+- get_cpu_info()->pv_cr3 = __pa(this_cpu(root_pgt));
++ get_cpu_info()->pv_cr3 = this_cpu(root_pgt) ? __pa(this_cpu(root_pgt)) : 0;
+
+ load_system_tables();
+
+@@ -736,14 +736,20 @@ static int clone_mapping(const void *ptr, root_pgentry_t *rpt)
+ return 0;
+ }
+
++static __read_mostly int8_t opt_xpti = -1;
++boolean_param("xpti", opt_xpti);
+ DEFINE_PER_CPU(root_pgentry_t *, root_pgt);
+
+ static int setup_cpu_root_pgt(unsigned int cpu)
+ {
+- root_pgentry_t *rpt = alloc_xen_pagetable();
++ root_pgentry_t *rpt;
+ unsigned int off;
+ int rc;
+
++ if ( !opt_xpti )
++ return 0;
++
++ rpt = alloc_xen_pagetable();
+ if ( !rpt )
+ return -ENOMEM;
+
+@@ -987,10 +993,14 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
+
+ stack_base[0] = stack_start;
+
++ if ( opt_xpti < 0 )
++ opt_xpti = boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
++
+ rc = setup_cpu_root_pgt(0);
+ if ( rc )
+ panic("Error %d setting up PV root page table\n", rc);
+- get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
++ if ( per_cpu(root_pgt, 0) )
++ get_cpu_info()->pv_cr3 = __pa(per_cpu(root_pgt, 0));
+
+ set_nr_sockets();
+
+@@ -1063,6 +1073,7 @@ void __init smp_prepare_boot_cpu(void)
+ #endif
+
+ get_cpu_info()->xen_cr3 = 0;
++ get_cpu_info()->pv_cr3 = 0;
+ }
+
+ static void
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 16cf095ee1..5f9ce2d6b7 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -44,6 +44,7 @@ restore_all_guest:
+ movabs $DIRECTMAP_VIRT_START, %rcx
+ mov %rdi, %rax
+ and %rsi, %rdi
++ jz .Lrag_keep_cr3
+ and %r9, %rsi
+ add %rcx, %rdi
+ add %rcx, %rsi
+@@ -60,6 +61,7 @@ restore_all_guest:
+ rep movsq
+ mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx)
+ write_cr3 rax, rdi, rsi
++.Lrag_keep_cr3:
+
+ RESTORE_ALL
+ testw $TRAP_syscall,4(%rsp)
+--
+2.15.0
+
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index bb02b2bee9..64289fb261 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -3,7 +3,7 @@
# Maintainer: William Pitcock <nenolod@dereferenced.org>
pkgname=xen
pkgver=4.9.1
-pkgrel=1
+pkgrel=2
pkgdesc="Xen hypervisor"
url="http://www.xen.org/"
arch="x86_64 armhf aarch64"
@@ -101,6 +101,8 @@ options="!strip"
# 4.9.1-r1:
# - XSA-246
# - XSA-247
+# 4.9.1-r2:
+# - XSA-254 XPTI
case "$CARCH" in
x86*)
@@ -152,6 +154,11 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
xsa247-4.9-1.patch
xsa247-4.9-2.patch
+ 0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch
+ 0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch
+ 0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch
+ 0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch
+
qemu-coroutine-gthread.patch
qemu-xen_paths.patch
@@ -413,6 +420,10 @@ c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a36
b00f42d2069f273e204698177d2c36950cee759a92dfe7833c812ddff4dedde2c4a842980927ec4fc46d1f54b49879bf3a3681c6faf30b72fb3ad6a7eba060b2 xsa246-4.9.patch
c5e064543048751fda86ce64587493518da87d219ff077abb83ac13d8381ceb29f1b6479fc0b761b8f7a04c8c70203791ac4a8cc79bbc6f4dcfa6661c4790c5e xsa247-4.9-1.patch
71aefbe27cbd1d1d363b7d5826c69a238e4aad2958a1c6da330ae5daee791f54ce1d01fb79db84ed4248ab8b1593c9c28c3de5108f4d0953b04f7819af23a1d1 xsa247-4.9-2.patch
+cda45e5a564e429a1299f07ea496b0e0614f6b2d71a5dcd24f5efdb571cc54d74d04c8e0766279fe2acb7d9bb9cf8505281d6c7ba2d6334009e14a10f83096ee 0001-x86-entry-Remove-support-for-partial-cpu_user_regs-f.patch
+bce07e4094ae3036dafdf9fe3aeb1f566281484e1398184d774af9ad371066c0e8af232b8d1ab5d450923fb482e6dea6dfb921976b87b20ab56a3f2b4486d0d4 0002-x86-mm-Always-set-_PAGE_ACCESSED-on-L4e-updates.patch
+ba09c54451fae35f3fc70e4f2a76791bc652ad373e87402ebc30c53f8e7db2368d52a9018cc28a5efcbcd77e85c9ae45d9580550f215a3f9bbf63bbd21ef938d 0003-x86-Meltdown-band-aid-against-malicious-64-bit-PV-gu.patch
+ff100f19972d55acae495752a05f5ad834819a4fb1a03547d8167a635ed5076b5ae923f57087ce3568f285718dd78149e2987cf811105906c9b45965878aba72 0004-x86-allow-Meltdown-band-aid-to-be-disabled.patch
c3c46f232f0bd9f767b232af7e8ce910a6166b126bd5427bb8dc325aeb2c634b956de3fc225cab5af72649070c8205cc8e1cab7689fc266c204f525086f1a562 qemu-coroutine-gthread.patch
1936ab39a1867957fa640eb81c4070214ca4856a2743ba7e49c0cd017917071a9680d015f002c57fa7b9600dbadd29dcea5887f50e6c133305df2669a7a933f3 qemu-xen_paths.patch
f095ea373f36381491ad36f0662fb4f53665031973721256b23166e596318581da7cbb0146d0beb2446729adfdb321e01468e377793f6563a67d68b8b0f7ffe3 hotplug-vif-vtrill.patch