main/xen: add leftover patches

author: Leonardo Arena <rnalrd@alpinelinux.org> 2018-06-11 10:03:06 +0000
committer: Leonardo Arena <rnalrd@alpinelinux.org> 2018-06-11 10:03:06 +0000
commit: dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1 (patch)
tree: bc9f1754b07f7f6373077614228f9a7752c392c0
parent: 5f72054ca4ac3f0f8f05c17a83a9c203f580bddc (diff)
download: aports-dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1.tar.bz2
aports-dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1.tar.xz
7 files changed, 748 insertions, 1 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index 8521f69070..3958ff5032 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -3,7 +3,7 @@
 # Maintainer: William Pitcock <nenolod@dereferenced.org>
 pkgname=xen
 pkgver=4.10.1
-pkgrel=1
+pkgrel=2
 pkgdesc="Xen hypervisor"
 url="http://www.xen.org/"
 arch="x86_64 armhf aarch64"
diff --git a/main/xen/xsa260-1.patch b/main/xen/xsa260-1.patch
new file mode 100644
index 0000000000..21da59cddd
--- /dev/null
+++ b/main/xen/xsa260-1.patch
@@ -0,0 +1,72 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/traps: Fix %dr6 handing in #DB handler
+
+Most bits in %dr6 accumulate, rather than being set directly based on the
+current source of #DB.  Have the handler follow the manuals guidance, which
+avoids leaking hypervisor debugging activities into guest context.
+
+This is part of XSA-260 / CVE-2018-8897.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/arch/x86/traps.c	2018-04-13 15:29:36.006747135 +0200
++++ b/xen/arch/x86/traps.c	2018-04-13 15:44:57.015516185 +0200
+@@ -1761,11 +1761,36 @@ static void ler_enable(void)
+ 
+ void do_debug(struct cpu_user_regs *regs)
+ {
++    unsigned long dr6;
+     struct vcpu *v = current;
+ 
++    /* Stash dr6 as early as possible. */
++    dr6 = read_debugreg(6);
++
+     if ( debugger_trap_entry(TRAP_debug, regs) )
+         return;
+ 
++    /*
++     * At the time of writing (March 2018), on the subject of %dr6:
++     *
++     * The Intel manual says:
++     *   Certain debug exceptions may clear bits 0-3. The remaining contents
++     *   of the DR6 register are never cleared by the processor. To avoid
++     *   confusion in identifying debug exceptions, debug handlers should
++     *   clear the register (except bit 16, which they should set) before
++     *   returning to the interrupted task.
++     *
++     * The AMD manual says:
++     *   Bits 15:13 of the DR6 register are not cleared by the processor and
++     *   must be cleared by software after the contents have been read.
++     *
++     * Some bits are reserved set, some are reserved clear, and some bits
++     * which were previously reserved set are reused and cleared by hardware.
++     * For future compatibility, reset to the default value, which will allow
++     * us to spot any bit being changed by hardware to its non-default value.
++     */
++    write_debugreg(6, X86_DR6_DEFAULT);
++
+     if ( !guest_mode(regs) )
+     {
+         if ( regs->eflags & X86_EFLAGS_TF )
+@@ -1798,7 +1823,8 @@ void do_debug(struct cpu_user_regs *regs
+     }
+ 
+     /* Save debug status register where guest OS can peek at it */
+-    v->arch.debugreg[6] = read_debugreg(6);
++    v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT);
++    v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT);
+ 
+     ler_enable();
+     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+--- a/xen/include/asm-x86/debugreg.h	2015-02-11 09:36:29.000000000 +0100
++++ b/xen/include/asm-x86/debugreg.h	2018-04-13 15:44:57.015516185 +0200
+@@ -24,6 +24,8 @@
+ #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */
+ #define DR_STATUS_RESERVED_ONE  0xffff0ff0ul /* Reserved, read as one */
+ 
++#define X86_DR6_DEFAULT 0xffff0ff0ul    /* Default %dr6 value. */
++
+ /* Now define a bunch of things for manipulating the control register.
+    The top two bytes of the control register consist of 4 fields of 4
+    bits - each field corresponds to one of the four debug registers,
diff --git a/main/xen/xsa260-2.patch b/main/xen/xsa260-2.patch
new file mode 100644
index 0000000000..be71b2438f
--- /dev/null
+++ b/main/xen/xsa260-2.patch
@@ -0,0 +1,110 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/pv: Move exception injection into {,compat_}test_all_events()
+
+This allows paths to jump straight to {,compat_}test_all_events() and have
+injection of pending exceptions happen automatically, rather than requiring
+all calling paths to handle exceptions themselves.
+
+The normal exception path is simplified as a result, and
+compat_post_handle_exception() is removed entirely.
+
+This is part of XSA-260 / CVE-2018-8897.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events)
+         leaq  irq_stat+IRQSTAT_softirq_pending(%rip),%rcx
+         cmpl  $0,(%rcx,%rax,1)
+         jne   compat_process_softirqs
++
++        /* Inject exception if pending. */
++        lea   VCPU_trap_bounce(%rbx), %rdx
++        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++        jnz   .Lcompat_process_trapbounce
++
+         testb $1,VCPU_mce_pending(%rbx)
+         jnz   compat_process_mce
+ .Lcompat_test_guest_nmi:
+@@ -68,6 +74,15 @@ compat_process_softirqs:
+         call  do_softirq
+         jmp   compat_test_all_events
+ 
++        ALIGN
++/* %rbx: struct vcpu, %rdx: struct trap_bounce */
++.Lcompat_process_trapbounce:
++        sti
++.Lcompat_bounce_exception:
++        call  compat_create_bounce_frame
++        movb  $0, TRAPBOUNCE_flags(%rdx)
++        jmp   compat_test_all_events
++
+ 	ALIGN
+ /* %rbx: struct vcpu */
+ compat_process_mce:
+@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore)
+         xor   %eax, %eax
+         ret
+ 
+-/* %rdx: trap_bounce, %rbx: struct vcpu */
+-ENTRY(compat_post_handle_exception)
+-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+-        jz    compat_test_all_events
+-.Lcompat_bounce_exception:
+-        call  compat_create_bounce_frame
+-        movb  $0,TRAPBOUNCE_flags(%rdx)
+-        jmp   compat_test_all_events
+-
+         .section .text.entry, "ax", @progbits
+ 
+ /* See lstar_enter for entry register state. */
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -42,6 +42,12 @@ test_all_events:
+         leaq  irq_stat+IRQSTAT_softirq_pending(%rip), %rcx
+         cmpl  $0, (%rcx, %rax, 1)
+         jne   process_softirqs
++
++        /* Inject exception if pending. */
++        lea   VCPU_trap_bounce(%rbx), %rdx
++        testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx)
++        jnz   .Lprocess_trapbounce
++
+         cmpb  $0, VCPU_mce_pending(%rbx)
+         jne   process_mce
+ .Ltest_guest_nmi:
+@@ -70,6 +76,15 @@ process_softirqs:
+         jmp  test_all_events
+ 
+         ALIGN
++/* %rbx: struct vcpu, %rdx struct trap_bounce */
++.Lprocess_trapbounce:
++        sti
++.Lbounce_exception:
++        call  create_bounce_frame
++        movb  $0, TRAPBOUNCE_flags(%rdx)
++        jmp   test_all_events
++
++        ALIGN
+ /* %rbx: struct vcpu */
+ process_mce:
+         testb $1 << VCPU_TRAP_MCE, VCPU_async_exception_mask(%rbx)
+@@ -667,15 +682,9 @@ handle_exception_saved:
+         mov   %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14)
+         testb $3,UREGS_cs(%rsp)
+         jz    restore_all_xen
+-        leaq  VCPU_trap_bounce(%rbx),%rdx
+         movq  VCPU_domain(%rbx),%rax
+         testb $1,DOMAIN_is_32bit_pv(%rax)
+-        jnz   compat_post_handle_exception
+-        testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx)
+-        jz    test_all_events
+-.Lbounce_exception:
+-        call  create_bounce_frame
+-        movb  $0,TRAPBOUNCE_flags(%rdx)
++        jnz   compat_test_all_events
+         jmp   test_all_events
+ 
+ /* No special register assumptions. */
diff --git a/main/xen/xsa260-3.patch b/main/xen/xsa260-3.patch
new file mode 100644
index 0000000000..f0a0a5687d
--- /dev/null
+++ b/main/xen/xsa260-3.patch
@@ -0,0 +1,138 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/traps: Use an Interrupt Stack Table for #DB
+
+PV guests can use architectural corner cases to cause #DB to be raised after
+transitioning into supervisor mode.
+
+Use an interrupt stack table for #DB to prevent the exception being taken with
+a guest controlled stack pointer.
+
+This is part of XSA-260 / CVE-2018-8897.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/arch/x86/cpu/common.c
++++ b/xen/arch/x86/cpu/common.c
+@@ -679,6 +679,7 @@ void load_system_tables(void)
+ 			[IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE,
+ 			[IST_DF  - 1] = stack_top + IST_DF  * PAGE_SIZE,
+ 			[IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE,
++			[IST_DB  - 1] = stack_top + IST_DB  * PAGE_SIZE,
+ 
+ 			[IST_MAX ... ARRAY_SIZE(tss->ist) - 1] =
+ 				0x8600111111111111ul,
+@@ -706,6 +707,7 @@ void load_system_tables(void)
+ 	set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
+ 	set_ist(&idt_tables[cpu][TRAP_nmi],	      IST_NMI);
+ 	set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++	set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
+ 
+ 	/*
+ 	 * Bottom-of-stack must be 16-byte aligned!
+--- a/xen/arch/x86/hvm/svm/svm.c
++++ b/xen/arch/x86/hvm/svm/svm.c
+@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NMI);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_DB);
+ }
+ 
+ static void svm_ctxt_switch_to(struct vcpu *v)
+@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vc
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
+ 
+     svm_restore_dr(v);
+ 
+--- a/xen/arch/x86/smpboot.c
++++ b/xen/arch/x86/smpboot.c
+@@ -964,6 +964,7 @@ static int cpu_smpboot_alloc(unsigned in
+     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
+     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
++    set_ist(&idt_tables[cpu][TRAP_debug],         IST_NONE);
+ 
+     for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1);
+           i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i )
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu
+ /*
+  * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
+  *
+- * Stack pages 0, 1 and 2:
++ * Stack pages 0 - 3:
+  *   These are all 1-page IST stacks.  Each of these stacks have an exception
+  *   frame and saved register state at the top.  The interesting bound for a
+  *   trace is the word adjacent to this, while the bound for a dump is the
+  *   very top, including the exception frame.
+  *
+- * Stack pages 3, 4 and 5:
++ * Stack pages 4 and 5:
+  *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
+  *   explicitly not present, so attempting to dump or trace it is
+  *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
+@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(uns
+ {
+     switch ( get_stack_page(sp) )
+     {
+-    case 0 ... 2:
++    case 0 ... 3:
+         return ROUNDUP(sp, PAGE_SIZE) -
+             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
+ 
+ #ifndef MEMORY_GUARD
+-    case 3 ... 5:
++    case 4 ... 5:
+ #endif
+     case 6 ... 7:
+         return ROUNDUP(sp, STACK_SIZE) -
+@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsi
+ {
+     switch ( get_stack_page(sp) )
+     {
+-    case 0 ... 2:
++    case 0 ... 3:
+         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
+ 
+ #ifndef MEMORY_GUARD
+-    case 3 ... 5:
++    case 4 ... 5:
+ #endif
+     case 6 ... 7:
+         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
+@@ -1943,6 +1943,7 @@ void __init init_idt_traps(void)
+     set_ist(&idt_table[TRAP_double_fault],  IST_DF);
+     set_ist(&idt_table[TRAP_nmi],           IST_NMI);
+     set_ist(&idt_table[TRAP_machine_check], IST_MCE);
++    set_ist(&idt_table[TRAP_debug],         IST_DB);
+ 
+     /* CPU0 uses the master IDT. */
+     idt_tables[0] = idt_table;
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -739,7 +739,7 @@ ENTRY(device_not_available)
+ ENTRY(debug)
+         pushq $0
+         movl  $TRAP_debug,4(%rsp)
+-        jmp   handle_exception
++        jmp   handle_ist_exception
+ 
+ ENTRY(int3)
+         pushq $0
+--- a/xen/include/asm-x86/processor.h
++++ b/xen/include/asm-x86/processor.h
+@@ -443,7 +443,8 @@ struct __packed __cacheline_aligned tss_
+ #define IST_DF   1UL
+ #define IST_NMI  2UL
+ #define IST_MCE  3UL
+-#define IST_MAX  3UL
++#define IST_DB   4UL
++#define IST_MAX  4UL
+ 
+ /* Set the interrupt stack table used by a particular interrupt
+  * descriptor table entry. */
diff --git a/main/xen/xsa260-4.patch b/main/xen/xsa260-4.patch
new file mode 100644
index 0000000000..c2fa02d6e1
--- /dev/null
+++ b/main/xen/xsa260-4.patch
@@ -0,0 +1,72 @@
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Subject: x86/traps: Fix handling of #DB exceptions in hypervisor context
+
+The WARN_ON() can be triggered by guest activities, and emits a full stack
+trace without rate limiting.  Swap it out for a ratelimited printk with just
+enough information to work out what is going on.
+
+Not all #DB exceptions are traps, so blindly continuing is not a safe action
+to take.  We don't let PV guests select these settings in the real %dr7 to
+begin with, but for added safety against unexpected situations, detect the
+fault cases and crash in an obvious manner.
+
+This is part of XSA-260 / CVE-2018-8897.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -1809,16 +1809,44 @@ void do_debug(struct cpu_user_regs *regs
+                 regs->eflags &= ~X86_EFLAGS_TF;
+             }
+         }
+-        else
++
++        /*
++         * Check for fault conditions.  General Detect, and instruction
++         * breakpoints are faults rather than traps, at which point attempting
++         * to ignore and continue will result in a livelock.
++         */
++        if ( dr6 & DR_GENERAL_DETECT )
++        {
++            printk(XENLOG_ERR "Hit General Detect in Xen context\n");
++            fatal_trap(regs, 0);
++        }
++
++        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
+         {
+-            /*
+-             * We ignore watchpoints when they trigger within Xen. This may
+-             * happen when a buffer is passed to us which previously had a
+-             * watchpoint set on it. No need to bump EIP; the only faulting
+-             * trap is an instruction breakpoint, which can't happen to us.
+-             */
+-            WARN_ON(!search_exception_table(regs));
++            unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT;
++
++            for ( bp = 0; bp < 4; ++bp )
++            {
++                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
++                     ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ )
++                {
++                    printk(XENLOG_ERR
++                           "Hit instruction breakpoint in Xen context\n");
++                    fatal_trap(regs, 0);
++                }
++            }
+         }
++
++        /*
++         * Whatever caused this #DB should be a trap.  Note it and continue.
++         * Guests can trigger this in certain corner cases, so ensure the
++         * message is ratelimited.
++         */
++        gprintk(XENLOG_WARNING,
++                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
++                regs->cs, _p(regs->rip), _p(regs->rip),
++                regs->ss, _p(regs->rsp), dr6);
++
+         goto out;
+     }
+ 
diff --git a/main/xen/xsa261.patch b/main/xen/xsa261.patch
new file mode 100644
index 0000000000..a51744b8d0
--- /dev/null
+++ b/main/xen/xsa261.patch
@@ -0,0 +1,279 @@
+From: Xen Project Security Team <security@xenproject.org>
+Subject: x86/vpt: add support for IO-APIC routed interrupts
+
+And modify the HPET code to make use of it. Currently HPET interrupts
+are always treated as ISA and thus injected through the vPIC. This is
+wrong because HPET interrupts when not in legacy mode should be
+injected from the IO-APIC.
+
+To make things worse, the supported interrupt routing values are set
+to [20..23], which clearly falls outside of the ISA range, thus
+leading to an ASSERT in debug builds or memory corruption in non-debug
+builds because the interrupt injection code will write out of the
+bounds of the arch.hvm_domain.vpic array.
+
+Since the HPET interrupt source can change between ISA and IO-APIC
+always destroy the timer before changing the mode, or else Xen risks
+changing it while the timer is active.
+
+Note that vpt interrupt injection is racy in the sense that the
+vIO-APIC RTE entry can be written by the guest in between the call to
+pt_irq_masked and hvm_ioapic_assert, or the call to pt_update_irq and
+pt_intr_post. Those are not deemed to be security issues, but rather
+quirks of the current implementation. In the worse case the guest
+might lose interrupts or get multiple interrupt vectors injected for
+the same timer source.
+
+This is part of XSA-261.
+
+Address actual and potential compiler warnings. Fix formatting.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+---
+Changes since v2:
+ - Move fallthrough comment to be just above the case label.
+ - Fix now stale comment in pt_update_irq.
+ - Use NR_ISAIRQS instead of 16.
+ - Expand commit message to mention the quirkiness of vpt interrupt
+   injection.
+
+Changes since v1:
+ - Simply usage of gsi in pt_irq_masked.
+ - Introduce hvm_ioapic_assert.
+ - Fix pt->source == PTSRC_isa in create_periodic_time.
+
+--- a/xen/arch/x86/hvm/hpet.c
++++ b/xen/arch/x86/hvm/hpet.c
+@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h,
+         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
+             ? (uint32_t)diff : 0;
+ 
++    destroy_periodic_time(&h->pt[tn]);
+     if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
++    {
+         /* if LegacyReplacementRoute bit is set, HPET specification requires
+            timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+            timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+         irq = (tn == 0) ? 0 : 8;
++        h->pt[tn].source = PTSRC_isa;
++    }
+     else
++    {
+         irq = timer_int_route(h, tn);
++        h->pt[tn].source = PTSRC_ioapic;
++    }
+ 
+     /*
+      * diff is the time from now when the timer should fire, for a periodic
+--- a/xen/arch/x86/hvm/irq.c
++++ b/xen/arch/x86/hvm/irq.c
+@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d,
+     vioapic_irq_positive_edge(d, ioapic_gsi);
+ }
+ 
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level)
++{
++    struct hvm_irq *hvm_irq = hvm_domain_irq(d);
++    int vector;
++
++    if ( gsi >= hvm_irq->nr_gsis )
++    {
++        ASSERT_UNREACHABLE();
++        return -1;
++    }
++
++    spin_lock(&d->arch.hvm_domain.irq_lock);
++    if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 )
++        assert_gsi(d, gsi);
++    vector = vioapic_get_vector(d, gsi);
++    spin_unlock(&d->arch.hvm_domain.irq_lock);
++
++    return vector;
++}
++
+ static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq)
+ {
+     assert_gsi(d, ioapic_gsi);
+--- a/xen/arch/x86/hvm/vpt.c
++++ b/xen/arch/x86/hvm/vpt.c
+@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic
+ static int pt_irq_masked(struct periodic_time *pt)
+ {
+     struct vcpu *v = pt->vcpu;
+-    unsigned int gsi, isa_irq;
+-    int mask;
+-    uint8_t pic_imr;
++    unsigned int gsi = pt->irq;
+ 
+-    if ( pt->source == PTSRC_lapic )
++    switch ( pt->source )
++    {
++    case PTSRC_lapic:
+     {
+         struct vlapic *vlapic = vcpu_vlapic(v);
++
+         return (!vlapic_enabled(vlapic) ||
+                 (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED));
+     }
+ 
+-    isa_irq = pt->irq;
+-    gsi = hvm_isa_irq_to_gsi(isa_irq);
+-    pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr;
+-    mask = vioapic_get_mask(v->domain, gsi);
+-    if ( mask < 0 )
+-    {
+-        dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n",
+-                v->domain->domain_id, gsi);
+-        domain_crash(v->domain);
+-        return -1;
++    case PTSRC_isa:
++    {
++        uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr;
++
++        /* Check if the interrupt is unmasked in the PIC. */
++        if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) )
++            return 0;
++
++        gsi = hvm_isa_irq_to_gsi(pt->irq);
++    }
++
++    /* Fallthrough to check if the interrupt is masked on the IO APIC. */
++    case PTSRC_ioapic:
++    {
++        int mask = vioapic_get_mask(v->domain, gsi);
++
++        if ( mask < 0 )
++        {
++            dprintk(XENLOG_WARNING,
++                    "d%d: invalid GSI (%u) for platform timer\n",
++                    v->domain->domain_id, gsi);
++            domain_crash(v->domain);
++            return -1;
++        }
++
++        return mask;
++    }
+     }
+ 
+-    return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) &&
+-            mask);
++    ASSERT_UNREACHABLE();
++    return 1;
+ }
+ 
+ static void pt_lock(struct periodic_time *pt)
+@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v)
+     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
+     struct periodic_time *pt, *temp, *earliest_pt;
+     uint64_t max_lag;
+-    int irq, is_lapic, pt_vector;
++    int irq, pt_vector = -1;
+ 
+     spin_lock(&v->arch.hvm_vcpu.tm_lock);
+ 
+@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v)
+ 
+     earliest_pt->irq_issued = 1;
+     irq = earliest_pt->irq;
+-    is_lapic = (earliest_pt->source == PTSRC_lapic);
+ 
+     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
+ 
+-    /*
+-     * If periodic timer interrut is handled by lapic, its vector in
+-     * IRR is returned and used to set eoi_exit_bitmap for virtual
+-     * interrupt delivery case. Otherwise return -1 to do nothing.
+-     */
+-    if ( is_lapic )
++    switch ( earliest_pt->source )
+     {
++    case PTSRC_lapic:
++        /*
++         * If periodic timer interrupt is handled by lapic, its vector in
++         * IRR is returned and used to set eoi_exit_bitmap for virtual
++         * interrupt delivery case. Otherwise return -1 to do nothing.
++         */
+         vlapic_set_irq(vcpu_vlapic(v), irq, 0);
+         pt_vector = irq;
+-    }
+-    else
+-    {
++        break;
++
++    case PTSRC_isa:
+         hvm_isa_irq_deassert(v->domain, irq);
+         if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
+              v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
+-        {
+             hvm_isa_irq_assert(v->domain, irq, NULL);
+-            pt_vector = -1;
+-        }
+         else
+         {
+             pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector);
+@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v)
+             if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
+                 pt_vector = -1;
+         }
++        break;
++
++    case PTSRC_ioapic:
++        /*
++         * NB: At the moment IO-APIC routed interrupts generated by vpt devices
++         * (HPET) are edge-triggered.
++         */
++        pt_vector = hvm_ioapic_assert(v->domain, irq, false);
++        if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
++            pt_vector = -1;
++        break;
+     }
+ 
+     return pt_vector;
+@@ -418,7 +444,14 @@ void create_periodic_time(
+     struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+     uint64_t period, uint8_t irq, time_cb *cb, void *data)
+ {
+-    ASSERT(pt->source != 0);
++    if ( !pt->source ||
++         (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) ||
++         (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis &&
++          pt->source == PTSRC_ioapic) )
++    {
++        ASSERT_UNREACHABLE();
++        return;
++    }
+ 
+     destroy_periodic_time(pt);
+ 
+@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct period
+ {
+     int on_list;
+ 
+-    ASSERT(pt->source == PTSRC_isa);
++    ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic);
+ 
+     if ( pt->vcpu == NULL )
+         return;
+--- a/xen/include/asm-x86/hvm/irq.h
++++ b/xen/include/asm-x86/hvm/irq.h
+@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain
+ 
+ int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data);
+ 
++/* Assert an IO APIC pin. */
++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level);
++
+ void hvm_maybe_deassert_evtchn_irq(void);
+ void hvm_assert_evtchn_irq(struct vcpu *v);
+ void hvm_set_callback_via(struct domain *d, uint64_t via);
+--- a/xen/include/asm-x86/hvm/vpt.h
++++ b/xen/include/asm-x86/hvm/vpt.h
+@@ -44,6 +44,7 @@ struct periodic_time {
+     bool_t warned_timeout_too_short;
+ #define PTSRC_isa    1 /* ISA time source */
+ #define PTSRC_lapic  2 /* LAPIC time source */
++#define PTSRC_ioapic 3 /* IOAPIC time source */
+     u8 source;                  /* PTSRC_ */
+     u8 irq;
+     struct vcpu *vcpu;          /* vcpu timer interrupt delivers to */
diff --git a/main/xen/xsa262-4.10.patch b/main/xen/xsa262-4.10.patch
new file mode 100644
index 0000000000..ba9a8ffa22
--- /dev/null
+++ b/main/xen/xsa262-4.10.patch
@@ -0,0 +1,76 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/HVM: guard against emulator driving ioreq state in weird ways
+
+In the case where hvm_wait_for_io() calls wait_on_xen_event_channel(),
+p->state ends up being read twice in succession: once to determine that
+state != p->state, and then again at the top of the loop.  This gives a
+compromised emulator a chance to change the state back between the two
+reads, potentially keeping Xen in a loop indefinitely.
+
+Instead:
+* Read p->state once in each of the wait_on_xen_event_channel() tests,
+* re-use that value the next time around,
+* and insist that the states continue to transition "forward" (with the
+  exception of the transition to STATE_IOREQ_NONE).
+
+This is XSA-262.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@citrix.com>
+
+--- a/xen/arch/x86/hvm/ioreq.c
++++ b/xen/arch/x86/hvm/ioreq.c
+@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ior
+ 
+ static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+ {
++    unsigned int prev_state = STATE_IOREQ_NONE;
++
+     while ( sv->pending )
+     {
+         unsigned int state = p->state;
+ 
+-        rmb();
+-        switch ( state )
++        smp_rmb();
++
++    recheck:
++        if ( unlikely(state == STATE_IOREQ_NONE) )
+         {
+-        case STATE_IOREQ_NONE:
+             /*
+              * The only reason we should see this case is when an
+              * emulator is dying and it races with an I/O being
+@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_i
+              */
+             hvm_io_assist(sv, ~0ul);
+             break;
++        }
++
++        if ( unlikely(state < prev_state) )
++        {
++            gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
++                     prev_state, state);
++            sv->pending = false;
++            domain_crash(sv->vcpu->domain);
++            return false; /* bail */
++        }
++
++        switch ( prev_state = state )
++        {
+         case STATE_IORESP_READY: /* IORESP_READY -> NONE */
+             p->state = STATE_IOREQ_NONE;
+             hvm_io_assist(sv, p->data);
+             break;
+         case STATE_IOREQ_READY:  /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
+         case STATE_IOREQ_INPROCESS:
+-            wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state);
+-            break;
++            wait_on_xen_event_channel(sv->ioreq_evtchn,
++                                      ({ state = p->state;
++                                         smp_rmb();
++                                         state != prev_state; }));
++            goto recheck;
+         default:
+             gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
+             sv->pending = false;
author	Leonardo Arena <rnalrd@alpinelinux.org>	2018-06-11 10:03:06 +0000
committer	Leonardo Arena <rnalrd@alpinelinux.org>	2018-06-11 10:03:06 +0000
commit	dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1 (patch)
tree	bc9f1754b07f7f6373077614228f9a7752c392c0
parent	5f72054ca4ac3f0f8f05c17a83a9c203f580bddc (diff)
download	aports-dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1.tar.bz2 aports-dacd68bca253cc110bbcb2e6efcb4b15bf44b3f1.tar.xz