aboutsummaryrefslogtreecommitdiffstats
path: root/main/xen/xsa45-4.2.patch
diff options
context:
space:
mode:
Diffstat (limited to 'main/xen/xsa45-4.2.patch')
-rw-r--r--main/xen/xsa45-4.2.patch1133
1 files changed, 1133 insertions, 0 deletions
diff --git a/main/xen/xsa45-4.2.patch b/main/xen/xsa45-4.2.patch
new file mode 100644
index 0000000000..dfdfdea64b
--- /dev/null
+++ b/main/xen/xsa45-4.2.patch
@@ -0,0 +1,1133 @@
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 26a7f12..b97ac6d 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -73,8 +73,6 @@ void (*dead_idle) (void) __read_mostly = default_dead_idle;
+ static void paravirt_ctxt_switch_from(struct vcpu *v);
+ static void paravirt_ctxt_switch_to(struct vcpu *v);
+
+-static void vcpu_destroy_pagetables(struct vcpu *v);
+-
+ static void default_idle(void)
+ {
+ local_irq_disable();
+@@ -860,6 +858,9 @@ int arch_set_info_guest(
+
+ if ( !v->is_initialised )
+ {
++ if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
++ return -EINVAL;
++
+ v->arch.pv_vcpu.ldt_base = c(ldt_base);
+ v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
+ }
+@@ -957,24 +958,44 @@ int arch_set_info_guest(
+ if ( rc != 0 )
+ return rc;
+
++ set_bit(_VPF_in_reset, &v->pause_flags);
++
+ if ( !compat )
+- {
+ cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
+- cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
+-
+- if ( !cr3_page )
+- {
+- destroy_gdt(v);
+- return -EINVAL;
+- }
+- if ( !paging_mode_refcounts(d)
+- && !get_page_type(cr3_page, PGT_base_page_table) )
+- {
+- put_page(cr3_page);
+- destroy_gdt(v);
+- return -EINVAL;
+- }
++#ifdef CONFIG_COMPAT
++ else
++ cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
++#endif
++ cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
+
++ if ( !cr3_page )
++ rc = -EINVAL;
++ else if ( paging_mode_refcounts(d) )
++ /* nothing */;
++ else if ( cr3_page == v->arch.old_guest_table )
++ {
++ v->arch.old_guest_table = NULL;
++ put_page(cr3_page);
++ }
++ else
++ {
++ /*
++ * Since v->arch.guest_table{,_user} are both NULL, this effectively
++ * is just a call to put_old_guest_table().
++ */
++ if ( !compat )
++ rc = vcpu_destroy_pagetables(v);
++ if ( !rc )
++ rc = get_page_type_preemptible(cr3_page,
++ !compat ? PGT_root_page_table
++ : PGT_l3_page_table);
++ if ( rc == -EINTR )
++ rc = -EAGAIN;
++ }
++ if ( rc )
++ /* handled below */;
++ else if ( !compat )
++ {
+ v->arch.guest_table = pagetable_from_page(cr3_page);
+ #ifdef __x86_64__
+ if ( c.nat->ctrlreg[1] )
+@@ -982,56 +1003,44 @@ int arch_set_info_guest(
+ cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
+ cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
+
+- if ( !cr3_page ||
+- (!paging_mode_refcounts(d)
+- && !get_page_type(cr3_page, PGT_base_page_table)) )
++ if ( !cr3_page )
++ rc = -EINVAL;
++ else if ( !paging_mode_refcounts(d) )
+ {
+- if (cr3_page)
+- put_page(cr3_page);
+- cr3_page = pagetable_get_page(v->arch.guest_table);
+- v->arch.guest_table = pagetable_null();
+- if ( paging_mode_refcounts(d) )
+- put_page(cr3_page);
+- else
+- put_page_and_type(cr3_page);
+- destroy_gdt(v);
+- return -EINVAL;
++ rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
++ switch ( rc )
++ {
++ case -EINTR:
++ rc = -EAGAIN;
++ case -EAGAIN:
++ v->arch.old_guest_table =
++ pagetable_get_page(v->arch.guest_table);
++ v->arch.guest_table = pagetable_null();
++ break;
++ }
+ }
+-
+- v->arch.guest_table_user = pagetable_from_page(cr3_page);
+- }
+- else if ( !(flags & VGCF_in_kernel) )
+- {
+- destroy_gdt(v);
+- return -EINVAL;
++ if ( !rc )
++ v->arch.guest_table_user = pagetable_from_page(cr3_page);
+ }
+ }
+ else
+ {
+ l4_pgentry_t *l4tab;
+
+- cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
+- cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
+-
+- if ( !cr3_page)
+- {
+- destroy_gdt(v);
+- return -EINVAL;
+- }
+-
+- if (!paging_mode_refcounts(d)
+- && !get_page_type(cr3_page, PGT_l3_page_table) )
+- {
+- put_page(cr3_page);
+- destroy_gdt(v);
+- return -EINVAL;
+- }
+-
+ l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
+ *l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
+ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
+ #endif
+ }
++ if ( rc )
++ {
++ if ( cr3_page )
++ put_page(cr3_page);
++ destroy_gdt(v);
++ return rc;
++ }
++
++ clear_bit(_VPF_in_reset, &v->pause_flags);
+
+ if ( v->vcpu_id == 0 )
+ update_domain_wallclock_time(d);
+@@ -1053,17 +1062,16 @@ int arch_set_info_guest(
+ #undef c
+ }
+
+-void arch_vcpu_reset(struct vcpu *v)
++int arch_vcpu_reset(struct vcpu *v)
+ {
+ if ( !is_hvm_vcpu(v) )
+ {
+ destroy_gdt(v);
+- vcpu_destroy_pagetables(v);
+- }
+- else
+- {
+- vcpu_end_shutdown_deferral(v);
++ return vcpu_destroy_pagetables(v);
+ }
++
++ vcpu_end_shutdown_deferral(v);
++ return 0;
+ }
+
+ /*
+@@ -2069,63 +2077,6 @@ static int relinquish_memory(
+ return ret;
+ }
+
+-static void vcpu_destroy_pagetables(struct vcpu *v)
+-{
+- struct domain *d = v->domain;
+- unsigned long pfn;
+-
+-#ifdef __x86_64__
+- if ( is_pv_32on64_vcpu(v) )
+- {
+- pfn = l4e_get_pfn(*(l4_pgentry_t *)
+- __va(pagetable_get_paddr(v->arch.guest_table)));
+-
+- if ( pfn != 0 )
+- {
+- if ( paging_mode_refcounts(d) )
+- put_page(mfn_to_page(pfn));
+- else
+- put_page_and_type(mfn_to_page(pfn));
+- }
+-
+- l4e_write(
+- (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
+- l4e_empty());
+-
+- v->arch.cr3 = 0;
+- return;
+- }
+-#endif
+-
+- pfn = pagetable_get_pfn(v->arch.guest_table);
+- if ( pfn != 0 )
+- {
+- if ( paging_mode_refcounts(d) )
+- put_page(mfn_to_page(pfn));
+- else
+- put_page_and_type(mfn_to_page(pfn));
+- v->arch.guest_table = pagetable_null();
+- }
+-
+-#ifdef __x86_64__
+- /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+- pfn = pagetable_get_pfn(v->arch.guest_table_user);
+- if ( pfn != 0 )
+- {
+- if ( !is_pv_32bit_vcpu(v) )
+- {
+- if ( paging_mode_refcounts(d) )
+- put_page(mfn_to_page(pfn));
+- else
+- put_page_and_type(mfn_to_page(pfn));
+- }
+- v->arch.guest_table_user = pagetable_null();
+- }
+-#endif
+-
+- v->arch.cr3 = 0;
+-}
+-
+ int domain_relinquish_resources(struct domain *d)
+ {
+ int ret;
+@@ -2143,7 +2094,11 @@ int domain_relinquish_resources(struct domain *d)
+
+ /* Drop the in-use references to page-table bases. */
+ for_each_vcpu ( d, v )
+- vcpu_destroy_pagetables(v);
++ {
++ ret = vcpu_destroy_pagetables(v);
++ if ( ret )
++ return ret;
++ }
+
+ if ( !is_hvm_domain(d) )
+ {
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index 3d471a5..efacc98 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -3509,8 +3509,11 @@ static void hvm_s3_suspend(struct domain *d)
+
+ for_each_vcpu ( d, v )
+ {
++ int rc;
++
+ vlapic_reset(vcpu_vlapic(v));
+- vcpu_reset(v);
++ rc = vcpu_reset(v);
++ ASSERT(!rc);
+ }
+
+ vpic_reset(d);
+diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
+index 52d111b..7778342 100644
+--- a/xen/arch/x86/hvm/vlapic.c
++++ b/xen/arch/x86/hvm/vlapic.c
+@@ -252,10 +252,13 @@ static void vlapic_init_sipi_action(unsigned long _vcpu)
+ {
+ case APIC_DM_INIT: {
+ bool_t fpu_initialised;
++ int rc;
++
+ domain_lock(target->domain);
+ /* Reset necessary VCPU state. This does not include FPU state. */
+ fpu_initialised = target->fpu_initialised;
+- vcpu_reset(target);
++ rc = vcpu_reset(target);
++ ASSERT(!rc);
+ target->fpu_initialised = fpu_initialised;
+ vlapic_reset(vcpu_vlapic(target));
+ domain_unlock(target->domain);
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 8444610..055f307 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -1241,7 +1241,16 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+ #endif
+
+ if ( unlikely(partial > 0) )
++ {
++ ASSERT(preemptible >= 0);
+ return __put_page_type(l3e_get_page(l3e), preemptible);
++ }
++
++ if ( preemptible < 0 )
++ {
++ current->arch.old_guest_table = l3e_get_page(l3e);
++ return 0;
++ }
+
+ return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+ }
+@@ -1254,7 +1263,17 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+ (l4e_get_pfn(l4e) != pfn) )
+ {
+ if ( unlikely(partial > 0) )
++ {
++ ASSERT(preemptible >= 0);
+ return __put_page_type(l4e_get_page(l4e), preemptible);
++ }
++
++ if ( preemptible < 0 )
++ {
++ current->arch.old_guest_table = l4e_get_page(l4e);
++ return 0;
++ }
++
+ return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+ }
+ return 1;
+@@ -1549,12 +1568,17 @@ static int alloc_l3_table(struct page_info *page, int preemptible)
+ if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
+ {
+ MEM_LOG("Failure in alloc_l3_table: entry %d", i);
++ if ( i )
++ {
++ page->nr_validated_ptes = i;
++ page->partial_pte = 0;
++ current->arch.old_guest_table = page;
++ }
+ while ( i-- > 0 )
+ {
+ if ( !is_guest_l3_slot(i) )
+ continue;
+ unadjust_guest_l3e(pl3e[i], d);
+- put_page_from_l3e(pl3e[i], pfn, 0, 0);
+ }
+ }
+
+@@ -1584,22 +1608,24 @@ static int alloc_l4_table(struct page_info *page, int preemptible)
+ page->nr_validated_ptes = i;
+ page->partial_pte = partial ?: 1;
+ }
+- else if ( rc == -EINTR )
++ else if ( rc < 0 )
+ {
++ if ( rc != -EINTR )
++ MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+ if ( i )
+ {
+ page->nr_validated_ptes = i;
+ page->partial_pte = 0;
+- rc = -EAGAIN;
++ if ( rc == -EINTR )
++ rc = -EAGAIN;
++ else
++ {
++ if ( current->arch.old_guest_table )
++ page->nr_validated_ptes++;
++ current->arch.old_guest_table = page;
++ }
+ }
+ }
+- else if ( rc < 0 )
+- {
+- MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+- while ( i-- > 0 )
+- if ( is_guest_l4_slot(d, i) )
+- put_page_from_l4e(pl4e[i], pfn, 0, 0);
+- }
+ if ( rc < 0 )
+ return rc;
+
+@@ -2047,7 +2073,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
+ pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
+ }
+
+- put_page_from_l3e(ol3e, pfn, 0, 0);
++ put_page_from_l3e(ol3e, pfn, 0, -preemptible);
+ return rc;
+ }
+
+@@ -2110,7 +2136,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
+ return -EFAULT;
+ }
+
+- put_page_from_l4e(ol4e, pfn, 0, 0);
++ put_page_from_l4e(ol4e, pfn, 0, -preemptible);
+ return rc;
+ }
+
+@@ -2268,7 +2294,15 @@ static int alloc_page_type(struct page_info *page, unsigned long type,
+ PRtype_info ": caf=%08lx taf=%" PRtype_info,
+ page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
+ type, page->count_info, page->u.inuse.type_info);
+- page->u.inuse.type_info = 0;
++ if ( page != current->arch.old_guest_table )
++ page->u.inuse.type_info = 0;
++ else
++ {
++ ASSERT((page->u.inuse.type_info &
++ (PGT_count_mask | PGT_validated)) == 1);
++ get_page_light(page);
++ page->u.inuse.type_info |= PGT_partial;
++ }
+ }
+ else
+ {
+@@ -2808,49 +2842,150 @@ static void put_superpage(unsigned long mfn)
+
+ #endif
+
++static int put_old_guest_table(struct vcpu *v)
++{
++ int rc;
++
++ if ( !v->arch.old_guest_table )
++ return 0;
++
++ switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table, 1) )
++ {
++ case -EINTR:
++ case -EAGAIN:
++ return -EAGAIN;
++ }
++
++ v->arch.old_guest_table = NULL;
++
++ return rc;
++}
++
++int vcpu_destroy_pagetables(struct vcpu *v)
++{
++ unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
++ struct page_info *page;
++ int rc = put_old_guest_table(v);
++
++ if ( rc )
++ return rc;
++
++#ifdef __x86_64__
++ if ( is_pv_32on64_vcpu(v) )
++ mfn = l4e_get_pfn(*(l4_pgentry_t *)mfn_to_virt(mfn));
++#endif
++
++ if ( mfn )
++ {
++ page = mfn_to_page(mfn);
++ if ( paging_mode_refcounts(v->domain) )
++ put_page(page);
++ else
++ rc = put_page_and_type_preemptible(page, 1);
++ }
++
++#ifdef __x86_64__
++ if ( is_pv_32on64_vcpu(v) )
++ {
++ if ( !rc )
++ l4e_write(
++ (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)),
++ l4e_empty());
++ }
++ else
++#endif
++ if ( !rc )
++ {
++ v->arch.guest_table = pagetable_null();
++
++#ifdef __x86_64__
++ /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
++ mfn = pagetable_get_pfn(v->arch.guest_table_user);
++ if ( mfn )
++ {
++ page = mfn_to_page(mfn);
++ if ( paging_mode_refcounts(v->domain) )
++ put_page(page);
++ else
++ rc = put_page_and_type_preemptible(page, 1);
++ }
++ if ( !rc )
++ v->arch.guest_table_user = pagetable_null();
++#endif
++ }
++
++ v->arch.cr3 = 0;
++
++ return rc;
++}
+
+ int new_guest_cr3(unsigned long mfn)
+ {
+ struct vcpu *curr = current;
+ struct domain *d = curr->domain;
+- int okay;
++ int rc;
+ unsigned long old_base_mfn;
+
+ #ifdef __x86_64__
+ if ( is_pv_32on64_domain(d) )
+ {
+- okay = paging_mode_refcounts(d)
+- ? 0 /* Old code was broken, but what should it be? */
+- : mod_l4_entry(
++ rc = paging_mode_refcounts(d)
++ ? -EINVAL /* Old code was broken, but what should it be? */
++ : mod_l4_entry(
+ __va(pagetable_get_paddr(curr->arch.guest_table)),
+ l4e_from_pfn(
+ mfn,
+ (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
+- pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0;
+- if ( unlikely(!okay) )
++ pagetable_get_pfn(curr->arch.guest_table), 0, 1, curr);
++ switch ( rc )
+ {
++ case 0:
++ break;
++ case -EINTR:
++ case -EAGAIN:
++ return -EAGAIN;
++ default:
+ MEM_LOG("Error while installing new compat baseptr %lx", mfn);
+- return 0;
++ return rc;
+ }
+
+ invalidate_shadow_ldt(curr, 0);
+ write_ptbase(curr);
+
+- return 1;
++ return 0;
+ }
+ #endif
+- okay = paging_mode_refcounts(d)
+- ? get_page_from_pagenr(mfn, d)
+- : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
+- if ( unlikely(!okay) )
++ rc = put_old_guest_table(curr);
++ if ( unlikely(rc) )
++ return rc;
++
++ old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
++ /*
++ * This is particularly important when getting restarted after the
++ * previous attempt got preempted in the put-old-MFN phase.
++ */
++ if ( old_base_mfn == mfn )
+ {
+- MEM_LOG("Error while installing new baseptr %lx", mfn);
++ write_ptbase(curr);
+ return 0;
+ }
+
+- invalidate_shadow_ldt(curr, 0);
++ rc = paging_mode_refcounts(d)
++ ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL)
++ : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1);
++ switch ( rc )
++ {
++ case 0:
++ break;
++ case -EINTR:
++ case -EAGAIN:
++ return -EAGAIN;
++ default:
++ MEM_LOG("Error while installing new baseptr %lx", mfn);
++ return rc;
++ }
+
+- old_base_mfn = pagetable_get_pfn(curr->arch.guest_table);
++ invalidate_shadow_ldt(curr, 0);
+
+ curr->arch.guest_table = pagetable_from_pfn(mfn);
+ update_cr3(curr);
+@@ -2859,13 +2994,25 @@ int new_guest_cr3(unsigned long mfn)
+
+ if ( likely(old_base_mfn != 0) )
+ {
++ struct page_info *page = mfn_to_page(old_base_mfn);
++
+ if ( paging_mode_refcounts(d) )
+- put_page(mfn_to_page(old_base_mfn));
++ put_page(page);
+ else
+- put_page_and_type(mfn_to_page(old_base_mfn));
++ switch ( rc = put_page_and_type_preemptible(page, 1) )
++ {
++ case -EINTR:
++ rc = -EAGAIN;
++ case -EAGAIN:
++ curr->arch.old_guest_table = page;
++ break;
++ default:
++ BUG_ON(rc);
++ break;
++ }
+ }
+
+- return 1;
++ return rc;
+ }
+
+ static struct domain *get_pg_owner(domid_t domid)
+@@ -2994,12 +3141,29 @@ long do_mmuext_op(
+ unsigned int foreigndom)
+ {
+ struct mmuext_op op;
+- int rc = 0, i = 0, okay;
+ unsigned long type;
+- unsigned int done = 0;
++ unsigned int i = 0, done = 0;
+ struct vcpu *curr = current;
+ struct domain *d = curr->domain;
+ struct domain *pg_owner;
++ int okay, rc = put_old_guest_table(curr);
++
++ if ( unlikely(rc) )
++ {
++ if ( likely(rc == -EAGAIN) )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
++ foreigndom);
++ return rc;
++ }
++
++ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
++ likely(guest_handle_is_null(uops)) )
++ {
++ /* See the curr->arch.old_guest_table related
++ * hypercall_create_continuation() below. */
++ return (int)foreigndom;
++ }
+
+ if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+ {
+@@ -3024,7 +3188,7 @@ long do_mmuext_op(
+
+ for ( i = 0; i < count; i++ )
+ {
+- if ( hypercall_preempt_check() )
++ if ( curr->arch.old_guest_table || hypercall_preempt_check() )
+ {
+ rc = -EAGAIN;
+ break;
+@@ -3088,21 +3252,17 @@ long do_mmuext_op(
+ }
+
+ if ( (rc = xsm_memory_pin_page(d, pg_owner, page)) != 0 )
+- {
+- put_page_and_type(page);
+ okay = 0;
+- break;
+- }
+-
+- if ( unlikely(test_and_set_bit(_PGT_pinned,
+- &page->u.inuse.type_info)) )
++ else if ( unlikely(test_and_set_bit(_PGT_pinned,
++ &page->u.inuse.type_info)) )
+ {
+ MEM_LOG("Mfn %lx already pinned", page_to_mfn(page));
+- put_page_and_type(page);
+ okay = 0;
+- break;
+ }
+
++ if ( unlikely(!okay) )
++ goto pin_drop;
++
+ /* A page is dirtied when its pin status is set. */
+ paging_mark_dirty(pg_owner, page_to_mfn(page));
+
+@@ -3116,7 +3276,13 @@ long do_mmuext_op(
+ &page->u.inuse.type_info));
+ spin_unlock(&pg_owner->page_alloc_lock);
+ if ( drop_ref )
+- put_page_and_type(page);
++ {
++ pin_drop:
++ if ( type == PGT_l1_page_table )
++ put_page_and_type(page);
++ else
++ curr->arch.old_guest_table = page;
++ }
+ }
+
+ break;
+@@ -3144,7 +3310,17 @@ long do_mmuext_op(
+ break;
+ }
+
+- put_page_and_type(page);
++ switch ( rc = put_page_and_type_preemptible(page, 1) )
++ {
++ case -EINTR:
++ case -EAGAIN:
++ curr->arch.old_guest_table = page;
++ rc = 0;
++ break;
++ default:
++ BUG_ON(rc);
++ break;
++ }
+ put_page(page);
+
+ /* A page is dirtied when its pin status is cleared. */
+@@ -3154,8 +3330,13 @@ long do_mmuext_op(
+ }
+
+ case MMUEXT_NEW_BASEPTR:
+- okay = (!paging_mode_translate(d)
+- && new_guest_cr3(op.arg1.mfn));
++ if ( paging_mode_translate(d) )
++ okay = 0;
++ else
++ {
++ rc = new_guest_cr3(op.arg1.mfn);
++ okay = !rc;
++ }
+ break;
+
+
+@@ -3169,29 +3350,56 @@ long do_mmuext_op(
+ break;
+ }
+
++ old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
++ /*
++ * This is particularly important when getting restarted after the
++ * previous attempt got preempted in the put-old-MFN phase.
++ */
++ if ( old_mfn == op.arg1.mfn )
++ break;
++
+ if ( op.arg1.mfn != 0 )
+ {
+ if ( paging_mode_refcounts(d) )
+ okay = get_page_from_pagenr(op.arg1.mfn, d);
+ else
+- okay = !get_page_and_type_from_pagenr(
+- op.arg1.mfn, PGT_root_page_table, d, 0, 0);
++ {
++ rc = get_page_and_type_from_pagenr(
++ op.arg1.mfn, PGT_root_page_table, d, 0, 1);
++ okay = !rc;
++ }
+ if ( unlikely(!okay) )
+ {
+- MEM_LOG("Error while installing new mfn %lx", op.arg1.mfn);
++ if ( rc == -EINTR )
++ rc = -EAGAIN;
++ else if ( rc != -EAGAIN )
++ MEM_LOG("Error while installing new mfn %lx",
++ op.arg1.mfn);
+ break;
+ }
+ }
+
+- old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+ curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
+
+ if ( old_mfn != 0 )
+ {
++ struct page_info *page = mfn_to_page(old_mfn);
++
+ if ( paging_mode_refcounts(d) )
+- put_page(mfn_to_page(old_mfn));
++ put_page(page);
+ else
+- put_page_and_type(mfn_to_page(old_mfn));
++ switch ( rc = put_page_and_type_preemptible(page, 1) )
++ {
++ case -EINTR:
++ rc = -EAGAIN;
++ case -EAGAIN:
++ curr->arch.old_guest_table = page;
++ okay = 0;
++ break;
++ default:
++ BUG_ON(rc);
++ break;
++ }
+ }
+
+ break;
+@@ -3433,9 +3641,27 @@ long do_mmuext_op(
+ }
+
+ if ( rc == -EAGAIN )
++ {
++ ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmuext_op, "hihi",
+ uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
++ }
++ else if ( curr->arch.old_guest_table )
++ {
++ XEN_GUEST_HANDLE(void) null;
++
++ ASSERT(rc || i == count);
++ set_xen_guest_handle(null, NULL);
++ /*
++ * In order to have a way to communicate the final return value to
++ * our continuation, we pass this in place of "foreigndom", building
++ * on the fact that this argument isn't needed anymore.
++ */
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_mmuext_op, "hihi", null,
++ MMU_UPDATE_PREEMPTED, null, rc);
++ }
+
+ put_pg_owner(pg_owner);
+
+@@ -3462,11 +3688,28 @@ long do_mmu_update(
+ void *va;
+ unsigned long gpfn, gmfn, mfn;
+ struct page_info *page;
+- int rc = 0, i = 0;
+- unsigned int cmd, done = 0, pt_dom;
+- struct vcpu *v = current;
++ unsigned int cmd, i = 0, done = 0, pt_dom;
++ struct vcpu *curr = current, *v = curr;
+ struct domain *d = v->domain, *pt_owner = d, *pg_owner;
+ struct domain_mmap_cache mapcache;
++ int rc = put_old_guest_table(curr);
++
++ if ( unlikely(rc) )
++ {
++ if ( likely(rc == -EAGAIN) )
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
++ foreigndom);
++ return rc;
++ }
++
++ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
++ likely(guest_handle_is_null(ureqs)) )
++ {
++ /* See the curr->arch.old_guest_table related
++ * hypercall_create_continuation() below. */
++ return (int)foreigndom;
++ }
+
+ if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+ {
+@@ -3515,7 +3758,7 @@ long do_mmu_update(
+
+ for ( i = 0; i < count; i++ )
+ {
+- if ( hypercall_preempt_check() )
++ if ( curr->arch.old_guest_table || hypercall_preempt_check() )
+ {
+ rc = -EAGAIN;
+ break;
+@@ -3696,9 +3939,27 @@ long do_mmu_update(
+ }
+
+ if ( rc == -EAGAIN )
++ {
++ ASSERT(i < count);
+ rc = hypercall_create_continuation(
+ __HYPERVISOR_mmu_update, "hihi",
+ ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
++ }
++ else if ( curr->arch.old_guest_table )
++ {
++ XEN_GUEST_HANDLE(void) null;
++
++ ASSERT(rc || i == count);
++ set_xen_guest_handle(null, NULL);
++ /*
++ * In order to have a way to communicate the final return value to
++ * our continuation, we pass this in place of "foreigndom", building
++ * on the fact that this argument isn't needed anymore.
++ */
++ rc = hypercall_create_continuation(
++ __HYPERVISOR_mmu_update, "hihi", null,
++ MMU_UPDATE_PREEMPTED, null, rc);
++ }
+
+ put_pg_owner(pg_owner);
+
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index 692281a..eada470 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -2407,12 +2407,23 @@ static int emulate_privileged_op(struct cpu_user_regs *regs)
+ #endif
+ }
+ page = get_page_from_gfn(v->domain, gfn, NULL, P2M_ALLOC);
+- rc = page ? new_guest_cr3(page_to_mfn(page)) : 0;
+ if ( page )
++ {
++ rc = new_guest_cr3(page_to_mfn(page));
+ put_page(page);
++ }
++ else
++ rc = -EINVAL;
+ domain_unlock(v->domain);
+- if ( rc == 0 ) /* not okay */
++ switch ( rc )
++ {
++ case 0:
++ break;
++ case -EAGAIN: /* retry after preemption */
++ goto skip;
++ default: /* not okay */
+ goto fail;
++ }
+ break;
+ }
+
+diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
+index fb7baca..ef7822b 100644
+--- a/xen/arch/x86/x86_64/compat/mm.c
++++ b/xen/arch/x86/x86_64/compat/mm.c
+@@ -268,6 +268,13 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops,
+ int rc = 0;
+ XEN_GUEST_HANDLE(mmuext_op_t) nat_ops;
+
++ if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
++ likely(guest_handle_is_null(cmp_uops)) )
++ {
++ set_xen_guest_handle(nat_ops, NULL);
++ return do_mmuext_op(nat_ops, count, pdone, foreigndom);
++ }
++
+ preempt_mask = count & MMU_UPDATE_PREEMPTED;
+ count ^= preempt_mask;
+
+@@ -365,17 +372,23 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops,
+ : mcs->call.args[1];
+ unsigned int left = arg1 & ~MMU_UPDATE_PREEMPTED;
+
+- BUG_ON(left == arg1);
++ BUG_ON(left == arg1 && left != i);
+ BUG_ON(left > count);
+ guest_handle_add_offset(nat_ops, i - left);
+ guest_handle_subtract_offset(cmp_uops, left);
+ left = 1;
+- BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops));
+- BUG_ON(left != arg1);
+- if (!test_bit(_MCSF_in_multicall, &mcs->flags))
+- regs->_ecx += count - i;
++ if ( arg1 != MMU_UPDATE_PREEMPTED )
++ {
++ BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops,
++ cmp_uops));
++ if ( !test_bit(_MCSF_in_multicall, &mcs->flags) )
++ regs->_ecx += count - i;
++ else
++ mcs->compat_call.args[1] += count - i;
++ }
+ else
+- mcs->compat_call.args[1] += count - i;
++ BUG_ON(hypercall_xlat_continuation(&left, 0));
++ BUG_ON(left != arg1);
+ }
+ else
+ BUG_ON(err > 0);
+diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c
+index 40a0287..9ddaa38 100644
+--- a/xen/common/compat/domain.c
++++ b/xen/common/compat/domain.c
+@@ -50,6 +50,10 @@ int compat_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+ rc = v->is_initialised ? -EEXIST : arch_set_info_guest(v, cmp_ctxt);
+ domain_unlock(d);
+
++ if ( rc == -EAGAIN )
++ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
++ cmd, vcpuid, arg);
++
+ xfree(cmp_ctxt);
+ break;
+ }
+diff --git a/xen/common/domain.c b/xen/common/domain.c
+index c09fb73..89ab922 100644
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -779,14 +779,18 @@ void domain_unpause_by_systemcontroller(struct domain *d)
+ domain_unpause(d);
+ }
+
+-void vcpu_reset(struct vcpu *v)
++int vcpu_reset(struct vcpu *v)
+ {
+ struct domain *d = v->domain;
++ int rc;
+
+ vcpu_pause(v);
+ domain_lock(d);
+
+- arch_vcpu_reset(v);
++ set_bit(_VPF_in_reset, &v->pause_flags);
++ rc = arch_vcpu_reset(v);
++ if ( rc )
++ goto out_unlock;
+
+ set_bit(_VPF_down, &v->pause_flags);
+
+@@ -802,9 +806,13 @@ void vcpu_reset(struct vcpu *v)
+ #endif
+ cpumask_clear(v->cpu_affinity_tmp);
+ clear_bit(_VPF_blocked, &v->pause_flags);
++ clear_bit(_VPF_in_reset, &v->pause_flags);
+
++ out_unlock:
+ domain_unlock(v->domain);
+ vcpu_unpause(v);
++
++ return rc;
+ }
+
+
+@@ -841,6 +849,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+ domain_unlock(d);
+
+ free_vcpu_guest_context(ctxt);
++
++ if ( rc == -EAGAIN )
++ rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih",
++ cmd, vcpuid, arg);
++
+ break;
+
+ case VCPUOP_up: {
+diff --git a/xen/common/domctl.c b/xen/common/domctl.c
+index cbc8146..b3bfb38 100644
+--- a/xen/common/domctl.c
++++ b/xen/common/domctl.c
+@@ -307,8 +307,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
+
+ if ( guest_handle_is_null(op->u.vcpucontext.ctxt) )
+ {
+- vcpu_reset(v);
+- ret = 0;
++ ret = vcpu_reset(v);
++ if ( ret == -EAGAIN )
++ ret = hypercall_create_continuation(
++ __HYPERVISOR_domctl, "h", u_domctl);
+ goto svc_out;
+ }
+
+@@ -337,6 +339,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
+ domain_pause(d);
+ ret = arch_set_info_guest(v, c);
+ domain_unpause(d);
++
++ if ( ret == -EAGAIN )
++ ret = hypercall_create_continuation(
++ __HYPERVISOR_domctl, "h", u_domctl);
+ }
+
+ svc_out:
+diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
+index aecee68..898f63a 100644
+--- a/xen/include/asm-x86/domain.h
++++ b/xen/include/asm-x86/domain.h
+@@ -464,6 +464,7 @@ struct arch_vcpu
+ pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */
+ #endif
+ pagetable_t guest_table; /* (MFN) guest notion of cr3 */
++ struct page_info *old_guest_table; /* partially destructed pagetable */
+ /* guest_table holds a ref to the page, and also a type-count unless
+ * shadow refcounts are in use */
+ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
+diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
+index ba92568..82cdde6 100644
+--- a/xen/include/asm-x86/mm.h
++++ b/xen/include/asm-x86/mm.h
+@@ -605,6 +605,7 @@ void audit_domains(void);
+ int new_guest_cr3(unsigned long pfn);
+ void make_cr3(struct vcpu *v, unsigned long mfn);
+ void update_cr3(struct vcpu *v);
++int vcpu_destroy_pagetables(struct vcpu *);
+ void propagate_page_fault(unsigned long addr, u16 error_code);
+ void *do_page_walk(struct vcpu *v, unsigned long addr);
+
+diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
+index d4ac50f..504a70f 100644
+--- a/xen/include/xen/domain.h
++++ b/xen/include/xen/domain.h
+@@ -13,7 +13,7 @@ typedef union {
+ struct vcpu *alloc_vcpu(
+ struct domain *d, unsigned int vcpu_id, unsigned int cpu_id);
+ struct vcpu *alloc_dom0_vcpu0(void);
+-void vcpu_reset(struct vcpu *v);
++int vcpu_reset(struct vcpu *);
+
+ struct xen_domctl_getdomaininfo;
+ void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
+@@ -67,7 +67,7 @@ void arch_dump_vcpu_info(struct vcpu *v);
+
+ void arch_dump_domain_info(struct domain *d);
+
+-void arch_vcpu_reset(struct vcpu *v);
++int arch_vcpu_reset(struct vcpu *);
+
+ extern spinlock_t vcpu_alloc_lock;
+ bool_t domctl_lock_acquire(void);
+diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
+index b619269..b0715cb 100644
+--- a/xen/include/xen/sched.h
++++ b/xen/include/xen/sched.h
+@@ -644,6 +644,9 @@ static inline struct domain *next_domain_in_cpupool(
+ /* VCPU is blocked due to missing mem_sharing ring. */
+ #define _VPF_mem_sharing 6
+ #define VPF_mem_sharing (1UL<<_VPF_mem_sharing)
++ /* VCPU is being reset. */
++#define _VPF_in_reset 7
++#define VPF_in_reset (1UL<<_VPF_in_reset)
+
+ static inline int vcpu_runnable(struct vcpu *v)
+ {