diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 4009a60..9a34488 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -70,8 +70,6 @@ void (*dead_idle) (void) __read_mostly = default_dead_idle; static void paravirt_ctxt_switch_from(struct vcpu *v); static void paravirt_ctxt_switch_to(struct vcpu *v); -static void vcpu_destroy_pagetables(struct vcpu *v); - static void continue_idle_domain(struct vcpu *v) { reset_stack_and_jump(idle_loop); @@ -678,6 +676,7 @@ int arch_set_info_guest( { struct domain *d = v->domain; unsigned long cr3_pfn = INVALID_MFN; + struct page_info *cr3_page; unsigned long flags, cr4; int i, rc = 0, compat; @@ -817,72 +816,103 @@ int arch_set_info_guest( if ( rc != 0 ) return rc; + set_bit(_VPF_in_reset, &v->pause_flags); + if ( !compat ) - { cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[3])); +#ifdef __x86_64__ + else + cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3])); +#endif + cr3_page = mfn_to_page(cr3_pfn); - if ( !mfn_valid(cr3_pfn) || - (paging_mode_refcounts(d) - ? !get_page(mfn_to_page(cr3_pfn), d) - : !get_page_and_type(mfn_to_page(cr3_pfn), d, - PGT_base_page_table)) ) - { - destroy_gdt(v); - return -EINVAL; - } + if ( !mfn_valid(cr3_pfn) || !get_page(cr3_page, d) ) + { + cr3_page = NULL; + rc = -EINVAL; + } + else if ( paging_mode_refcounts(d) ) + /* nothing */; + else if ( cr3_page == v->arch.old_guest_table ) + { + v->arch.old_guest_table = NULL; + put_page(cr3_page); + } + else + { + /* + * Since v->arch.guest_table{,_user} are both NULL, this effectively + * is just a call to put_old_guest_table(). + */ + if ( !compat ) + rc = vcpu_destroy_pagetables(v); + if ( !rc ) + rc = get_page_type_preemptible(cr3_page, + !compat ? PGT_root_page_table + : PGT_l3_page_table); + if ( rc == -EINTR ) + rc = -EAGAIN; + } + if ( rc ) + /* handled below */; + else if ( !compat ) + { v->arch.guest_table = pagetable_from_pfn(cr3_pfn); #ifdef __x86_64__ if ( c.nat->ctrlreg[1] ) { cr3_pfn = gmfn_to_mfn(d, xen_cr3_to_pfn(c.nat->ctrlreg[1])); + cr3_page = mfn_to_page(cr3_pfn); - if ( !mfn_valid(cr3_pfn) || - (paging_mode_refcounts(d) - ? !get_page(mfn_to_page(cr3_pfn), d) - : !get_page_and_type(mfn_to_page(cr3_pfn), d, - PGT_base_page_table)) ) + if ( !mfn_valid(cr3_pfn) || !get_page(cr3_page, d) ) { - cr3_pfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = pagetable_null(); - if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(cr3_pfn)); - else - put_page_and_type(mfn_to_page(cr3_pfn)); - destroy_gdt(v); - return -EINVAL; + cr3_page = NULL; + rc = -EINVAL; + } + else if ( !paging_mode_refcounts(d) ) + { + rc = get_page_type_preemptible(cr3_page, PGT_root_page_table); + switch ( rc ) + { + case -EINTR: + rc = -EAGAIN; + case -EAGAIN: + v->arch.old_guest_table = + pagetable_get_page(v->arch.guest_table); + v->arch.guest_table = pagetable_null(); + break; + } } - v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn); + if ( !rc ) + v->arch.guest_table_user = pagetable_from_pfn(cr3_pfn); } else if ( !(flags & VGCF_in_kernel) ) { - destroy_gdt(v); - return -EINVAL; + cr3_page = NULL; + rc = -EINVAL; } } else { l4_pgentry_t *l4tab; - cr3_pfn = gmfn_to_mfn(d, compat_cr3_to_pfn(c.cmp->ctrlreg[3])); - - if ( !mfn_valid(cr3_pfn) || - (paging_mode_refcounts(d) - ? !get_page(mfn_to_page(cr3_pfn), d) - : !get_page_and_type(mfn_to_page(cr3_pfn), d, - PGT_l3_page_table)) ) - { - destroy_gdt(v); - return -EINVAL; - } - l4tab = __va(pagetable_get_paddr(v->arch.guest_table)); *l4tab = l4e_from_pfn( cr3_pfn, _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED); #endif } + if ( rc ) + { + if ( cr3_page ) + put_page(cr3_page); + destroy_gdt(v); + return rc; + } + + clear_bit(_VPF_in_reset, &v->pause_flags); if ( v->vcpu_id == 0 ) update_domain_wallclock_time(d); @@ -904,17 +934,16 @@ int arch_set_info_guest( #undef c } -void arch_vcpu_reset(struct vcpu *v) +int arch_vcpu_reset(struct vcpu *v) { if ( !is_hvm_vcpu(v) ) { destroy_gdt(v); - vcpu_destroy_pagetables(v); - } - else - { - vcpu_end_shutdown_deferral(v); + return vcpu_destroy_pagetables(v); } + + vcpu_end_shutdown_deferral(v); + return 0; } /* @@ -1917,63 +1946,6 @@ static int relinquish_memory( return ret; } -static void vcpu_destroy_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; - unsigned long pfn; - -#ifdef __x86_64__ - if ( is_pv_32on64_vcpu(v) ) - { - pfn = l4e_get_pfn(*(l4_pgentry_t *) - __va(pagetable_get_paddr(v->arch.guest_table))); - - if ( pfn != 0 ) - { - if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(pfn)); - else - put_page_and_type(mfn_to_page(pfn)); - } - - l4e_write( - (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)), - l4e_empty()); - - v->arch.cr3 = 0; - return; - } -#endif - - pfn = pagetable_get_pfn(v->arch.guest_table); - if ( pfn != 0 ) - { - if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(pfn)); - else - put_page_and_type(mfn_to_page(pfn)); - v->arch.guest_table = pagetable_null(); - } - -#ifdef __x86_64__ - /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ - pfn = pagetable_get_pfn(v->arch.guest_table_user); - if ( pfn != 0 ) - { - if ( !is_pv_32bit_vcpu(v) ) - { - if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(pfn)); - else - put_page_and_type(mfn_to_page(pfn)); - } - v->arch.guest_table_user = pagetable_null(); - } -#endif - - v->arch.cr3 = 0; -} - int domain_relinquish_resources(struct domain *d) { int ret; @@ -1992,7 +1964,9 @@ int domain_relinquish_resources(struct domain *d) for_each_vcpu ( d, v ) { /* Drop the in-use references to page-table bases. */ - vcpu_destroy_pagetables(v); + ret = vcpu_destroy_pagetables(v); + if ( ret ) + return ret; /* * Relinquish GDT mappings. No need for explicit unmapping of the diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 9f53728..140e70c 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -3083,8 +3083,11 @@ static void hvm_s3_suspend(struct domain *d) for_each_vcpu ( d, v ) { + int rc; + vlapic_reset(vcpu_vlapic(v)); - vcpu_reset(v); + rc = vcpu_reset(v); + ASSERT(!rc); } vpic_reset(d); diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c index 3af41cc..8d47bd0 100644 --- a/xen/arch/x86/hvm/vlapic.c +++ b/xen/arch/x86/hvm/vlapic.c @@ -252,10 +252,13 @@ static void vlapic_init_sipi_action(unsigned long _vcpu) { case APIC_DM_INIT: { bool_t fpu_initialised; + int rc; + domain_lock(target->domain); /* Reset necessary VCPU state. This does not include FPU state. */ fpu_initialised = target->fpu_initialised; - vcpu_reset(target); + rc = vcpu_reset(target); + ASSERT(!rc); target->fpu_initialised = fpu_initialised; vlapic_reset(vcpu_vlapic(target)); domain_unlock(target->domain); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 30d281d..ceeb998 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1182,7 +1182,16 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, #endif if ( unlikely(partial > 0) ) + { + ASSERT(preemptible >= 0); return __put_page_type(l3e_get_page(l3e), preemptible); + } + + if ( preemptible < 0 ) + { + current->arch.old_guest_table = l3e_get_page(l3e); + return 0; + } return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); } @@ -1195,7 +1204,17 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, (l4e_get_pfn(l4e) != pfn) ) { if ( unlikely(partial > 0) ) + { + ASSERT(preemptible >= 0); return __put_page_type(l4e_get_page(l4e), preemptible); + } + + if ( preemptible < 0 ) + { + current->arch.old_guest_table = l4e_get_page(l4e); + return 0; + } + return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); } return 1; @@ -1485,12 +1504,17 @@ static int alloc_l3_table(struct page_info *page, int preemptible) if ( rc < 0 && rc != -EAGAIN && rc != -EINTR ) { MEM_LOG("Failure in alloc_l3_table: entry %d", i); + if ( i ) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; + current->arch.old_guest_table = page; + } while ( i-- > 0 ) { if ( !is_guest_l3_slot(i) ) continue; unadjust_guest_l3e(pl3e[i], d); - put_page_from_l3e(pl3e[i], pfn, 0, 0); } } @@ -1520,22 +1544,24 @@ static int alloc_l4_table(struct page_info *page, int preemptible) page->nr_validated_ptes = i; page->partial_pte = partial ?: 1; } - else if ( rc == -EINTR ) + else if ( rc < 0 ) { + if ( rc != -EINTR ) + MEM_LOG("Failure in alloc_l4_table: entry %d", i); if ( i ) { page->nr_validated_ptes = i; page->partial_pte = 0; - rc = -EAGAIN; + if ( rc == -EINTR ) + rc = -EAGAIN; + else + { + if ( current->arch.old_guest_table ) + page->nr_validated_ptes++; + current->arch.old_guest_table = page; + } } } - else if ( rc < 0 ) - { - MEM_LOG("Failure in alloc_l4_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn, 0, 0); - } if ( rc < 0 ) return rc; @@ -1965,7 +1991,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e, pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e); } - put_page_from_l3e(ol3e, pfn, 0, 0); + put_page_from_l3e(ol3e, pfn, 0, -preemptible); return rc; } @@ -2028,7 +2054,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, return -EFAULT; } - put_page_from_l4e(ol4e, pfn, 0, 0); + put_page_from_l4e(ol4e, pfn, 0, -preemptible); return rc; } @@ -2186,7 +2212,15 @@ static int alloc_page_type(struct page_info *page, unsigned long type, PRtype_info ": caf=%08lx taf=%" PRtype_info, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), type, page->count_info, page->u.inuse.type_info); - page->u.inuse.type_info = 0; + if ( page != current->arch.old_guest_table ) + page->u.inuse.type_info = 0; + else + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask | PGT_validated)) == 1); + get_page_light(page); + page->u.inuse.type_info |= PGT_partial; + } } else { @@ -2724,49 +2758,150 @@ static void put_superpage(unsigned long mfn) #endif +static int put_old_guest_table(struct vcpu *v) +{ + int rc; + + if ( !v->arch.old_guest_table ) + return 0; + + switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table, 1) ) + { + case -EINTR: + case -EAGAIN: + return -EAGAIN; + } + + v->arch.old_guest_table = NULL; + + return rc; +} + +int vcpu_destroy_pagetables(struct vcpu *v) +{ + unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); + struct page_info *page; + int rc = put_old_guest_table(v); + + if ( rc ) + return rc; + +#ifdef __x86_64__ + if ( is_pv_32on64_vcpu(v) ) + mfn = l4e_get_pfn(*(l4_pgentry_t *)mfn_to_virt(mfn)); +#endif + + if ( mfn ) + { + page = mfn_to_page(mfn); + if ( paging_mode_refcounts(v->domain) ) + put_page(page); + else + rc = put_page_and_type_preemptible(page, 1); + } + +#ifdef __x86_64__ + if ( is_pv_32on64_vcpu(v) ) + { + if ( !rc ) + l4e_write( + (l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table)), + l4e_empty()); + } + else +#endif + if ( !rc ) + { + v->arch.guest_table = pagetable_null(); + +#ifdef __x86_64__ + /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ + mfn = pagetable_get_pfn(v->arch.guest_table_user); + if ( mfn ) + { + page = mfn_to_page(mfn); + if ( paging_mode_refcounts(v->domain) ) + put_page(page); + else + rc = put_page_and_type_preemptible(page, 1); + } + if ( !rc ) + v->arch.guest_table_user = pagetable_null(); +#endif + } + + v->arch.cr3 = 0; + + return rc; +} int new_guest_cr3(unsigned long mfn) { struct vcpu *curr = current; struct domain *d = curr->domain; - int okay; + int rc; unsigned long old_base_mfn; #ifdef __x86_64__ if ( is_pv_32on64_domain(d) ) { - okay = paging_mode_refcounts(d) - ? 0 /* Old code was broken, but what should it be? */ - : mod_l4_entry( + rc = paging_mode_refcounts(d) + ? -EINVAL /* Old code was broken, but what should it be? */ + : mod_l4_entry( __va(pagetable_get_paddr(curr->arch.guest_table)), l4e_from_pfn( mfn, (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)), - pagetable_get_pfn(curr->arch.guest_table), 0, 0, curr) == 0; - if ( unlikely(!okay) ) + pagetable_get_pfn(curr->arch.guest_table), 0, 1, curr); + switch ( rc ) { + case 0: + break; + case -EINTR: + case -EAGAIN: + return -EAGAIN; + default: MEM_LOG("Error while installing new compat baseptr %lx", mfn); - return 0; + return rc; } invalidate_shadow_ldt(curr, 0); write_ptbase(curr); - return 1; + return 0; } #endif - okay = paging_mode_refcounts(d) - ? get_page_from_pagenr(mfn, d) - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0); - if ( unlikely(!okay) ) + rc = put_old_guest_table(curr); + if ( unlikely(rc) ) + return rc; + + old_base_mfn = pagetable_get_pfn(curr->arch.guest_table); + /* + * This is particularly important when getting restarted after the + * previous attempt got preempted in the put-old-MFN phase. + */ + if ( old_base_mfn == mfn ) { - MEM_LOG("Error while installing new baseptr %lx", mfn); + write_ptbase(curr); return 0; } - invalidate_shadow_ldt(curr, 0); + rc = paging_mode_refcounts(d) + ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL) + : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1); + switch ( rc ) + { + case 0: + break; + case -EINTR: + case -EAGAIN: + return -EAGAIN; + default: + MEM_LOG("Error while installing new baseptr %lx", mfn); + return rc; + } - old_base_mfn = pagetable_get_pfn(curr->arch.guest_table); + invalidate_shadow_ldt(curr, 0); curr->arch.guest_table = pagetable_from_pfn(mfn); update_cr3(curr); @@ -2775,13 +2910,25 @@ int new_guest_cr3(unsigned long mfn) if ( likely(old_base_mfn != 0) ) { + struct page_info *page = mfn_to_page(old_base_mfn); + if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(old_base_mfn)); + put_page(page); else - put_page_and_type(mfn_to_page(old_base_mfn)); + switch ( rc = put_page_and_type_preemptible(page, 1) ) + { + case -EINTR: + rc = -EAGAIN; + case -EAGAIN: + curr->arch.old_guest_table = page; + break; + default: + BUG_ON(rc); + break; + } } - return 1; + return rc; } static struct domain *get_pg_owner(domid_t domid) @@ -2910,12 +3057,29 @@ long do_mmuext_op( unsigned int foreigndom) { struct mmuext_op op; - int rc = 0, i = 0, okay; unsigned long type; - unsigned int done = 0; + unsigned int i = 0, done = 0; struct vcpu *curr = current; struct domain *d = curr->domain; struct domain *pg_owner; + int okay, rc = put_old_guest_table(curr); + + if ( unlikely(rc) ) + { + if ( likely(rc == -EAGAIN) ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone, + foreigndom); + return rc; + } + + if ( unlikely(count == MMU_UPDATE_PREEMPTED) && + likely(guest_handle_is_null(uops)) ) + { + /* See the curr->arch.old_guest_table related + * hypercall_create_continuation() below. */ + return (int)foreigndom; + } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { @@ -2940,7 +3104,7 @@ long do_mmuext_op( for ( i = 0; i < count; i++ ) { - if ( hypercall_preempt_check() ) + if ( curr->arch.old_guest_table || hypercall_preempt_check() ) { rc = -EAGAIN; break; @@ -3000,21 +3164,17 @@ long do_mmuext_op( page = mfn_to_page(mfn); if ( (rc = xsm_memory_pin_page(d, page)) != 0 ) - { - put_page_and_type(page); okay = 0; - break; - } - - if ( unlikely(test_and_set_bit(_PGT_pinned, - &page->u.inuse.type_info)) ) + else if ( unlikely(test_and_set_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) { MEM_LOG("Mfn %lx already pinned", mfn); - put_page_and_type(page); okay = 0; - break; } + if ( unlikely(!okay) ) + goto pin_drop; + /* A page is dirtied when its pin status is set. */ paging_mark_dirty(pg_owner, mfn); @@ -3028,7 +3188,13 @@ long do_mmuext_op( &page->u.inuse.type_info)); spin_unlock(&pg_owner->page_alloc_lock); if ( drop_ref ) - put_page_and_type(page); + { + pin_drop: + if ( type == PGT_l1_page_table ) + put_page_and_type(page); + else + curr->arch.old_guest_table = page; + } } break; @@ -3058,7 +3224,17 @@ long do_mmuext_op( break; } - put_page_and_type(page); + switch ( rc = put_page_and_type_preemptible(page, 1) ) + { + case -EINTR: + case -EAGAIN: + curr->arch.old_guest_table = page; + rc = 0; + break; + default: + BUG_ON(rc); + break; + } put_page(page); /* A page is dirtied when its pin status is cleared. */ @@ -3068,7 +3244,8 @@ long do_mmuext_op( } case MMUEXT_NEW_BASEPTR: - okay = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn)); + rc = new_guest_cr3(gmfn_to_mfn(d, op.arg1.mfn)); + okay = !rc; break; #ifdef __x86_64__ @@ -3076,29 +3253,55 @@ long do_mmuext_op( unsigned long old_mfn, mfn; mfn = gmfn_to_mfn(d, op.arg1.mfn); + old_mfn = pagetable_get_pfn(curr->arch.guest_table_user); + /* + * This is particularly important when getting restarted after the + * previous attempt got preempted in the put-old-MFN phase. + */ + if ( old_mfn == mfn ) + break; + if ( mfn != 0 ) { if ( paging_mode_refcounts(d) ) okay = get_page_from_pagenr(mfn, d); else - okay = !get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d, 0, 0); + { + rc = get_page_and_type_from_pagenr( + mfn, PGT_root_page_table, d, 0, 1); + okay = !rc; + } if ( unlikely(!okay) ) { - MEM_LOG("Error while installing new mfn %lx", mfn); + if ( rc == -EINTR ) + rc = -EAGAIN; + else if ( rc != -EAGAIN ) + MEM_LOG("Error while installing new mfn %lx", mfn); break; } } - old_mfn = pagetable_get_pfn(curr->arch.guest_table_user); curr->arch.guest_table_user = pagetable_from_pfn(mfn); if ( old_mfn != 0 ) { + struct page_info *page = mfn_to_page(old_mfn); + if ( paging_mode_refcounts(d) ) - put_page(mfn_to_page(old_mfn)); + put_page(page); else - put_page_and_type(mfn_to_page(old_mfn)); + switch ( rc = put_page_and_type_preemptible(page, 1) ) + { + case -EINTR: + rc = -EAGAIN; + case -EAGAIN: + curr->arch.old_guest_table = page; + okay = 0; + break; + default: + BUG_ON(rc); + break; + } } break; @@ -3337,9 +3540,27 @@ long do_mmuext_op( } if ( rc == -EAGAIN ) + { + ASSERT(i < count); rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE(void) null; + + ASSERT(rc || i == count); + set_xen_guest_handle(null, NULL); + /* + * In order to have a way to communicate the final return value to + * our continuation, we pass this in place of "foreigndom", building + * on the fact that this argument isn't needed anymore. + */ + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", null, + MMU_UPDATE_PREEMPTED, null, rc); + } put_pg_owner(pg_owner); @@ -3366,11 +3587,28 @@ long do_mmu_update( void *va; unsigned long gpfn, gmfn, mfn; struct page_info *page; - int rc = 0, okay = 1, i = 0; - unsigned int cmd, done = 0, pt_dom; - struct vcpu *v = current; + unsigned int cmd, i = 0, done = 0, pt_dom; + struct vcpu *curr = current, *v = curr; struct domain *d = v->domain, *pt_owner = d, *pg_owner; struct domain_mmap_cache mapcache; + int rc = put_old_guest_table(curr), okay = 1; + + if ( unlikely(rc) ) + { + if ( likely(rc == -EAGAIN) ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone, + foreigndom); + return rc; + } + + if ( unlikely(count == MMU_UPDATE_PREEMPTED) && + likely(guest_handle_is_null(ureqs)) ) + { + /* See the curr->arch.old_guest_table related + * hypercall_create_continuation() below. */ + return (int)foreigndom; + } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { @@ -3419,7 +3657,7 @@ long do_mmu_update( for ( i = 0; i < count; i++ ) { - if ( hypercall_preempt_check() ) + if ( curr->arch.old_guest_table || hypercall_preempt_check() ) { rc = -EAGAIN; break; @@ -3684,9 +3922,27 @@ long do_mmu_update( } if ( rc == -EAGAIN ) + { + ASSERT(i < count); rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + } + else if ( curr->arch.old_guest_table ) + { + XEN_GUEST_HANDLE(void) null; + + ASSERT(rc || i == count); + set_xen_guest_handle(null, NULL); + /* + * In order to have a way to communicate the final return value to + * our continuation, we pass this in place of "foreigndom", building + * on the fact that this argument isn't needed anymore. + */ + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", null, + MMU_UPDATE_PREEMPTED, null, rc); + } put_pg_owner(pg_owner); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 234d9ac..e336439 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -2317,8 +2317,15 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg))); #endif domain_unlock(v->domain); - if ( rc == 0 ) /* not okay */ + switch ( rc ) + { + case 0: + break; + case -EAGAIN: /* retry after preemption */ + goto skip; + default: /* not okay */ goto fail; + } break; case 4: /* Write CR4 */ diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c index 3ef08a5..6ad41d4 100644 --- a/xen/arch/x86/x86_64/compat/mm.c +++ b/xen/arch/x86/x86_64/compat/mm.c @@ -222,6 +222,13 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops, int rc = 0; XEN_GUEST_HANDLE(mmuext_op_t) nat_ops; + if ( unlikely(count == MMU_UPDATE_PREEMPTED) && + likely(guest_handle_is_null(cmp_uops)) ) + { + set_xen_guest_handle(nat_ops, NULL); + return do_mmuext_op(nat_ops, count, pdone, foreigndom); + } + preempt_mask = count & MMU_UPDATE_PREEMPTED; count ^= preempt_mask; @@ -319,17 +326,23 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mmuext_op_compat_t) cmp_uops, : mcs->call.args[1]; unsigned int left = arg1 & ~MMU_UPDATE_PREEMPTED; - BUG_ON(left == arg1); + BUG_ON(left == arg1 && left != i); BUG_ON(left > count); guest_handle_add_offset(nat_ops, i - left); guest_handle_subtract_offset(cmp_uops, left); left = 1; - BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops)); - BUG_ON(left != arg1); - if (!test_bit(_MCSF_in_multicall, &mcs->flags)) - regs->_ecx += count - i; + if ( arg1 != MMU_UPDATE_PREEMPTED ) + { + BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, + cmp_uops)); + if ( !test_bit(_MCSF_in_multicall, &mcs->flags) ) + regs->_ecx += count - i; + else + mcs->compat_call.args[1] += count - i; + } else - mcs->compat_call.args[1] += count - i; + BUG_ON(hypercall_xlat_continuation(&left, 0)); + BUG_ON(left != arg1); } else BUG_ON(err > 0); diff --git a/xen/common/compat/domain.c b/xen/common/compat/domain.c index 67e0e5e..5fe393f 100644 --- a/xen/common/compat/domain.c +++ b/xen/common/compat/domain.c @@ -52,6 +52,10 @@ int compat_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) rc = boot_vcpu(d, vcpuid, cmp_ctxt); domain_unlock(d); + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); + xfree(cmp_ctxt); break; } diff --git a/xen/common/domain.c b/xen/common/domain.c index 054f7c4..5fa045b 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -770,14 +770,18 @@ int boot_vcpu(struct domain *d, int vcpuid, vcpu_guest_context_u ctxt) return arch_set_info_guest(v, ctxt); } -void vcpu_reset(struct vcpu *v) +int vcpu_reset(struct vcpu *v) { struct domain *d = v->domain; + int rc; vcpu_pause(v); domain_lock(d); - arch_vcpu_reset(v); + set_bit(_VPF_in_reset, &v->pause_flags); + rc = arch_vcpu_reset(v); + if ( rc ) + goto out_unlock; set_bit(_VPF_down, &v->pause_flags); @@ -793,9 +797,13 @@ void vcpu_reset(struct vcpu *v) #endif cpus_clear(v->cpu_affinity_tmp); clear_bit(_VPF_blocked, &v->pause_flags); + clear_bit(_VPF_in_reset, &v->pause_flags); + out_unlock: domain_unlock(v->domain); vcpu_unpause(v); + + return rc; } @@ -834,6 +842,11 @@ long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg) domain_unlock(d); xfree(ctxt); + + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", + cmd, vcpuid, arg); + break; case VCPUOP_up: diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 981cb1a..faac366 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -286,8 +286,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl) if ( guest_handle_is_null(op->u.vcpucontext.ctxt) ) { - vcpu_reset(v); - ret = 0; + ret = vcpu_reset(v); + if ( ret == -EAGAIN ) + ret = hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); goto svc_out; } @@ -316,6 +318,10 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl) domain_pause(d); ret = arch_set_info_guest(v, c); domain_unpause(d); + + if ( ret == -EAGAIN ) + ret = hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); } svc_out: diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index fe1459d..a387862 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -405,6 +405,7 @@ struct arch_vcpu pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ #endif pagetable_t guest_table; /* (MFN) guest notion of cr3 */ + struct page_info *old_guest_table; /* partially destructed pagetable */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index c93a022..2498007 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -554,6 +554,7 @@ void audit_domains(void); int new_guest_cr3(unsigned long pfn); void make_cr3(struct vcpu *v, unsigned long mfn); void update_cr3(struct vcpu *v); +int vcpu_destroy_pagetables(struct vcpu *); void propagate_page_fault(unsigned long addr, u16 error_code); void *do_page_walk(struct vcpu *v, unsigned long addr); diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index edffd1f..5175ef7 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -15,7 +15,7 @@ struct vcpu *alloc_vcpu( int boot_vcpu( struct domain *d, int vcpuid, vcpu_guest_context_u ctxt); struct vcpu *alloc_dom0_vcpu0(void); -void vcpu_reset(struct vcpu *v); +int vcpu_reset(struct vcpu *); struct xen_domctl_getdomaininfo; void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info); @@ -57,7 +57,7 @@ void arch_dump_vcpu_info(struct vcpu *v); void arch_dump_domain_info(struct domain *d); -void arch_vcpu_reset(struct vcpu *v); +int arch_vcpu_reset(struct vcpu *); bool_t domctl_lock_acquire(void); void domctl_lock_release(void); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 35c3a7f..c04b25d 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -592,6 +592,9 @@ extern struct domain *domain_list; /* VCPU is blocked on memory-event ring. */ #define _VPF_mem_event 4 #define VPF_mem_event (1UL<<_VPF_mem_event) + /* VCPU is being reset. */ +#define _VPF_in_reset 7 +#define VPF_in_reset (1UL<<_VPF_in_reset) static inline int vcpu_runnable(struct vcpu *v) {