From 867988237d3e472fe2c99e81ae733e103422566c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 28 Sep 2017 15:17:25 +0100 Subject: [PATCH 1/2] x86: limit linear page table use to a single level That's the only way that they're meant to be used. Without such a restriction arbitrarily long chains of same-level page tables can be built, tearing down of which may then cause arbitrarily deep recursion, causing a stack overflow. To facilitate this restriction, a counter is being introduced to track both the number of same-level entries in a page table as well as the number of uses of a page table in another same-level one (counting into positive and negative direction respectively, utilizing the fact that both counts can't be non-zero at the same time). Note that the added accounting introduces a restriction on the number of times a page can be used in other same-level page tables - more than 32k of such uses are no longer possible. Note also that some put_page_and_type[_preemptible]() calls are replaced with open-coded equivalents. This seemed preferrable to adding "parent_table" to the matrix of functions. Note further that cross-domain same-level page table references are no longer permitted (they probably never should have been). This is XSA-240. Reported-by: Jann Horn Signed-off-by: Jan Beulich Signed-off-by: George Dunlap --- xen/arch/x86/domain.c | 1 + xen/arch/x86/mm.c | 171 ++++++++++++++++++++++++++++++++++++++----- xen/include/asm-x86/domain.h | 2 + xen/include/asm-x86/mm.h | 25 +++++-- 4 files changed, 175 insertions(+), 24 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index d7e699228c..d7ed72c246 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1226,6 +1226,7 @@ int arch_set_info_guest( rc = -ERESTART; /* Fallthrough */ case -ERESTART: + v->arch.old_guest_ptpg = NULL; v->arch.old_guest_table = pagetable_get_page(v->arch.guest_table); v->arch.guest_table = pagetable_null(); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 86f5eda52d..1e469bd354 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -747,6 +747,61 @@ static void put_data_page( put_page(page); } +static bool inc_linear_entries(struct page_info *pg) +{ + typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; + + do { + /* + * The check below checks for the "linear use" count being non-zero + * as well as overflow. Signed integer overflow is undefined behavior + * according to the C spec. However, as long as linear_pt_count is + * smaller in size than 'int', the arithmetic operation of the + * increment below won't overflow; rather the result will be truncated + * when stored. Ensure that this is always true. + */ + BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); + oc = nc++; + if ( nc <= 0 ) + return false; + nc = cmpxchg(&pg->linear_pt_count, oc, nc); + } while ( oc != nc ); + + return true; +} + +static void dec_linear_entries(struct page_info *pg) +{ + typeof(pg->linear_pt_count) oc; + + oc = arch_fetch_and_add(&pg->linear_pt_count, -1); + ASSERT(oc > 0); +} + +static bool inc_linear_uses(struct page_info *pg) +{ + typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; + + do { + /* See the respective comment in inc_linear_entries(). */ + BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); + oc = nc--; + if ( nc >= 0 ) + return false; + nc = cmpxchg(&pg->linear_pt_count, oc, nc); + } while ( oc != nc ); + + return true; +} + +static void dec_linear_uses(struct page_info *pg) +{ + typeof(pg->linear_pt_count) oc; + + oc = arch_fetch_and_add(&pg->linear_pt_count, 1); + ASSERT(oc < 0); +} + /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -777,15 +832,35 @@ get_##level##_linear_pagetable( \ \ if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \ { \ + struct page_info *ptpg = mfn_to_page(pde_pfn); \ + \ + /* Make sure the page table belongs to the correct domain. */ \ + if ( unlikely(page_get_owner(ptpg) != d) ) \ + return 0; \ + \ /* Make sure the mapped frame belongs to the correct domain. */ \ if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \ return 0; \ \ /* \ - * Ensure that the mapped frame is an already-validated page table. \ + * Ensure that the mapped frame is an already-validated page table \ + * and is not itself having linear entries, as well as that the \ + * containing page table is not iself in use as a linear page table \ + * elsewhere. \ * If so, atomically increment the count (checking for overflow). \ */ \ page = mfn_to_page(pfn); \ + if ( !inc_linear_entries(ptpg) ) \ + { \ + put_page(page); \ + return 0; \ + } \ + if ( !inc_linear_uses(page) ) \ + { \ + dec_linear_entries(ptpg); \ + put_page(page); \ + return 0; \ + } \ y = page->u.inuse.type_info; \ do { \ x = y; \ @@ -793,6 +868,8 @@ get_##level##_linear_pagetable( \ unlikely((x & (PGT_type_mask|PGT_validated)) != \ (PGT_##level##_page_table|PGT_validated)) ) \ { \ + dec_linear_uses(page); \ + dec_linear_entries(ptpg); \ put_page(page); \ return 0; \ } \ @@ -1226,6 +1303,9 @@ get_page_from_l4e( l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \ } while ( 0 ) +static int _put_page_type(struct page_info *page, bool preemptible, + struct page_info *ptpg); + void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) { unsigned long pfn = l1e_get_pfn(l1e); @@ -1296,17 +1376,22 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) if ( l2e_get_flags(l2e) & _PAGE_PSE ) put_superpage(l2e_get_pfn(l2e)); else - put_page_and_type(l2e_get_page(l2e)); + { + struct page_info *pg = l2e_get_page(l2e); + int rc = _put_page_type(pg, false, mfn_to_page(pfn)); + + ASSERT(!rc); + put_page(pg); + } return 0; } -static int __put_page_type(struct page_info *, int preemptible); - static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, int partial, bool_t defer) { struct page_info *pg; + int rc; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) ) return 1; @@ -1329,21 +1414,28 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, if ( unlikely(partial > 0) ) { ASSERT(!defer); - return __put_page_type(pg, 1); + return _put_page_type(pg, true, mfn_to_page(pfn)); } if ( defer ) { + current->arch.old_guest_ptpg = mfn_to_page(pfn); current->arch.old_guest_table = pg; return 0; } - return put_page_and_type_preemptible(pg); + rc = _put_page_type(pg, true, mfn_to_page(pfn)); + if ( likely(!rc) ) + put_page(pg); + + return rc; } static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, int partial, bool_t defer) { + int rc = 1; + if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) { @@ -1352,18 +1444,22 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, if ( unlikely(partial > 0) ) { ASSERT(!defer); - return __put_page_type(pg, 1); + return _put_page_type(pg, true, mfn_to_page(pfn)); } if ( defer ) { + current->arch.old_guest_ptpg = mfn_to_page(pfn); current->arch.old_guest_table = pg; return 0; } - return put_page_and_type_preemptible(pg); + rc = _put_page_type(pg, true, mfn_to_page(pfn)); + if ( likely(!rc) ) + put_page(pg); } - return 1; + + return rc; } static int alloc_l1_table(struct page_info *page) @@ -1561,6 +1657,7 @@ static int alloc_l3_table(struct page_info *page) { page->nr_validated_ptes = i; page->partial_pte = 0; + current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; } while ( i-- > 0 ) @@ -1654,6 +1751,7 @@ static int alloc_l4_table(struct page_info *page) { if ( current->arch.old_guest_table ) page->nr_validated_ptes++; + current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; } } @@ -2403,14 +2501,20 @@ int free_page_type(struct page_info *pag } -static int __put_final_page_type( - struct page_info *page, unsigned long type, int preemptible) +static int _put_final_page_type(struct page_info *page, unsigned long type, + bool preemptible, struct page_info *ptpg) { int rc = free_page_type(page, type, preemptible); /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { + if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) ) + { + dec_linear_uses(page); + dec_linear_entries(ptpg); + } + ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); /* * Record TLB information for flush later. We do not stamp page tables * when running in shadow mode: @@ -2446,8 +2550,8 @@ static int __put_final_page_type( } -static int __put_page_type(struct page_info *page, - int preemptible) +static int _put_page_type(struct page_info *page, bool preemptible, + struct page_info *ptpg) { unsigned long nx, x, y = page->u.inuse.type_info; int rc = 0; @@ -2474,12 +2578,28 @@ static int __put_page_type(struct page_info *page, x, nx)) != x) ) continue; /* We cleared the 'valid bit' so we do the clean up. */ - rc = __put_final_page_type(page, x, preemptible); + rc = _put_final_page_type(page, x, preemptible, ptpg); + ptpg = NULL; if ( x & PGT_partial ) put_page(page); break; } + if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) + { + /* + * page_set_tlbflush_timestamp() accesses the same union + * linear_pt_count lives in. Unvalidated page table pages, + * however, should occur during domain destruction only + * anyway. Updating of linear_pt_count luckily is not + * necessary anymore for a dying domain. + */ + ASSERT(page_get_owner(page)->is_dying); + ASSERT(page->linear_pt_count < 0); + ASSERT(ptpg->linear_pt_count > 0); + ptpg = NULL; + } + /* * Record TLB information for flush later. We do not stamp page * tables when running in shadow mode: @@ -2499,6 +2619,13 @@ static int __put_page_type(struct page_info *page, return -EINTR; } + if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) + { + ASSERT(!rc); + dec_linear_uses(page); + dec_linear_entries(ptpg); + } + return rc; } @@ -2638,6 +2765,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, page->nr_validated_ptes = 0; page->partial_pte = 0; } + page->linear_pt_count = 0; rc = alloc_page_type(page, type, preemptible); } @@ -2652,7 +2780,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, void put_page_type(struct page_info *page) { - int rc = __put_page_type(page, 0); + int rc = _put_page_type(page, false, NULL); ASSERT(rc == 0); (void)rc; } @@ -2668,7 +2796,7 @@ int get_page_type(struct page_info *page, unsigned long type) int put_page_type_preemptible(struct page_info *page) { - return __put_page_type(page, 1); + return _put_page_type(page, true, NULL); } int get_page_type_preemptible(struct page_info *page, unsigned long type) @@ -2878,11 +3006,14 @@ int put_old_guest_table(struct vcpu *v) if ( !v->arch.old_guest_table ) return 0; - switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) ) + switch ( rc = _put_page_type(v->arch.old_guest_table, true, + v->arch.old_guest_ptpg) ) { case -EINTR: case -ERESTART: return -ERESTART; + case 0: + put_page(v->arch.old_guest_table); } v->arch.old_guest_table = NULL; @@ -3042,6 +3173,7 @@ int new_guest_cr3(unsigned long mfn) rc = -ERESTART; /* fallthrough */ case -ERESTART: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; break; default: @@ -3310,7 +3442,10 @@ long do_mmuext_op( if ( type == PGT_l1_page_table ) put_page_and_type(page); else + { + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + } } } @@ -3346,6 +3481,7 @@ long do_mmuext_op( { case -EINTR: case -ERESTART: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; rc = 0; break; @@ -3425,6 +3561,7 @@ long do_mmuext_op( rc = -ERESTART; /* fallthrough */ case -ERESTART: + curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; break; default: diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 924caac834..5a512918cc 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -527,6 +527,8 @@ struct arch_vcpu pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ pagetable_t guest_table; /* (MFN) guest notion of cr3 */ struct page_info *old_guest_table; /* partially destructed pagetable */ + struct page_info *old_guest_ptpg; /* containing page table of the */ + /* former, if any */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 119d7dec6b..445da50d47 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -124,11 +124,11 @@ struct page_info u32 tlbflush_timestamp; /* - * When PGT_partial is true then this field is valid and indicates - * that PTEs in the range [0, @nr_validated_ptes) have been validated. - * An extra page reference must be acquired (or not dropped) whenever - * PGT_partial gets set, and it must be dropped when the flag gets - * cleared. This is so that a get() leaving a page in partially + * When PGT_partial is true then the first two fields are valid and + * indicate that PTEs in the range [0, @nr_validated_ptes) have been + * validated. An extra page reference must be acquired (or not dropped) + * whenever PGT_partial gets set, and it must be dropped when the flag + * gets cleared. This is so that a get() leaving a page in partially * validated state (where the caller would drop the reference acquired * due to the getting of the type [apparently] failing [-ERESTART]) * would not accidentally result in a page left with zero general @@ -152,10 +152,18 @@ struct page_info * put_page_from_lNe() (due to the apparent failure), and hence it * must be dropped when the put operation is resumed (and completes), * but it must not be acquired if picking up the page for validation. + * + * The 3rd field, @linear_pt_count, indicates + * - by a positive value, how many same-level page table entries a page + * table has, + * - by a negative value, in how many same-level page tables a page is + * in use. */ struct { - u16 nr_validated_ptes; - s8 partial_pte; + u16 nr_validated_ptes:PAGETABLE_ORDER + 1; + u16 :16 - PAGETABLE_ORDER - 1 - 2; + s16 partial_pte:2; + s16 linear_pt_count; }; /* @@ -206,6 +214,9 @@ struct page_info #define PGT_count_width PG_shift(9) #define PGT_count_mask ((1UL<