From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH] x86: fix preemptable page type handling (v2) Date: Thu, 30 Oct 2008 12:27:46 +0000 Message-ID: <4909B652.76E4.0078.0@novell.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Return-path: Content-Disposition: inline List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: xen-devel@lists.xensource.com List-Id: xen-devel@lists.xenproject.org - retain a page reference when PGT_partial is set on a page (and drop it when clearing that flag) - don't drop a page reference never acquired when freeing the page type of a page where the allocation of the type got preempted (and never completed) - don't acquire a page reference when allocating the page type of a page where freeing the type got preempted (and never completed, and hence didn't drop the respective reference) Signed-off-by: Jan Beulich Index: 2008-10-27/xen/arch/x86/domain.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- 2008-10-27.orig/xen/arch/x86/domain.c 2008-10-29 17:10:41.0000000= 00 +0100 +++ 2008-10-27/xen/arch/x86/domain.c 2008-10-29 17:11:51.000000000 = +0100 @@ -1683,18 +1683,24 @@ static int relinquish_memory( break; case -EINTR: page->u.inuse.type_info |=3D PGT_validated; + if ( x & PGT_partial ) + put_page(page); put_page(page); ret =3D -EAGAIN; goto out; case -EAGAIN: page->u.inuse.type_info |=3D PGT_partial; - put_page(page); + if ( x & PGT_partial ) + put_page(page); goto out; default: BUG(); } if ( x & PGT_partial ) + { page->u.inuse.type_info--; + put_page(page); + } break; } } Index: 2008-10-27/xen/arch/x86/mm.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- 2008-10-27.orig/xen/arch/x86/mm.c 2008-10-28 14:53:16.000000000 = +0100 +++ 2008-10-27/xen/arch/x86/mm.c 2008-10-29 16:46:52.000000000 = +0100 @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned static int get_page_and_type_from_pagenr(unsigned long page_nr,=20 unsigned long type, struct domain *d, + int partial, int preemptible) { struct page_info *page =3D mfn_to_page(page_nr); int rc; =20 - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) + if ( likely(partial >=3D 0) && + unlikely(!get_page_from_pagenr(page_nr, d)) ) return -EINVAL; =20 rc =3D (preemptible ? get_page_type_preemptible(page, type) : (get_page_type(page, type) ? 0 : -EINVAL)); =20 - if ( rc ) + if ( unlikely(rc) && partial >=3D 0 ) put_page(page); =20 return rc; @@ -761,7 +763,7 @@ get_page_from_l2e( } =20 rc =3D get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); if ( unlikely(rc =3D=3D -EINVAL) && get_l2_linear_pagetable(l2e, pfn, = d) ) rc =3D 0; =20 @@ -772,7 +774,7 @@ get_page_from_l2e( define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible= ) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, = int preemptible) { int rc; =20 @@ -786,7 +788,7 @@ get_page_from_l3e( } =20 rc =3D get_page_and_type_from_pagenr( - l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible); if ( unlikely(rc =3D=3D -EINVAL) && get_l3_linear_pagetable(l3e, pfn, = d) ) rc =3D 0; =20 @@ -797,7 +799,7 @@ get_page_from_l3e( define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible= ) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, = int preemptible) { int rc; =20 @@ -811,7 +813,7 @@ get_page_from_l4e( } =20 rc =3D get_page_and_type_from_pagenr( - l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible); if ( unlikely(rc =3D=3D -EINVAL) && get_l4_linear_pagetable(l4e, pfn, = d) ) rc =3D 0; =20 @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_ return 1; } =20 +static int __put_page_type(struct page_info *, int preemptible); =20 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&=20 (l3e_get_pfn(l3e) !=3D pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l3e_get_page(l3e), preemptible); return put_page_and_type_preemptible(l3e_get_page(l3e), preemptibl= e); + } return 1; } =20 #if CONFIG_PAGING_LEVELS >=3D 4 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&=20 (l4e_get_pfn(l4e) !=3D pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l4e_get_page(l4e), preemptible); return put_page_and_type_preemptible(l4e_get_page(l4e), preemptibl= e); + } return 1; } #endif @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in unsigned long pfn =3D page_to_mfn(page); l3_pgentry_t *pl3e; unsigned int i; - int rc =3D 0; + int rc =3D 0, partial =3D page->partial_pte; =20 #if CONFIG_PAGING_LEVELS =3D=3D 3 /* @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); =20 - for ( i =3D page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) + for ( i =3D page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial =3D 0 ) { if ( is_pv_32bit_domain(d) && (i =3D=3D 3) ) { @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in rc =3D get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),= PGT_l2_page_table | PGT_pae_xen_l2, - d, preemptible); + d, partial, preemptible= ); } else if ( !is_guest_l3_slot(i) || - (rc =3D get_page_from_l3e(pl3e[i], pfn, d, preemptible))= > 0 ) + (rc =3D get_page_from_l3e(pl3e[i], pfn, d, + partial, preemptible)) > 0 ) continue; =20 if ( rc =3D=3D -EAGAIN ) { page->nr_validated_ptes =3D i; - page->partial_pte =3D 1; + page->partial_pte =3D partial ?: 1; } else if ( rc =3D=3D -EINTR && i ) { @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in if ( !is_guest_l3_slot(i) ) continue; unadjust_guest_l3e(pl3e[i], d); - put_page_from_l3e(pl3e[i], pfn, 0); + put_page_from_l3e(pl3e[i], pfn, 0, 0); } } =20 @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in unsigned long pfn =3D page_to_mfn(page); l4_pgentry_t *pl4e =3D page_to_virt(page); unsigned int i; - int rc =3D 0; + int rc =3D 0, partial =3D page->partial_pte; =20 - for ( i =3D page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) + for ( i =3D page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; + i++, partial =3D 0 ) { if ( !is_guest_l4_slot(d, i) || - (rc =3D get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 = ) + (rc =3D get_page_from_l4e(pl4e[i], pfn, d, + partial, preemptible)) > 0 ) continue; =20 if ( rc =3D=3D -EAGAIN ) { page->nr_validated_ptes =3D i; - page->partial_pte =3D 1; + page->partial_pte =3D partial ?: 1; } else if ( rc =3D=3D -EINTR ) { @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in MEM_LOG("Failure in alloc_l4_table: entry %d", i); while ( i-- > 0 ) if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn, 0); + put_page_from_l4e(pl4e[i], pfn, 0, 0); } if ( rc < 0 ) return rc; @@ -1377,19 +1392,20 @@ static int free_l3_table(struct page_inf struct domain *d =3D page_get_owner(page); unsigned long pfn =3D page_to_mfn(page); l3_pgentry_t *pl3e; - unsigned int i =3D page->nr_validated_ptes - !page->partial_pte; - int rc =3D 0; + int rc =3D 0, partial =3D page->partial_pte; + unsigned int i =3D page->nr_validated_ptes - !partial; =20 pl3e =3D map_domain_page(pfn); =20 do { if ( is_guest_l3_slot(i) ) { - rc =3D put_page_from_l3e(pl3e[i], pfn, preemptible); + rc =3D put_page_from_l3e(pl3e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial =3D 0; if ( rc > 0 ) continue; - if ( rc ) - break; unadjust_guest_l3e(pl3e[i], d); } } while ( i-- ); @@ -1399,7 +1415,7 @@ static int free_l3_table(struct page_inf if ( rc =3D=3D -EAGAIN ) { page->nr_validated_ptes =3D i; - page->partial_pte =3D 1; + page->partial_pte =3D partial ?: -1; } else if ( rc =3D=3D -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { @@ -1416,18 +1432,21 @@ static int free_l4_table(struct page_inf struct domain *d =3D page_get_owner(page); unsigned long pfn =3D page_to_mfn(page); l4_pgentry_t *pl4e =3D page_to_virt(page); - unsigned int i =3D page->nr_validated_ptes - !page->partial_pte; - int rc =3D 0; + int rc =3D 0, partial =3D page->partial_pte; + unsigned int i =3D page->nr_validated_ptes - !partial; =20 do { if ( is_guest_l4_slot(d, i) ) - rc =3D put_page_from_l4e(pl4e[i], pfn, preemptible); - } while ( rc >=3D 0 && i-- ); + rc =3D put_page_from_l4e(pl4e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial =3D 0; + } while ( i-- ); =20 if ( rc =3D=3D -EAGAIN ) { page->nr_validated_ptes =3D i; - page->partial_pte =3D 1; + page->partial_pte =3D partial ?: -1; } else if ( rc =3D=3D -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { @@ -1703,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl return rc ? 0 : -EFAULT; } =20 - rc =3D get_page_from_l3e(nl3e, pfn, d, preemptible); + rc =3D get_page_from_l3e(nl3e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l3pg), rc; rc =3D 0; @@ -1732,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl } =20 page_unlock(l3pg); - put_page_from_l3e(ol3e, pfn, 0); + put_page_from_l3e(ol3e, pfn, 0, 0); return rc; } =20 @@ -1781,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl return rc ? 0 : -EFAULT; } =20 - rc =3D get_page_from_l4e(nl4e, pfn, d, preemptible); + rc =3D get_page_from_l4e(nl4e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l4pg), rc; rc =3D 0; @@ -1802,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl } =20 page_unlock(l4pg); - put_page_from_l4e(ol4e, pfn, 0); + put_page_from_l4e(ol4e, pfn, 0, 0); return rc; } =20 @@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i struct domain *owner =3D page_get_owner(page); int rc; =20 + /* Obtain an extra reference to retain if we set PGT_partial. */ + if ( preemptible && !get_page(page, owner) ) + return -EINVAL; + /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner !=3D NULL) ) paging_mark_dirty(owner, page_to_mfn(page)); @@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i if ( rc =3D=3D -EAGAIN ) { page->u.inuse.type_info |=3D PGT_partial; + return -EAGAIN; } - else if ( rc =3D=3D -EINTR ) + + if ( preemptible ) + put_page(page); + + if ( rc =3D=3D -EINTR ) { ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) =3D=3D 1); @@ -2020,11 +2048,16 @@ static int __put_page_type_final(struct=20 case -EAGAIN: wmb(); page->u.inuse.type_info |=3D PGT_partial; + /* Must skip put_page() below. */ + preemptible =3D 0; break; default: BUG(); } =20 + if ( preemptible ) + put_page(page); + return rc; } =20 @@ -2034,6 +2067,10 @@ static int __put_page_type(struct page_i { unsigned long nx, x, y =3D page->u.inuse.type_info; =20 + /* Obtain an extra reference to retain if we set PGT_partial. */ + if ( preemptible && !get_page(page, page_get_owner(page)) ) + return -EINVAL; + for ( ; ; ) { x =3D y; @@ -2055,6 +2092,8 @@ static int __put_page_type(struct page_i if ( unlikely((y =3D cmpxchg(&page->u.inuse.type_info, x, nx)) !=3D x) ) continue; + if ( x & PGT_partial ) + put_page(page); /* We cleared the 'valid bit' so we do the clean up. */ return __put_page_type_final(page, x, preemptible); } @@ -2075,9 +2114,16 @@ static int __put_page_type(struct page_i break; =20 if ( preemptible && hypercall_preempt_check() ) + { + if ( preemptible ) + put_page(page); return -EINTR; + } } =20 + if ( preemptible ) + put_page(page); + return 0; } =20 @@ -2181,7 +2227,11 @@ static int __get_page_type(struct page_i } =20 if ( likely((y =3D cmpxchg(&page->u.inuse.type_info, x, nx)) = =3D=3D x) ) + { + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); break; + } =20 if ( preemptible && hypercall_preempt_check() ) return -EINTR; @@ -2290,7 +2340,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay =3D paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, = 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); @@ -2534,7 +2584,7 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; =20 - rc =3D get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, = 1); + rc =3D get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, = 0, 1); okay =3D !rc; if ( unlikely(!okay) ) { @@ -2615,7 +2665,7 @@ int do_mmuext_op( okay =3D get_page_from_pagenr(mfn, d); else okay =3D !get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d, 0); + mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2722,7 +2772,7 @@ int do_mmuext_op( unsigned char *ptr; =20 okay =3D !get_page_and_type_from_pagenr(mfn, PGT_writable_page= , - FOREIGNDOM, 0); + FOREIGNDOM, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while clearing mfn %lx", mfn); @@ -2755,7 +2805,7 @@ int do_mmuext_op( } =20 okay =3D !get_page_and_type_from_pagenr(mfn, PGT_writable_page= , - FOREIGNDOM, 0); + FOREIGNDOM, 0, 0); if ( unlikely(!okay) ) { put_page(mfn_to_page(src_mfn)); Index: 2008-10-27/xen/include/asm-x86/mm.h =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- 2008-10-27.orig/xen/include/asm-x86/mm.h 2008-10-29 17:09:06.0000000= 00 +0100 +++ 2008-10-27/xen/include/asm-x86/mm.h 2008-10-30 13:25:44.000000000 = +0100 @@ -61,12 +61,36 @@ struct page_info /* * When PGT_partial is true then this field is valid and = indicates * that PTEs in the range [0, @nr_validated_ptes) have been = validated. - * If @partial_pte is true then PTE at @nr_validated_ptes+1 has = been - * partially validated. + * An extra page reference must be acquired (or not dropped) = whenever + * PGT_partial gets set, and it must be dropped when the flag = gets + * cleared. This is so that a get() leaving a page in partially + * validated state (where the caller would drop the reference = acquired + * due to the getting of the type [apparently] failing [-EAGAIN]) + * would not accidentally result in a page left with zero general + * reference count, but non-zero type reference count (possible = when + * the partial get() is followed immediately by domain destruction= ). + * Likewise, the ownership of the single type reference for = partially + * (in-)validated pages is tied to this flag, i.e. the instance + * setting the flag must not drop that reference, whereas the = instance + * clearing it will have to. + * + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 = has + * been partially validated. This implies that the general = reference + * to the page (acquired from get_page_from_lNe()) would be = dropped + * (again due to the apparent failure) and hence must be = re-acquired + * when resuming the validation, but must not be dropped when = picking + * up the page for invalidation. + * + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 = has + * been partially invalidated. This is basically the opposite = case of + * above, i.e. the general reference to the page was not dropped = in + * put_page_from_lNe() (due to the apparent failure), and hence = it + * must be dropped when the put operation is resumed (and = completes), + * but it must not be acquired if picking up the page for = validation. */ struct { u16 nr_validated_ptes; - bool_t partial_pte; + s8 partial_pte; }; =20 /*