* [PATCH] x86: fix preemptable page type handling (v2)
@ 2008-10-30 12:27 Jan Beulich
2008-10-30 14:46 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: Jan Beulich @ 2008-10-30 12:27 UTC (permalink / raw)
To: xen-devel
- retain a page reference when PGT_partial is set on a page (and drop
it when clearing that flag)
- don't drop a page reference never acquired when freeing the page type
of a page where the allocation of the type got preempted (and never
completed)
- don't acquire a page reference when allocating the page type of a
page where freeing the type got preempted (and never completed, and
hence didn't drop the respective reference)
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Index: 2008-10-27/xen/arch/x86/domain.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/domain.c 2008-10-29 17:10:41.000000000 +0100
+++ 2008-10-27/xen/arch/x86/domain.c 2008-10-29 17:11:51.000000000 +0100
@@ -1683,18 +1683,24 @@ static int relinquish_memory(
break;
case -EINTR:
page->u.inuse.type_info |= PGT_validated;
+ if ( x & PGT_partial )
+ put_page(page);
put_page(page);
ret = -EAGAIN;
goto out;
case -EAGAIN:
page->u.inuse.type_info |= PGT_partial;
- put_page(page);
+ if ( x & PGT_partial )
+ put_page(page);
goto out;
default:
BUG();
}
if ( x & PGT_partial )
+ {
page->u.inuse.type_info--;
+ put_page(page);
+ }
break;
}
}
Index: 2008-10-27/xen/arch/x86/mm.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/mm.c 2008-10-28 14:53:16.000000000 +0100
+++ 2008-10-27/xen/arch/x86/mm.c 2008-10-29 16:46:52.000000000 +0100
@@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned
static int get_page_and_type_from_pagenr(unsigned long page_nr,
unsigned long type,
struct domain *d,
+ int partial,
int preemptible)
{
struct page_info *page = mfn_to_page(page_nr);
int rc;
- if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+ if ( likely(partial >= 0) &&
+ unlikely(!get_page_from_pagenr(page_nr, d)) )
return -EINVAL;
rc = (preemptible ?
get_page_type_preemptible(page, type) :
(get_page_type(page, type) ? 0 : -EINVAL));
- if ( rc )
+ if ( unlikely(rc) && partial >= 0 )
put_page(page);
return rc;
@@ -761,7 +763,7 @@ get_page_from_l2e(
}
rc = get_page_and_type_from_pagenr(
- l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+ l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
rc = 0;
@@ -772,7 +774,7 @@ get_page_from_l2e(
define_get_linear_pagetable(l3);
static int
get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
{
int rc;
@@ -786,7 +788,7 @@ get_page_from_l3e(
}
rc = get_page_and_type_from_pagenr(
- l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+ l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
rc = 0;
@@ -797,7 +799,7 @@ get_page_from_l3e(
define_get_linear_pagetable(l4);
static int
get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
{
int rc;
@@ -811,7 +813,7 @@ get_page_from_l4e(
}
rc = get_page_and_type_from_pagenr(
- l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+ l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
rc = 0;
@@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
return 1;
}
+static int __put_page_type(struct page_info *, int preemptible);
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- int preemptible)
+ int partial, int preemptible)
{
if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
(l3e_get_pfn(l3e) != pfn) )
+ {
+ if ( unlikely(partial > 0) )
+ return __put_page_type(l3e_get_page(l3e), preemptible);
return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+ }
return 1;
}
#if CONFIG_PAGING_LEVELS >= 4
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- int preemptible)
+ int partial, int preemptible)
{
if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
(l4e_get_pfn(l4e) != pfn) )
+ {
+ if ( unlikely(partial > 0) )
+ return __put_page_type(l4e_get_page(l4e), preemptible);
return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+ }
return 1;
}
#endif
@@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
unsigned int i;
- int rc = 0;
+ int rc = 0, partial = page->partial_pte;
#if CONFIG_PAGING_LEVELS == 3
/*
@@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
if ( is_pv_32on64_domain(d) )
memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
- for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+ i++, partial = 0 )
{
if ( is_pv_32bit_domain(d) && (i == 3) )
{
@@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
PGT_l2_page_table |
PGT_pae_xen_l2,
- d, preemptible);
+ d, partial, preemptible);
}
else if ( !is_guest_l3_slot(i) ||
- (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
+ (rc = get_page_from_l3e(pl3e[i], pfn, d,
+ partial, preemptible)) > 0 )
continue;
if ( rc == -EAGAIN )
{
page->nr_validated_ptes = i;
- page->partial_pte = 1;
+ page->partial_pte = partial ?: 1;
}
else if ( rc == -EINTR && i )
{
@@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
if ( !is_guest_l3_slot(i) )
continue;
unadjust_guest_l3e(pl3e[i], d);
- put_page_from_l3e(pl3e[i], pfn, 0);
+ put_page_from_l3e(pl3e[i], pfn, 0, 0);
}
}
@@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
unsigned int i;
- int rc = 0;
+ int rc = 0, partial = page->partial_pte;
- for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+ for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+ i++, partial = 0 )
{
if ( !is_guest_l4_slot(d, i) ||
- (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
+ (rc = get_page_from_l4e(pl4e[i], pfn, d,
+ partial, preemptible)) > 0 )
continue;
if ( rc == -EAGAIN )
{
page->nr_validated_ptes = i;
- page->partial_pte = 1;
+ page->partial_pte = partial ?: 1;
}
else if ( rc == -EINTR )
{
@@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
MEM_LOG("Failure in alloc_l4_table: entry %d", i);
while ( i-- > 0 )
if ( is_guest_l4_slot(d, i) )
- put_page_from_l4e(pl4e[i], pfn, 0);
+ put_page_from_l4e(pl4e[i], pfn, 0, 0);
}
if ( rc < 0 )
return rc;
@@ -1377,19 +1392,20 @@ static int free_l3_table(struct page_inf
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l3_pgentry_t *pl3e;
- unsigned int i = page->nr_validated_ptes - !page->partial_pte;
- int rc = 0;
+ int rc = 0, partial = page->partial_pte;
+ unsigned int i = page->nr_validated_ptes - !partial;
pl3e = map_domain_page(pfn);
do {
if ( is_guest_l3_slot(i) )
{
- rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+ rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+ if ( rc < 0 )
+ break;
+ partial = 0;
if ( rc > 0 )
continue;
- if ( rc )
- break;
unadjust_guest_l3e(pl3e[i], d);
}
} while ( i-- );
@@ -1399,7 +1415,7 @@ static int free_l3_table(struct page_inf
if ( rc == -EAGAIN )
{
page->nr_validated_ptes = i;
- page->partial_pte = 1;
+ page->partial_pte = partial ?: -1;
}
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
{
@@ -1416,18 +1432,21 @@ static int free_l4_table(struct page_inf
struct domain *d = page_get_owner(page);
unsigned long pfn = page_to_mfn(page);
l4_pgentry_t *pl4e = page_to_virt(page);
- unsigned int i = page->nr_validated_ptes - !page->partial_pte;
- int rc = 0;
+ int rc = 0, partial = page->partial_pte;
+ unsigned int i = page->nr_validated_ptes - !partial;
do {
if ( is_guest_l4_slot(d, i) )
- rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
- } while ( rc >= 0 && i-- );
+ rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+ if ( rc < 0 )
+ break;
+ partial = 0;
+ } while ( i-- );
if ( rc == -EAGAIN )
{
page->nr_validated_ptes = i;
- page->partial_pte = 1;
+ page->partial_pte = partial ?: -1;
}
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
{
@@ -1703,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
return rc ? 0 : -EFAULT;
}
- rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+ rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
if ( unlikely(rc < 0) )
return page_unlock(l3pg), rc;
rc = 0;
@@ -1732,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
}
page_unlock(l3pg);
- put_page_from_l3e(ol3e, pfn, 0);
+ put_page_from_l3e(ol3e, pfn, 0, 0);
return rc;
}
@@ -1781,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
return rc ? 0 : -EFAULT;
}
- rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+ rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
if ( unlikely(rc < 0) )
return page_unlock(l4pg), rc;
rc = 0;
@@ -1802,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
}
page_unlock(l4pg);
- put_page_from_l4e(ol4e, pfn, 0);
+ put_page_from_l4e(ol4e, pfn, 0, 0);
return rc;
}
@@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i
struct domain *owner = page_get_owner(page);
int rc;
+ /* Obtain an extra reference to retain if we set PGT_partial. */
+ if ( preemptible && !get_page(page, owner) )
+ return -EINVAL;
+
/* A page table is dirtied when its type count becomes non-zero. */
if ( likely(owner != NULL) )
paging_mark_dirty(owner, page_to_mfn(page));
@@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i
if ( rc == -EAGAIN )
{
page->u.inuse.type_info |= PGT_partial;
+ return -EAGAIN;
}
- else if ( rc == -EINTR )
+
+ if ( preemptible )
+ put_page(page);
+
+ if ( rc == -EINTR )
{
ASSERT((page->u.inuse.type_info &
(PGT_count_mask|PGT_validated|PGT_partial)) == 1);
@@ -2020,11 +2048,16 @@ static int __put_page_type_final(struct
case -EAGAIN:
wmb();
page->u.inuse.type_info |= PGT_partial;
+ /* Must skip put_page() below. */
+ preemptible = 0;
break;
default:
BUG();
}
+ if ( preemptible )
+ put_page(page);
+
return rc;
}
@@ -2034,6 +2067,10 @@ static int __put_page_type(struct page_i
{
unsigned long nx, x, y = page->u.inuse.type_info;
+ /* Obtain an extra reference to retain if we set PGT_partial. */
+ if ( preemptible && !get_page(page, page_get_owner(page)) )
+ return -EINVAL;
+
for ( ; ; )
{
x = y;
@@ -2055,6 +2092,8 @@ static int __put_page_type(struct page_i
if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
x, nx)) != x) )
continue;
+ if ( x & PGT_partial )
+ put_page(page);
/* We cleared the 'valid bit' so we do the clean up. */
return __put_page_type_final(page, x, preemptible);
}
@@ -2075,9 +2114,16 @@ static int __put_page_type(struct page_i
break;
if ( preemptible && hypercall_preempt_check() )
+ {
+ if ( preemptible )
+ put_page(page);
return -EINTR;
+ }
}
+ if ( preemptible )
+ put_page(page);
+
return 0;
}
@@ -2181,7 +2227,11 @@ static int __get_page_type(struct page_i
}
if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+ {
+ if ( (x & PGT_partial) && !(nx & PGT_partial) )
+ put_page(page);
break;
+ }
if ( preemptible && hypercall_preempt_check() )
return -EINTR;
@@ -2290,7 +2340,7 @@ int new_guest_cr3(unsigned long mfn)
#endif
okay = paging_mode_refcounts(d)
? get_page_from_pagenr(mfn, d)
- : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
+ : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2534,7 +2584,7 @@ int do_mmuext_op(
if ( paging_mode_refcounts(FOREIGNDOM) )
break;
- rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+ rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
okay = !rc;
if ( unlikely(!okay) )
{
@@ -2615,7 +2665,7 @@ int do_mmuext_op(
okay = get_page_from_pagenr(mfn, d);
else
okay = !get_page_and_type_from_pagenr(
- mfn, PGT_root_page_table, d, 0);
+ mfn, PGT_root_page_table, d, 0, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2722,7 +2772,7 @@ int do_mmuext_op(
unsigned char *ptr;
okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
- FOREIGNDOM, 0);
+ FOREIGNDOM, 0, 0);
if ( unlikely(!okay) )
{
MEM_LOG("Error while clearing mfn %lx", mfn);
@@ -2755,7 +2805,7 @@ int do_mmuext_op(
}
okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
- FOREIGNDOM, 0);
+ FOREIGNDOM, 0, 0);
if ( unlikely(!okay) )
{
put_page(mfn_to_page(src_mfn));
Index: 2008-10-27/xen/include/asm-x86/mm.h
===================================================================
--- 2008-10-27.orig/xen/include/asm-x86/mm.h 2008-10-29 17:09:06.000000000 +0100
+++ 2008-10-27/xen/include/asm-x86/mm.h 2008-10-30 13:25:44.000000000 +0100
@@ -61,12 +61,36 @@ struct page_info
/*
* When PGT_partial is true then this field is valid and indicates
* that PTEs in the range [0, @nr_validated_ptes) have been validated.
- * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
- * partially validated.
+ * An extra page reference must be acquired (or not dropped) whenever
+ * PGT_partial gets set, and it must be dropped when the flag gets
+ * cleared. This is so that a get() leaving a page in partially
+ * validated state (where the caller would drop the reference acquired
+ * due to the getting of the type [apparently] failing [-EAGAIN])
+ * would not accidentally result in a page left with zero general
+ * reference count, but non-zero type reference count (possible when
+ * the partial get() is followed immediately by domain destruction).
+ * Likewise, the ownership of the single type reference for partially
+ * (in-)validated pages is tied to this flag, i.e. the instance
+ * setting the flag must not drop that reference, whereas the instance
+ * clearing it will have to.
+ *
+ * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+ * been partially validated. This implies that the general reference
+ * to the page (acquired from get_page_from_lNe()) would be dropped
+ * (again due to the apparent failure) and hence must be re-acquired
+ * when resuming the validation, but must not be dropped when picking
+ * up the page for invalidation.
+ *
+ * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+ * been partially invalidated. This is basically the opposite case of
+ * above, i.e. the general reference to the page was not dropped in
+ * put_page_from_lNe() (due to the apparent failure), and hence it
+ * must be dropped when the put operation is resumed (and completes),
+ * but it must not be acquired if picking up the page for validation.
*/
struct {
u16 nr_validated_ptes;
- bool_t partial_pte;
+ s8 partial_pte;
};
/*
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: fix preemptable page type handling (v2)
2008-10-30 12:27 [PATCH] x86: fix preemptable page type handling (v2) Jan Beulich
@ 2008-10-30 14:46 ` Keir Fraser
2008-10-30 14:56 ` [PATCH] x86: fix preemptable page type handling(v2) Jan Beulich
0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2008-10-30 14:46 UTC (permalink / raw)
To: Jan Beulich, xen-devel
Why does this patch not do what the new comment says and directly get_page()
when PGT_partial is set and put_page() when PGT_partial is cleared? Doing
get_page() across all __put_page() operations and then skipping the
put_page() in some cases, for example, seems inefficient and also makes the
code less clear. Why would you do it that way?
-- Keir
On 30/10/08 12:27, "Jan Beulich" <jbeulich@novell.com> wrote:
> - retain a page reference when PGT_partial is set on a page (and drop
> it when clearing that flag)
> - don't drop a page reference never acquired when freeing the page type
> of a page where the allocation of the type got preempted (and never
> completed)
> - don't acquire a page reference when allocating the page type of a
> page where freeing the type got preempted (and never completed, and
> hence didn't drop the respective reference)
>
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
>
> Index: 2008-10-27/xen/arch/x86/domain.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain.c 2008-10-29 17:10:41.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain.c 2008-10-29 17:11:51.000000000 +0100
> @@ -1683,18 +1683,24 @@ static int relinquish_memory(
> break;
> case -EINTR:
> page->u.inuse.type_info |= PGT_validated;
> + if ( x & PGT_partial )
> + put_page(page);
> put_page(page);
> ret = -EAGAIN;
> goto out;
> case -EAGAIN:
> page->u.inuse.type_info |= PGT_partial;
> - put_page(page);
> + if ( x & PGT_partial )
> + put_page(page);
> goto out;
> default:
> BUG();
> }
> if ( x & PGT_partial )
> + {
> page->u.inuse.type_info--;
> + put_page(page);
> + }
> break;
> }
> }
> Index: 2008-10-27/xen/arch/x86/mm.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/mm.c 2008-10-28 14:53:16.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/mm.c 2008-10-29 16:46:52.000000000 +0100
> @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned
> static int get_page_and_type_from_pagenr(unsigned long page_nr,
> unsigned long type,
> struct domain *d,
> + int partial,
> int preemptible)
> {
> struct page_info *page = mfn_to_page(page_nr);
> int rc;
>
> - if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
> + if ( likely(partial >= 0) &&
> + unlikely(!get_page_from_pagenr(page_nr, d)) )
> return -EINVAL;
>
> rc = (preemptible ?
> get_page_type_preemptible(page, type) :
> (get_page_type(page, type) ? 0 : -EINVAL));
>
> - if ( rc )
> + if ( unlikely(rc) && partial >= 0 )
> put_page(page);
>
> return rc;
> @@ -761,7 +763,7 @@ get_page_from_l2e(
> }
>
> rc = get_page_and_type_from_pagenr(
> - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
> + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
> if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
> rc = 0;
>
> @@ -772,7 +774,7 @@ get_page_from_l2e(
> define_get_linear_pagetable(l3);
> static int
> get_page_from_l3e(
> - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
> + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int
> preemptible)
> {
> int rc;
>
> @@ -786,7 +788,7 @@ get_page_from_l3e(
> }
>
> rc = get_page_and_type_from_pagenr(
> - l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
> + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
> if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
> rc = 0;
>
> @@ -797,7 +799,7 @@ get_page_from_l3e(
> define_get_linear_pagetable(l4);
> static int
> get_page_from_l4e(
> - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
> + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int
> preemptible)
> {
> int rc;
>
> @@ -811,7 +813,7 @@ get_page_from_l4e(
> }
>
> rc = get_page_and_type_from_pagenr(
> - l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
> + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
> if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
> rc = 0;
>
> @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
> return 1;
> }
>
> +static int __put_page_type(struct page_info *, int preemptible);
>
> static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
> - int preemptible)
> + int partial, int preemptible)
> {
> if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) &&
> (l3e_get_pfn(l3e) != pfn) )
> + {
> + if ( unlikely(partial > 0) )
> + return __put_page_type(l3e_get_page(l3e), preemptible);
> return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
> + }
> return 1;
> }
>
> #if CONFIG_PAGING_LEVELS >= 4
> static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
> - int preemptible)
> + int partial, int preemptible)
> {
> if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
> (l4e_get_pfn(l4e) != pfn) )
> + {
> + if ( unlikely(partial > 0) )
> + return __put_page_type(l4e_get_page(l4e), preemptible);
> return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
> + }
> return 1;
> }
> #endif
> @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
> unsigned long pfn = page_to_mfn(page);
> l3_pgentry_t *pl3e;
> unsigned int i;
> - int rc = 0;
> + int rc = 0, partial = page->partial_pte;
>
> #if CONFIG_PAGING_LEVELS == 3
> /*
> @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
> if ( is_pv_32on64_domain(d) )
> memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
>
> - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
> + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
> + i++, partial = 0 )
> {
> if ( is_pv_32bit_domain(d) && (i == 3) )
> {
> @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
> rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
> PGT_l2_page_table |
> PGT_pae_xen_l2,
> - d, preemptible);
> + d, partial, preemptible);
> }
> else if ( !is_guest_l3_slot(i) ||
> - (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0
> )
> + (rc = get_page_from_l3e(pl3e[i], pfn, d,
> + partial, preemptible)) > 0 )
> continue;
>
> if ( rc == -EAGAIN )
> {
> page->nr_validated_ptes = i;
> - page->partial_pte = 1;
> + page->partial_pte = partial ?: 1;
> }
> else if ( rc == -EINTR && i )
> {
> @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
> if ( !is_guest_l3_slot(i) )
> continue;
> unadjust_guest_l3e(pl3e[i], d);
> - put_page_from_l3e(pl3e[i], pfn, 0);
> + put_page_from_l3e(pl3e[i], pfn, 0, 0);
> }
> }
>
> @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
> unsigned long pfn = page_to_mfn(page);
> l4_pgentry_t *pl4e = page_to_virt(page);
> unsigned int i;
> - int rc = 0;
> + int rc = 0, partial = page->partial_pte;
>
> - for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
> + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
> + i++, partial = 0 )
> {
> if ( !is_guest_l4_slot(d, i) ||
> - (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
> + (rc = get_page_from_l4e(pl4e[i], pfn, d,
> + partial, preemptible)) > 0 )
> continue;
>
> if ( rc == -EAGAIN )
> {
> page->nr_validated_ptes = i;
> - page->partial_pte = 1;
> + page->partial_pte = partial ?: 1;
> }
> else if ( rc == -EINTR )
> {
> @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
> MEM_LOG("Failure in alloc_l4_table: entry %d", i);
> while ( i-- > 0 )
> if ( is_guest_l4_slot(d, i) )
> - put_page_from_l4e(pl4e[i], pfn, 0);
> + put_page_from_l4e(pl4e[i], pfn, 0, 0);
> }
> if ( rc < 0 )
> return rc;
> @@ -1377,19 +1392,20 @@ static int free_l3_table(struct page_inf
> struct domain *d = page_get_owner(page);
> unsigned long pfn = page_to_mfn(page);
> l3_pgentry_t *pl3e;
> - unsigned int i = page->nr_validated_ptes - !page->partial_pte;
> - int rc = 0;
> + int rc = 0, partial = page->partial_pte;
> + unsigned int i = page->nr_validated_ptes - !partial;
>
> pl3e = map_domain_page(pfn);
>
> do {
> if ( is_guest_l3_slot(i) )
> {
> - rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
> + rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
> + if ( rc < 0 )
> + break;
> + partial = 0;
> if ( rc > 0 )
> continue;
> - if ( rc )
> - break;
> unadjust_guest_l3e(pl3e[i], d);
> }
> } while ( i-- );
> @@ -1399,7 +1415,7 @@ static int free_l3_table(struct page_inf
> if ( rc == -EAGAIN )
> {
> page->nr_validated_ptes = i;
> - page->partial_pte = 1;
> + page->partial_pte = partial ?: -1;
> }
> else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
> {
> @@ -1416,18 +1432,21 @@ static int free_l4_table(struct page_inf
> struct domain *d = page_get_owner(page);
> unsigned long pfn = page_to_mfn(page);
> l4_pgentry_t *pl4e = page_to_virt(page);
> - unsigned int i = page->nr_validated_ptes - !page->partial_pte;
> - int rc = 0;
> + int rc = 0, partial = page->partial_pte;
> + unsigned int i = page->nr_validated_ptes - !partial;
>
> do {
> if ( is_guest_l4_slot(d, i) )
> - rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
> - } while ( rc >= 0 && i-- );
> + rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
> + if ( rc < 0 )
> + break;
> + partial = 0;
> + } while ( i-- );
>
> if ( rc == -EAGAIN )
> {
> page->nr_validated_ptes = i;
> - page->partial_pte = 1;
> + page->partial_pte = partial ?: -1;
> }
> else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
> {
> @@ -1703,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
> return rc ? 0 : -EFAULT;
> }
>
> - rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
> + rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
> if ( unlikely(rc < 0) )
> return page_unlock(l3pg), rc;
> rc = 0;
> @@ -1732,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
> }
>
> page_unlock(l3pg);
> - put_page_from_l3e(ol3e, pfn, 0);
> + put_page_from_l3e(ol3e, pfn, 0, 0);
> return rc;
> }
>
> @@ -1781,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
> return rc ? 0 : -EFAULT;
> }
>
> - rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
> + rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
> if ( unlikely(rc < 0) )
> return page_unlock(l4pg), rc;
> rc = 0;
> @@ -1802,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
> }
>
> page_unlock(l4pg);
> - put_page_from_l4e(ol4e, pfn, 0);
> + put_page_from_l4e(ol4e, pfn, 0, 0);
> return rc;
> }
>
> @@ -1866,6 +1885,10 @@ static int alloc_page_type(struct page_i
> struct domain *owner = page_get_owner(page);
> int rc;
>
> + /* Obtain an extra reference to retain if we set PGT_partial. */
> + if ( preemptible && !get_page(page, owner) )
> + return -EINVAL;
> +
> /* A page table is dirtied when its type count becomes non-zero. */
> if ( likely(owner != NULL) )
> paging_mark_dirty(owner, page_to_mfn(page));
> @@ -1900,8 +1923,13 @@ static int alloc_page_type(struct page_i
> if ( rc == -EAGAIN )
> {
> page->u.inuse.type_info |= PGT_partial;
> + return -EAGAIN;
> }
> - else if ( rc == -EINTR )
> +
> + if ( preemptible )
> + put_page(page);
> +
> + if ( rc == -EINTR )
> {
> ASSERT((page->u.inuse.type_info &
> (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
> @@ -2020,11 +2048,16 @@ static int __put_page_type_final(struct
> case -EAGAIN:
> wmb();
> page->u.inuse.type_info |= PGT_partial;
> + /* Must skip put_page() below. */
> + preemptible = 0;
> break;
> default:
> BUG();
> }
>
> + if ( preemptible )
> + put_page(page);
> +
> return rc;
> }
>
> @@ -2034,6 +2067,10 @@ static int __put_page_type(struct page_i
> {
> unsigned long nx, x, y = page->u.inuse.type_info;
>
> + /* Obtain an extra reference to retain if we set PGT_partial. */
> + if ( preemptible && !get_page(page, page_get_owner(page)) )
> + return -EINVAL;
> +
> for ( ; ; )
> {
> x = y;
> @@ -2055,6 +2092,8 @@ static int __put_page_type(struct page_i
> if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
> x, nx)) != x) )
> continue;
> + if ( x & PGT_partial )
> + put_page(page);
> /* We cleared the 'valid bit' so we do the clean up. */
> return __put_page_type_final(page, x, preemptible);
> }
> @@ -2075,9 +2114,16 @@ static int __put_page_type(struct page_i
> break;
>
> if ( preemptible && hypercall_preempt_check() )
> + {
> + if ( preemptible )
> + put_page(page);
> return -EINTR;
> + }
> }
>
> + if ( preemptible )
> + put_page(page);
> +
> return 0;
> }
>
> @@ -2181,7 +2227,11 @@ static int __get_page_type(struct page_i
> }
>
> if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
> + {
> + if ( (x & PGT_partial) && !(nx & PGT_partial) )
> + put_page(page);
> break;
> + }
>
> if ( preemptible && hypercall_preempt_check() )
> return -EINTR;
> @@ -2290,7 +2340,7 @@ int new_guest_cr3(unsigned long mfn)
> #endif
> okay = paging_mode_refcounts(d)
> ? get_page_from_pagenr(mfn, d)
> - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
> + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
> if ( unlikely(!okay) )
> {
> MEM_LOG("Error while installing new baseptr %lx", mfn);
> @@ -2534,7 +2584,7 @@ int do_mmuext_op(
> if ( paging_mode_refcounts(FOREIGNDOM) )
> break;
>
> - rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
> + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
> okay = !rc;
> if ( unlikely(!okay) )
> {
> @@ -2615,7 +2665,7 @@ int do_mmuext_op(
> okay = get_page_from_pagenr(mfn, d);
> else
> okay = !get_page_and_type_from_pagenr(
> - mfn, PGT_root_page_table, d, 0);
> + mfn, PGT_root_page_table, d, 0, 0);
> if ( unlikely(!okay) )
> {
> MEM_LOG("Error while installing new mfn %lx", mfn);
> @@ -2722,7 +2772,7 @@ int do_mmuext_op(
> unsigned char *ptr;
>
> okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
> - FOREIGNDOM, 0);
> + FOREIGNDOM, 0, 0);
> if ( unlikely(!okay) )
> {
> MEM_LOG("Error while clearing mfn %lx", mfn);
> @@ -2755,7 +2805,7 @@ int do_mmuext_op(
> }
>
> okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
> - FOREIGNDOM, 0);
> + FOREIGNDOM, 0, 0);
> if ( unlikely(!okay) )
> {
> put_page(mfn_to_page(src_mfn));
> Index: 2008-10-27/xen/include/asm-x86/mm.h
> ===================================================================
> --- 2008-10-27.orig/xen/include/asm-x86/mm.h 2008-10-29 17:09:06.000000000
> +0100
> +++ 2008-10-27/xen/include/asm-x86/mm.h 2008-10-30 13:25:44.000000000 +0100
> @@ -61,12 +61,36 @@ struct page_info
> /*
> * When PGT_partial is true then this field is valid and indicates
> * that PTEs in the range [0, @nr_validated_ptes) have been
> validated.
> - * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
> - * partially validated.
> + * An extra page reference must be acquired (or not dropped) whenever
> + * PGT_partial gets set, and it must be dropped when the flag gets
> + * cleared. This is so that a get() leaving a page in partially
> + * validated state (where the caller would drop the reference
> acquired
> + * due to the getting of the type [apparently] failing [-EAGAIN])
> + * would not accidentally result in a page left with zero general
> + * reference count, but non-zero type reference count (possible when
> + * the partial get() is followed immediately by domain destruction).
> + * Likewise, the ownership of the single type reference for partially
> + * (in-)validated pages is tied to this flag, i.e. the instance
> + * setting the flag must not drop that reference, whereas the
> instance
> + * clearing it will have to.
> + *
> + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
> + * been partially validated. This implies that the general reference
> + * to the page (acquired from get_page_from_lNe()) would be dropped
> + * (again due to the apparent failure) and hence must be re-acquired
> + * when resuming the validation, but must not be dropped when picking
> + * up the page for invalidation.
> + *
> + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
> + * been partially invalidated. This is basically the opposite case of
> + * above, i.e. the general reference to the page was not dropped in
> + * put_page_from_lNe() (due to the apparent failure), and hence it
> + * must be dropped when the put operation is resumed (and completes),
> + * but it must not be acquired if picking up the page for validation.
> */
> struct {
> u16 nr_validated_ptes;
> - bool_t partial_pte;
> + s8 partial_pte;
> };
>
> /*
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: fix preemptable page type handling(v2)
2008-10-30 14:46 ` Keir Fraser
@ 2008-10-30 14:56 ` Jan Beulich
2008-10-30 15:00 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: Jan Beulich @ 2008-10-30 14:56 UTC (permalink / raw)
To: Keir Fraser; +Cc: xen-devel
>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.10.08 15:46 >>>
>Why does this patch not do what the new comment says and directly get_page()
>when PGT_partial is set and put_page() when PGT_partial is cleared? Doing
Hmm, the description was meant to tell the net effect, not the way it's
implemented.
>get_page() across all __put_page() operations and then skipping the
>put_page() in some cases, for example, seems inefficient and also makes the
>code less clear. Why would you do it that way?
Because I can't handle a failure of get_page() (due to count overflow)
once I'm past the put()s.
Jan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: fix preemptable page type handling(v2)
2008-10-30 14:56 ` [PATCH] x86: fix preemptable page type handling(v2) Jan Beulich
@ 2008-10-30 15:00 ` Keir Fraser
2008-10-30 17:03 ` [PATCH] x86: fix preemptable page typehandling(v2) Jan Beulich
0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2008-10-30 15:00 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel
On 30/10/08 14:56, "Jan Beulich" <jbeulich@novell.com> wrote:
>> get_page() across all __put_page() operations and then skipping the
>> put_page() in some cases, for example, seems inefficient and also makes the
>> code less clear. Why would you do it that way?
>
> Because I can't handle a failure of get_page() (due to count overflow)
> once I'm past the put()s.
We could make get_page() always leave a few available references, and then
define a new get_page() variant which:
A. Does not check the owner
B. Knows that it is incrementing a non-zero count
C. Is able to use the few references never claimed by get_page().
This would be basically a small cmpxchg() loop. Since it's only executed
before setting PGT_partial we know that only one such special reference is
needed per page and we could just make get_page() leave headroom of one
reference.
Simpler code and faster code?
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: fix preemptable page typehandling(v2)
2008-10-30 15:00 ` Keir Fraser
@ 2008-10-30 17:03 ` Jan Beulich
2008-10-30 17:27 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: Jan Beulich @ 2008-10-30 17:03 UTC (permalink / raw)
To: Keir Fraser; +Cc: xen-devel
>>> Keir Fraser <keir.fraser@eu.citrix.com> 30.10.08 16:00 >>>
>On 30/10/08 14:56, "Jan Beulich" <jbeulich@novell.com> wrote:
>
>>> get_page() across all __put_page() operations and then skipping the
>>> put_page() in some cases, for example, seems inefficient and also makes the
>>> code less clear. Why would you do it that way?
>>
>> Because I can't handle a failure of get_page() (due to count overflow)
>> once I'm past the put()s.
>
>We could make get_page() always leave a few available references, and then
>define a new get_page() variant which:
> A. Does not check the owner
> B. Knows that it is incrementing a non-zero count
> C. Is able to use the few references never claimed by get_page().
>
>This would be basically a small cmpxchg() loop. Since it's only executed
>before setting PGT_partial we know that only one such special reference is
>needed per page and we could just make get_page() leave headroom of one
>reference.
>
>Simpler code and faster code?
Yes - admittedly I had expected you to dislike a special casing approach like
this...
Jan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] x86: fix preemptable page typehandling(v2)
2008-10-30 17:03 ` [PATCH] x86: fix preemptable page typehandling(v2) Jan Beulich
@ 2008-10-30 17:27 ` Keir Fraser
2008-10-30 21:19 ` Use of the "has_shutdown_code" code in v3.2 Roger Cruz
0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2008-10-30 17:27 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel
On 30/10/08 17:03, "Jan Beulich" <jbeulich@novell.com> wrote:
>> This would be basically a small cmpxchg() loop. Since it's only executed
>> before setting PGT_partial we know that only one such special reference is
>> needed per page and we could just make get_page() leave headroom of one
>> reference.
>>
>> Simpler code and faster code?
>
> Yes - admittedly I had expected you to dislike a special casing approach like
> this...
I think it will overall make the code clearer.
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
* Use of the "has_shutdown_code" code in v3.2
2008-10-30 17:27 ` Keir Fraser
@ 2008-10-30 21:19 ` Roger Cruz
2008-10-30 22:02 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: Roger Cruz @ 2008-10-30 21:19 UTC (permalink / raw)
To: xen-devel
We're using the V3.2 of the hypervisor code and while tracking down a
suspend/resume bug, I came across the following code that caused me to
ask myself whether the "has_shutdown_code" should also be cleared when
the domain is resumed.
This hypercall will not set a new shutdown reason if the domain already
has the flag set to 1. So, if a domain is suspended and then resumed,
the flag would remain set.
case SCHEDOP_shutdown_code:
{
struct sched_shutdown sched_shutdown;
ret = -EFAULT;
if ( copy_from_guest(&sched_shutdown, arg, 1) )
break;
ret = 0;
TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
current->domain->domain_id, current->vcpu_id,
sched_shutdown.reason);
spin_lock(¤t->domain->shutdown_lock);
if ( !current->domain->has_shutdown_code )
{
current->domain->shutdown_code = (u8)sched_shutdown.reason;
current->domain->has_shutdown_code = 1;
}
So I asked myself, should the domain_resume routine also clear that
flag?
void domain_resume(struct domain *d)
{
struct vcpu *v;
/*
* Some code paths assume that shutdown status does not get reset
under
* their feet (e.g., some assertions make this assumption).
*/
domain_pause(d);
spin_lock(&d->shutdown_lock);
d->is_shutting_down = d->is_shut_down = 0; <---- should it also
include: "d->has_shutdown_code = "
for_each_vcpu ( d, v )
{
if ( v->paused_for_shutdown )
vcpu_unpause(v);
v->paused_for_shutdown = 0;
}
spin_unlock(&d->shutdown_lock);
domain_unpause(d);
}
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: Use of the "has_shutdown_code" code in v3.2
2008-10-30 21:19 ` Use of the "has_shutdown_code" code in v3.2 Roger Cruz
@ 2008-10-30 22:02 ` Keir Fraser
0 siblings, 0 replies; 8+ messages in thread
From: Keir Fraser @ 2008-10-30 22:02 UTC (permalink / raw)
To: Roger Cruz, xen-devel
On 30/10/08 21:19, "Roger Cruz" <rcruz@marathontechnologies.com> wrote:
> We're using the V3.2 of the hypervisor code and while tracking down a
> suspend/resume bug, I came across the following code that caused me to
> ask myself whether the "has_shutdown_code" should also be cleared when
> the domain is resumed.
You're musing on a flag which has never been part of mainline Xen. Looks
like you're right though -- a tree which has the has_shutdown_code flag
should clear it on domain resume.
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2008-10-30 22:02 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-30 12:27 [PATCH] x86: fix preemptable page type handling (v2) Jan Beulich
2008-10-30 14:46 ` Keir Fraser
2008-10-30 14:56 ` [PATCH] x86: fix preemptable page type handling(v2) Jan Beulich
2008-10-30 15:00 ` Keir Fraser
2008-10-30 17:03 ` [PATCH] x86: fix preemptable page typehandling(v2) Jan Beulich
2008-10-30 17:27 ` Keir Fraser
2008-10-30 21:19 ` Use of the "has_shutdown_code" code in v3.2 Roger Cruz
2008-10-30 22:02 ` Keir Fraser
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.