* [PATCH 1/1] Xen PV support for hugepages
@ 2008-11-04 15:41 dcm
2008-11-05 10:59 ` Keir Fraser
0 siblings, 1 reply; 8+ messages in thread
From: dcm @ 2008-11-04 15:41 UTC (permalink / raw)
To: xen-devel
This is the latest version of a patch that adds hugepage support to the Xen
hypervisor in a PV environment. It is against the latest xen-unstable tree
on xenbits.xensource.com. I believe this version addresses the comments
made about the previous version of the patch.
Hugepage support must be enabled via the hypervisor command line option
"allowhugepage".
It assumes the guest guarantees that the hugepage is physically aligned and
contiguous.
There is currently no support in the tools for saving/restoring/migrating
guests who use hugepages.
Note to the maintainers: What else needs to be done for this patch to be
considered for inclusion into the tree?
Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
----
--- xen-unstable//./xen/include/asm-x86/mm.h 2008-10-02 14:23:17.000000000 -0500
+++ xen-hpage/./xen/include/asm-x86/mm.h 2008-11-04 08:24:35.000000000 -0600
@@ -240,6 +240,8 @@ pae_copy_root(struct vcpu *v, l3_pgentry
int check_descriptor(const struct domain *, struct desc_struct *d);
+extern int opt_allow_hugepage;
+
/******************************************************************************
* With shadow pagetables, the different kinds of address start
* to get get confusing.
--- xen-unstable//./xen/include/asm-x86/x86_32/page.h 2008-07-17 09:49:27.000000000 -0500
+++ xen-hpage/./xen/include/asm-x86/x86_32/page.h 2008-11-04 08:24:35.000000000 -0600
@@ -115,7 +115,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
#define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX)
#define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
#define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */
#endif /* __X86_32_PAGE_H__ */
--- xen-unstable//./xen/include/asm-x86/x86_64/page.h 2008-10-02 14:23:17.000000000 -0500
+++ xen-hpage/./xen/include/asm-x86/x86_64/page.h 2008-11-04 08:24:35.000000000 -0600
@@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t;
#define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX)
#define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
-#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
+#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
#define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
#define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
--- xen-unstable//./xen/arch/x86/mm.c 2008-11-04 08:22:40.000000000 -0600
+++ xen-hpage/./xen/arch/x86/mm.c 2008-11-04 08:24:35.000000000 -0600
@@ -160,6 +160,9 @@ unsigned long total_pages;
#define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
+int opt_allow_hugepage = 0;
+boolean_param("allowhugepage", opt_allow_hugepage);
+
#define l1_disallow_mask(d) \
((d != dom_io) && \
(rangeset_is_empty((d)->iomem_caps) && \
@@ -584,6 +587,26 @@ static int get_page_and_type_from_pagenr
return rc;
}
+static int get_data_page(struct page_info *page, struct domain *d, int writeable)
+{
+ int rc;
+
+ if ( writeable )
+ rc = get_page_and_type(page, d, PGT_writable_page);
+ else
+ rc = get_page(page, d);
+
+ return rc;
+}
+
+static void put_data_page(struct page_info *page, int writeable)
+{
+ if ( writeable )
+ put_page_and_type(page);
+ else
+ put_page(page);
+}
+
/*
* We allow root tables to map each other (a.k.a. linear page tables). It
* needs some special care with reference counts and access permissions:
@@ -656,6 +679,7 @@ get_page_from_l1e(
struct vcpu *curr = current;
struct domain *owner;
int okay;
+ int writeable;
if ( !(l1f & _PAGE_PRESENT) )
return 1;
@@ -698,10 +722,9 @@ get_page_from_l1e(
* contribute to writeable mapping refcounts. (This allows the
* qemu-dm helper process in dom0 to map the domain's memory without
* messing up the count of "real" writable mappings.) */
- okay = (((l1f & _PAGE_RW) &&
- !(unlikely(paging_mode_external(d) && (d != curr->domain))))
- ? get_page_and_type(page, d, PGT_writable_page)
- : get_page(page, d));
+ writeable = (l1f & _PAGE_RW) &&
+ !( unlikely(paging_mode_external(d) && (d != curr->domain)) );
+ okay = get_data_page(page, d, writeable);
if ( !okay )
{
MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -759,11 +782,43 @@ get_page_from_l2e(
MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
return -EINVAL;
}
+ if ( l2e_get_flags(l2e) & _PAGE_PSE )
+ {
+ unsigned long mfn = l2e_get_pfn(l2e);
+ unsigned long m, me;
+ struct page_info *page = mfn_to_page(mfn);
+ int writeable;
- rc = get_page_and_type_from_pagenr(
- l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
- if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
- rc = 0;
+ if ( !opt_allow_hugepage )
+ return -EINVAL;
+
+ writeable = l2e_get_flags(l2e) & _PAGE_RW;
+
+ rc = get_data_page(page, d, writeable);
+ if ( unlikely(!rc) )
+ return rc;
+
+ for ( m = mfn+1, me = mfn + (L1_PAGETABLE_ENTRIES-1); m <= me; m++ )
+ {
+ rc = get_data_page(mfn_to_page(m), d, writeable);
+ if ( unlikely(!rc) )
+ {
+ for ( --m; m > mfn; --m )
+ put_data_page(mfn_to_page(m), writeable);
+ put_data_page(page, writeable);
+ return 0;
+ }
+ }
+#ifdef __x86_64__
+ map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
+ PAGE_HYPERVISOR | l2e_get_flags(l2e));
+#endif
+ } else {
+ rc = get_page_and_type_from_pagenr(
+ l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+ if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+ rc = 0;
+ }
return rc;
}
@@ -955,7 +1010,18 @@ static int put_page_from_l2e(l2_pgentry_
if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) &&
(l2e_get_pfn(l2e) != pfn) )
{
- put_page_and_type(l2e_get_page(l2e));
+ if ( l2e_get_flags(l2e) & _PAGE_PSE )
+ {
+ unsigned long mfn = l2e_get_pfn(l2e);
+ unsigned long m, me;
+ struct page_info *page = mfn_to_page(mfn);
+ int writeable = l2e_get_flags(l2e) & _PAGE_RW;
+
+ for ( m = mfn+1, me = mfn + (L1_PAGETABLE_ENTRIES-1); m <= me; m++ )
+ put_data_page(mfn_to_page(m), writeable);
+ put_data_page(page, writeable);
+ } else
+ put_page_and_type(l2e_get_page(l2e));
return 0;
}
return 1;
--- xen-unstable//./xen/arch/x86/traps.c 2008-11-04 08:22:40.000000000 -0600
+++ xen-hpage/./xen/arch/x86/traps.c 2008-11-04 08:24:35.000000000 -0600
@@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_reg
{
/* Modify Feature Information. */
__clear_bit(X86_FEATURE_VME, &d);
- __clear_bit(X86_FEATURE_PSE, &d);
+ if (!opt_allow_hugepage)
+ __clear_bit(X86_FEATURE_PSE, &d);
__clear_bit(X86_FEATURE_PGE, &d);
__clear_bit(X86_FEATURE_MCE, &d);
__clear_bit(X86_FEATURE_MCA, &d);
@@ -2002,9 +2003,12 @@ static int emulate_privileged_op(struct
case 4: /* Read CR4 */
/*
* Guests can read CR4 to see what features Xen has enabled. We
- * therefore lie about PGE & PSE as they are unavailable to guests.
+ * therefore lie about PGE as it is unavailable to guests.
+ * Also disallow PSE if hugepages are not enabled.
*/
- *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
+ *reg = read_cr4() & ~(X86_CR4_PGE);
+ if (!opt_allow_hugepage)
+ *reg &= ~(X86_CR4_PSE);
break;
default:
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-04 15:41 [PATCH 1/1] Xen PV support for hugepages dcm
@ 2008-11-05 10:59 ` Keir Fraser
2008-11-05 15:13 ` Jan Beulich
0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2008-11-05 10:59 UTC (permalink / raw)
To: dcm, xen-devel
On 4/11/08 15:41, "dcm@mccr.org" <dcm@mccr.org> wrote:
> This is the latest version of a patch that adds hugepage support to the Xen
> hypervisor in a PV environment. It is against the latest xen-unstable tree
> on xenbits.xensource.com. I believe this version addresses the comments
> made about the previous version of the patch.
>
> Hugepage support must be enabled via the hypervisor command line option
> "allowhugepage".
>
> It assumes the guest guarantees that the hugepage is physically aligned and
> contiguous.
>
> There is currently no support in the tools for saving/restoring/migrating
> guests who use hugepages.
>
> Note to the maintainers: What else needs to be done for this patch to be
> considered for inclusion into the tree?
I cleaned it up, fixed some return codes, and applied as c/s 18756. It may
be worth Jan Beulich taking a quick look as the return-code conventions for
the get/put functions in mm.c is now rather convoluted. I think I got it
right but I'm not 100% sure. Indeed, Jan, it'd sure be nice to have some
comments explaining what the <0,=0,>0 codes mean from the various get/put
functions, and how they differ between the l1e variants and the others (I'm
pretty sure they do differ).
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-05 10:59 ` Keir Fraser
@ 2008-11-05 15:13 ` Jan Beulich
2008-11-06 13:44 ` Dave McCracken
0 siblings, 1 reply; 8+ messages in thread
From: Jan Beulich @ 2008-11-05 15:13 UTC (permalink / raw)
To: Keir Fraser, dcm; +Cc: xen-devel
>>> Keir Fraser <keir.fraser@eu.citrix.com> 05.11.08 11:59 >>>
>On 4/11/08 15:41, "dcm@mccr.org" <dcm@mccr.org> wrote:
>
>> This is the latest version of a patch that adds hugepage support to the Xen
>> hypervisor in a PV environment. It is against the latest xen-unstable tree
>> on xenbits.xensource.com. I believe this version addresses the comments
>> made about the previous version of the patch.
>>
>> Hugepage support must be enabled via the hypervisor command line option
>> "allowhugepage".
>>
>> It assumes the guest guarantees that the hugepage is physically aligned and
>> contiguous.
>>
>> There is currently no support in the tools for saving/restoring/migrating
>> guests who use hugepages.
>>
>> Note to the maintainers: What else needs to be done for this patch to be
>> considered for inclusion into the tree?
>
>I cleaned it up, fixed some return codes, and applied as c/s 18756. It may
>be worth Jan Beulich taking a quick look as the return-code conventions for
>the get/put functions in mm.c is now rather convoluted. I think I got it
>right but I'm not 100% sure. Indeed, Jan, it'd sure be nice to have some
>comments explaining what the <0,=0,>0 codes mean from the various get/put
>functions, and how they differ between the l1e variants and the others (I'm
>pretty sure they do differ).
The return value handling seems correct, but the call to map_pages_to_xen()
at the end of get_page_from_l2e() doesn't: Simply or-ing the guest-provided
flags into PAGE_HYPERVISOR definitely isn't correct. The cache attributes
must be extracted, converted to L1 notion, and then be or-ed in.
Also, the special code to deal with LDT pages in put_page_from_l1e() should
now also be required in put_page_from_l2e(). Perhaps that logic could be
moved into put_data_page()?
Further, with L2_DISALLOW_MASK now permitting _PAGE_PSE, the use
site in mod_l2_entry() must certainly also be modified, i.e. the subsequent
l2e_has_changed() would need to include _PAGE_PSE, and the case of
_PAGE_RW toggling when _PAGE_PSE was and will be set needs to be
taken care of.
Next, adjust_guest_l2e() would apparently also need to be modified.
Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather
risky: I'm pretty sure the pv-ops kernel relies on this flag being clear - when
set, it would blindly (i.e. without ensuring the underlying memory is
contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1
mapping. I would think this capability ought to be propagated by another
means. Likewise I'm uncertain about letting X86_CR4_PSE shine through.
And a general question: How is a trivial DomU going to be able to make
use of this, without being permitted to allocate non-order-zero chunks of
memory?
Jan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-05 15:13 ` Jan Beulich
@ 2008-11-06 13:44 ` Dave McCracken
2008-11-06 13:54 ` Keir Fraser
2008-11-06 14:04 ` Jan Beulich
0 siblings, 2 replies; 8+ messages in thread
From: Dave McCracken @ 2008-11-06 13:44 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser
On Wednesday 05 November 2008, Jan Beulich wrote:
> The return value handling seems correct, but the call to map_pages_to_xen()
> at the end of get_page_from_l2e() doesn't: Simply or-ing the guest-provided
> flags into PAGE_HYPERVISOR definitely isn't correct. The cache attributes
> must be extracted, converted to L1 notion, and then be or-ed in.
Oops. I thought I'd fixed that. Apparently I looked at it and got
sidetracked. I'll fix.
> Also, the special code to deal with LDT pages in put_page_from_l1e() should
> now also be required in put_page_from_l2e(). Perhaps that logic could be
> moved into put_data_page()?
It looks to me like this is only done for PGT_seg_desc_page. I don't think
any hugepage can be of this type. I can add it to put_data_page() if you
think we need to be sure.
> Further, with L2_DISALLOW_MASK now permitting _PAGE_PSE, the use
> site in mod_l2_entry() must certainly also be modified, i.e. the subsequent
> l2e_has_changed() would need to include _PAGE_PSE, and the case of
> _PAGE_RW toggling when _PAGE_PSE was and will be set needs to be
> taken care of.
>
> Next, adjust_guest_l2e() would apparently also need to be modified.
I'll look more closely into those.
> Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather
> risky: I'm pretty sure the pv-ops kernel relies on this flag being clear -
> when set, it would blindly (i.e. without ensuring the underlying memory is
> contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1
> mapping. I would think this capability ought to be propagated by another
> means. Likewise I'm uncertain about letting X86_CR4_PSE shine through.
>
> And a general question: How is a trivial DomU going to be able to make
> use of this, without being permitted to allocate non-order-zero chunks of
> memory?
This is the point of having hugepages a command line option. It should only
be turned on if you intend to run guests who enforce the alignment rules.
I'm working on ways to get guests to run in aligned memory.
Dave McCracken
Oracle Corp.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-06 13:44 ` Dave McCracken
@ 2008-11-06 13:54 ` Keir Fraser
2008-11-06 14:04 ` Jan Beulich
1 sibling, 0 replies; 8+ messages in thread
From: Keir Fraser @ 2008-11-06 13:54 UTC (permalink / raw)
To: Dave McCracken, Jan Beulich; +Cc: xen-devel
On 6/11/08 13:44, "Dave McCracken" <dcm@mccr.org> wrote:
>> Also, the special code to deal with LDT pages in put_page_from_l1e() should
>> now also be required in put_page_from_l2e(). Perhaps that logic could be
>> moved into put_data_page()?
>
> It looks to me like this is only done for PGT_seg_desc_page. I don't think
> any hugepage can be of this type. I can add it to put_data_page() if you
> think we need to be sure.
The data pages mapped by a superpage mapping aren't special. One of them
could certainly be PGT_seg_desc_page. Hence moving the logic into
put_data_page() makes sense.
-- Keir
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-06 13:44 ` Dave McCracken
2008-11-06 13:54 ` Keir Fraser
@ 2008-11-06 14:04 ` Jan Beulich
2008-11-07 14:39 ` Dave McCracken
1 sibling, 1 reply; 8+ messages in thread
From: Jan Beulich @ 2008-11-06 14:04 UTC (permalink / raw)
To: Dave McCracken; +Cc: xen-devel, Keir Fraser
>>> Dave McCracken <dcm@mccr.org> 06.11.08 14:44 >>>
>> Finally, no longer hiding X86_FEATURE_PSE from the guest seems rather
>> risky: I'm pretty sure the pv-ops kernel relies on this flag being clear -
>> when set, it would blindly (i.e. without ensuring the underlying memory is
>> contiguous and suitably aligned) try to use 2Mb mappings for e.g. the 1:1
>> mapping. I would think this capability ought to be propagated by another
>> means. Likewise I'm uncertain about letting X86_CR4_PSE shine through.
>>
>> And a general question: How is a trivial DomU going to be able to make
>> use of this, without being permitted to allocate non-order-zero chunks of
>> memory?
>
>This is the point of having hugepages a command line option. It should only
>be turned on if you intend to run guests who enforce the alignment rules.
You mean 'if you intend to run *only* guests ...', including dom0. Any guest
unaware of the connection of X86_FEATURE_PSE and the need to create
contiguous 2M chunks would fail, and any guest not having I/O memory
assigned would never manage to create such chunks.
Jan
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-06 14:04 ` Jan Beulich
@ 2008-11-07 14:39 ` Dave McCracken
2008-11-07 18:07 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 8+ messages in thread
From: Dave McCracken @ 2008-11-07 14:39 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel, Keir Fraser
On Thursday 06 November 2008, Jan Beulich wrote:
> >This is the point of having hugepages a command line option. It should
> > only be turned on if you intend to run guests who enforce the alignment
> > rules.
>
> You mean 'if you intend to run *only* guests ...', including dom0. Any
> guest unaware of the connection of X86_FEATURE_PSE and the need to create
> contiguous 2M chunks would fail, and any guest not having I/O memory
> assigned would never manage to create such chunks.
Hmm... Sounds like a per-domain flag would be a good solution here. I'll look
into it.
Dave McCracken
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/1] Xen PV support for hugepages
2008-11-07 14:39 ` Dave McCracken
@ 2008-11-07 18:07 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 8+ messages in thread
From: Jeremy Fitzhardinge @ 2008-11-07 18:07 UTC (permalink / raw)
To: Dave McCracken; +Cc: xen-devel, Keir Fraser
Dave McCracken wrote:
> On Thursday 06 November 2008, Jan Beulich wrote:
>
>>> This is the point of having hugepages a command line option. It should
>>> only be turned on if you intend to run guests who enforce the alignment
>>> rules.
>>>
>> You mean 'if you intend to run *only* guests ...', including dom0. Any
>> guest unaware of the connection of X86_FEATURE_PSE and the need to create
>> contiguous 2M chunks would fail, and any guest not having I/O memory
>> assigned would never manage to create such chunks.
>>
>
> Hmm... Sounds like a per-domain flag would be a good solution here. I'll look
> into it.
>
I think a guest should explicitly enable large page support via
something like vmassist. In other words:
* keep PSE clear - you are not implementing something similar to PSE
* export the ability to map large pages via a feature flag or something
* require guests to explicitly enable large page support
I'm sceptical about the value of this patch as it stands. We already
have a vast number of operating modes and combinations, and I think we
need to have a pretty high bar before adding any more. If this work
arranged things so that we could just set PSE - and thereby make
supporting it on the guest side much simpler - then I think the decision
to include it would be easier.
In practical terms, I think this means that we need to modify Xen and
the domain builder to *always* allocate memory to guests in 2M contig
chunks, and then sort out all the tricky details ;) In practice,
however, we would lose a lot of the benefit of doing this we'd get from
the native kernel, because all the RO pagetable mappings would chop up
the linear and kernel maps so much that we'd not get much chance to use
the 2M mappings. It would, however, make hugetlbfs work without much drama.
J
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2008-11-07 18:07 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-04 15:41 [PATCH 1/1] Xen PV support for hugepages dcm
2008-11-05 10:59 ` Keir Fraser
2008-11-05 15:13 ` Jan Beulich
2008-11-06 13:44 ` Dave McCracken
2008-11-06 13:54 ` Keir Fraser
2008-11-06 14:04 ` Jan Beulich
2008-11-07 14:39 ` Dave McCracken
2008-11-07 18:07 ` Jeremy Fitzhardinge
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.