From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: linux-arch@vger.kernel.org, Juergen Gross <jgross@suse.com>,
Andi Kleen <ak@linux.intel.com>, Arnd Bergmann <arnd@arndb.de>,
linux-mm@kvack.org, x86@kernel.org, linux-kernel@vger.kernel.org,
Andy Lutomirski <luto@amacapital.net>,
Dave Hansen <dave.hansen@intel.com>,
Ingo Molnar <mingo@redhat.com>, "H. Peter Anvin" <hpa@zytor.com>,
Xiong Zhang <xiong.y.zhang@intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
xen-devel <xen-devel@lists.xen.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
Thomas Gleixner <tglx@linutronix.de>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: Re: [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d
Date: Tue, 7 Mar 2017 16:00:09 +0300 [thread overview]
Message-ID: <20170307130009.GA2154__5381.30348218164$1488891686$gmane$org@node> (raw)
In-Reply-To: <ab2868ea-1dd1-d51b-4c5a-921ef5c9a427@oracle.com>
On Mon, Mar 06, 2017 at 03:48:24PM -0500, Boris Ostrovsky wrote:
>
> > +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
> > + int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
> > + bool last, unsigned long limit)
> > +{
> > + int i, nr, flush = 0;
> > +
> > + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
> > + for (i = 0; i < nr; i++) {
> > + pud_t *pud;
> > +
> > + if (p4d_none(p4d[i]))
> > + continue;
> > +
> > + pud = pud_offset(&p4d[i], 0);
> > + if (PTRS_PER_PUD > 1)
> > + flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
> > + xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
> > + }
> > + return flush;
> > +}
>
> ..
>
> > + p4d = p4d_offset(&pgd[i], 0);
> > + if (PTRS_PER_P4D > 1)
> > + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
> > + xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
>
>
> We are losing flush status at all levels so we need something like
>
> flush |= xen_XXX_walk(...)
+ Xiong.
Thanks for noticing this. The fixup is below.
Please test, I don't have a setup for this.
>
>
>
> > }
> >
> > -out:
> > /* Do the top level last, so that the callbacks can use it as
> > a cue to do final things like tlb flushes. */
> > flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
> > @@ -1150,57 +1161,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
> > xen_free_ro_pages(pa, PAGE_SIZE);
> > }
> >
> > +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
> > +{
> > + unsigned long pa;
> > + pte_t *pte_tbl;
> > + int i;
> > +
> > + if (pmd_large(*pmd)) {
> > + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > + xen_free_ro_pages(pa, PMD_SIZE);
> > + return;
> > + }
> > +
> > + pte_tbl = pte_offset_kernel(pmd, 0);
> > + for (i = 0; i < PTRS_PER_PTE; i++) {
> > + if (pte_none(pte_tbl[i]))
> > + continue;
> > + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
> > + xen_free_ro_pages(pa, PAGE_SIZE);
> > + }
> > + set_pmd(pmd, __pmd(0));
> > + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
> > +{
> > + unsigned long pa;
> > + pmd_t *pmd_tbl;
> > + int i;
> > +
> > + if (pud_large(*pud)) {
> > + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > + xen_free_ro_pages(pa, PUD_SIZE);
> > + return;
> > + }
> > +
> > + pmd_tbl = pmd_offset(pud, 0);
> > + for (i = 0; i < PTRS_PER_PMD; i++) {
> > + if (pmd_none(pmd_tbl[i]))
> > + continue;
> > + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
> > + }
> > + set_pud(pud, __pud(0));
> > + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
> > +}
> > +
> > +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
> > +{
> > + unsigned long pa;
> > + pud_t *pud_tbl;
> > + int i;
> > +
> > + if (p4d_large(*p4d)) {
> > + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
> > + xen_free_ro_pages(pa, P4D_SIZE);
> > + return;
> > + }
> > +
> > + pud_tbl = pud_offset(p4d, 0);
> > + for (i = 0; i < PTRS_PER_PUD; i++) {
> > + if (pud_none(pud_tbl[i]))
> > + continue;
> > + xen_cleanmfnmap_pud(pud_tbl + i, unpin);
> > + }
> > + set_p4d(p4d, __p4d(0));
> > + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
> > +}
> > +
> > /*
> > * Since it is well isolated we can (and since it is perhaps large we should)
> > * also free the page tables mapping the initial P->M table.
> > */
> > static void __init xen_cleanmfnmap(unsigned long vaddr)
> > {
> > - unsigned long va = vaddr & PMD_MASK;
> > - unsigned long pa;
> > - pgd_t *pgd = pgd_offset_k(va);
> > - pud_t *pud_page = pud_offset(pgd, 0);
> > - pud_t *pud;
> > - pmd_t *pmd;
> > - pte_t *pte;
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > unsigned int i;
> > bool unpin;
> >
> > unpin = (vaddr == 2 * PGDIR_SIZE);
> > - set_pgd(pgd, __pgd(0));
> > - do {
> > - pud = pud_page + pud_index(va);
> > - if (pud_none(*pud)) {
> > - va += PUD_SIZE;
> > - } else if (pud_large(*pud)) {
> > - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
> > - xen_free_ro_pages(pa, PUD_SIZE);
> > - va += PUD_SIZE;
> > - } else {
> > - pmd = pmd_offset(pud, va);
> > - if (pmd_large(*pmd)) {
> > - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
> > - xen_free_ro_pages(pa, PMD_SIZE);
> > - } else if (!pmd_none(*pmd)) {
> > - pte = pte_offset_kernel(pmd, va);
> > - set_pmd(pmd, __pmd(0));
> > - for (i = 0; i < PTRS_PER_PTE; ++i) {
> > - if (pte_none(pte[i]))
> > - break;
> > - pa = pte_pfn(pte[i]) << PAGE_SHIFT;
> > - xen_free_ro_pages(pa, PAGE_SIZE);
> > - }
> > - xen_cleanmfnmap_free_pgtbl(pte, unpin);
> > - }
> > - va += PMD_SIZE;
> > - if (pmd_index(va))
> > - continue;
> > - set_pud(pud, __pud(0));
> > - xen_cleanmfnmap_free_pgtbl(pmd, unpin);
> > - }
> > -
> > - } while (pud_index(va) || pmd_index(va));
> > - xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
> > + vaddr &= PMD_MASK;
> > + pgd = pgd_offset_k(vaddr);
> > + p4d = p4d_offset(pgd, 0);
> > + for (i = 0; i < PTRS_PER_P4D; i++) {
> > + if (p4d_none(p4d[i]))
> > + continue;
> > + xen_cleanmfnmap_p4d(p4d + i, unpin);
> > + }
>
> Don't we need to pass vaddr down to all routines so that they select
> appropriate tables? You seem to always be choosing the first one.
IIUC, we clear whole page table subtree covered by one pgd entry.
So, no, there's no need to pass vaddr down. Just pointer to page table
entry is enough.
But I know virtually nothing about Xen. Please re-check my reasoning.
I would also appreciate help with getting x86 Xen code work with 5-level
paging enabled. For now I make CONFIG_XEN dependent on !CONFIG_X86_5LEVEL.
Fixup:
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a4079cfab007..d66b7e79781a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -629,7 +629,8 @@ static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(&pud[i], 0);
if (PTRS_PER_PMD > 1)
flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
- xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
+ flush |= xen_pmd_walk(mm, pmd, func,
+ last && i == nr - 1, limit);
}
return flush;
}
@@ -650,7 +651,8 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
pud = pud_offset(&p4d[i], 0);
if (PTRS_PER_PUD > 1)
flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
- xen_pud_walk(mm, pud, func, last && i == nr - 1, limit);
+ flush |= xen_pud_walk(mm, pud, func,
+ last && i == nr - 1, limit);
}
return flush;
}
@@ -706,7 +708,7 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
p4d = p4d_offset(&pgd[i], 0);
if (PTRS_PER_P4D > 1)
flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
- xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
+ flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
/* Do the top level last, so that the callbacks can use it as
--
Kirill A. Shutemov
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
next prev parent reply other threads:[~2017-03-07 13:00 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20170306135357.3124-1-kirill.shutemov@linux.intel.com>
[not found] ` <20170306135357.3124-29-kirill.shutemov@linux.intel.com>
2017-03-06 20:05 ` [PATCHv4 28/33] x86/mm: add support of additional page table level during early boot Boris Ostrovsky
[not found] ` <7e78a76a-f5e8-bb60-e5be-a91a84faa1f9@oracle.com>
2017-03-06 20:23 ` Kirill A. Shutemov
[not found] ` <20170306135357.3124-19-kirill.shutemov@linux.intel.com>
2017-03-06 20:48 ` [PATCHv4 18/33] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d Boris Ostrovsky
[not found] ` <ab2868ea-1dd1-d51b-4c5a-921ef5c9a427@oracle.com>
2017-03-07 13:00 ` Kirill A. Shutemov [this message]
[not found] ` <20170307130009.GA2154@node>
2017-03-07 18:18 ` Boris Ostrovsky
[not found] ` <8bd7d5b7-7a22-a0a2-8eff-e909a1c6783e@oracle.com>
2017-03-07 18:26 ` Andrew Cooper
[not found] ` <47f06f4a-ef29-5a77-c48b-43a91a8c9579@citrix.com>
2017-03-07 18:45 ` Boris Ostrovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='20170307130009.GA2154__5381.30348218164$1488891686$gmane$org@node' \
--to=kirill@shutemov.name \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=arnd@arndb.de \
--cc=boris.ostrovsky@oracle.com \
--cc=dave.hansen@intel.com \
--cc=hpa@zytor.com \
--cc=jgross@suse.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-arch@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@amacapital.net \
--cc=mingo@redhat.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
--cc=x86@kernel.org \
--cc=xen-devel@lists.xen.org \
--cc=xiong.y.zhang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).