public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jeremy Fitzhardinge <jeremy@goop.org>
To: Ingo Molnar <mingo@elte.hu>
Cc: LKML <linux-kernel@vger.kernel.org>,
	x86@kernel.org, xen-devel <xen-devel@lists.xensource.com>,
	Stephen Tweedie <sct@redhat.com>,
	Eduardo Habkost <ehabkost@redhat.com>,
	Mark McLoughlin <markmc@redhat.com>,
	x86@kernel.org
Subject: [PATCH 17 of 36] x86: preallocate and prepopulate separately
Date: Wed, 25 Jun 2008 00:19:13 -0400	[thread overview]
Message-ID: <70d47cd6a5c05cd4cd6f.1214367553@localhost> (raw)
In-Reply-To: <patchbomb.1214367536@localhost>

Jan Beulich points out that vmalloc_sync_all() assumes that the
kernel's pmd is always expected to be present in the pgd.  The current
pgd construction code will add the pgd to the pgd_list before its pmds
have been pre-populated, thereby making it visible to
vmalloc_sync_all().

However, because pgd_prepopulate_pmd also does the allocation, it may
block and cannot be done under spinlock.

The solution is to preallocate the pmds out of the spinlock, then
populate them while holding the pgd_list lock.

This patch also pulls the pmd preallocation and mop-up functions out
to be common, assuming that the compiler will generate no code for
them when PREALLOCTED_PMDS is 0.  Also, there's no need for pgd_ctor
to clear the pgd again, since it's allocated as a zeroed page.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Jan Beulich <jbeulich@novell.com>
---
 arch/x86/mm/pgtable.c |  177 +++++++++++++++++++++++++++++--------------------
 1 file changed, 105 insertions(+), 72 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -66,12 +66,6 @@
 static void pgd_ctor(void *p)
 {
 	pgd_t *pgd = p;
-	unsigned long flags;
-
-	/* Clear usermode parts of PGD */
-	memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
-
-	spin_lock_irqsave(&pgd_lock, flags);
 
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -91,8 +85,6 @@
 	/* list required to sync kernel mapping updates */
 	if (!SHARED_KERNEL_PMD)
 		pgd_list_add(pgd);
-
-	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 static void pgd_dtor(void *pgd)
@@ -120,30 +112,6 @@
 
 #ifdef CONFIG_X86_PAE
 /*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-	int i;
-
-	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
-		pgd_t pgd = pgdp[i];
-
-		if (pgd_val(pgd) != 0) {
-			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
-			pgdp[i] = native_make_pgd(0);
-
-			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
-			pmd_free(mm, pmd);
-		}
-	}
-}
-
-/*
  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
  * updating the top-level pagetable entries to guarantee the
  * processor notices the update.  Since this is expensive, and
@@ -154,31 +122,7 @@
  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
  * and initialize the kernel pmds here.
  */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
-	pud_t *pud;
-	unsigned long addr;
-	int i;
-
-	pud = pud_offset(pgd, 0);
- 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
-	     i++, pud++, addr += PUD_SIZE) {
-		pmd_t *pmd = pmd_alloc_one(mm, addr);
-
-		if (!pmd) {
-			pgd_mop_up_pmds(mm, pgd);
-			return 0;
-		}
-
-		if (i >= KERNEL_PGD_BOUNDARY)
-			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
-			       sizeof(pmd_t) * PTRS_PER_PMD);
-
-		pud_populate(mm, pud, pmd);
-	}
-
-	return 1;
-}
+#define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD
 
 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 {
@@ -198,35 +142,124 @@
 		write_cr3(read_cr3());
 }
 #else  /* !CONFIG_X86_PAE */
+
 /* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+#define PREALLOCATED_PMDS	0
+
+#endif	/* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[])
 {
-	return 1;
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++)
+		if (pmds[i])
+			free_page((unsigned long)pmds[i]);
 }
 
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+static int preallocate_pmds(pmd_t *pmds[])
 {
+	int i;
+	bool failed = false;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+		if (pmd == NULL)
+			failed = true;
+		pmds[i] = pmd;
+	}
+
+	if (failed) {
+		free_pmds(pmds);
+		return -ENOMEM;
+	}
+
+	return 0;
 }
-#endif	/* CONFIG_X86_PAE */
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = native_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	pud = pud_offset(pgd, 0);
+
+ 	for (addr = i = 0; i < PREALLOCATED_PMDS;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmds[i];
+
+		if (i >= KERNEL_PGD_BOUNDARY)
+			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+
+		pud_populate(mm, pud, pmd);
+	}
+}
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	pgd_t *pgd;
+	pmd_t *pmds[PREALLOCATED_PMDS];
+	unsigned long flags;
 
-	/* so that alloc_pmd can use it */
+	pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (pgd == NULL)
+		goto out;
+
 	mm->pgd = pgd;
-	if (pgd) {
-		pgd_ctor(pgd);
 
-		if (paravirt_pgd_alloc(mm) != 0 ||
-		    !pgd_prepopulate_pmd(mm, pgd)) {
-			pgd_dtor(pgd);
-			free_page((unsigned long)pgd);
-			pgd = NULL;
-		}
-	}
+	if (preallocate_pmds(pmds) != 0)
+		goto out_free_pgd;
+
+	if (paravirt_pgd_alloc(mm) != 0)
+		goto out_free_pmds;
+
+	/*
+	 * Make sure that pre-populating the pmds is atomic with
+	 * respect to anything walking the pgd_list, so that they
+	 * never see a partially populated pgd.
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	pgd_ctor(pgd);
+	pgd_prepopulate_pmd(mm, pgd, pmds);
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
 
 	return pgd;
+
+out_free_pmds:
+	free_pmds(pmds);
+out_free_pgd:
+	free_page((unsigned long)pgd);
+out:
+	return NULL;
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)



  parent reply	other threads:[~2008-06-25  4:38 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-06-25  4:18 [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen support Jeremy Fitzhardinge
2008-06-25  4:18 ` [PATCH 01 of 36] x86: asm-x86/pgtable.h: fix compiler warning Jeremy Fitzhardinge
2008-06-25  4:18 ` [PATCH 02 of 36] x86: add memory clobber to save/loadsegment Jeremy Fitzhardinge
2008-06-25  4:18 ` [PATCH 03 of 36] x86: add memory barriers to wrmsr Jeremy Fitzhardinge
2008-06-25  4:44   ` Arjan van de Ven
2008-06-25 21:08     ` Jeremy Fitzhardinge
2008-06-25 22:31       ` Arjan van de Ven
2008-06-25 23:05         ` Jeremy Fitzhardinge
2008-06-25 23:18         ` H. Peter Anvin
2008-06-25 23:37           ` Jeremy Fitzhardinge
2008-06-25 23:42             ` H. Peter Anvin
2008-06-25  4:19 ` [PATCH 04 of 36] x86: remove open-coded save/load segment operations Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 05 of 36] x86_64: use write_gdt_entry in vsyscall_set_cpu Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 06 of 36] x86_64: use p??_populate() to attach pages to pagetable Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 07 of 36] x86_64: unify early_ioremap Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 08 of 36] x86_64: Add gate_offset() and gate_segment() macros Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 09 of 36] x86_64: Use __pgd() on mk_kernel_pgd() Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 10 of 36] x86: unify pgd_index Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 11 of 36] x86: unify mmu_context.h Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 12 of 36] x86_64: replace end_pfn with num_physpages Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 13 of 36] x86_64: add prototype for x86_64_start_kernel() Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 14 of 36] x86_64: add sync_cmpxchg Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 15 of 36] x86: simplify vmalloc_sync_all Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 16 of 36] x86/paravirt: add a pgd_alloc/free hooks Jeremy Fitzhardinge
2008-06-25  4:19 ` Jeremy Fitzhardinge [this message]
2008-06-25  4:19 ` [PATCH 18 of 36] x86/paravirt: add debugging for missing operations Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 19 of 36] paravirt_ops: define PARA_INDIRECT for indirect asm calls Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 20 of 36] paravirt/x86_64: move __PAGE_OFFSET to leave a space for hypervisor Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 21 of 36] x86-64: add FIX_PARAVIRT_BOOTMAP fixmap slot Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 22 of 36] x86_64: split x86_64_start_kernel Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 23 of 36] x86_64: adjust mapping of physical pagetables to work with Xen Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 24 of 36] x86_64: create small vmemmap mappings if PSE not available Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 25 of 36] x86_64: PSE no longer a hard requirement Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 26 of 36] x86_64: Split set_pte_vaddr() Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 27 of 36] x86_64: __switch_to(): Move arch_leave_lazy_cpu_mode() to the right place Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 28 of 36] Save %fs and %gs before load_TLS() and arch_leave_lazy_cpu_mode() Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 29 of 36] Use __KERNEL_DS as SS when returning to a kernel thread (VERIFY) Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 30 of 36] x86/paravirt_ops: split sysret and sysexit Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 31 of 36] x86_64 pvops: don't restore user rsp within sysret Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 32 of 36] Add sysret/sysexit pvops for returning to 32-bit compatibility userspace Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 33 of 36] x86_64: ia32entry: replace privileged instructions with pvops Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 34 of 36] x86_64: swapgs pvop with a user-stack can never be called Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 35 of 36] x86_64/paravirt: add adjust_exception_frame Jeremy Fitzhardinge
2008-06-25  4:19 ` [PATCH 36 of 36] x86_64/paravirt: Make load_gs_index() a paravirt operation Jeremy Fitzhardinge
2008-06-25  8:47   ` Ingo Molnar
2008-06-25 11:48     ` Jeremy Fitzhardinge
2008-06-25  8:42 ` [PATCH 00 of 36] x86/paravirt: groundwork for 64-bit Xen support Ingo Molnar
2008-06-25 11:46   ` Jeremy Fitzhardinge
2008-06-25 15:22   ` Ingo Molnar
2008-06-25 20:12     ` Jeremy Fitzhardinge
2008-06-26 10:57       ` Ingo Molnar
2008-06-26 10:58         ` Ingo Molnar
2008-06-26 14:34           ` [Xen-devel] " Jeremy Fitzhardinge
2008-06-27 15:56             ` Ingo Molnar
2008-06-27 16:02               ` Jeremy Fitzhardinge
2008-06-27 16:06                 ` Ingo Molnar
2008-06-27 16:25                   ` Jeremy Fitzhardinge
2008-06-27 16:03             ` Ingo Molnar
2008-06-27 19:04               ` Jeremy Fitzhardinge
2008-06-29  8:43                 ` Ingo Molnar
2008-06-30  3:02                   ` Jeremy Fitzhardinge
2008-06-30  4:35                     ` Yinghai Lu
2008-06-30  5:32                       ` Jeremy Fitzhardinge
2008-06-30  8:21                     ` Ingo Molnar
2008-06-30  9:22                       ` Ingo Molnar
2008-06-30 17:17                         ` Jeremy Fitzhardinge
2008-06-30 18:12                           ` Ingo Molnar
2008-06-30 18:36                             ` Jeremy Fitzhardinge
2008-06-30 18:44                               ` Ingo Molnar
2008-06-30 17:57                         ` Jeremy Fitzhardinge
2008-06-30 18:03                           ` Ingo Molnar
2008-06-30 23:04                         ` Jeremy Fitzhardinge
2008-07-01  8:52                           ` Ingo Molnar
2008-07-01  9:21                             ` Ingo Molnar
2008-07-01 16:10                               ` Jeremy Fitzhardinge
2008-07-01 16:14                               ` Jeremy Fitzhardinge
2008-07-01 20:31                                 ` Ingo Molnar
2008-07-03  9:10                                   ` Ingo Molnar
2008-07-03 15:47                                     ` Jeremy Fitzhardinge
2008-07-03 18:20                                     ` Yinghai Lu
2008-07-03 18:25                                       ` Jeremy Fitzhardinge
2008-07-03 18:30                                         ` Yinghai Lu
2008-07-03 18:41                                           ` Jeremy Fitzhardinge
2008-07-03 18:51                                             ` Yinghai Lu
2008-07-03 19:19                                               ` Yinghai Lu
2008-07-03 19:29                                                 ` Yinghai Lu
2008-07-09  7:42                                                   ` Ingo Molnar
2008-06-26 14:28         ` Jeremy Fitzhardinge
2008-06-26 18:25         ` Jeremy Fitzhardinge
2008-06-26 19:02         ` Jeremy Fitzhardinge
2008-06-25 12:40 ` Andi Kleen
2008-06-25 18:45   ` [Xen-devel] " Keir Fraser
2008-06-25 19:13     ` Andi Kleen
2008-06-25 19:22       ` Keir Fraser
2008-06-25 20:14         ` Andi Kleen
2008-06-25 20:03   ` Jeremy Fitzhardinge

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=70d47cd6a5c05cd4cd6f.1214367553@localhost \
    --to=jeremy@goop.org \
    --cc=ehabkost@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=markmc@redhat.com \
    --cc=mingo@elte.hu \
    --cc=sct@redhat.com \
    --cc=x86@kernel.org \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox