All of lore.kernel.org
 help / color / mirror / Atom feed
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@ozlabs.org, Kumar Gala <kumar.gala@freescale.com>
Subject: [PATCH 3/10] powerpc/mm: Rework context management for CPUs with no hash table v2
Date: Fri, 19 Dec 2008 16:13:29 +1100	[thread overview]
Message-ID: <20081219051433.AF3C9DDFBD@ozlabs.org> (raw)
In-Reply-To: <1229663599.904385.502157196243.qpush@grosgo>

This reworks the context management code used by 4xx,8xx and
freescale BookE. It adds support for SMP by implementing a
concept of stale context map to lazily flush the TLB on
processors where a context may have been invalidated. This
also contains the ground work for generalizing such lazy TLB
flushing by just picking up a new PID and marking the old one
stale. This will be implemented later.

This is a first implementation that uses a global spinlock.

Ideally, we should try to get at least the fast path (context ID
already assigned) lockless or limited to a per context lock,
but for now this will do.

I tried to keep the UP case reasonably simple to avoid adding
too much overhead to 8xx which does a lot of context stealing
since it effectively has only 16 PIDs available.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
v2. remove some bugs with active tracking on SMP

 arch/powerpc/include/asm/mmu-40x.h       |    5 
 arch/powerpc/include/asm/mmu-44x.h       |    5 
 arch/powerpc/include/asm/mmu-8xx.h       |    3 
 arch/powerpc/include/asm/mmu-fsl-booke.h |    5 
 arch/powerpc/include/asm/tlbflush.h      |    2 
 arch/powerpc/mm/mmu_context_nohash.c     |  270 +++++++++++++++++++++++++------
 6 files changed, 235 insertions(+), 55 deletions(-)

--- linux-work.orig/arch/powerpc/include/asm/mmu-40x.h	2008-09-29 14:21:37.000000000 +1000
+++ linux-work/arch/powerpc/include/asm/mmu-40x.h	2008-12-19 15:41:54.000000000 +1100
@@ -54,8 +54,9 @@
 #ifndef __ASSEMBLY__
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
Index: linux-work/arch/powerpc/include/asm/mmu-44x.h
===================================================================
--- linux-work.orig/arch/powerpc/include/asm/mmu-44x.h	2008-09-29 14:21:37.000000000 +1000
+++ linux-work/arch/powerpc/include/asm/mmu-44x.h	2008-12-19 15:42:26.000000000 +1100
@@ -56,8 +56,9 @@
 extern unsigned int tlb_44x_hwater;
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
Index: linux-work/arch/powerpc/include/asm/mmu-fsl-booke.h
===================================================================
--- linux-work.orig/arch/powerpc/include/asm/mmu-fsl-booke.h	2008-12-08 15:40:33.000000000 +1100
+++ linux-work/arch/powerpc/include/asm/mmu-fsl-booke.h	2008-12-19 15:41:54.000000000 +1100
@@ -76,8 +76,9 @@
 #ifndef __ASSEMBLY__
 
 typedef struct {
-	unsigned long id;
-	unsigned long vdso_base;
+	unsigned int	id;
+	unsigned int	active;
+	unsigned long	vdso_base;
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */
 
Index: linux-work/arch/powerpc/mm/mmu_context_nohash.c
===================================================================
--- linux-work.orig/arch/powerpc/mm/mmu_context_nohash.c	2008-12-19 15:41:54.000000000 +1100
+++ linux-work/arch/powerpc/mm/mmu_context_nohash.c	2008-12-19 16:06:46.000000000 +1100
@@ -14,13 +14,28 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  *
+ * TODO:
+ *
+ *   - The global context lock will not scale very well
+ *   - The maps should be dynamically allocated to allow for processors
+ *     that support more PID bits at runtime
+ *   - Implement flush_tlb_mm() by making the context stale and picking
+ *     a new one
+ *   - More aggressively clear stale map bits and maybe find some way to
+ *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
 
+#undef DEBUG
+#define DEBUG_STEAL_ONLY
+#undef DEBUG_MAP_CONSISTENCY
+
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
+#include <linux/spinlock.h>
 
 /*
  *   The MPC8xx has only 16 contexts.  We rotate through them on each
@@ -40,17 +55,14 @@
  */
 
 #ifdef CONFIG_8xx
-#define NO_CONTEXT      	16
 #define LAST_CONTEXT    	15
 #define FIRST_CONTEXT    	0
 
 #elif defined(CONFIG_4xx)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
 #elif defined(CONFIG_E200) || defined(CONFIG_E500)
-#define NO_CONTEXT      	256
 #define LAST_CONTEXT    	255
 #define FIRST_CONTEXT    	1
 
@@ -58,66 +70,208 @@
 #error Unsupported processor type
 #endif
 
-static unsigned long next_mmu_context;
+static unsigned int next_context, nr_free_contexts;
 static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1];
-static atomic_t nr_free_contexts;
+static unsigned long stale_map[NR_CPUS][LAST_CONTEXT / BITS_PER_LONG + 1];
 static struct mm_struct *context_mm[LAST_CONTEXT+1];
-static void steal_context(void);
+static spinlock_t context_lock = SPIN_LOCK_UNLOCKED;
 
 /* Steal a context from a task that has one at the moment.
- * This is only used on 8xx and 4xx and we presently assume that
- * they don't do SMP.  If they do then this will have to check
- * whether the MM we steal is in use.
- * We also assume that this is only used on systems that don't
- * use an MMU hash table - this is true for 8xx and 4xx.
+ *
+ * This is used when we are running out of available PID numbers
+ * on the processors.
+ *
  * This isn't an LRU system, it just frees up each context in
  * turn (sort-of pseudo-random replacement :).  This would be the
  * place to implement an LRU scheme if anyone was motivated to do it.
  *  -- paulus
+ *
+ * For context stealing, we use a slightly different approach for
+ * SMP and UP. Basically, the UP one is simpler and doesn't use
+ * the stale map as we can just flush the local CPU
+ *  -- benh
  */
-static void steal_context(void)
+#ifdef CONFIG_SMP
+static unsigned int steal_context_smp(unsigned int id)
 {
 	struct mm_struct *mm;
+	unsigned int cpu, max;
 
-	/* free up context `next_mmu_context' */
-	/* if we shouldn't free context 0, don't... */
-	if (next_mmu_context < FIRST_CONTEXT)
-		next_mmu_context = FIRST_CONTEXT;
-	mm = context_mm[next_mmu_context];
-	flush_tlb_mm(mm);
-	destroy_context(mm);
-}
+ again:
+	max = LAST_CONTEXT - FIRST_CONTEXT;
 
+	/* Attempt to free next_context first and then loop until we manage */
+	while (max--) {
+		/* Pick up the victim mm */
+		mm = context_mm[id];
+
+		/* We have a candidate victim, check if it's active, on SMP
+		 * we cannot steal active contexts
+		 */
+		if (mm->context.active) {
+			id ++;
+			if (id > LAST_CONTEXT)
+				id = FIRST_CONTEXT;
+			continue;
+		}
+		pr_debug("[%d] steal context %d from mm @%p\n",
+			 smp_processor_id(), id, mm);
+
+		/* Mark this mm has having no context anymore */
+		mm->context.id = MMU_NO_CONTEXT;
+
+		/* Mark it stale on all CPUs that used this mm */
+		for_each_cpu_mask_nr(cpu, mm->cpu_vm_mask)
+			__set_bit(id, stale_map[cpu]);
+		return id;
+	}
 
-/*
- * Get a new mmu context for the address space described by `mm'.
+	/* This will happen if you have more CPUs than available contexts,
+	 * all we can do here is wait a bit and try again
+	 */
+	spin_unlock(&context_lock);
+	cpu_relax();
+	spin_lock(&context_lock);
+	goto again;
+}
+#endif  /* CONFIG_SMP */
+
+/* Note that this will also be called on SMP if all other CPUs are
+ * offlined, which means that it may be called for cpu != 0. For
+ * this to work, we somewhat assume that CPUs that are onlined
+ * come up with a fully clean TLB (or are cleaned when offlined)
  */
-static inline void get_mmu_context(struct mm_struct *mm)
+static unsigned int steal_context_up(unsigned int id)
 {
-	unsigned long ctx;
+	struct mm_struct *mm;
+	int cpu = smp_processor_id();
 
-	if (mm->context.id != NO_CONTEXT)
-		return;
+	/* Pick up the victim mm */
+	mm = context_mm[id];
+
+	pr_debug("[%d] steal context %d from mm @%p\n", cpu, id, mm);
+
+	/* Mark this mm has having no context anymore */
+	mm->context.id = MMU_NO_CONTEXT;
+
+	/* Flush the TLB for that context */
+	local_flush_tlb_mm(mm);
+
+	/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+	__clear_bit(id, stale_map[cpu]);
+
+	return id;
+}
 
-	while (atomic_dec_if_positive(&nr_free_contexts) < 0)
-		steal_context();
+#ifdef DEBUG_MAP_CONSISTENCY
+static void context_check_map(void)
+{
+	unsigned int id, nrf, nact;
 
-	ctx = next_mmu_context;
-	while (test_and_set_bit(ctx, context_map)) {
-		ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx);
-		if (ctx > LAST_CONTEXT)
-			ctx = 0;
-	}
-	next_mmu_context = (ctx + 1) & LAST_CONTEXT;
-	mm->context.id = ctx;
-	context_mm[ctx] = mm;
+	nrf = nact = 0;
+	for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
+		int used = test_bit(id, context_map);
+		if (!used)
+			nrf++;
+		if (used != (context_mm[id] != NULL))
+			pr_err("MMU: Context %d is %s and MM is %p !\n",
+			       id, used ? "used" : "free", context_mm[id]);
+		if (context_mm[id] != NULL)
+			nact += context_mm[id]->context.active;
+	}
+	if (nrf != nr_free_contexts) {
+		pr_err("MMU: Free context count out of sync ! (%d vs %d)\n",
+		       nr_free_contexts, nrf);
+		nr_free_contexts = nrf;
+	}
+	if (nact > num_online_cpus())
+		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
+		       nact, num_online_cpus());
 }
+#else
+static void context_check_map(void) { }
+#endif
 
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
-	get_mmu_context(next);
+	unsigned int id, cpu = smp_processor_id();
+	unsigned long *map;
 
-	set_context(next->context.id, next->pgd);
+	/* No lockless fast path .. yet */
+	spin_lock(&context_lock);
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] activating context for mm @%p, active=%d, id=%d\n",
+		 cpu, next, next->context.active, next->context.id);
+#endif
+
+#ifdef CONFIG_SMP
+	/* Mark us active and the previous one not anymore */
+	next->context.active++;
+	if (prev) {
+		WARN_ON(prev->context.active < 1);
+		prev->context.active--;
+	}
+#endif /* CONFIG_SMP */
+
+	/* If we already have a valid assigned context, skip all that */
+	id = next->context.id;
+	if (likely(id != MMU_NO_CONTEXT))
+		goto ctxt_ok;
+
+	/* We really don't have a context, let's try to acquire one */
+	id = next_context;
+	if (id > LAST_CONTEXT)
+		id = FIRST_CONTEXT;
+	map = context_map;
+
+	/* No more free contexts, let's try to steal one */
+	if (nr_free_contexts == 0) {
+#ifdef CONFIG_SMP
+		if (num_online_cpus() > 1) {
+			id = steal_context_smp(id);
+			goto stolen;
+		}
+#endif /* CONFIG_SMP */
+		id = steal_context_up(id);
+		goto stolen;
+	}
+	nr_free_contexts--;
+
+	/* We know there's at least one free context, try to find it */
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+		if (id > LAST_CONTEXT)
+			id = FIRST_CONTEXT;
+	}
+ stolen:
+	next_context = id + 1;
+	context_mm[id] = next;
+	next->context.id = id;
+
+#ifndef DEBUG_STEAL_ONLY
+	pr_debug("[%d] picked up new id %d, nrf is now %d\n",
+		 cpu, id, nr_free_contexts);
+#endif
+
+	context_check_map();
+ ctxt_ok:
+
+	/* If that context got marked stale on this CPU, then flush the
+	 * local TLB for it and unmark it before we use it
+	 */
+	if (test_bit(id, stale_map[cpu])) {
+		pr_debug("[%d] flushing stale context %d for mm @%p !\n",
+			 cpu, id, next);
+		local_flush_tlb_mm(next);
+
+		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+		__clear_bit(id, stale_map[cpu]);
+	}
+
+	/* Flick the MMU and release lock */
+	set_context(id, next->pgd);
+	spin_unlock(&context_lock);
 }
 
 /*
@@ -125,7 +279,9 @@ void switch_mmu_context(struct mm_struct
  */
 int init_new_context(struct task_struct *t, struct mm_struct *mm)
 {
-	mm->context.id = NO_CONTEXT;
+	mm->context.id = MMU_NO_CONTEXT;
+	mm->context.active = 0;
+
 	return 0;
 }
 
@@ -134,13 +290,25 @@ int init_new_context(struct task_struct 
  */
 void destroy_context(struct mm_struct *mm)
 {
-	preempt_disable();
-	if (mm->context.id != NO_CONTEXT) {
-		clear_bit(mm->context.id, context_map);
-		mm->context.id = NO_CONTEXT;
-		atomic_inc(&nr_free_contexts);
+	unsigned int id;
+
+	if (mm->context.id == MMU_NO_CONTEXT)
+		return;
+
+	WARN_ON(mm->context.active != 0);
+
+	spin_lock(&context_lock);
+	id = mm->context.id;
+	if (id != MMU_NO_CONTEXT) {
+		__clear_bit(id, context_map);
+		mm->context.id = MMU_NO_CONTEXT;
+#ifdef DEBUG_MAP_CONSISTENCY
+		mm->context.active = 0;
+		context_mm[id] = NULL;
+#endif
+		nr_free_contexts++;
 	}
-	preempt_enable();
+	spin_unlock(&context_lock);
 }
 
 
@@ -149,6 +317,12 @@ void destroy_context(struct mm_struct *m
  */
 void __init mmu_context_init(void)
 {
+	/* Mark init_mm as being active on all possible CPUs since
+	 * we'll get called with prev == init_mm the first time
+	 * we schedule on a given CPU
+	 */
+	init_mm.context.active = NR_CPUS;
+
 	/*
 	 * Some processors have too few contexts to reserve one for
 	 * init_mm, and require using context 0 for a normal task.
@@ -156,7 +330,7 @@ void __init mmu_context_init(void)
 	 * This code assumes FIRST_CONTEXT < 32.
 	 */
 	context_map[0] = (1 << FIRST_CONTEXT) - 1;
-	next_mmu_context = FIRST_CONTEXT;
-	atomic_set(&nr_free_contexts, LAST_CONTEXT - FIRST_CONTEXT + 1);
+	next_context = FIRST_CONTEXT;
+	nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
 }
 
Index: linux-work/arch/powerpc/include/asm/tlbflush.h
===================================================================
--- linux-work.orig/arch/powerpc/include/asm/tlbflush.h	2008-12-17 12:28:23.000000000 +1100
+++ linux-work/arch/powerpc/include/asm/tlbflush.h	2008-12-19 15:42:28.000000000 +1100
@@ -29,6 +29,8 @@
 
 #include <linux/mm.h>
 
+#define MMU_NO_CONTEXT      	((unsigned int)-1)
+
 extern void _tlbie(unsigned long address, unsigned int pid);
 extern void _tlbil_all(void);
 extern void _tlbil_pid(unsigned int pid);
Index: linux-work/arch/powerpc/include/asm/mmu-8xx.h
===================================================================
--- linux-work.orig/arch/powerpc/include/asm/mmu-8xx.h	2008-09-29 14:21:37.000000000 +1000
+++ linux-work/arch/powerpc/include/asm/mmu-8xx.h	2008-12-19 15:41:54.000000000 +1100
@@ -137,7 +137,8 @@
 
 #ifndef __ASSEMBLY__
 typedef struct {
-	unsigned long id;
+	unsigned int id;
+	unsigned int active;
 	unsigned long vdso_base;
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */

  parent reply	other threads:[~2008-12-19  5:13 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-12-19  5:13 [PATCH 0/10] powerpc: Preliminary work to enable SMP BookE (v3) Benjamin Herrenschmidt
2008-12-19  5:13 ` [PATCH 1/10] powerpc/4xx: Extended DCR support v2 Benjamin Herrenschmidt
2008-12-19 12:36   ` Josh Boyer
2008-12-19  5:13 ` [PATCH 2/10] powerpc/mm: Split mmu_context handling v3 Benjamin Herrenschmidt
2008-12-19  5:13 ` Benjamin Herrenschmidt [this message]
2008-12-19  5:13 ` [PATCH 4/10] powerpc/mm: Introduce MMU features v3 Benjamin Herrenschmidt
2008-12-19  5:13 ` [PATCH 5/10] powerpc/mm: Add SMP support to no-hash TLB handling v5 Benjamin Herrenschmidt
2008-12-19  5:13 ` [PATCH 6/10] powerpc/mm: Split low level tlb invalidate for nohash processors Benjamin Herrenschmidt
2008-12-19  5:13 ` [PATCH 7/10] powerpc/44x: No need to mask MSR:CE, ME or DE in _tlbil_va on 440 Benjamin Herrenschmidt
2008-12-19 12:37   ` [PATCH 7/10] powerpc/44x: No need to mask MSR:CE,ME " Josh Boyer
2008-12-19  5:13 ` [PATCH 8/10] powerpc/mm: Runtime allocation of mmu context maps for nohash CPUs v2 Benjamin Herrenschmidt
2008-12-19  5:13 ` [PATCH 9/10] powerpc/mm: Rework usage of _PAGE_COHERENT/NO_CACHE/GUARDED v2 Benjamin Herrenschmidt
2009-01-18 19:43   ` Gerhard Pircher
2009-01-18 19:55     ` Benjamin Herrenschmidt
2009-01-18 20:29       ` Gerhard Pircher
2009-01-23 16:51         ` [PATCH] powerpc/mm: Fix handling of _PAGE_COHERENT in BAT setup code Gerhard Pircher
2008-12-19  5:13 ` [PATCH 10/10] powerpc/44x: 44x TLB doesn't need "Guarded" set for all pages Benjamin Herrenschmidt
2008-12-19 12:38   ` Josh Boyer
2008-12-19 16:05 ` [PATCH 0/10] powerpc: Preliminary work to enable SMP BookE (v3) Kumar Gala

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20081219051433.AF3C9DDFBD@ozlabs.org \
    --to=benh@kernel.crashing.org \
    --cc=kumar.gala@freescale.com \
    --cc=linuxppc-dev@ozlabs.org \
    --cc=paulus@samba.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.