public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH, 2.6.9] improved load_balance() tolerance for pinned tasks
@ 2004-10-20 19:36 John Hawkes
  2004-10-20 19:59 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: John Hawkes @ 2004-10-20 19:36 UTC (permalink / raw)
  To: nickpiggin, akpm, linux-kernel, jbarnes; +Cc: hawkes

A large number of processes that are pinned to a single CPU results in
every other CPU's load_balance() seeing this overloaded CPU as "busiest",
yet move_tasks() never finds a task to pull-migrate.  This condition
occurs during module unload, but can also occur as a denial-of-service
using sys_sched_setaffinity().  Several hundred CPUs performing this
fruitless load_balance() will livelock on the busiest CPU's runqueue
lock.  A smaller number of CPUs will livelock if the pinned task count
gets high.  This simple patch remedies the more common first problem:
after a move_tasks() failure to migrate anything, the balance_interval
increments.  Using a simple increment, vs.  the more dramatic doubling of
the balance_interval, is conservative and yet also effective.

John Hawkes


Signed-off-by: John Hawkes <hawkes@sgi.com>




Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c	2004-10-19 15:04:11.000000000 -0700
+++ linux/kernel/sched.c	2004-10-19 15:09:50.000000000 -0700
@@ -2123,11 +2123,19 @@
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries;
 		}
-	} else
-		sd->nr_balance_failed = 0;
 
-	/* We were unbalanced, so reset the balancing interval */
-	sd->balance_interval = sd->min_interval;
+		/*
+		 * We were unbalanced, but unsuccessful in move_tasks(),
+		 * so bump the balance_interval to lessen the lock contention.
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval++;
+	} else {
+                sd->nr_balance_failed = 0;
+
+		/* We were unbalanced, so reset the balancing interval */
+		sd->balance_interval = sd->min_interval;
+	}
 
 	return nr_moved;
 

^ permalink raw reply	[flat|nested] 10+ messages in thread
* Re: [PATCH, 2.6.9] improved load_balance() tolerance for pinned tasks
@ 2004-10-22 19:20 John Hawkes
  2004-10-23  4:22 ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: John Hawkes @ 2004-10-22 19:20 UTC (permalink / raw)
  To: nickpiggin; +Cc: akpm, jbarnes, linux-kernel

Nick, your patch doesn't work on my 128p to solve the problem.
This variation, however, does work.  It's a patch against 2.6.9.
The difference is in move_tasks().






 linux-2.6-npiggin/kernel/sched.c |   34 +++++++++++++++++++++++-----------
 1 files changed, 23 insertions(+), 11 deletions(-)

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c	2004-10-22 09:11:12.000000000 -0700
+++ linux/kernel/sched.c	2004-10-22 11:45:10.000000000 -0700
@@ -1770,7 +1770,7 @@
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		     struct sched_domain *sd, enum idle_type idle)
+		     struct sched_domain *sd, enum idle_type idle, int *pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1780,8 +1780,10 @@
 	 */
 	if (task_running(rq, p))
 		return 0;
-	if (!cpu_isset(this_cpu, p->cpus_allowed))
+	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+		*pinned++;
 		return 0;
+	}
 
 	/* Aggressive migration if we've failed balancing */
 	if (idle == NEWLY_IDLE ||
@@ -1802,11 +1804,11 @@
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
+		      enum idle_type idle, int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
-	int idx, pulled = 0;
+	int idx, examined = 0, pulled = 0, pinned = 0;
 	task_t *tmp;
 
 	if (max_nr_move <= 0 || busiest->nr_running <= 1)
@@ -1850,7 +1852,8 @@
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+	examined++;
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1876,6 +1879,8 @@
 		goto skip_bitmap;
 	}
 out:
+	if (unlikely(examined && examined == pinned))
+		*all_pinned = 1;
 	return pulled;
 }
 
@@ -2056,7 +2061,7 @@
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
-	int nr_moved;
+	int nr_moved, all_pinned;
 
 	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -2095,11 +2100,16 @@
 		 */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-						imbalance, sd, idle);
+						imbalance, sd, idle,
+						&all_pinned);
 		spin_unlock(&busiest->lock);
 	}
-	spin_unlock(&this_rq->lock);
+	/* All tasks on this runqueue were pinned by CPU affinity */
+	if (unlikely(all_pinned))
+		goto out_balanced;
 
+	spin_unlock(&this_rq->lock);
+	
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
@@ -2154,7 +2164,7 @@
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
-	int nr_moved = 0;
+	int nr_moved = 0, all_pinned;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
@@ -2174,7 +2184,7 @@
 
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					imbalance, sd, NEWLY_IDLE);
+			imbalance, sd, NEWLY_IDLE, &all_pinned);
 	if (!nr_moved)
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 
@@ -2236,6 +2246,7 @@
 		cpumask_t tmp;
 		runqueue_t *rq;
 		int push_cpu = 0;
+		int all_pinned;
 
 		if (group == busy_group)
 			goto next_group;
@@ -2261,7 +2272,8 @@
 		if (unlikely(busiest == rq))
 			goto next_group;
 		double_lock_balance(busiest, rq);
-		if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
+		if (move_tasks(rq, push_cpu, busiest, 1,
+				sd, IDLE, &all_pinned)) {
 			schedstat_inc(busiest, alb_lost);
 			schedstat_inc(rq, alb_gained);
 		} else {

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2004-10-30  0:28 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-10-20 19:36 [PATCH, 2.6.9] improved load_balance() tolerance for pinned tasks John Hawkes
2004-10-20 19:59 ` Ingo Molnar
2004-10-22 13:08 ` Nick Piggin
2004-10-22 19:38 ` John Hawkes
     [not found]   ` <00ee01c4b870$030b80f0$6700a8c0@comcast.net>
2004-10-23  4:27     ` Nick Piggin
2004-10-25 16:02       ` John Hawkes
2004-10-25 23:59         ` Nick Piggin
2004-10-30  0:21         ` Matthew Dobson
  -- strict thread matches above, loose matches on Subject: below --
2004-10-22 19:20 John Hawkes
2004-10-23  4:22 ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox