[PATCH 1/2] core scheduler changes

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
To: riel@redhat.com, Ingo Molnar <mingo@elte.hu>,
	Nick Piggin <nickpiggin@yahoo.com.au>,
	Andrew Morton <akpm@osdl.org>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH 1/2] core scheduler changes
Date: Fri, 26 Jan 2007 11:33:17 +0530	[thread overview]
Message-ID: <20070126060317.GB2487@in.ibm.com> (raw)
In-Reply-To: <20070126060142.GA2487@in.ibm.com>


This patch does several things:

	- Introduces the notion of control window (current set at 1
	  sec - ideally the window size should be adjusted based on
	  number of users to avoid rapid context switches). Bandwidth of each 
	  user is controlled within this window.  rq->last_update tracks where 
	  are in the current window.

	- Modifies scheduler_tick() to account cpu bandwidth consumption
	  by a task group. Basically bandwidth consumed by a task is
	  charged to itself (p->time_slice) -and- to its group as well.

	- A task is forced off the CPU once its group has expired the
	  bandwidth in the current control window. Such a task is also
	  marked as "starving".

	- schedule() avoids picking tasks whose group has expired its
	  bandwidth in current control window. Any task (with non-zero
	  p->timeslice) which is not picked to run in schedule() because of 
	  this reason is marked "starving".
	
	- If a group has bandwidth left and it has starving tasks, then 
	  schedule() prefers picking such tasks over non-starving tasks.
	  This will avoid starvation of lower-priority tasks in a group.


Signed-off-by : Srivatsa Vaddagiri <vatsa@in.ibm.com>

---


diff -puN include/linux/sched.h~cpu-controller-based-on-rtlimit_rt_cpu-patch include/linux/sched.h
--- linux-2.6.20-rc5/include/linux/sched.h~cpu-controller-based-on-rtlimit_rt_cpu-patch	2007-01-19 15:17:27.000000000 +0530
+++ linux-2.6.20-rc5-vatsa/include/linux/sched.h	2007-01-26 09:04:07.000000000 +0530
@@ -531,6 +531,10 @@ struct signal_struct {
 #define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
 #define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
 
+#ifdef CONFIG_FAIRSCHED
+struct cpu_usage;
+#endif
+
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -555,6 +559,10 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct list_head uidhash_list;
 	uid_t uid;
+#ifdef CONFIG_FAIRSCHED
+	int cpu_limit;
+	struct cpu_usage *cpu_usage;
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
@@ -1137,6 +1145,9 @@ static inline void put_task_struct(struc
 					/* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
+#ifdef CONFIG_FAIRSCHED
+#define PF_STARVING	0x00000010      /* Task starving for CPU */
+#endif
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff -puN kernel/sched.c~cpu-controller-based-on-rtlimit_rt_cpu-patch kernel/sched.c
--- linux-2.6.20-rc5/kernel/sched.c~cpu-controller-based-on-rtlimit_rt_cpu-patch	2007-01-19 15:17:27.000000000 +0530
+++ linux-2.6.20-rc5-vatsa/kernel/sched.c	2007-01-26 09:04:07.000000000 +0530
@@ -266,6 +266,9 @@ struct rq {
 	unsigned long ttwu_local;
 #endif
 	struct lock_class_key rq_lock_key;
+#ifdef CONFIG_FAIRSCHED
+	unsigned long last_update;
+#endif
 };
 
 static DEFINE_PER_CPU(struct rq, runqueues);
@@ -710,6 +713,126 @@ enqueue_task_head(struct task_struct *p,
 	p->array = array;
 }
 
+#ifdef CONFIG_FAIRSCHED
+
+struct cpu_usage {
+       long tokens;
+       unsigned long last_update;
+       int starve_count;
+};
+
+#define task_starving(p)	(p->flags & PF_STARVING)
+
+/* Mark a task starving - either we shortcircuited its timeslice or we didnt
+ * pick it to run (because user ran out of bandwidth limit in current epoch).
+ */
+static inline void set_tsk_starving(struct task_struct *p)
+{
+	struct user_struct *user = p->user;
+	struct cpu_usage *cu;
+
+	if (task_starving(p) || !user->cpu_limit)
+		return;
+
+	cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+	cu->starve_count++;
+	p->flags |= PF_STARVING;
+}
+
+/* Clear a task's starving flag */
+static inline void clear_tsk_starving(struct task_struct *p)
+{
+	struct user_struct *user = p->user;
+	struct cpu_usage *cu;
+
+	if (!task_starving(p) || !user->cpu_limit)
+		return;
+
+	cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+	cu->starve_count--;
+	p->flags &= ~PF_STARVING;
+}
+
+/* Does the task's group have starving tasks? */
+static inline int is_user_starving(struct task_struct *p)
+{
+	struct user_struct *user = p->user;
+	struct cpu_usage *cu;
+
+	if (!user->cpu_limit)
+		return 0;
+
+	cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+	if (cu->starve_count)
+		return 1;
+
+	return 0;
+}
+
+/* Are we past the 1-sec control window? If so, all groups get to renew their
+ * expired tokens.
+ */
+static inline void adjust_control_window(void)
+{
+	struct rq *rq = this_rq();
+	unsigned long delta;
+
+	delta = jiffies - rq->last_update;
+	if (delta >= HZ)
+		rq->last_update += (delta/HZ) * HZ;
+}
+
+/* Account group's cpu usage */
+static inline void inc_cpu_usage(struct task_struct *p)
+{
+	struct user_struct *user = p->user;
+	struct cpu_usage *cu;
+
+	adjust_control_window();
+
+	if (!user->cpu_limit)
+		return;
+
+	cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+	cu->tokens--;
+}
+
+static inline int task_over_cpu_limit(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	struct user_struct *user = p->user;
+	struct cpu_usage *cu;
+
+	adjust_control_window();
+
+	if (!user->cpu_limit)
+	 	return 0;
+
+	cu = per_cpu_ptr(user->cpu_usage, task_cpu(p));
+	if (cu->last_update != rq->last_update) {
+		/* Replenish tokens */
+		cu->tokens += user->cpu_limit * HZ / 100;
+		cu->last_update = rq->last_update;
+	}
+
+	if (cu->tokens <= 0)
+		return 1;
+
+	return 0;
+}
+
+#else
+
+#define task_starving(p)	0
+
+static void inc_cpu_usage(struct task_struct *p) { }
+static int task_over_cpu_limit(struct task_struct *p) { return 0; }
+static void set_tsk_starving(struct task_struct *p) { }
+static void clear_tsk_starving(struct task_struct *p) { }
+static int is_user_starving(struct task_struct *p) { return 0;}
+
+#endif		/* CONFIG_FAIRSCHED */
+
 /*
  * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -1607,6 +1730,9 @@ void fastcall sched_fork(struct task_str
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+#ifdef CONFIG_FAIRSCHED
+	p->flags &= ~PF_STARVING;
+#endif
 	/*
 	 * Share the timeslice between parent and child, thus the
 	 * total amount of pending timeslices in the system doesn't change,
@@ -2074,6 +2200,7 @@ static void pull_task(struct rq *src_rq,
 {
 	dequeue_task(p, src_array);
 	dec_nr_running(p, src_rq);
+	clear_tsk_starving(p);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
@@ -3137,6 +3264,9 @@ static void task_running_tick(struct rq 
 		return;
 	}
 	spin_lock(&rq->lock);
+
+	inc_cpu_usage(p);
+
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter. Note: we do not update a thread's
@@ -3163,17 +3293,18 @@ static void task_running_tick(struct rq 
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq)
+						|| task_over_cpu_limit(p)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
+		goto out_unlock;
 	} else {
 		/*
 		 * Prevent a too long timeslice allowing a task to monopolize
@@ -3200,6 +3331,14 @@ static void task_running_tick(struct rq 
 			set_tsk_need_resched(p);
 		}
 	}
+
+	if (task_over_cpu_limit(p)) {
+		dequeue_task(p, rq->active);
+		set_tsk_need_resched(p);
+		enqueue_task(p, rq->expired);
+		set_tsk_starving(p);
+	}
+
 out_unlock:
 	spin_unlock(&rq->lock);
 }
@@ -3416,7 +3555,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx, new_prio;
+	int cpu, idx, new_prio, array_switch;
 	long *switch_count;
 	struct rq *rq;
 
@@ -3478,6 +3617,7 @@ need_resched_nonpreemptible:
 		else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
+			clear_tsk_starving(prev);
 			deactivate_task(prev, rq);
 		}
 	}
@@ -3493,11 +3633,15 @@ need_resched_nonpreemptible:
 		}
 	}
 
+	array_switch = 0;
+
+pick_next_task:
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
 		/*
 		 * Switch the active and expired arrays.
 		 */
+		array_switch++;
 		schedstat_inc(rq, sched_switch);
 		rq->active = rq->expired;
 		rq->expired = array;
@@ -3510,6 +3654,25 @@ need_resched_nonpreemptible:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 
+	/* If we have done an array switch twice, it means we cant find any
+	 * task which isn't above its limit and hence we just run the
+	 * first task on the active array.
+	 */
+	if (array_switch < 2 && (task_over_cpu_limit(next) ||
+			(!task_starving(next) && is_user_starving(next)))) {
+		dequeue_task(next, rq->active);
+		enqueue_task(next, rq->expired);
+		if (next->time_slice)
+			set_tsk_starving(next);
+		goto pick_next_task;
+	}
+
+	if (task_over_cpu_limit(next))
+		rq->last_update = jiffies;
+	if (!next->time_slice)
+		next->time_slice = task_timeslice(next);
+	clear_tsk_starving(next);
+
 	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
@@ -5061,6 +5224,7 @@ static int __migrate_task(struct task_st
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	clear_tsk_starving(p);
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
 		/*
_

-- 
Regards,
vatsa

next prev parent reply	other threads:[~2007-01-26  6:03 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-26  6:01 [RFC] Fair-user scheduler Srivatsa Vaddagiri
2007-01-26  6:03 ` Srivatsa Vaddagiri [this message]
2007-01-31 15:01   ` [PATCH 1/2] core scheduler changes Srivatsa Vaddagiri
2007-01-26  6:05 ` [PATCH 2/2] Track number of users in the system Srivatsa Vaddagiri
2007-01-26 14:09 ` [RFC] Fair-user scheduler Kirill Korotaev
2007-01-26 18:52   ` Eric Piel
2007-01-31 15:10     ` Srivatsa Vaddagiri
2007-01-26 18:41 ` Chris Friesen
2007-01-31 15:16   ` Srivatsa Vaddagiri

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070126060317.GB2487@in.ibm.com \
    --to=vatsa@in.ibm.com \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.