Re: [patch 03/18] sched: introduce primitives to account for CFS bandwidth tracking

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
To: Paul Turner <pjt@google.com>
Cc: linux-kernel@vger.kernel.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval.giani@gmail.com>,
	Balbir Singh <bsingharora@gmail.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
	Jason Baron <jbaron@redhat.com>, Nikhil Rao <ncrao@google.com>
Subject: Re: [patch 03/18] sched: introduce primitives to account for CFS bandwidth tracking
Date: Fri, 22 Jul 2011 16:44:50 +0530	[thread overview]
Message-ID: <20110722111450.GB20315@linux.vnet.ibm.com> (raw)
In-Reply-To: <20110721184756.972636699@google.com>

* Paul Turner <pjt@google.com> [2011-07-21 09:43:28]:

> In this patch we introduce the notion of CFS bandwidth, partitioned into
> globally unassigned bandwidth, and locally claimed bandwidth.
> 
> - The global bandwidth is per task_group, it represents a pool of unclaimed
>   bandwidth that cfs_rqs can allocate from.  
> - The local bandwidth is tracked per-cfs_rq, this represents allotments from
>   the global pool bandwidth assigned to a specific cpu.
> 
> Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
> - cpu.cfs_period_us : the bandwidth period in usecs
> - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
>   to consume over period above.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Nikhil Rao <ncrao@google.com>
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> 
> ---
>  init/Kconfig        |   12 +++
>  kernel/sched.c      |  196 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/sched_fair.c |   16 ++++
>  3 files changed, 220 insertions(+), 4 deletions(-)
> 
> Index: tip/init/Kconfig
> ===================================================================
> --- tip.orig/init/Kconfig
> +++ tip/init/Kconfig
> @@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
>  	depends on CGROUP_SCHED
>  	default CGROUP_SCHED
> 
> +config CFS_BANDWIDTH
> +	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
> +	depends on EXPERIMENTAL
> +	depends on FAIR_GROUP_SCHED
> +	default n
> +	help
> +	  This option allows users to define CPU bandwidth rates (limits) for
> +	  tasks running within the fair group scheduler.  Groups with no limit
> +	  set are considered to be unconstrained and will run with no
> +	  restriction.
> +	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
> +
>  config RT_GROUP_SCHED
>  	bool "Group scheduling for SCHED_RR/FIFO"
>  	depends on EXPERIMENTAL
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -244,6 +244,14 @@ struct cfs_rq;
> 
>  static LIST_HEAD(task_groups);
> 
> +struct cfs_bandwidth {
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	raw_spinlock_t lock;
> +	ktime_t period;
> +	u64 quota;
> +#endif
> +};
> +
>  /* task group related information */
>  struct task_group {
>  	struct cgroup_subsys_state css;
> @@ -275,6 +283,8 @@ struct task_group {
>  #ifdef CONFIG_SCHED_AUTOGROUP
>  	struct autogroup *autogroup;
>  #endif
> +
> +	struct cfs_bandwidth cfs_bandwidth;
>  };
> 
>  /* task_group_lock serializes the addition/removal of task groups */
> @@ -374,9 +384,48 @@ struct cfs_rq {
> 
>  	unsigned long load_contribution;
>  #endif
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	int runtime_enabled;
> +	s64 runtime_remaining;
> +#endif
>  #endif
>  };
> 
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +#ifdef CONFIG_CFS_BANDWIDTH
> +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
> +{
> +	return &tg->cfs_bandwidth;
> +}
> +
> +static inline u64 default_cfs_period(void);
> +
> +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> +{
> +	raw_spin_lock_init(&cfs_b->lock);
> +	cfs_b->quota = RUNTIME_INF;
> +	cfs_b->period = ns_to_ktime(default_cfs_period());
> +}
> +
> +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +{
> +	cfs_rq->runtime_enabled = 0;
> +}
> +
> +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
> +{}
> +#else
> +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
> +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
> +
> +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
> +{
> +	return NULL;
> +}
> +#endif /* CONFIG_CFS_BANDWIDTH */
> +#endif /* CONFIG_FAIR_GROUP_SCHED */
> +
>  /* Real-Time classes' related field in a runqueue: */
>  struct rt_rq {
>  	struct rt_prio_array active;
> @@ -7795,6 +7844,7 @@ static void init_tg_cfs_entry(struct tas
>  	tg->cfs_rq[cpu] = cfs_rq;
>  	init_cfs_rq(cfs_rq, rq);
>  	cfs_rq->tg = tg;
> +	init_cfs_rq_runtime(cfs_rq);

this hunk fails to apply, due to the changes introduced by
acb5a9ba3bd7 in the tip tree.
> 
>  	tg->se[cpu] = se;
>  	/* se could be NULL for root_task_group */
> @@ -7930,6 +7980,7 @@ void __init sched_init(void)
>  		 * We achieve this by letting root_task_group's tasks sit
>  		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
>  		 */
> +		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
>  		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
> 
> @@ -8171,6 +8222,8 @@ static void free_fair_sched_group(struct
>  {
>  	int i;
> 
> +	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
> +
>  	for_each_possible_cpu(i) {
>  		if (tg->cfs_rq)
>  			kfree(tg->cfs_rq[i]);
> @@ -8198,6 +8251,8 @@ int alloc_fair_sched_group(struct task_g
> 
>  	tg->shares = NICE_0_LOAD;
> 
> +	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
> +
>  	for_each_possible_cpu(i) {
>  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
>  				      GFP_KERNEL, cpu_to_node(i));
> @@ -8569,7 +8624,7 @@ static int __rt_schedulable(struct task_
>  	return walk_tg_tree(tg_schedulable, tg_nop, &data);
>  }
> 
> -static int tg_set_bandwidth(struct task_group *tg,
> +static int tg_set_rt_bandwidth(struct task_group *tg,
>  		u64 rt_period, u64 rt_runtime)
>  {
>  	int i, err = 0;
> @@ -8608,7 +8663,7 @@ int sched_group_set_rt_runtime(struct ta
>  	if (rt_runtime_us < 0)
>  		rt_runtime = RUNTIME_INF;
> 
> -	return tg_set_bandwidth(tg, rt_period, rt_runtime);
> +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
>  }
> 
>  long sched_group_rt_runtime(struct task_group *tg)
> @@ -8633,7 +8688,7 @@ int sched_group_set_rt_period(struct tas
>  	if (rt_period == 0)
>  		return -EINVAL;
> 
> -	return tg_set_bandwidth(tg, rt_period, rt_runtime);
> +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
>  }
> 
>  long sched_group_rt_period(struct task_group *tg)
> @@ -8823,6 +8878,128 @@ static u64 cpu_shares_read_u64(struct cg
> 
>  	return (u64) scale_load_down(tg->shares);
>  }
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
> +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
> +
> +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
> +{
> +	int i;
> +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
> +	static DEFINE_MUTEX(mutex);
> +
> +	if (tg == &root_task_group)
> +		return -EINVAL;
> +
> +	/*
> +	 * Ensure we have at some amount of bandwidth every period.  This is
> +	 * to prevent reaching a state of large arrears when throttled via
> +	 * entity_tick() resulting in prolonged exit starvation.
> +	 */
> +	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
> +		return -EINVAL;
> +
> +	/*
> +	 * Likewise, bound things on the otherside by preventing insane quota
> +	 * periods.  This also allows us to normalize in computing quota
> +	 * feasibility.
> +	 */
> +	if (period > max_cfs_quota_period)
> +		return -EINVAL;
> +
> +	mutex_lock(&mutex);
> +	raw_spin_lock_irq(&cfs_b->lock);
> +	cfs_b->period = ns_to_ktime(period);
> +	cfs_b->quota = quota;
> +	raw_spin_unlock_irq(&cfs_b->lock);
> +
> +	for_each_possible_cpu(i) {
> +		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
> +		struct rq *rq = rq_of(cfs_rq);
> +
> +		raw_spin_lock_irq(&rq->lock);
> +		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
> +		cfs_rq->runtime_remaining = 0;
> +		raw_spin_unlock_irq(&rq->lock);
> +	}
> +	mutex_unlock(&mutex);
> +
> +	return 0;
> +}
> +
> +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
> +{
> +	u64 quota, period;
> +
> +	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
> +	if (cfs_quota_us < 0)
> +		quota = RUNTIME_INF;
> +	else
> +		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
> +
> +	return tg_set_cfs_bandwidth(tg, period, quota);
> +}
> +
> +long tg_get_cfs_quota(struct task_group *tg)
> +{
> +	u64 quota_us;
> +
> +	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
> +		return -1;
> +
> +	quota_us = tg_cfs_bandwidth(tg)->quota;
> +	do_div(quota_us, NSEC_PER_USEC);
> +
> +	return quota_us;
> +}
> +
> +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
> +{
> +	u64 quota, period;
> +
> +	period = (u64)cfs_period_us * NSEC_PER_USEC;
> +	quota = tg_cfs_bandwidth(tg)->quota;
> +
> +	if (period <= 0)
> +		return -EINVAL;
> +
> +	return tg_set_cfs_bandwidth(tg, period, quota);
> +}
> +
> +long tg_get_cfs_period(struct task_group *tg)
> +{
> +	u64 cfs_period_us;
> +
> +	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
> +	do_div(cfs_period_us, NSEC_PER_USEC);
> +
> +	return cfs_period_us;
> +}
> +
> +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
> +{
> +	return tg_get_cfs_quota(cgroup_tg(cgrp));
> +}
> +
> +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
> +				s64 cfs_quota_us)
> +{
> +	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
> +}
> +
> +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
> +{
> +	return tg_get_cfs_period(cgroup_tg(cgrp));
> +}
> +
> +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
> +				u64 cfs_period_us)
> +{
> +	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
> +}
> +
> +#endif /* CONFIG_CFS_BANDWIDTH */
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
> 
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -8857,6 +9034,18 @@ static struct cftype cpu_files[] = {
>  		.write_u64 = cpu_shares_write_u64,
>  	},
>  #endif
> +#ifdef CONFIG_CFS_BANDWIDTH
> +	{
> +		.name = "cfs_quota_us",
> +		.read_s64 = cpu_cfs_quota_read_s64,
> +		.write_s64 = cpu_cfs_quota_write_s64,
> +	},
> +	{
> +		.name = "cfs_period_us",
> +		.read_u64 = cpu_cfs_period_read_u64,
> +		.write_u64 = cpu_cfs_period_write_u64,
> +	},
> +#endif
>  #ifdef CONFIG_RT_GROUP_SCHED
>  	{
>  		.name = "rt_runtime_us",
> @@ -9166,4 +9355,3 @@ struct cgroup_subsys cpuacct_subsys = {
>  	.subsys_id = cpuacct_subsys_id,
>  };
>  #endif	/* CONFIG_CGROUP_CPUACCT */
> -
> Index: tip/kernel/sched_fair.c
> ===================================================================
> --- tip.orig/kernel/sched_fair.c
> +++ tip/kernel/sched_fair.c
> @@ -1256,6 +1256,22 @@ entity_tick(struct cfs_rq *cfs_rq, struc
>  		check_preempt_tick(cfs_rq, curr);
>  }
> 
> +
> +/**************************************************
> + * CFS bandwidth control machinery
> + */
> +
> +#ifdef CONFIG_CFS_BANDWIDTH
> +/*
> + * default period for cfs group bandwidth.
> + * default: 0.1s, units: nanoseconds
> + */
> +static inline u64 default_cfs_period(void)
> +{
> +	return 100000000ULL;
> +}
> +#endif
> +
>  /**************************************************
>   * CFS operations on tasks:
>   */
> 
>

next prev parent reply	other threads:[~2011-07-22 11:14 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-07-21 16:43 [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-21 16:43 ` [patch 01/18] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-07-22 11:06   ` Kamalesh Babulal
2011-07-21 16:43 ` [patch 02/18] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-08-14 16:15   ` [tip:sched/core] sched: Implement " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 03/18] sched: introduce primitives to account for CFS bandwidth tracking Paul Turner
2011-07-22 11:14   ` Kamalesh Babulal [this message]
2011-08-14 16:17   ` [tip:sched/core] sched: Introduce " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 04/18] sched: validate CFS quota hierarchies Paul Turner
2011-08-14 16:19   ` [tip:sched/core] sched: Validate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 05/18] sched: accumulate per-cfs_rq cpu usage and charge against bandwidth Paul Turner
2011-08-14 16:21   ` [tip:sched/core] sched: Accumulate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 06/18] sched: add a timer to handle CFS bandwidth refresh Paul Turner
2011-08-14 16:23   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 07/18] sched: expire invalid runtime Paul Turner
2011-08-14 16:24   ` [tip:sched/core] sched: Expire " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 08/18] sched: add support for throttling group entities Paul Turner
2011-08-08 15:46   ` Lin Ming
2011-08-08 16:00     ` Peter Zijlstra
2011-08-08 16:16       ` Paul Turner
2011-08-14 16:26   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 09/18] sched: add support for unthrottling " Paul Turner
2011-08-14 16:27   ` [tip:sched/core] sched: Add " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 10/18] sched: allow for positional tg_tree walks Paul Turner
2011-08-14 16:29   ` [tip:sched/core] sched: Allow " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 11/18] sched: prevent interactions with throttled entities Paul Turner
2011-07-22 11:26   ` Kamalesh Babulal
2011-07-22 11:37     ` Peter Zijlstra
2011-07-22 11:41   ` Kamalesh Babulal
2011-07-22 11:43     ` Peter Zijlstra
2011-07-22 18:16       ` Kamalesh Babulal
2011-08-14 16:30   ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 12/18] sched: prevent buddy " Paul Turner
2011-08-14 16:32   ` [tip:sched/core] sched: Prevent " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 13/18] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-08-14 16:34   ` [tip:sched/core] sched: Migrate " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 14/18] sched: throttle entities exceeding their allowed bandwidth Paul Turner
2011-08-14 16:35   ` [tip:sched/core] sched: Throttle " tip-bot for Paul Turner
2011-07-21 16:43 ` [patch 15/18] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-08-14 16:37   ` [tip:sched/core] sched: Add " tip-bot for Nikhil Rao
2011-07-21 16:43 ` [patch 16/18] sched: return unused runtime on group dequeue Paul Turner
2011-08-14 16:39   ` [tip:sched/core] sched: Return " tip-bot for Paul Turner
2011-07-21 16:43 ` [RFT][patch 17/18] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-07-21 16:43 ` [patch 18/18] sched: add documentation for bandwidth control Paul Turner
2011-08-14 16:41   ` [tip:sched/core] sched: Add " tip-bot for Bharata B Rao
2011-07-21 23:01 ` [patch 00/18] CFS Bandwidth Control v7.2 Paul Turner
2011-07-25 14:58 ` Peter Zijlstra
2011-07-25 15:00   ` Peter Zijlstra
2011-07-25 16:21     ` Paul E. McKenney
2011-07-25 16:28       ` Peter Zijlstra
2011-07-25 16:46         ` Paul E. McKenney
2011-07-25 17:08           ` Peter Zijlstra
2011-07-25 17:11             ` Dhaval Giani
2011-07-25 17:35               ` Peter Zijlstra
2011-07-28  2:59     ` Paul Turner
2011-09-13 12:10 ` Vladimir Davydov
2011-09-13 14:00   ` Peter Zijlstra
2011-09-16  8:06   ` Paul Turner
2011-09-19  8:22     ` Vladimir Davydov
2011-09-19  8:33       ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110722111450.GB20315@linux.vnet.ibm.com \
    --to=kamalesh@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=bsingharora@gmail.com \
    --cc=dhaval.giani@gmail.com \
    --cc=jbaron@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=pjt@google.com \
    --cc=seto.hidetoshi@jp.fujitsu.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox