Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Rafael Wysocki <rafael.j.wysocki@intel.com>,
	Len Brown <len.brown@intel.com>,
	Andi Kleen <andi.kleen@intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Paul Turner <pjt@google.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Eduardo Valentin <edubezval@gmail.com>,
	Punit Agrawal <punit.agrawal@arm.com>,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Subject: Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection
Date: Tue, 10 Nov 2015 14:23:24 +0100	[thread overview]
Message-ID: <20151110132324.GC17308@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <1447114883-23851-4-git-send-email-jacob.jun.pan@linux.intel.com>

On Mon, Nov 09, 2015 at 04:21:23PM -0800, Jacob Pan wrote:
> +++ b/include/trace/events/sched.h

> +/*
> + * Tracepoint for idle injection
> + */
> +TRACE_EVENT(sched_cfs_idle_inject,
> +
> +	TP_PROTO(char *msg, int throttled),
> +
> +	TP_ARGS(msg, throttled),
> +
> +	TP_STRUCT__entry(
> +		__string(msg, msg)
> +		__field(int, throttled)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(msg, msg);
> +		__entry->throttled = throttled;
> +	),
> +
> +	TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
> +);

So I hate tracepoints.. and I'd rather not see them. But at the very
least kill that @msg field and replace it with an enum or so.


> +/*
> + * Knobs for controlling percentage of time when idle is forced across all
> + * CPUs. This is a power management feature intended for achieving deepest
> + * and broadest idle without lower CPU frequencies to less optimal level.
> + * No action is taken if CPUs are natually idle.
> + */
> +#ifdef CONFIG_CFS_IDLE_INJECT
> +unsigned int sysctl_sched_cfs_idle_inject_pct;
> +unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;

since you're playing the ifdef game, you might as well also do:

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
		cfs_rq->runnable = true;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return cfs_rq->runnable;
}

#else

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	cfs_rq->nr_running++;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return !!cfs_rq->nr_running;
}

> +#endif
> +
>  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
>  {
>  	lw->weight += inc;
> @@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		list_add(&se->group_node, &rq->cfs_tasks);
>  	}
>  #endif
> -	cfs_rq->nr_running++;
> +
> +	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = true;

which makes that:
	cfs_rq_nr_running_inc();

>  }
>  
>  static void
> @@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>  		list_del_init(&se->group_node);
>  	}
> -	cfs_rq->nr_running--;
> +
> +	if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = false;

	cfs_rq_nr_running_dec();

>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>  
>  again:
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -	if (!cfs_rq->nr_running)
> +	if (!cfs_rq->runnable)

	if (!cfs_rq_runnable(cfs_rq))
>  		goto idle;
>  
>  	if (prev->sched_class != &fair_sched_class)

>  idle:
> +	if ((cfs_rq->forced_idle)) {
> +		if (unlikely(local_softirq_pending())) {
> +			trace_sched_cfs_idle_inject("softirq pending", 1);

> +			cfs_rq->forced_idle = false;
> +			cfs_rq->runnable = cfs_rq->nr_running;

maybe:
			__unthrottle_cfs_rq(cfs_rq); ?

> +			goto again;
> +		}
> +		trace_sched_cfs_idle_inject("forced idle", 1);
> +		return NULL;
> +	}
>  	/*
>  	 * This is OK, because current is on_cpu, which avoids it being picked
>  	 * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
>  #endif /* SMP */
>  
>  }
> +
> +#ifdef CONFIG_CFS_IDLE_INJECT

> +static atomic_t idle_inject_active;

You only use atomic_{read,set} on this, therefore atomic_t is pointless.

> +static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
> +static DEFINE_PER_CPU(bool, idle_injected);

I tend to prefer to not use bool as a storage class; its ill defined.

> +/* protect injection parameters from runtime changes */
> +static DEFINE_SPINLOCK(idle_inject_lock);

A global lock, yay :-), I think you want this to be a RAW_SPINLOCK
though. As on -RT this would want to actually run from IRQ context too.

> +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
> +{
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +	int cpu = smp_processor_id();
> +	ktime_t now, delta, period;
> +	bool status;
> +
> +	now = hrtimer_cb_get_time(hrt);

You're not interested in the current time.

> +
> +	status = raw_cpu_read(idle_injected);
> +	if (status) {
> +		/*
> +		 * We were injecting idle in the last phase, let's forward the
> +		 * timer to the next period
> +		 *
> +		 * status: 1             0                1        0
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *              ^ we are here
> +		 *                  forward to here: ^
> +		 */
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

This doesn't make any sense. Who cares what the current time is.

> +	} else {
> +		/*
> +		 * We were not injecting idle in the last phase, let's forward
> +		 * timer after forced idle duration
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *     ^ we are here
> +		 *              ^ forward timer to here
> +		 */
> +		hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));

Same here, we don't care about the current time. The timer was at the
previous start of injection, just forward it a whole period to find the
next injection slot.

> +	}

It looks like what you want is:

	hrtimer_forward(hrt, period);

unconditionally.

> +	raw_cpu_write(idle_injected, !status);
> +	trace_sched_cfs_idle_inject("idle sync timer", !status);
> +	if (status)
> +		unthrottle_rq(cpu);
> +	else
> +		throttle_rq(cpu);
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static void idle_inject_timer_start(void *info)
> +{
> +	int cpu = smp_processor_id();
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +
> +	this_cpu_write(idle_injected, true);
> +	set_bit(cpu, idle_inject_cpumask);
> +	hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
> +	hrtimer_set_expires(hrt, *(ktime_t *)info);

This is broken, _first_ set an expiration time, then start the timer.

Now you insert the timer into the RB tree on a previous expiration time,
then you modify the expiration time under it, effectively wrecking the
RB tree.

> +}

> +static void stop_idle_inject(void)
> +{
> +	int i;
> +	struct hrtimer *hrt;
> +
> +	if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {

I don't get the point of this bitmap; with the cpu notifier you
basically ensure this is equal to online_mask.

Also, this weight test is pointless, if the bitmap is empty the
for_each_set_bit() should be of equal cost -- and afaict nothing calling
this is performance critical in the first place.

> +		for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {

> +			hrt = &per_cpu(idle_inject_timer, i);
> +			hrtimer_cancel(hrt);
> +			unthrottle_rq(i);
> +		}
> +	}
> +}
> +
> +static int idle_inject_cpu_callback(struct notifier_block *nfb,
> +				unsigned long action, void *hcpu)
> +{
> +	unsigned long cpu = (unsigned long)hcpu;
> +	struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
> +	ktime_t now, delta, period;
> +
> +	if (!atomic_read(&idle_inject_active))
> +		goto exit_ok;

We should never get here if that weren't set, right? I mean you
register/unregister these callbacks around setting that variable.

> +
> +	switch (action) {
> +	case CPU_STARTING:
> +		raw_cpu_write(idle_injected, true);
> +
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +		set_bit(cpu, idle_inject_cpumask);
> +
> +		now = hrtimer_cb_get_time(hrt);
> +		hrtimer_start(hrt, ms_to_ktime(duration),
> +			HRTIMER_MODE_ABS_PINNED);
> +		/*
> +		 * When a new CPU comes online, we need to make sure it aligns
> +		 * its phase with the rest of the CPUs. So we set the
> +		 * timer to the next period based on the common starting time,
> +		 * then start injecting idle time.
> +		 */
> +		spin_lock_irq(&idle_inject_lock);
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		spin_unlock_irq(&idle_inject_lock);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

Same broken, you cannot call that on a timer you've already started.

> +		break;
> +	case CPU_DYING:
> +		clear_bit(cpu, idle_inject_cpumask);
> +		hrtimer_cancel(hrt);
> +		raw_cpu_write(idle_injected, false);
> +		unthrottle_rq(cpu);
> +		break;
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +exit_ok:
> +	return NOTIFY_OK;
> +}
> +
> +static int idle_inject_pm_callback(struct notifier_block *self,
> +				unsigned long action, void *hcpu)
> +{
> +	switch (action) {
> +	case PM_HIBERNATION_PREPARE:
> +	case PM_SUSPEND_PREPARE:
> +		if (atomic_read(&idle_inject_active))
> +			stop_idle_inject();

As with the above, if that were false, this whole callback would not be
called, seeing how you unregister before actually clearing that
idle_inject_active thing.

> +		break;
> +	case PM_POST_HIBERNATION:
> +	case PM_POST_SUSPEND:
> +		printk("POST SUSPEND restart idle injection\n");

Seems a tad inconsistent, printing here but not when stopping it.

> +		start_idle_inject();
> +		break;
> +	default:
> +		break;
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block idle_inject_pm_notifier = {
> +	.notifier_call = idle_inject_pm_callback,
> +};
> +
> +static struct notifier_block idle_inject_cpu_notifier = {
> +	.notifier_call = idle_inject_cpu_callback,
> +};
> +
> +static void end_idle_inject(void) {
> +	unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	unregister_pm_notifier(&idle_inject_pm_notifier);

As per the above, these callbacks will not happen hereafter, and will
this never see:

> +	atomic_set(&idle_inject_active, 0);
> +	kfree(idle_inject_cpumask);
> +}
> +
> +static int prepare_idle_inject(void)
> +{
> +	int retval = 0;
> +	int bitmap_size;
> +	int cpu;
> +	struct hrtimer *hrt;
> +
> +	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);

This is incorrect, you want nr_cpu_ids. There is no guarantee the CPU
space does not contain holes. But seeing I still don't see the point of
the mask, this might all fix itself by killing it alltogether.

> +	idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
> +	if (!idle_inject_cpumask)
> +		return -ENOMEM;
> +
> +	retval = register_pm_notifier(&idle_inject_pm_notifier);
> +	if (retval)
> +		goto exit_free;
> +	retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	if (retval)
> +		goto exit_unregister_pm;
> +	get_online_cpus();
> +	for_each_online_cpu(cpu) {
> +		hrt = &per_cpu(idle_inject_timer, cpu);
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +	}
> +	put_online_cpus();
> +
> +	if (!duration)
> +		duration = DEFAULT_DURATION_MSECS;
> +
> +	return 0;
> +exit_unregister_pm:
> +	unregister_pm_notifier(&idle_inject_pm_notifier);
> +exit_free:
> +	kfree(idle_inject_cpumask);
> +	return retval;
> +}
> +
> +int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
> +					int write,
> +					void __user *buffer,
> +					size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
> +		if (!idle_pct)
> +			start_idle_inject();
> +		else if (!sysctl_sched_cfs_idle_inject_pct) {
> +			stop_idle_inject();
> +			end_idle_inject();
> +		}
> +
> +		/* recompute injection parameters */
> +		spin_lock_irq(&idle_inject_lock);
> +		idle_pct = sysctl_sched_cfs_idle_inject_pct;
> +		/*
> +		 * duration is fixed for each injection period, we adjust
> +		 * non idle interval to satisfy the idle percentage set
> +		 * by the user. e.g. if duration is 10 and we want 33% idle
> +		 * then interval is 20.
> +		 * 33% idle
> +		 * ____          ___________________          _________
> +		 *     |________|                   |________| 33% idle
> +		 * ____          ________          _______
> +		 *     |________|        |________|  50% idle
> +		 *
> +		 *     |duration|interval|
> +		 */
> +		if (idle_pct)
> +			inject_interval = (duration * (100 - idle_pct))
> +				/ idle_pct;

This needs {} (or just exceed the 80 char thing).

> +		spin_unlock_irq(&idle_inject_lock);
> +
> +	}
> +out:
> +	return ret;
> +}
> +
> +int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
> +						int write,
> +						void __user *buffer,
> +						size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (duration == sysctl_sched_cfs_idle_inject_duration)
> +		goto out;
> +	/* recompute injection parameters */
> +	spin_lock_irq(&idle_inject_lock);
> +	duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
> +	if (idle_pct)
> +		inject_interval = (duration * (100 - idle_pct)) / idle_pct;
> +
> +	spin_unlock_irq(&idle_inject_lock);
> +out:
> +	return ret;
> +}

And since you have proc handlers for both these, why not convert to
ktime here and avoid the enless ms_to_ktime() calls ?

Also, maybe precompute the period, since that is what you really need.

next prev parent reply	other threads:[~2015-11-10 13:23 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-10  0:21 [RFC PATCH v2 0/3] CFS idle injection Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 1/3] ktime: add a roundup function Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 2/3] timer: relax tick stop in idle entry Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 3/3] sched: introduce synchronized idle injection Jacob Pan
2015-11-10 13:23   ` Peter Zijlstra [this message]
2015-11-10 14:01     ` Jacob Pan
2015-11-10 14:58       ` Peter Zijlstra
2015-11-10 16:28         ` Jacob Pan
2015-11-10 16:36           ` Peter Zijlstra
2015-11-10 16:50             ` Jacob Pan
2015-11-10 17:00               ` Peter Zijlstra
2015-11-10 17:14                 ` Jacob Pan
2015-11-10 18:41     ` Jacob Pan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151110132324.GC17308@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=andi.kleen@intel.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=edubezval@gmail.com \
    --cc=jacob.jun.pan@linux.intel.com \
    --cc=len.brown@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pjt@google.com \
    --cc=punit.agrawal@arm.com \
    --cc=rafael.j.wysocki@intel.com \
    --cc=srinivas.pandruvada@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox