Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Rafael Wysocki <rafael.j.wysocki@intel.com>,
	Len Brown <len.brown@intel.com>,
	Andi Kleen <andi.kleen@intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Paul Turner <pjt@google.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Eduardo Valentin <edubezval@gmail.com>,
	Punit Agrawal <punit.agrawal@arm.com>,
	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Subject: Re: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection
Date: Tue, 10 Nov 2015 14:23:24 +0100	[thread overview]
Message-ID: <20151110132324.GC17308@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <1447114883-23851-4-git-send-email-jacob.jun.pan@linux.intel.com>

On Mon, Nov 09, 2015 at 04:21:23PM -0800, Jacob Pan wrote:
> +++ b/include/trace/events/sched.h

> +/*
> + * Tracepoint for idle injection
> + */
> +TRACE_EVENT(sched_cfs_idle_inject,
> +
> +	TP_PROTO(char *msg, int throttled),
> +
> +	TP_ARGS(msg, throttled),
> +
> +	TP_STRUCT__entry(
> +		__string(msg, msg)
> +		__field(int, throttled)
> +	),
> +
> +	TP_fast_assign(
> +		__assign_str(msg, msg);
> +		__entry->throttled = throttled;
> +	),
> +
> +	TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
> +);

So I hate tracepoints.. and I'd rather not see them. But at the very
least kill that @msg field and replace it with an enum or so.


> +/*
> + * Knobs for controlling percentage of time when idle is forced across all
> + * CPUs. This is a power management feature intended for achieving deepest
> + * and broadest idle without lower CPU frequencies to less optimal level.
> + * No action is taken if CPUs are natually idle.
> + */
> +#ifdef CONFIG_CFS_IDLE_INJECT
> +unsigned int sysctl_sched_cfs_idle_inject_pct;
> +unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;

since you're playing the ifdef game, you might as well also do:

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
		cfs_rq->runnable = true;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return cfs_rq->runnable;
}

#else

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
	cfs_rq->nr_running++;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
	return !!cfs_rq->nr_running;
}

> +#endif
> +
>  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
>  {
>  	lw->weight += inc;
> @@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		list_add(&se->group_node, &rq->cfs_tasks);
>  	}
>  #endif
> -	cfs_rq->nr_running++;
> +
> +	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = true;

which makes that:
	cfs_rq_nr_running_inc();

>  }
>  
>  static void
> @@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>  		list_del_init(&se->group_node);
>  	}
> -	cfs_rq->nr_running--;
> +
> +	if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> +		cfs_rq->runnable = false;

	cfs_rq_nr_running_dec();

>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>  
>  again:
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -	if (!cfs_rq->nr_running)
> +	if (!cfs_rq->runnable)

	if (!cfs_rq_runnable(cfs_rq))
>  		goto idle;
>  
>  	if (prev->sched_class != &fair_sched_class)

>  idle:
> +	if ((cfs_rq->forced_idle)) {
> +		if (unlikely(local_softirq_pending())) {
> +			trace_sched_cfs_idle_inject("softirq pending", 1);

> +			cfs_rq->forced_idle = false;
> +			cfs_rq->runnable = cfs_rq->nr_running;

maybe:
			__unthrottle_cfs_rq(cfs_rq); ?

> +			goto again;
> +		}
> +		trace_sched_cfs_idle_inject("forced idle", 1);
> +		return NULL;
> +	}
>  	/*
>  	 * This is OK, because current is on_cpu, which avoids it being picked
>  	 * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
>  #endif /* SMP */
>  
>  }
> +
> +#ifdef CONFIG_CFS_IDLE_INJECT

> +static atomic_t idle_inject_active;

You only use atomic_{read,set} on this, therefore atomic_t is pointless.

> +static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
> +static DEFINE_PER_CPU(bool, idle_injected);

I tend to prefer to not use bool as a storage class; its ill defined.

> +/* protect injection parameters from runtime changes */
> +static DEFINE_SPINLOCK(idle_inject_lock);

A global lock, yay :-), I think you want this to be a RAW_SPINLOCK
though. As on -RT this would want to actually run from IRQ context too.

> +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
> +{
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +	int cpu = smp_processor_id();
> +	ktime_t now, delta, period;
> +	bool status;
> +
> +	now = hrtimer_cb_get_time(hrt);

You're not interested in the current time.

> +
> +	status = raw_cpu_read(idle_injected);
> +	if (status) {
> +		/*
> +		 * We were injecting idle in the last phase, let's forward the
> +		 * timer to the next period
> +		 *
> +		 * status: 1             0                1        0
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *              ^ we are here
> +		 *                  forward to here: ^
> +		 */
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

This doesn't make any sense. Who cares what the current time is.

> +	} else {
> +		/*
> +		 * We were not injecting idle in the last phase, let's forward
> +		 * timer after forced idle duration
> +		 * ____          ____________________           _______
> +		 *     |________|                    |_________|
> +		 *
> +		 *     |duration|      interval      |
> +		 *
> +		 *     ^ we are here
> +		 *              ^ forward timer to here
> +		 */
> +		hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));

Same here, we don't care about the current time. The timer was at the
previous start of injection, just forward it a whole period to find the
next injection slot.

> +	}

It looks like what you want is:

	hrtimer_forward(hrt, period);

unconditionally.

> +	raw_cpu_write(idle_injected, !status);
> +	trace_sched_cfs_idle_inject("idle sync timer", !status);
> +	if (status)
> +		unthrottle_rq(cpu);
> +	else
> +		throttle_rq(cpu);
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static void idle_inject_timer_start(void *info)
> +{
> +	int cpu = smp_processor_id();
> +	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +
> +	this_cpu_write(idle_injected, true);
> +	set_bit(cpu, idle_inject_cpumask);
> +	hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
> +	hrtimer_set_expires(hrt, *(ktime_t *)info);

This is broken, _first_ set an expiration time, then start the timer.

Now you insert the timer into the RB tree on a previous expiration time,
then you modify the expiration time under it, effectively wrecking the
RB tree.

> +}

> +static void stop_idle_inject(void)
> +{
> +	int i;
> +	struct hrtimer *hrt;
> +
> +	if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {

I don't get the point of this bitmap; with the cpu notifier you
basically ensure this is equal to online_mask.

Also, this weight test is pointless, if the bitmap is empty the
for_each_set_bit() should be of equal cost -- and afaict nothing calling
this is performance critical in the first place.

> +		for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {

> +			hrt = &per_cpu(idle_inject_timer, i);
> +			hrtimer_cancel(hrt);
> +			unthrottle_rq(i);
> +		}
> +	}
> +}
> +
> +static int idle_inject_cpu_callback(struct notifier_block *nfb,
> +				unsigned long action, void *hcpu)
> +{
> +	unsigned long cpu = (unsigned long)hcpu;
> +	struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
> +	ktime_t now, delta, period;
> +
> +	if (!atomic_read(&idle_inject_active))
> +		goto exit_ok;

We should never get here if that weren't set, right? I mean you
register/unregister these callbacks around setting that variable.

> +
> +	switch (action) {
> +	case CPU_STARTING:
> +		raw_cpu_write(idle_injected, true);
> +
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +		set_bit(cpu, idle_inject_cpumask);
> +
> +		now = hrtimer_cb_get_time(hrt);
> +		hrtimer_start(hrt, ms_to_ktime(duration),
> +			HRTIMER_MODE_ABS_PINNED);
> +		/*
> +		 * When a new CPU comes online, we need to make sure it aligns
> +		 * its phase with the rest of the CPUs. So we set the
> +		 * timer to the next period based on the common starting time,
> +		 * then start injecting idle time.
> +		 */
> +		spin_lock_irq(&idle_inject_lock);
> +		delta = ktime_sub(now, inject_start_time);
> +		period = ktime_add(ms_to_ktime(duration),
> +				ms_to_ktime(inject_interval));
> +		delta = ktime_roundup(delta, period);
> +		spin_unlock_irq(&idle_inject_lock);
> +		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

Same broken, you cannot call that on a timer you've already started.

> +		break;
> +	case CPU_DYING:
> +		clear_bit(cpu, idle_inject_cpumask);
> +		hrtimer_cancel(hrt);
> +		raw_cpu_write(idle_injected, false);
> +		unthrottle_rq(cpu);
> +		break;
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +exit_ok:
> +	return NOTIFY_OK;
> +}
> +
> +static int idle_inject_pm_callback(struct notifier_block *self,
> +				unsigned long action, void *hcpu)
> +{
> +	switch (action) {
> +	case PM_HIBERNATION_PREPARE:
> +	case PM_SUSPEND_PREPARE:
> +		if (atomic_read(&idle_inject_active))
> +			stop_idle_inject();

As with the above, if that were false, this whole callback would not be
called, seeing how you unregister before actually clearing that
idle_inject_active thing.

> +		break;
> +	case PM_POST_HIBERNATION:
> +	case PM_POST_SUSPEND:
> +		printk("POST SUSPEND restart idle injection\n");

Seems a tad inconsistent, printing here but not when stopping it.

> +		start_idle_inject();
> +		break;
> +	default:
> +		break;
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block idle_inject_pm_notifier = {
> +	.notifier_call = idle_inject_pm_callback,
> +};
> +
> +static struct notifier_block idle_inject_cpu_notifier = {
> +	.notifier_call = idle_inject_cpu_callback,
> +};
> +
> +static void end_idle_inject(void) {
> +	unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	unregister_pm_notifier(&idle_inject_pm_notifier);

As per the above, these callbacks will not happen hereafter, and will
this never see:

> +	atomic_set(&idle_inject_active, 0);
> +	kfree(idle_inject_cpumask);
> +}
> +
> +static int prepare_idle_inject(void)
> +{
> +	int retval = 0;
> +	int bitmap_size;
> +	int cpu;
> +	struct hrtimer *hrt;
> +
> +	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);

This is incorrect, you want nr_cpu_ids. There is no guarantee the CPU
space does not contain holes. But seeing I still don't see the point of
the mask, this might all fix itself by killing it alltogether.

> +	idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
> +	if (!idle_inject_cpumask)
> +		return -ENOMEM;
> +
> +	retval = register_pm_notifier(&idle_inject_pm_notifier);
> +	if (retval)
> +		goto exit_free;
> +	retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
> +	if (retval)
> +		goto exit_unregister_pm;
> +	get_online_cpus();
> +	for_each_online_cpu(cpu) {
> +		hrt = &per_cpu(idle_inject_timer, cpu);
> +		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +		hrt->function = idle_inject_timer_fn;
> +	}
> +	put_online_cpus();
> +
> +	if (!duration)
> +		duration = DEFAULT_DURATION_MSECS;
> +
> +	return 0;
> +exit_unregister_pm:
> +	unregister_pm_notifier(&idle_inject_pm_notifier);
> +exit_free:
> +	kfree(idle_inject_cpumask);
> +	return retval;
> +}
> +
> +int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
> +					int write,
> +					void __user *buffer,
> +					size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
> +		if (!idle_pct)
> +			start_idle_inject();
> +		else if (!sysctl_sched_cfs_idle_inject_pct) {
> +			stop_idle_inject();
> +			end_idle_inject();
> +		}
> +
> +		/* recompute injection parameters */
> +		spin_lock_irq(&idle_inject_lock);
> +		idle_pct = sysctl_sched_cfs_idle_inject_pct;
> +		/*
> +		 * duration is fixed for each injection period, we adjust
> +		 * non idle interval to satisfy the idle percentage set
> +		 * by the user. e.g. if duration is 10 and we want 33% idle
> +		 * then interval is 20.
> +		 * 33% idle
> +		 * ____          ___________________          _________
> +		 *     |________|                   |________| 33% idle
> +		 * ____          ________          _______
> +		 *     |________|        |________|  50% idle
> +		 *
> +		 *     |duration|interval|
> +		 */
> +		if (idle_pct)
> +			inject_interval = (duration * (100 - idle_pct))
> +				/ idle_pct;

This needs {} (or just exceed the 80 char thing).

> +		spin_unlock_irq(&idle_inject_lock);
> +
> +	}
> +out:
> +	return ret;
> +}
> +
> +int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
> +						int write,
> +						void __user *buffer,
> +						size_t *length,	loff_t *ppos)
> +{
> +	int ret;
> +
> +	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (ret)
> +		goto out;
> +
> +	if (duration == sysctl_sched_cfs_idle_inject_duration)
> +		goto out;
> +	/* recompute injection parameters */
> +	spin_lock_irq(&idle_inject_lock);
> +	duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
> +	if (idle_pct)
> +		inject_interval = (duration * (100 - idle_pct)) / idle_pct;
> +
> +	spin_unlock_irq(&idle_inject_lock);
> +out:
> +	return ret;
> +}

And since you have proc handlers for both these, why not convert to
ktime here and avoid the enless ms_to_ktime() calls ?

Also, maybe precompute the period, since that is what you really need.

next prev parent reply	other threads:[~2015-11-10 13:23 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-10  0:21 [RFC PATCH v2 0/3] CFS idle injection Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 1/3] ktime: add a roundup function Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 2/3] timer: relax tick stop in idle entry Jacob Pan
2015-11-10  0:21 ` [RFC PATCH v2 3/3] sched: introduce synchronized idle injection Jacob Pan
2015-11-10 13:23   ` Peter Zijlstra [this message]
2015-11-10 14:01     ` Jacob Pan
2015-11-10 14:58       ` Peter Zijlstra
2015-11-10 16:28         ` Jacob Pan
2015-11-10 16:36           ` Peter Zijlstra
2015-11-10 16:50             ` Jacob Pan
2015-11-10 17:00               ` Peter Zijlstra
2015-11-10 17:14                 ` Jacob Pan
2015-11-10 18:41     ` Jacob Pan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151110132324.GC17308@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=andi.kleen@intel.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=edubezval@gmail.com \
    --cc=jacob.jun.pan@linux.intel.com \
    --cc=len.brown@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pjt@google.com \
    --cc=punit.agrawal@arm.com \
    --cc=rafael.j.wysocki@intel.com \
    --cc=srinivas.pandruvada@linux.intel.com \
    --cc=tglx@linutronix.de \
    --cc=tim.c.chen@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.