All of lore.kernel.org
 help / color / mirror / Atom feed
From: Waiman Long <waiman.long@hp.com>
To: Jason Low <jason.low2@hp.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	linux-kernel@vger.kernel.org,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Mel Gorman <mgorman@suse.de>, Rik van Riel <riel@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Preeti U Murthy <preeti@linux.vnet.ibm.com>,
	Mike Galbraith <umgwanakikbuti@gmail.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Aswin Chandramouleeswaran <aswin@hp.com>,
	Scott J Norton <scott.norton@hp.com>
Subject: Re: [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability
Date: Wed, 29 Apr 2015 14:43:42 -0400	[thread overview]
Message-ID: <5541265E.1020005@hp.com> (raw)
In-Reply-To: <1430251224-5764-4-git-send-email-jason.low2@hp.com>

On 04/28/2015 04:00 PM, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
>
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
>
> Note: On 32 bit systems using the generic 64 bit atomics, this causes
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
>
> Signed-off-by: Jason Low<jason.low2@hp.com>
> ---
>   include/linux/init_task.h      |    7 ++--
>   include/linux/sched.h          |   10 ++----
>   kernel/fork.c                  |    3 --
>   kernel/sched/stats.h           |   15 +++-----
>   kernel/time/posix-cpu-timers.c |   79 +++++++++++++++++++++++++---------------
>   5 files changed, 62 insertions(+), 52 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 696d223..7b9d8b5 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>   	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
>   	.rlim		= INIT_RLIMITS,					\
>   	.cputimer	= { 						\
> -		.cputime = INIT_CPUTIME,				\
> -		.running = 0,						\
> -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> +		.utime 		  = ATOMIC64_INIT(0),			\
> +		.stime		  = ATOMIC64_INIT(0),			\
> +		.sum_exec_runtime = ATOMIC64_INIT(0),			\
> +		.running 	  = 0					\
>   	},								\
>   	.cred_guard_mutex =						\
>   		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 604eb7c..c736a47 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -601,9 +601,10 @@ struct task_cputime {
>    * used for thread group CPU timer calculations.
>    */
>   struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>   	int running;
> -	raw_spinlock_t lock;
>   };
>
>   #include<linux/rwsem.h>
> @@ -2970,11 +2971,6 @@ static __always_inline bool need_resched(void)
>   void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
>   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -	raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>   /*
>    * Reevaluate whether the task has signals pending delivery.
>    * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 47c37a4..2e67086 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
>   {
>   	unsigned long cpu_limit;
>
> -	/* Thread group counters. */
> -	thread_group_cputime_init(sig);
> -
>   	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>   	if (cpu_limit != RLIM_INFINITY) {
>   		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..c6d1c7d 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
>   {
>   	struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
>
> -	if (!cputimer->running)
> +	/* Check if cputimer isn't running. This is accessed without locking. */
> +	if (!READ_ONCE(cputimer->running))
>   		return false;
>
>   	/*
> @@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.utime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime,&cputimer->utime);
>   }
>
>   /**
> @@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.stime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime,&cputimer->stime);
>   }
>
>   /**
> @@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
>   	if (!cputimer_running(tsk))
>   		return;
>
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.sum_exec_runtime += ns;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(ns,&cputimer->sum_exec_runtime);
>   }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index e072d98..d857306 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
>   	return 0;
>   }
>
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +/*
> + * Set cputime to sum_cputime if sum_cputime>  cputime. Use cmpxchg
> + * to avoid race conditions with concurrent updates to cputime.
> + */
> +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
>   {
> -	if (b->utime>  a->utime)
> -		a->utime = b->utime;
> +	u64 curr_cputime;
> +retry:
> +	curr_cputime = atomic64_read(cputime);
> +	if (sum_cputime>  curr_cputime) {
> +		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> +			goto retry;
> +	}
> +}
>
> -	if (b->stime>  a->stime)
> -		a->stime = b->stime;
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> +{
> +	__update_gt_cputime(&cputimer->utime, sum->utime);
> +	__update_gt_cputime(&cputimer->stime, sum->stime);
> +	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> +}
>
> -	if (b->sum_exec_runtime>  a->sum_exec_runtime)
> -		a->sum_exec_runtime = b->sum_exec_runtime;
> +/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +					  struct thread_group_cputimer *cputimer)
> +{
> +	times->utime = atomic64_read(&cputimer->utime);
> +	times->stime = atomic64_read(&cputimer->stime);
> +	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
>   }
>
>   void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
>   {
>   	struct thread_group_cputimer *cputimer =&tsk->signal->cputimer;
>   	struct task_cputime sum;
> -	unsigned long flags;
>
> -	if (!cputimer->running) {
> +	/* Check if cputimer isn't running. This is accessed without locking. */
> +	if (!READ_ONCE(cputimer->running)) {
>   		/*
>   		 * The POSIX timer interface allows for absolute time expiry
>   		 * values through the TIMER_ABSTIME flag, therefore we have
> -		 * to synchronize the timer to the clock every time we start
> -		 * it.
> +		 * to synchronize the timer to the clock every time we start it.
>   		 */
>   		thread_group_cputime(tsk,&sum);
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -		cputimer->running = 1;
> -		update_gt_cputime(&cputimer->cputime,&sum);
> -	} else
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	*times = cputimer->cputime;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +		update_gt_cputime(cputimer,&sum);
> +
> +		/*
> +		 * We're setting cputimer->running without a lock. Ensure
> +		 * this only gets written to in one operation. We set
> +		 * running after update_gt_cputime() as a small optimization,
> +		 * but barriers are not required because update_gt_cputime()
> +		 * can handle concurrent updates.
> +		 */
> +		WRITE_ONCE(cputimer->running, 1);
> +	}
> +	sample_group_cputimer(times, cputimer);
>   }

If there is a possibility that more than one thread will be running this 
code concurrently, I think it will be safer to  use cmpxchg to set the 
running flag:

     if (!READ_ONCE(cputimer->running) && !cmpxchg(&cputimer->running, 
0, 1)) {
         ...

This will ensure that only one thread will update it.

Cheers,
Longman


  parent reply	other threads:[~2015-04-29 18:43 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-28 20:00 [PATCH v2 0/5] sched, timer: Improve scalability of itimers Jason Low
2015-04-28 20:00 ` [PATCH v2 1/5] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
2015-04-29 14:34   ` Rik van Riel
2015-04-29 17:05   ` Waiman Long
2015-04-29 17:15     ` Steven Rostedt
2015-04-29 18:25       ` Jason Low
2015-05-08 13:22   ` [tip:sched/core] sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE() tip-bot for Jason Low
2015-04-28 20:00 ` [PATCH v2 2/5] sched, numa: Document usages of mm->numa_scan_seq Jason Low
2015-04-29 14:35   ` Rik van Riel
2015-04-29 18:14   ` Waiman Long
2015-04-29 18:45     ` Jason Low
2015-04-30 18:42       ` Waiman Long
2015-04-30 18:54         ` Davidlohr Bueso
2015-04-30 20:58           ` Waiman Long
2015-04-30 21:26           ` Jason Low
2015-04-30 21:13         ` Jason Low
2015-05-01  0:28           ` [PATCH v3 " Jason Low
2015-05-08 13:22             ` [tip:sched/core] sched/numa: " tip-bot for Jason Low
2015-05-01 15:21           ` [PATCH v2 2/5] sched, numa: " Paul E. McKenney
2015-05-01 17:40             ` Jason Low
2015-04-28 20:00 ` [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability Jason Low
2015-04-29 14:38   ` Rik van Riel
2015-04-29 20:45     ` Jason Low
2015-04-29 18:43   ` Waiman Long [this message]
2015-04-29 20:14     ` Jason Low
2015-05-08 13:22   ` [tip:sched/core] sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), " tip-bot for Jason Low
2015-05-08 21:31     ` [PATCH] sched, timer: Fix documentation for 'struct thread_group_cputimer' Jason Low
2015-05-11  6:41       ` [tip:sched/core] sched, timer: Fix documentation for ' struct thread_group_cputimer' tip-bot for Jason Low
2015-04-28 20:00 ` [PATCH v2 4/5] sched, timer: Provide an atomic task_cputime data structure Jason Low
2015-04-29 14:47   ` Rik van Riel
2015-05-08 13:22   ` [tip:sched/core] sched, timer: Provide an atomic ' struct task_cputime' " tip-bot for Jason Low
2015-04-28 20:00 ` [PATCH v2 5/5] sched, timer: Use the atomic task_cputime in thread_group_cputimer Jason Low
2015-04-29 14:48   ` Rik van Riel
2015-05-08 13:23   ` [tip:sched/core] " tip-bot for Jason Low
     [not found] <016401d08246$0917f130$1b47d390$@alibaba-inc.com>
2015-04-29  6:38 ` [PATCH v2 3/5] sched, timer: Use atomics in thread_group_cputimer to improve scalability Hillf Danton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5541265E.1020005@hp.com \
    --to=waiman.long@hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=aswin@hp.com \
    --cc=dave@stgolabs.net \
    --cc=fweisbec@gmail.com \
    --cc=jason.low2@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=preeti@linux.vnet.ibm.com \
    --cc=riel@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=scott.norton@hp.com \
    --cc=tglx@linutronix.de \
    --cc=umgwanakikbuti@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.