Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Frederic Weisbecker <fweisbec@gmail.com>
To: Jason Low <jason.low2@hp.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Mike Galbraith <umgwanakikbuti@gmail.com>,
	Rik van Riel <riel@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Scott Norton <scott.norton@hp.com>,
	Aswin Chandramouleeswaran <aswin@hp.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
Date: Thu, 5 Mar 2015 16:35:09 +0100	[thread overview]
Message-ID: <20150305153506.GD5074@lerouge> (raw)
In-Reply-To: <1425321731.5304.14.camel@j-VirtualBox>

On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:
> v1->v2:
> - Peter suggested that cputimer->running does not need to be atomic,
>   so we can leave it as an integer.
> - Address a race condition that could occur in update_gt_cputime().
> - Add helper functions to avoid repeating code.
> 
> While running a database workload, we found a scalability issue
> with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger
> systems (such as a 16 socket machine), this caused more than 30% of
> total time spent trying to obtain this kernel lock to update these
> group timer stats.
> 
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
> 
> Signed-off-by: Jason Low <jason.low2@hp.com>
> ---
>  include/linux/init_task.h      |    7 +++--
>  include/linux/sched.h          |   10 ++-----
>  kernel/fork.c                  |    3 --
>  kernel/sched/stats.h           |   12 ++------
>  kernel/time/posix-cpu-timers.c |   55 +++++++++++++++++++++++----------------
>  5 files changed, 42 insertions(+), 45 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 3037fc0..c4cdec7 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>  	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
>  	.rlim		= INIT_RLIMITS,					\
>  	.cputimer	= { 						\
> -		.cputime = INIT_CPUTIME,				\
> -		.running = 0,						\
> -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> +		.utime = ATOMIC64_INIT(0),                              \
> +		.stime = ATOMIC64_INIT(0),                              \
> +		.sum_exec_runtime = ATOMIC64_INIT(0),                   \
> +		.running = 0						\
>  	},								\
>  	.cred_guard_mutex =						\
>  		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8db31ef..d6b0f76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -588,9 +588,10 @@ struct task_cputime {
>   * used for thread group CPU timer calculations.
>   */
>  struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>  	int running;
> -	raw_spinlock_t lock;
>  };
>  
>  #include <linux/rwsem.h>
> @@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
>  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>  
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -	raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>  /*
>   * Reevaluate whether the task has signals pending delivery.
>   * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4dc2dda..df9dfe9 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
>  {
>  	unsigned long cpu_limit;
>  
> -	/* Thread group counters. */
> -	thread_group_cputime_init(sig);
> -
>  	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>  	if (cpu_limit != RLIM_INFINITY) {
>  		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..adda94e 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.utime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->utime);
>  }
>  
>  /**
> @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.stime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->stime);
>  }
>  
>  /**
> @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.sum_exec_runtime += ns;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(ns, &cputimer->sum_exec_runtime);
>  }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index a16b678..ba93c23 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
>  	return error;
>  }
>  
> +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +			                 struct thread_group_cputimer *cputimer)
> +{
> +        times->utime = atomic64_read(&cputimer->utime);
> +        times->stime = atomic64_read(&cputimer->stime);
> +        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);

So, in the case we are calling that right after setting cputimer->running, I guess we are fine
because we just updated cputimer with the freshest values.

But if we are reading this a while after, say several ticks further, there is a chance that
we read stale values since we don't lock anymore.

I don't know if it matters or not, I guess it depends how stale it can be and how much precision
we expect from posix cpu timers. It probably doesn't matter.

But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
value because it performs a full barrier, at the cost of more overhead of course.

next prev parent reply	other threads:[~2015-03-05 15:35 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
2015-03-02 19:03 ` Linus Torvalds
2015-03-02 21:49   ` Jason Low
2015-03-19 17:21     ` Jason Low
2015-03-19 17:59       ` Linus Torvalds
2015-03-19 20:14         ` Jason Low
2015-03-02 19:40 ` Oleg Nesterov
2015-03-02 19:43   ` Oleg Nesterov
2015-03-02 21:16     ` Jason Low
2015-03-02 21:44       ` Linus Torvalds
2015-03-02 22:43         ` Jason Low
2015-03-05 15:20         ` Frederic Weisbecker
2015-03-05 20:02           ` Jason Low
2015-03-02 21:19   ` Jason Low
2015-03-05 15:35 ` Frederic Weisbecker [this message]
2015-03-05 15:56   ` Paul E. McKenney
2015-03-05 16:00     ` Frederic Weisbecker
2015-03-05 16:16       ` Paul E. McKenney
2015-03-06  0:06   ` Jason Low

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150305153506.GD5074@lerouge \
    --to=fweisbec@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=aswin@hp.com \
    --cc=jason.low2@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=scott.norton@hp.com \
    --cc=torvalds@linux-foundation.org \
    --cc=umgwanakikbuti@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox