Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Low <jason.low2@hp.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Oleg Nesterov <oleg@redhat.com>,
	Mike Galbraith <umgwanakikbuti@gmail.com>,
	Rik van Riel <riel@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Scott Norton <scott.norton@hp.com>,
	Aswin Chandramouleeswaran <aswin@hp.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
Date: Thu, 5 Mar 2015 07:56:59 -0800	[thread overview]
Message-ID: <20150305155659.GD5773@linux.vnet.ibm.com> (raw)
In-Reply-To: <20150305153506.GD5074@lerouge>

On Thu, Mar 05, 2015 at 04:35:09PM +0100, Frederic Weisbecker wrote:
> On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:
> > v1->v2:
> > - Peter suggested that cputimer->running does not need to be atomic,
> >   so we can leave it as an integer.
> > - Address a race condition that could occur in update_gt_cputime().
> > - Add helper functions to avoid repeating code.
> > 
> > While running a database workload, we found a scalability issue
> > with itimers.
> > 
> > Much of the problem was caused by the thread_group_cputimer spinlock.
> > Each time we account for group system/user time, we need to obtain a
> > thread_group_cputimer's spinlock to update the timers. On larger
> > systems (such as a 16 socket machine), this caused more than 30% of
> > total time spent trying to obtain this kernel lock to update these
> > group timer stats.
> > 
> > This patch converts the timers to 64 bit atomic variables and use
> > atomic add to update them without a lock. With this patch, the percent
> > of total time spent updating thread group cputimer timers was reduced
> > from 30% down to less than 1%.
> > 
> > Signed-off-by: Jason Low <jason.low2@hp.com>
> > ---
> >  include/linux/init_task.h      |    7 +++--
> >  include/linux/sched.h          |   10 ++-----
> >  kernel/fork.c                  |    3 --
> >  kernel/sched/stats.h           |   12 ++------
> >  kernel/time/posix-cpu-timers.c |   55 +++++++++++++++++++++++----------------
> >  5 files changed, 42 insertions(+), 45 deletions(-)
> > 
> > diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> > index 3037fc0..c4cdec7 100644
> > --- a/include/linux/init_task.h
> > +++ b/include/linux/init_task.h
> > @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
> >  	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
> >  	.rlim		= INIT_RLIMITS,					\
> >  	.cputimer	= { 						\
> > -		.cputime = INIT_CPUTIME,				\
> > -		.running = 0,						\
> > -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> > +		.utime = ATOMIC64_INIT(0),                              \
> > +		.stime = ATOMIC64_INIT(0),                              \
> > +		.sum_exec_runtime = ATOMIC64_INIT(0),                   \
> > +		.running = 0						\
> >  	},								\
> >  	.cred_guard_mutex =						\
> >  		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 8db31ef..d6b0f76 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -588,9 +588,10 @@ struct task_cputime {
> >   * used for thread group CPU timer calculations.
> >   */
> >  struct thread_group_cputimer {
> > -	struct task_cputime cputime;
> > +	atomic64_t utime;
> > +	atomic64_t stime;
> > +	atomic64_t sum_exec_runtime;
> >  	int running;
> > -	raw_spinlock_t lock;
> >  };
> >  
> >  #include <linux/rwsem.h>
> > @@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
> >  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
> >  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
> >  
> > -static inline void thread_group_cputime_init(struct signal_struct *sig)
> > -{
> > -	raw_spin_lock_init(&sig->cputimer.lock);
> > -}
> > -
> >  /*
> >   * Reevaluate whether the task has signals pending delivery.
> >   * Wake the task if so.
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 4dc2dda..df9dfe9 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
> >  {
> >  	unsigned long cpu_limit;
> >  
> > -	/* Thread group counters. */
> > -	thread_group_cputime_init(sig);
> > -
> >  	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
> >  	if (cpu_limit != RLIM_INFINITY) {
> >  		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> > diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> > index 4ab7043..adda94e 100644
> > --- a/kernel/sched/stats.h
> > +++ b/kernel/sched/stats.h
> > @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.utime += cputime;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(cputime, &cputimer->utime);
> >  }
> >  
> >  /**
> > @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.stime += cputime;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(cputime, &cputimer->stime);
> >  }
> >  
> >  /**
> > @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.sum_exec_runtime += ns;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(ns, &cputimer->sum_exec_runtime);
> >  }
> > diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> > index a16b678..ba93c23 100644
> > --- a/kernel/time/posix-cpu-timers.c
> > +++ b/kernel/time/posix-cpu-timers.c
> > @@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
> >  	return error;
> >  }
> >  
> > +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> > +static inline void sample_group_cputimer(struct task_cputime *times,
> > +			                 struct thread_group_cputimer *cputimer)
> > +{
> > +        times->utime = atomic64_read(&cputimer->utime);
> > +        times->stime = atomic64_read(&cputimer->stime);
> > +        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
> 
> So, in the case we are calling that right after setting cputimer->running, I guess we are fine
> because we just updated cputimer with the freshest values.
> 
> But if we are reading this a while after, say several ticks further, there is a chance that
> we read stale values since we don't lock anymore.
> 
> I don't know if it matters or not, I guess it depends how stale it can be and how much precision
> we expect from posix cpu timers. It probably doesn't matter.
> 
> But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
> value because it performs a full barrier, at the cost of more overhead of course.

Well, if we are running within a guest OS, we might be delayed at any point
for quite some time.  Even with interrupts disabled.

							Thanx, Paul

next prev parent reply	other threads:[~2015-03-05 15:57 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
2015-03-02 19:03 ` Linus Torvalds
2015-03-02 21:49   ` Jason Low
2015-03-19 17:21     ` Jason Low
2015-03-19 17:59       ` Linus Torvalds
2015-03-19 20:14         ` Jason Low
2015-03-02 19:40 ` Oleg Nesterov
2015-03-02 19:43   ` Oleg Nesterov
2015-03-02 21:16     ` Jason Low
2015-03-02 21:44       ` Linus Torvalds
2015-03-02 22:43         ` Jason Low
2015-03-05 15:20         ` Frederic Weisbecker
2015-03-05 20:02           ` Jason Low
2015-03-02 21:19   ` Jason Low
2015-03-05 15:35 ` Frederic Weisbecker
2015-03-05 15:56   ` Paul E. McKenney [this message]
2015-03-05 16:00     ` Frederic Weisbecker
2015-03-05 16:16       ` Paul E. McKenney
2015-03-06  0:06   ` Jason Low

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150305155659.GD5773@linux.vnet.ibm.com \
    --to=paulmck@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=aswin@hp.com \
    --cc=fweisbec@gmail.com \
    --cc=jason.low2@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=scott.norton@hp.com \
    --cc=torvalds@linux-foundation.org \
    --cc=umgwanakikbuti@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.