Re: [PATCH RFC 01/16] prcu: Add PRCU implementation

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: lianglihao@huawei.com
Cc: guohanjun@huawei.com, heng.z@huawei.com, hb.chen@huawei.com,
	lihao.liang@gmail.com, linux-kernel@vger.kernel.org
Subject: Re: [PATCH RFC 01/16] prcu: Add PRCU implementation
Date: Wed, 24 Jan 2018 22:16:18 -0800	[thread overview]
Message-ID: <20180125061618.GU3741@linux.vnet.ibm.com> (raw)
In-Reply-To: <1516694381-20333-2-git-send-email-lianglihao@huawei.com>

On Tue, Jan 23, 2018 at 03:59:26PM +0800, lianglihao@huawei.com wrote:
> From: Heng Zhang <heng.z@huawei.com>
> 
> This RCU implementation (PRCU) is based on a fast consensus protocol
> published in the following paper:
> 
> Fast Consensus Using Bounded Staleness for Scalable Read-mostly Synchronization.
> Haibo Chen, Heng Zhang, Ran Liu, Binyu Zang, and Haibing Guan.
> IEEE Transactions on Parallel and Distributed Systems (TPDS), 2016.
> https://dl.acm.org/citation.cfm?id=3024114.3024143
> 
> Signed-off-by: Heng Zhang <heng.z@huawei.com>
> Signed-off-by: Lihao Liang <lianglihao@huawei.com>

A few comments and questions interspersed.

							Thanx, Paul

> ---
>  include/linux/prcu.h |  37 +++++++++++++++
>  kernel/rcu/Makefile  |   2 +-
>  kernel/rcu/prcu.c    | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/core.c  |   2 +
>  4 files changed, 165 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/prcu.h
>  create mode 100644 kernel/rcu/prcu.c
> 
> diff --git a/include/linux/prcu.h b/include/linux/prcu.h
> new file mode 100644
> index 00000000..653b4633
> --- /dev/null
> +++ b/include/linux/prcu.h
> @@ -0,0 +1,37 @@
> +#ifndef __LINUX_PRCU_H
> +#define __LINUX_PRCU_H
> +
> +#include <linux/atomic.h>
> +#include <linux/mutex.h>
> +#include <linux/wait.h>
> +
> +#define CONFIG_PRCU
> +
> +struct prcu_local_struct {
> +	unsigned int locked;
> +	unsigned int online;
> +	unsigned long long version;
> +};
> +
> +struct prcu_struct {
> +	atomic64_t global_version;
> +	atomic_t active_ctr;
> +	struct mutex mtx;
> +	wait_queue_head_t wait_q;
> +};
> +
> +#ifdef CONFIG_PRCU
> +void prcu_read_lock(void);
> +void prcu_read_unlock(void);
> +void synchronize_prcu(void);
> +void prcu_note_context_switch(void);
> +
> +#else /* #ifdef CONFIG_PRCU */
> +
> +#define prcu_read_lock() do {} while (0)
> +#define prcu_read_unlock() do {} while (0)
> +#define synchronize_prcu() do {} while (0)
> +#define prcu_note_context_switch() do {} while (0)

If CONFIG_PRCU=n and some code is built that uses PRCU, shouldn't you
get a build error rather than an error-free but inoperative PRCU?

Of course, Peter's question about purpose of the patch set applies
here as well.

> +
> +#endif /* #ifdef CONFIG_PRCU */
> +#endif /* __LINUX_PRCU_H */
> diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
> index 23803c7d..8791419c 100644
> --- a/kernel/rcu/Makefile
> +++ b/kernel/rcu/Makefile
> @@ -2,7 +2,7 @@
>  # and is generally not a function of system call inputs.
>  KCOV_INSTRUMENT := n
> 
> -obj-y += update.o sync.o
> +obj-y += update.o sync.o prcu.o
>  obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
>  obj-$(CONFIG_TREE_SRCU) += srcutree.o
>  obj-$(CONFIG_TINY_SRCU) += srcutiny.o
> diff --git a/kernel/rcu/prcu.c b/kernel/rcu/prcu.c
> new file mode 100644
> index 00000000..a00b9420
> --- /dev/null
> +++ b/kernel/rcu/prcu.c
> @@ -0,0 +1,125 @@
> +#include <linux/smp.h>
> +#include <linux/prcu.h>
> +#include <linux/percpu.h>
> +#include <linux/compiler.h>
> +#include <linux/sched.h>
> +
> +#include <asm/barrier.h>
> +
> +DEFINE_PER_CPU_SHARED_ALIGNED(struct prcu_local_struct, prcu_local);
> +
> +struct prcu_struct global_prcu = {
> +	.global_version = ATOMIC64_INIT(0),
> +	.active_ctr = ATOMIC_INIT(0),
> +	.mtx = __MUTEX_INITIALIZER(global_prcu.mtx),
> +	.wait_q = __WAIT_QUEUE_HEAD_INITIALIZER(global_prcu.wait_q)
> +};
> +struct prcu_struct *prcu = &global_prcu;
> +
> +static inline void prcu_report(struct prcu_local_struct *local)
> +{
> +	unsigned long long global_version;
> +	unsigned long long local_version;
> +
> +	global_version = atomic64_read(&prcu->global_version);
> +	local_version = local->version;
> +	if (global_version > local_version)
> +		cmpxchg(&local->version, local_version, global_version);
> +}
> +
> +void prcu_read_lock(void)
> +{
> +	struct prcu_local_struct *local;
> +
> +	local = get_cpu_ptr(&prcu_local);
> +	if (!local->online) {
> +		WRITE_ONCE(local->online, 1);
> +		smp_mb();
> +	}
> +
> +	local->locked++;
> +	put_cpu_ptr(&prcu_local);
> +}
> +EXPORT_SYMBOL(prcu_read_lock);
> +
> +void prcu_read_unlock(void)
> +{
> +	int locked;
> +	struct prcu_local_struct *local;
> +
> +	barrier();
> +	local = get_cpu_ptr(&prcu_local);
> +	locked = local->locked;
> +	if (locked) {
> +		local->locked--;
> +		if (locked == 1)
> +			prcu_report(local);

Is ordering important here?  It looks to me that the compiler could
rearrange some of the accesses within prcu_report() with the local->locked
decrement.  There appears to be some potential for load and store tearing,
though perhaps you have verified that your compiler avoids this on
the architecture that you are using.

> +		put_cpu_ptr(&prcu_local);
> +	} else {

Hmmm...  We get here if the RCU read-side critical section was preempted.
If none of them are preempted, ->active_ctr remains zero.

> +		put_cpu_ptr(&prcu_local);
> +		if (!atomic_dec_return(&prcu->active_ctr))
> +			wake_up(&prcu->wait_q);
> +	}
> +}
> +EXPORT_SYMBOL(prcu_read_unlock);
> +
> +static void prcu_handler(void *info)
> +{
> +	struct prcu_local_struct *local;
> +
> +	local = this_cpu_ptr(&prcu_local);
> +	if (!local->locked)
> +		WRITE_ONCE(local->version, atomic64_read(&prcu->global_version));
> +}
> +
> +void synchronize_prcu(void)
> +{
> +	int cpu;
> +	cpumask_t cpus;
> +	unsigned long long version;
> +	struct prcu_local_struct *local;
> +
> +	version = atomic64_add_return(1, &prcu->global_version);
> +	mutex_lock(&prcu->mtx);
> +
> +	local = get_cpu_ptr(&prcu_local);
> +	local->version = version;
> +	put_cpu_ptr(&prcu_local);
> +
> +	cpumask_clear(&cpus);
> +	for_each_possible_cpu(cpu) {
> +		local = per_cpu_ptr(&prcu_local, cpu);
> +		if (!READ_ONCE(local->online))
> +			continue;
> +		if (READ_ONCE(local->version) < version) {

On 32-bit systems, given that ->version is long long, you might see
load tearing.  And on some 32-bit systems, the cmpxchg() in prcu_hander()
might not build.

Or is the idea that only prcu_handler() updates ->version?  But in that
case, you wouldn't need the READ_ONCE() above.  What am I missing here?

> +			smp_call_function_single(cpu, prcu_handler, NULL, 0);
> +			cpumask_set_cpu(cpu, &cpus);
> +		}
> +	}
> +
> +	for_each_cpu(cpu, &cpus) {
> +		local = per_cpu_ptr(&prcu_local, cpu);
> +		while (READ_ONCE(local->version) < version)

This ->version read can also tear on some 32-bit systems, and this
one most definitely can race with the prcu_handler() above.  Does the
algorithm operate correctly in that case?  (It doesn't look that way
to me, but I might be missing something.) Or are 32-bit systems excluded?

> +			cpu_relax();
> +	}

I might be missing something, but I believe we need a memory barrier
here on non-TSO systems.  Without that, couldn't we miss a preemption?

> +
> +	if (atomic_read(&prcu->active_ctr))
> +		wait_event(prcu->wait_q, !atomic_read(&prcu->active_ctr));
> +
> +	mutex_unlock(&prcu->mtx);
> +}
> +EXPORT_SYMBOL(synchronize_prcu);
> +
> +void prcu_note_context_switch(void)
> +{
> +	struct prcu_local_struct *local;
> +
> +	local = get_cpu_ptr(&prcu_local);
> +	if (local->locked) {
> +		atomic_add(local->locked, &prcu->active_ctr);
> +		local->locked = 0;
> +	}
> +	local->online = 0;
> +	prcu_report(local);
> +	put_cpu_ptr(&prcu_local);
> +}
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 326d4f88..a308581b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -15,6 +15,7 @@
>  #include <linux/init_task.h>
>  #include <linux/context_tracking.h>
>  #include <linux/rcupdate_wait.h>
> +#include <linux/prcu.h>
> 
>  #include <linux/blkdev.h>
>  #include <linux/kprobes.h>
> @@ -3383,6 +3384,7 @@ static void __sched notrace __schedule(bool preempt)
> 
>  	local_irq_disable();
>  	rcu_note_context_switch(preempt);
> +	prcu_note_context_switch();
> 
>  	/*
>  	 * Make sure that signal_pending_state()->signal_pending() below
> -- 
> 2.14.1.729.g59c0ea183
>

next prev parent reply	other threads:[~2018-01-25  6:30 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-23  7:59 [PATCH RFC 00/16] A new RCU implementation based on a fast consensus protocol lianglihao
2018-01-23  7:59 ` [PATCH RFC 01/16] prcu: Add PRCU implementation lianglihao
2018-01-24 11:26   ` Peter Zijlstra
2018-01-24 17:15     ` Lihao Liang
2018-01-24 20:19       ` Peter Zijlstra
2018-01-25  6:16   ` Paul E. McKenney [this message]
2018-01-25  7:30     ` Boqun Feng
2018-01-30  5:34       ` zhangheng (AC)
2018-01-30  6:40         ` Boqun Feng
2018-01-30 10:42           ` zhangheng (AC)
2018-01-27  7:35     ` Lihao Liang
2018-01-30  3:58     ` zhangheng (AC)
2018-01-29  9:10   ` Lai Jiangshan
2018-01-30  6:21     ` zhangheng (AC)
2018-01-23  7:59 ` [PATCH RFC 02/16] rcutorture: Add PRCU rcu_torture_ops lianglihao
2018-01-23  7:59 ` [PATCH RFC 03/16] rcutorture: Add PRCU test config files lianglihao
2018-01-25  6:27   ` Paul E. McKenney
2018-01-23  7:59 ` [PATCH RFC 04/16] rcuperf: Add PRCU rcu_perf_ops lianglihao
2018-01-23  7:59 ` [PATCH RFC 05/16] rcuperf: Add PRCU test config files lianglihao
2018-01-23  7:59 ` [PATCH RFC 06/16] rcuperf: Set gp_exp to true for tests to run lianglihao
2018-01-25  6:18   ` Paul E. McKenney
2018-01-26  8:33     ` Lihao Liang
2018-01-23  7:59 ` [PATCH RFC 07/16] prcu: Implement call_prcu() API lianglihao
2018-01-25  6:20   ` Paul E. McKenney
2018-01-26  8:44     ` Lihao Liang
2018-01-26 22:22       ` Paul E. McKenney
2018-01-23  7:59 ` [PATCH RFC 08/16] prcu: Implement PRCU callback processing lianglihao
2018-01-23  7:59 ` [PATCH RFC 09/16] prcu: Implement prcu_barrier() API lianglihao
2018-01-25  6:24   ` Paul E. McKenney
2018-01-23  7:59 ` [PATCH RFC 10/16] rcutorture: Test call_prcu() and prcu_barrier() lianglihao
2018-01-23  7:59 ` [PATCH RFC 11/16] rcutorture: Add basic ARM64 support to run scripts lianglihao
2018-01-23  7:59 ` [PATCH RFC 12/16] prcu: Add PRCU Kconfig parameter lianglihao
2018-01-23  7:59 ` [PATCH RFC 13/16] prcu: Comment source code lianglihao
2018-01-23  7:59 ` [PATCH RFC 14/16] rcuperf: Add config files with various CONFIG_NR_CPUS lianglihao
2018-01-23  7:59 ` [PATCH RFC 15/16] rcutorture: Add scripts to run experiments lianglihao
2018-01-25  6:28   ` Paul E. McKenney
2018-01-23  7:59 ` [PATCH RFC 16/16] Add GPLv2 license lianglihao
2018-01-25  5:53 ` [PATCH RFC 00/16] A new RCU implementation based on a fast consensus protocol Paul E. McKenney
2018-01-27  7:22   ` Lihao Liang
2018-01-27  7:57     ` Paul E. McKenney
2018-01-27  9:57       ` Lihao Liang
2018-01-27 23:46         ` Paul E. McKenney
2018-01-27 23:41       ` Paul E. McKenney

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180125061618.GU3741@linux.vnet.ibm.com \
    --to=paulmck@linux.vnet.ibm.com \
    --cc=guohanjun@huawei.com \
    --cc=hb.chen@huawei.com \
    --cc=heng.z@huawei.com \
    --cc=lianglihao@huawei.com \
    --cc=lihao.liang@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox