[PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying
@ 2010-10-20  6:13 Lai Jiangshan
  2010-10-20 19:25 ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Lai Jiangshan @ 2010-10-20  6:13 UTC (permalink / raw)
  To: Paul E. McKenney, Ingo Molnar, LKML

When we handle cpu notify DYING, the whole system is stopped except
current CPU, so we can touch any data, and we remove the orphan_cbs_tail
and send the callbacks to the dest CPU directly.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 rcutree.c        |   81 ++++++++++++++-----------------------------------------
 rcutree.h        |   16 ++--------
 rcutree_plugin.h |    8 ++---
 rcutree_trace.c  |    4 +-
 4 files changed, 31 insertions(+), 78 deletions(-)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c..669d7fe 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &structname.orphan_cbs_list, \
-	.orphan_qlen = 0, \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -984,53 +981,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
  */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 	int i;
+	/* current DYING CPU is cleared in the cpu_online_mask */
+	int receive_cpu = cpumask_any(cpu_online_mask);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
-	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-	*rsp->orphan_cbs_tail = rdp->nxtlist;
-	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	receive_rdp->qlen += rdp->qlen;
+	receive_rdp->n_cbs_adopted += rdp->qlen;
+	rdp->n_cbs_orphaned += rdp->qlen;
+
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
-	rsp->orphan_qlen += rdp->qlen;
-	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->qlen = 0;
-	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	raw_spin_lock_irqsave(&rsp->onofflock, flags);
-	rdp = this_cpu_ptr(rsp->rda);
-	if (rsp->orphan_cbs_list == NULL) {
-		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-		return;
-	}
-	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-	rdp->qlen += rsp->orphan_qlen;
-	rdp->n_cbs_adopted += rsp->orphan_qlen;
-	rsp->orphan_cbs_list = NULL;
-	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-	rsp->orphan_qlen = 0;
-	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -1081,8 +1056,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp);
-
-	rcu_adopt_orphan_cbs(rsp);
 }
 
 /*
@@ -1100,11 +1073,7 @@ static void rcu_offline_cpu(int cpu)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
 
@@ -1702,10 +1671,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * early.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
-	preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-	rcu_adopt_orphan_cbs(rsp);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-	preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1797,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func().
-		 * The dying CPU clears its cpu_online_mask bit and
-		 * moves all of its RCU callbacks to ->orphan_cbs_list
-		 * in the context of stop_machine(), so subsequent calls
-		 * to _rcu_barrier() will adopt these callbacks and only
-		 * then queue rcu_barrier_func() on all remaining CPUs.
+		 * The whole machine is "stopped" except this cpu, so we can
+		 * touch any data without introducing corruption. And we send
+		 * the callbacks to an attribute chosen online cpu.
 		 */
-		rcu_send_cbs_to_orphanage(&rcu_bh_state);
-		rcu_send_cbs_to_orphanage(&rcu_sched_state);
-		rcu_preempt_send_cbs_to_orphanage();
+		rcu_send_cbs_to_online(&rcu_bh_state);
+		rcu_send_cbs_to_online(&rcu_sched_state);
+		rcu_preempt_send_cbs_to_online();
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170..1a54be2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -203,8 +203,8 @@ struct rcu_data {
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
-	unsigned long	n_cbs_orphaned;	/* RCU cbs sent to orphanage. */
-	unsigned long	n_cbs_adopted;	/* RCU cbs adopted from orphanage. */
+	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -309,15 +309,7 @@ struct rcu_state {
 	/* End of fields guarded by root rcu_node's lock. */
 
 	raw_spinlock_t onofflock;		/* exclude on/offline and */
-						/*  starting new GP.  Also */
-						/*  protects the following */
-						/*  orphan_cbs fields. */
-	struct rcu_head *orphan_cbs_list;	/* list of rcu_head structs */
-						/*  orphaned by all CPUs in */
-						/*  a given leaf rcu_node */
-						/*  going offline. */
-	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
-	long orphan_qlen;			/* Number of orphaned cbs. */
+						/*  starting new GP. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -390,7 +382,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147..0e75d60 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -773,11 +773,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable DYING RCU's callbacks to other online CPU.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-	rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+	rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 
 /*
@@ -1001,7 +1001,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
  * Because there is no preemptable RCU, there are no callbacks to move.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b..cbead39 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
 		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh, rsp->orphan_qlen);
+		   rsp->n_force_qs_lh);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying
  2010-10-20  6:13 [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying Lai Jiangshan
@ 2010-10-20 19:25 ` Paul E. McKenney
  2010-10-22  7:35   ` Lai Jiangshan
  0 siblings, 1 reply; 4+ messages in thread
From: Paul E. McKenney @ 2010-10-20 19:25 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Ingo Molnar, LKML

On Wed, Oct 20, 2010 at 02:13:06PM +0800, Lai Jiangshan wrote:
> When we handle cpu notify DYING, the whole system is stopped except
> current CPU, so we can touch any data, and we remove the orphan_cbs_tail
> and send the callbacks to the dest CPU directly.

Queued along with the documentation/comment patch below, thank you!!!
(Of course, please let me know if you see problems with my patch.)

One remaining question...  You use cpumask_any() to select the destination
CPU, which sounds good until you look at its definition.  The problem
is that cpumask_any() always chooses the lowest-numbered online CPU.
So imagine a (say) 64-CPU system and suppose that CPU 0 remains online.
Suppose further that the other 63 CPUs each execute some workload that
generates lots of RCU callbacks (perhaps creating then removing a large
source tree), and periodically go offline and come back online.

All of the RCU callbacks from CPUs 1-63 could easily end up getting
dumped onto CPU 0's callback lists.  It is easy to imagine that CPU 0
might not be able to invoke these callbacks as fast as the other CPUs
could generate them.

Or am I missing something?

If I am not missing something, could you please adjust so that the
adopting CPU varies, perhaps by scanning across the CPUs?  Feel free
to update this patch or to send a separate patch, whichever is easiest
for you.

							Thanx, Paul

> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
>  rcutree.c        |   81 ++++++++++++++-----------------------------------------
>  rcutree.h        |   16 ++--------
>  rcutree_plugin.h |    8 ++---
>  rcutree_trace.c  |    4 +-
>  4 files changed, 31 insertions(+), 78 deletions(-)
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index ccdc04c..669d7fe 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
>  	.gpnum = -300, \
>  	.completed = -300, \
>  	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
> -	.orphan_cbs_list = NULL, \
> -	.orphan_cbs_tail = &structname.orphan_cbs_list, \
> -	.orphan_qlen = 0, \
>  	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
>  	.n_force_qs = 0, \
>  	.n_force_qs_ngp = 0, \
> @@ -984,53 +981,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
>  #ifdef CONFIG_HOTPLUG_CPU
> 
>  /*
> - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
> - * specified flavor of RCU.  The callbacks will be adopted by the next
> - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
> - * comes first.  Because this is invoked from the CPU_DYING notifier,
> - * irqs are already disabled.
> + * Move a dying CPU's RCU callbacks to online CPU's callback list.
> + * Synchronization is not required because this function executes
> + * in stop_machine() context.
>   */
> -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
> +static void rcu_send_cbs_to_online(struct rcu_state *rsp)
>  {
>  	int i;
> +	/* current DYING CPU is cleared in the cpu_online_mask */
> +	int receive_cpu = cpumask_any(cpu_online_mask);
>  	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
> +	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);

------------------------------------------------------------------------

rcu: update documentation/comments for Lai's adoption patch

Lai's RCU-callback immediate-adoption patch changes the RCU tracing
output, so update tracing.txt.  Also update a few comments to clarify
the synchronization design.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 7f852db..3bb386d 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -122,7 +122,8 @@ o	"ci" is the number of RCU callbacks that have been invoked for
 	been registered in absence of CPU-hotplug activity.
 
 o	"co" is the number of RCU callbacks that have been orphaned due to
-	this CPU going offline.
+	this CPU going offline.  These orphaned callbacks have been moved
+	to an arbitrarily chosen online CPU.
 
 o	"ca" is the number of RCU callbacks that have been adopted due to
 	other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -160,12 +161,12 @@ o	"gpnum" is the number of grace periods that have started.  It is
 
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
 
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -204,11 +205,6 @@ o	"fqlh" is the number of calls to force_quiescent_state() that
 	exited immediately (without even being counted in nfqs above)
 	due to contention on ->fqslock.
 
-o	"oqlen" is the number of callbacks on the "orphan" callback
-	list.  RCU callbacks are placed on this list by CPUs going
-	offline, and are "adopted" either by the CPU helping the outgoing
-	CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o	Each element of the form "1/1 0:127 ^0" represents one struct
 	rcu_node.  Each line represents one level of the hierarchy, from
 	root to leaves.  It is best to think of the rcu_data structures
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a97a061..fa254c5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1657,7 +1657,9 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
 	 * might complete its grace period before all of the other CPUs
 	 * did their increment, causing this function to return too
-	 * early.
+	 * early.  Note that on_each_cpu() disables irqs, which prevents
+	 * any CPUs from coming online or going offline until each online
+	 * CPU has queued its RCU-barrier callback.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
@@ -1786,9 +1788,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * The whole machine is "stopped" except this cpu, so we can
-		 * touch any data without introducing corruption. And we send
-		 * the callbacks to an attribute chosen online cpu.
+		 * The whole machine is "stopped" except this CPU, so we can
+		 * touch any data without introducing corruption. We send the
+		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
 		rcu_send_cbs_to_online(&rcu_bh_state);
 		rcu_send_cbs_to_online(&rcu_sched_state);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0aa84ae..6341adf 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -844,7 +844,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable DYING RCU's callbacks to other online CPU.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
  */
 static void rcu_preempt_send_cbs_to_online(void)
 {

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying
  2010-10-20 19:25 ` Paul E. McKenney
@ 2010-10-22  7:35   ` Lai Jiangshan
  2010-10-22 16:10     ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Lai Jiangshan @ 2010-10-22  7:35 UTC (permalink / raw)
  To: paulmck; +Cc: Ingo Molnar, LKML

On 10/21/2010 03:25 AM, Paul E. McKenney wrote:
> On Wed, Oct 20, 2010 at 02:13:06PM +0800, Lai Jiangshan wrote:
>> When we handle cpu notify DYING, the whole system is stopped except
>> current CPU, so we can touch any data, and we remove the orphan_cbs_tail
>> and send the callbacks to the dest CPU directly.
> 
> Queued along with the documentation/comment patch below, thank you!!!
> (Of course, please let me know if you see problems with my patch.)

Your patch is good for me, please queue it, thanks.

> 
> One remaining question...  You use cpumask_any() to select the destination
> CPU, which sounds good until you look at its definition.  The problem
> is that cpumask_any() always chooses the lowest-numbered online CPU.
> So imagine a (say) 64-CPU system and suppose that CPU 0 remains online.
> Suppose further that the other 63 CPUs each execute some workload that
> generates lots of RCU callbacks (perhaps creating then removing a large
> source tree), and periodically go offline and come back online.
> 
> All of the RCU callbacks from CPUs 1-63 could easily end up getting
> dumped onto CPU 0's callback lists.  It is easy to imagine that CPU 0
> might not be able to invoke these callbacks as fast as the other CPUs
> could generate them.
> 
> Or am I missing something?

It happens in the worst case. It may also happen before this patch.

Before this patch, the callback move to the receive-CPU who handles the CPU_DEAD
event, and this CPU may be always cpu#0 in the worst case, the problem happens.

And it's not help if I introduce a choose_receive_cpu_very_smart(),
Suppose further that the other 63 CPUs each execute some workload that
generates lots of RCU callbacks (perhaps creating then removing a large
source tree), and periodically go offline and come back online. In worse
case, in some period, there is only cpu#0 online, So all of the RCU callbacks
from CPUs 1-63 could easily end up getting dumped onto CPU 0's callback lists. 
It is easy to imagine that CPU 0 might not be able to invoke these callbacks
as fast as the other CPUs could generate them.

Another bad case(it may happens without this patch/with this patch
/with choose_receive_cpu_very_smart()):
	Live-Lock, suppose cpu#A and cpu#B periodically go offline and come
	back online, the callback may be moved from A to B and from B to A
	periodically, no callback is handled.

To fix these problems(it does really very hardly happen), we must force
all adopted callbacks are called before next cpu-offline. so we can use
work_on_cpu() or rcu_barrier() to do this. To make the code simpler, I will
use rcu_barrier().

Thanks.
Lai

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying
  2010-10-22  7:35   ` Lai Jiangshan
@ 2010-10-22 16:10     ` Paul E. McKenney
  0 siblings, 0 replies; 4+ messages in thread
From: Paul E. McKenney @ 2010-10-22 16:10 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Ingo Molnar, LKML

On Fri, Oct 22, 2010 at 03:35:13PM +0800, Lai Jiangshan wrote:
> On 10/21/2010 03:25 AM, Paul E. McKenney wrote:
> > On Wed, Oct 20, 2010 at 02:13:06PM +0800, Lai Jiangshan wrote:
> >> When we handle cpu notify DYING, the whole system is stopped except
> >> current CPU, so we can touch any data, and we remove the orphan_cbs_tail
> >> and send the callbacks to the dest CPU directly.
> > 
> > Queued along with the documentation/comment patch below, thank you!!!
> > (Of course, please let me know if you see problems with my patch.)
> 
> Your patch is good for me, please queue it, thanks.

Very good, done!

> > One remaining question...  You use cpumask_any() to select the destination
> > CPU, which sounds good until you look at its definition.  The problem
> > is that cpumask_any() always chooses the lowest-numbered online CPU.
> > So imagine a (say) 64-CPU system and suppose that CPU 0 remains online.
> > Suppose further that the other 63 CPUs each execute some workload that
> > generates lots of RCU callbacks (perhaps creating then removing a large
> > source tree), and periodically go offline and come back online.
> > 
> > All of the RCU callbacks from CPUs 1-63 could easily end up getting
> > dumped onto CPU 0's callback lists.  It is easy to imagine that CPU 0
> > might not be able to invoke these callbacks as fast as the other CPUs
> > could generate them.
> > 
> > Or am I missing something?
> 
> It happens in the worst case. It may also happen before this patch.
> 
> Before this patch, the callback move to the receive-CPU who handles the CPU_DEAD
> event, and this CPU may be always cpu#0 in the worst case, the problem happens.
> 
> And it's not help if I introduce a choose_receive_cpu_very_smart(),
> Suppose further that the other 63 CPUs each execute some workload that
> generates lots of RCU callbacks (perhaps creating then removing a large
> source tree), and periodically go offline and come back online. In worse
> case, in some period, there is only cpu#0 online, So all of the RCU callbacks
> from CPUs 1-63 could easily end up getting dumped onto CPU 0's callback lists. 
> It is easy to imagine that CPU 0 might not be able to invoke these callbacks
> as fast as the other CPUs could generate them.
> 
> Another bad case(it may happens without this patch/with this patch
> /with choose_receive_cpu_very_smart()):
> 	Live-Lock, suppose cpu#A and cpu#B periodically go offline and come
> 	back online, the callback may be moved from A to B and from B to A
> 	periodically, no callback is handled.

Agreed, it -could- happen before in the worst case, but it required very
bad luck for the task adopting the callbacks to always be the same.
In contrast, cpumask_any() will always pick on the same CPU.

That said, your approach called out below is intriguing...

> To fix these problems(it does really very hardly happen), we must force
> all adopted callbacks are called before next cpu-offline. so we can use
> work_on_cpu() or rcu_barrier() to do this. To make the code simpler, I will
> use rcu_barrier().

This approach is nice, but requires extensive testing -- a start would
be a script that randomly onlines and offlines CPUs while rcutorture
is running in the background.  If you have not already done so, could
you please give this an over-the-weekend test on the largest system
you have access to?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-10-22 16:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-20  6:13 [PATCH 2/2 v2] rcu,cleanup: simplify the code when cpu is dying Lai Jiangshan
2010-10-20 19:25 ` Paul E. McKenney
2010-10-22  7:35   ` Lai Jiangshan
2010-10-22 16:10     ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox