[RFC] RCU and CONFIG_PREEMPT_RT progress, part 2

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2
@ 2005-06-25  1:38 Paul E. McKenney
  2005-07-11 15:07 ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Paul E. McKenney @ 2005-06-25  1:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo, dipankar

[-- Attachment #1: Type: text/plain, Size: 16965 bytes --]

Hello!

More progress on CONFIG_PREEMPT_RT-compatible RCU.

o	Earlier this week, declared my CONFIG_PREEMPT_RT education
	complete enough to take on a counter-based approach.

o	Selected a simple counter-based approach, and created
	a Promela model of it, which passes spin.  See attached
	"ctrflipRCU.spin" and "ctrflipRCU.sh" files.  The
	spin analyzer is open-source software, available at
	http://spinroot.com/spin/whatispin.html if you want to check
	it out.  Will be running the model by some people here early
	next week.

o	Started prototyping Linux-kernel implementation, initially
	in the CONFIG_PREEMPT environment.  (Yes, I am still being
	cowardly!)  The implementation is as follows:

	o	Use a per-CPU pair of counters to track the number
		of outstanding RCU read-side critical sections
		started on the corresponding CPU (though these may
		have migrated to some other CPU in the meantime).

	o	In case of a race with the flipping of the counters,
		just increment the other counter for the corresponding
		CPU.  This suppresses further counter flips.

	o	Grace-period advancement depends entirely on
		synchronize_rcu() calls.  If your workload avoids
		making any synchronize_kernel() calls (which is not
		hard to do given that call_rcu() is often used
		instead of synchronize_rcu()), then RCU will be
		a big memory leak.  (This will be fixed later.)

	o	In theory, you can "cat /proc/rcugp" to force the
		kernel to invoke synchronize_rcu(), but I have not tested
		this yet, and will not do so until it actually boots.  ;-)
		The long-term fix will likely involve periodic
		grace-period forcing from a kernel task or from the
		workqueue environment.

	o	The rcu_read_lock() and rcu_read_unlock() primitives
		still use heavy-weight atomic operations in the common
		case (avoided only in the case of nested rcu_read_lock()
		calls).  Longer term, I expect to make changes to avoid
		the atomics in the (hopefully) common case where there
		is no preemption of RCU read-side critical sections.
		But simple things first!

	o	The rcu_read_lock() and rcu_read_unlock() primitives
		use memory-barrier instructions.  In theory, it is
		possible to dispense with these, but doing so requires
		some very strange code.  Not clear that this is practical.

o	Started running tests in vanilla CONFIG_PREEMPT environment
	to check solidity with stock RCU.  Will need solid base when
	stress-test time arrives.

Extremely broken patch included below for your amusement.  Feel free to
take bets on how much it differs from the eventual working patch.  ;-)

						Thanx, Paul

Not-signed-off-by: paulmck@us.ibm.com

diff -urpN -X dontdiff linux-2.6.12-rc6/arch/i386/Kconfig linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig
--- linux-2.6.12-rc6/arch/i386/Kconfig	Fri Jun 17 16:34:16 2005
+++ linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig	Fri Jun 24 10:22:39 2005
@@ -523,6 +523,19 @@ config PREEMPT
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 
+config PREEMPT_RCU
+	bool "Preemptible RCU read-side critical sections"
+	default y
+	help
+	  This option reduces the latency of the kernel when reacting to
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in an RCU read-side critical section.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you enjoy debugging random oopses.
+	  Say N if, for whatever reason, you want your kernel to actually work.
+
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
 	depends on PREEMPT
diff -urpN -X dontdiff linux-2.6.12-rc6/fs/proc/proc_misc.c linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c
--- linux-2.6.12-rc6/fs/proc/proc_misc.c	Fri Jun 17 16:35:03 2005
+++ linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c	Thu Jun 23 21:42:30 2005
@@ -549,6 +549,20 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#define RCU_STATS
+
+#ifdef RCU_STATS
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -571,6 +585,9 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef RCU_STATS
+		{"rcugp",	rcu_read_proc_gp},
+#endif /* #ifdef RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/rcupdate.h linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h
--- linux-2.6.12-rc6/include/linux/rcupdate.h	Fri Jun 17 16:35:13 2005
+++ linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h	Fri Jun 24 14:59:04 2005
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -192,6 +193,18 @@ static inline int rcu_pending(int cpu)
  */
 #define rcu_read_unlock()	preempt_enable()
 
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
+
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * way for writers to lock out RCU readers.  This is a feature, not
@@ -213,14 +226,22 @@ static inline int rcu_pending(int cpu)
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -238,6 +259,7 @@ static inline int rcu_pending(int cpu)
 				(_________p1); \
 				})
 
+
 /**
  * rcu_assign_pointer - assign (publicize) a pointer to a newly
  * initialized structure that will be dereferenced by RCU read-side
@@ -256,6 +278,7 @@ static inline int rcu_pending(int cpu)
 						(p) = (v); \
 					})
 
+
 /**
  * synchronize_sched - block until all CPUs have exited any non-preemptive
  * kernel code sequences.
@@ -269,7 +292,11 @@ static inline int rcu_pending(int cpu)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  */
+#ifdef CONFIG_PREEMPT_RCU
+extern void synchronize_sched(void);
+#else /* #ifdef CONFIG_PREEMPT_RCU */
 #define synchronize_sched() synchronize_rcu()
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/sched.h linux-2.6.12-rc6-ctrRCU/include/linux/sched.h
--- linux-2.6.12-rc6/include/linux/sched.h	Fri Jun 17 16:35:13 2005
+++ linux-2.6.12-rc6-ctrRCU/include/linux/sched.h	Wed Jun 22 17:25:49 2005
@@ -740,6 +740,11 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -urpN -X dontdiff linux-2.6.12-rc6/kernel/rcupdate.c linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c
--- linux-2.6.12-rc6/kernel/rcupdate.c	Fri Jun 17 16:35:15 2005
+++ linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c	Fri Jun 24 17:47:26 2005
@@ -47,6 +47,8 @@
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 
+#ifndef CONFIG_PREEMPT_RCU
+
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
 	{ .cur = -300, .completed = -300 };
@@ -480,3 +482,342 @@ EXPORT_SYMBOL(call_rcu);  /* WARNING: GP
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_PREEMPT_RT
+#error "Hey!!!  This hasn't been tested yet!!!  No way it works!!!"
+#endif /* #ifdef CONFIG_PREEMPT_RT */
+
+#define RCU_STATS
+
+#define raw_spinlock_t spinlock_t
+#define _raw_spin_lock(l) spin_lock(l)
+#define _raw_spin_unlock(l) spin_unlock(l)
+#define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		batch;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+};
+struct rcu_ctrlblk {
+	long		batch;
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.batch = 0,
+};
+DECLARE_MUTEX(rcu_flipmutex);
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
+
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		preempt_disable();
+		flipctr = rcu_data.batch & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (flipctr != (rcu_data.batch & 0x1)) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counter, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+		preempt_enable();
+	}
+}
+
+void
+rcu_read_unlock(void)
+{
+
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (current->rcu_flipctr2 != NULL) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+	if (rcu_data.batch != rcu_ctrlblk.batch) {
+		if (rcu_data.waitlist != NULL) {
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+		}
+		if (rcu_data.nextlist != NULL) {
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+		} else {
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.batch = rcu_ctrlblk.batch;
+	}
+}
+
+void
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	head->func = func;
+	head->next = NULL;
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	*rcu_data.waittail = head;
+	rcu_data.waittail = &head->next;
+	_raw_spin_unlock(&rcu_data.lock);
+}
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+		return;
+	}
+	_raw_spin_unlock(&rcu_data.lock);
+	tasklet_schedule(&rcu_data.rcu_tasklet);
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	struct rcu_head *next, *list;
+
+	_raw_spin_lock(&rcu_data.lock);
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	_raw_spin_unlock(&rcu_data.lock);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+	}
+}
+
+/*
+ * Force a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+__synchronize_rcu_flip(void)
+{
+	int cpu;
+	long flipctr;
+
+	flipctr = rcu_data.batch;
+	down(&rcu_flipmutex);
+	if (flipctr != rcu_data.batch) {
+	
+		/* Our work is done!  ;-) */
+
+		up(&rcu_flipmutex);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Wait for the completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+	for_each_cpu(cpu) {
+		while (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	/* Do the flip. */
+
+	smp_mb();
+	rcu_data.batch++;
+	smp_mb(); /* Not 100% certain that this one is required. */
+		  /* In fact, it seems likely that it can go. */
+	
+	/* A bit awkward -- looking ahead to per-CPU callback queues. */
+	/* Process callbacks... */
+
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	_raw_spin_unlock(&rcu_data.lock);
+	tasklet_schedule(&rcu_data.rcu_tasklet);
+
+	up(&rcu_flipmutex);
+}
+
+/*
+ * Force a single flip of the counters, but do it lazily, waiting
+ * a bit to see if someone else will do it for us.
+ */
+static void
+synchronize_rcu_flip(void)
+{
+	long oldbatch = rcu_data.batch;
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 100);
+	if ((rcu_data.batch - oldbatch) >= 2) {
+		
+		/* Our work is done! */
+
+		return;
+	}
+
+	/* No one else did it for us, so force the flip. */
+
+	__synchronize_rcu_flip();
+}
+
+void
+synchronize_rcu(void)
+{
+	long oldbatch = rcu_data.batch;
+
+	/* Need two flips to get a grace period. */
+
+	synchronize_rcu_flip();
+
+	if ((rcu_data.batch - oldbatch) >= 2) {
+		
+		/* Someone else helped with the second flip, we are done! */
+
+		return;
+	}
+
+	synchronize_rcu_flip();
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
+ */
+void
+synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map; 
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+	rcu_data.lock = RAW_SPIN_LOCK_UNLOCKED;
+	rcu_data.batch = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef RCU_STATS
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.batch;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.batch);
+}
+#endif /* #ifdef RCU_STATS */
+
+EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL(synchronize_sched);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_lock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_unlock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: Removal in April 2006. */
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */

[-- Attachment #2: ctrflipRCU.spin --]
[-- Type: text/plain, Size: 1091 bytes --]

#define PASSES 10

bit flip = 0;
bit ctr[2];
byte passctr = 0;

proctype rdr()
{
	bit lcl_flip;
	bit both;

	do
	:: passctr < PASSES ->
		lcl_flip = flip;
		/* need smp_read_barrier_depends() */
		ctr[lcl_flip]++;
		/* need smp_mb__after_atomic_inc() */
		if
		:: lcl_flip == flip ->
			both = 0
		:: else ->
			ctr[!lcl_flip]++;
			/* need smp_mb__after_atomic_inc() */
			both = 1
		fi;
		passctr++;
		passctr++;
		/* need smp_mb__before_atomic_dec() */
		ctr[lcl_flip]--;
		if
		:: both ->
			ctr[!lcl_flip]--;
		:: else -> skip
		fi;
	:: passctr >= PASSES ->
		break
	od
}

proctype upd()
{
	byte lcl_passctr;

	do
	::	passctr < PASSES ->
		lcl_passctr = passctr;
		if
		::	lcl_passctr / 2 * 2 == lcl_passctr ->
			lcl_passctr = 255
		::	else -> skip
		fi;
		do
		::	ctr[!flip] == 0 -> break
		::	true -> skip
		od;
		/* Need smp_mb()? */
		flip = !flip;
		/* Need smp_mb()? */
		do
		::	ctr[!flip] == 0 -> break
		::	true -> skip
		od;
		assert(lcl_passctr != passctr);
	::	passctr >= PASSES ->
		break
	od
}

init {
	atomic {
		ctr[0] = 0;
		ctr[1] = 0;
		run rdr();
		run upd();
	}
}

[-- Attachment #3: ctrflipRCU.sh --]
[-- Type: application/x-sh, Size: 147 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2
  2005-06-25  1:38 [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2 Paul E. McKenney
@ 2005-07-11 15:07 ` Paul E. McKenney
  2005-07-11 15:16   ` Ingo Molnar
  0 siblings, 1 reply; 4+ messages in thread
From: Paul E. McKenney @ 2005-07-11 15:07 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo, dipankar

Hello!

More progress on CONFIG_PREEMPT_RT-compatible RCU.

o	Continued prototyping Linux-kernel implementation, still
	in the CONFIG_PREEMPT environment.

o	Reasonably reliably boots, still fragile.

o	Using workqueues to ensure that grace periods advance
	even if synchronize_rcu() is never called.  This seems
	like a bad choice in retrospect, will be recoding to
	allow per-CPU scheduling-clock tick to advance the
	grace period.  Doing this should simplify the code
	and eliminate the need for workqueues.

o	Numerous boneheaded bugs fixed, a number remain.

o	A number of /proc entries provide debug info.  These
	will likely be removed in the final version.

o	Shortcomings remain, will fix once I have flushed out
	a few more bugs:

	o	The rcu_read_lock() and rcu_read_unlock() primitives
		still use heavy-weight atomic operations in the common
		case (avoided only in the case of nested rcu_read_lock()
		calls).  Longer term, I expect to make changes to avoid
		the atomics in the (hopefully) common case where there
		is no preemption of RCU read-side critical sections.
		But simple things first!

	o	The rcu_read_lock() and rcu_read_unlock() primitives
		use memory-barrier instructions.  In theory, it is
		possible to dispense with these, but doing so requires
		some very strange code.  Not clear that this is practical.

	o	Callbacks are queued globally.  Will be changing
		to per-CPU callbacks.

Broken patch included below for your amusement.

						Thanx, Paul

Not-signed-off-by: paulmck@us.ibm.com

diff -urpN -X dontdiff linux-2.6.12-rc6/arch/i386/Kconfig linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig
--- linux-2.6.12-rc6/arch/i386/Kconfig	2005-06-17 16:34:16.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig	2005-06-25 11:45:40.000000000 -0700
@@ -523,6 +523,34 @@ config PREEMPT
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 
+config PREEMPT_RCU
+	bool "Preemptible RCU read-side critical sections"
+	depends on PREEMPT
+	default y
+	help
+	  This option reduces the latency of the kernel when reacting to
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in an RCU read-side critical section.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you enjoy debugging random oopses.
+	  Say N if, for whatever reason, you want your kernel to actually work.
+
+config RCU_STATS
+	bool "/proc stats for preemptible RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y
+	help
+	  This option provides /proc stats to provide debugging info.
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in an RCU read-side critical section.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you want to see RCU stats in /proc
+	  Say N if you are unsure.
+
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
 	depends on PREEMPT
diff -urpN -X dontdiff linux-2.6.12-rc6/fs/proc/proc_misc.c linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c
--- linux-2.6.12-rc6/fs/proc/proc_misc.c	2005-06-17 16:35:03.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c	2005-06-29 11:22:11.000000000 -0700
@@ -549,6 +549,48 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_data(char *page);
+
+	len = rcu_read_proc_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_wqgp(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_wqgp_data(char *page);
+
+	len = rcu_read_proc_wqgp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ptrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ptrs_data(char *page);
+
+	len = rcu_read_proc_ptrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -571,6 +613,12 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef CONFIG_RCU_STATS
+		{"rcustats",	rcu_read_proc},
+		{"rcugp",	rcu_read_proc_gp},
+		{"rcuwqgp",	rcu_read_proc_wqgp},
+		{"rcuptrs",	rcu_read_proc_ptrs},
+#endif /* #ifdef CONFIG_RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/rcupdate.h linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h
--- linux-2.6.12-rc6/include/linux/rcupdate.h	2005-06-17 16:35:13.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h	2005-07-01 14:55:35.000000000 -0700
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -192,6 +193,18 @@ static inline int rcu_pending(int cpu)
  */
 #define rcu_read_unlock()	preempt_enable()
 
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
+
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * way for writers to lock out RCU readers.  This is a feature, not
@@ -213,14 +226,22 @@ static inline int rcu_pending(int cpu)
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -269,7 +290,11 @@ static inline int rcu_pending(int cpu)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define synchronize_sched() synchronize_rcu()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+extern void synchronize_sched(void);
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/sched.h linux-2.6.12-rc6-ctrRCU/include/linux/sched.h
--- linux-2.6.12-rc6/include/linux/sched.h	2005-06-17 16:35:13.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/include/linux/sched.h	2005-06-22 17:25:49.000000000 -0700
@@ -740,6 +740,11 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -urpN -X dontdiff linux-2.6.12-rc6/kernel/rcupdate.c linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c
--- linux-2.6.12-rc6/kernel/rcupdate.c	2005-06-17 16:35:15.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c	2005-07-01 17:58:01.000000000 -0700
@@ -46,6 +46,9 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/workqueue.h>
+
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
@@ -480,3 +483,526 @@ EXPORT_SYMBOL(call_rcu);  /* WARNING: GP
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_PREEMPT_RT
+#error "Hey!!!  This hasn't been tested yet!!!  No way it works!!!"
+#endif /* #ifdef CONFIG_PREEMPT_RT */
+
+#define raw_spinlock_t spinlock_t
+#define _raw_spin_lock(l) spin_lock(l)
+#define _raw_spin_unlock(l) spin_unlock(l)
+#define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		batch;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long		n_next_length;
+#ifdef CONFIG_RCU_STATS
+	long		n_next_add;
+	long		n_wait_length;
+	long		n_wait_add;
+	long		n_done_length;
+	long		n_done_add;
+	long		n_done_remove;
+	atomic_t	n_done_invoked;
+	long		n_rcu_process_callbacks;
+	long		n_rcu_check_callbacks;
+	long		n_rcu_advance_callbacks1;
+	long		n_rcu_advance_callbacks2;
+	long		n_rcu_advance_callbacks3;
+	long		n_rcu_advance_callbacks4;
+	long		n_rcu_advance_callbacks5;
+	atomic_t	n_synchronize_rcu;
+	atomic_t	n_synchronize_rcu_early1;
+	atomic_t	n_synchronize_rcu_flip1; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip2; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip3; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip4; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip5; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip6; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip_early1;
+	long		n_synchronize_rcu_flip_early2;
+#endif /* #ifdef CONFIG_RCU_STATS */
+};
+struct rcu_ctrlblk {
+	long		batch;
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.batch = 0,
+};
+DECLARE_MUTEX(rcu_flipmutex);
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
+
+#define RCU_NEXT_LIST_TRIGGER_LENGTH 100
+
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);	/* @@@ inside "if" for PREEMPT_RT... */
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.batch & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (flipctr != (rcu_ctrlblk.batch & 0x1)) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+#ifndef CONFIG_PREEMPT_RT
+	preempt_disable();
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+	local_irq_restore(oldirq);  /*@@@ should be able to precede. */
+}
+
+void
+rcu_read_unlock(void)
+{
+#ifndef CONFIG_PREEMPT_RT
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);  /* @@@ should be able to reverse... */
+	preempt_enable();
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (current->rcu_flipctr2 != NULL) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+#ifndef CONFIG_PREEMPT_RT
+	local_irq_restore(oldirq);
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_advance_callbacks1++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	if (rcu_data.batch != rcu_ctrlblk.batch) {
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_rcu_advance_callbacks2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		if (rcu_data.waitlist != NULL) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_done_length += rcu_data.n_wait_length;
+			rcu_data.n_done_add += rcu_data.n_wait_length;
+			rcu_data.n_wait_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		}
+		if (rcu_data.nextlist != NULL) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks4++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_wait_length += rcu_data.n_next_length;
+			rcu_data.n_wait_add += rcu_data.n_next_length;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.n_next_length = 0;
+		} else {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks5++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.batch = rcu_ctrlblk.batch;
+	}
+}
+
+void
+synchronize_rcu_wqh(void *dummy)
+{
+	synchronize_rcu();  /* @@@ need non-blocking attempt to flip... */
+}
+
+DECLARE_WORK(synchronize_rcu_wq, synchronize_rcu_wqh, NULL);
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	_raw_spin_lock(&rcu_data.lock);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_check_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+	} else {
+		_raw_spin_unlock(&rcu_data.lock);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+	if (rcu_data.n_next_length >= RCU_NEXT_LIST_TRIGGER_LENGTH) {
+		schedule_work(&synchronize_rcu_wq);
+	}
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	struct rcu_head *next, *list;
+
+	_raw_spin_lock(&rcu_data.lock);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_process_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_done_remove += rcu_data.n_done_length;
+	rcu_data.n_done_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	_raw_spin_unlock(&rcu_data.lock);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_done_invoked);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	}
+}
+
+/*
+ * Force a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+__synchronize_rcu_flip(void)
+{
+	int cpu;
+	long flipctr;
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip3);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	flipctr = rcu_data.batch;
+	down(&rcu_flipmutex);
+	if (flipctr != rcu_data.batch) {
+	
+		/* Our work is done!  ;-) */
+
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_synchronize_rcu_flip_early2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		up(&rcu_flipmutex);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Wait for the completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip4);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	for_each_cpu(cpu) {
+		while (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	/* Do the flip. */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip5);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	smp_mb();
+	rcu_ctrlblk.batch++;
+	smp_mb(); /* Not 100% certain that this one is required. */
+		  /* In fact, it seems likely that it can go. */
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip6);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	
+	/* A bit awkward -- looking ahead to per-CPU callback queues. */
+	/* Process callbacks... */
+
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	_raw_spin_unlock(&rcu_data.lock);
+	tasklet_schedule(&rcu_data.rcu_tasklet);
+
+	up(&rcu_flipmutex);
+}
+
+/*
+ * Force a single flip of the counters, but do it lazily, waiting
+ * a bit to see if someone else will do it for us.
+ */
+static void
+synchronize_rcu_flip(void)
+{
+	long oldbatch = rcu_data.batch;
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 100);
+	if (rcu_data.batch != oldbatch) {
+		
+		/* Our work is done! */
+
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_synchronize_rcu_flip_early1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+
+	/* No one else did it for us, so force the flip. */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip2);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__synchronize_rcu_flip();
+}
+
+void
+synchronize_rcu(void)
+{
+	long oldbatch = rcu_data.batch;
+
+	/* Need two flips to get a grace period. */
+
+/*&&&&  printk("Entering synchronize_rcu()\n"); */
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	synchronize_rcu_flip();
+
+	if ((rcu_data.batch - oldbatch) >= 2) {
+
+		/* Someone else helped with the second flip, we are done! */
+
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_synchronize_rcu_early1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+
+	synchronize_rcu_flip();
+/*&&&&*/printk("Exiting synchronize_rcu()\n");
+}
+
+void fastcall
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	head->func = func;
+	head->next = NULL;
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_next_add++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	rcu_data.n_next_length++;
+	_raw_spin_unlock(&rcu_data.lock);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
+ */
+void
+synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map; 
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	rcu_data.lock = RAW_SPIN_LOCK_UNLOCKED;
+	rcu_data.batch = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc_data(char *page)
+{
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld\n"
+		       "sr=%d srf1=%d srf2=%d srf3=%d srf4=%d srf5=%d srf6=%d\n"
+		       "ac1=%ld ac2=%ld ac3=%ld ac4=%ld ac5=%ld\n"
+		       "sre1=%d srfe1=%d srfe2=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld rcc=%ld rpc=%ld dl=%ld dr=%ld di=%d\n",
+		       rcu_ctrlblk.batch,
+		       rcu_data.batch,
+
+		       atomic_read(&rcu_data.n_synchronize_rcu),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip1),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip2),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip3),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip4),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip5),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip6),
+
+		       rcu_data.n_rcu_advance_callbacks1,
+		       rcu_data.n_rcu_advance_callbacks2,
+		       rcu_data.n_rcu_advance_callbacks3,
+		       rcu_data.n_rcu_advance_callbacks4,
+		       rcu_data.n_rcu_advance_callbacks5,
+
+		       atomic_read(&rcu_data.n_synchronize_rcu_early1),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip_early1),
+		       rcu_data.n_synchronize_rcu_flip_early2,
+
+		       rcu_data.n_next_add,
+		       rcu_data.n_next_length,
+		       rcu_data.n_wait_add,
+		       rcu_data.n_wait_length,
+		       rcu_data.n_rcu_check_callbacks,
+		       rcu_data.n_rcu_process_callbacks,
+		       rcu_data.n_done_length,
+		       rcu_data.n_done_remove,
+		       atomic_read(&rcu_data.n_done_invoked));
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.batch;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.batch);
+}
+
+int rcu_read_proc_wqgp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.batch;
+
+	schedule_work(&synchronize_rcu_wq);
+	return sprintf(page, "oldggp=%ld\n", oldgp);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
+EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL(synchronize_sched);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_lock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_unlock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: Removal in April 2006. */
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2
  2005-07-11 15:07 ` Paul E. McKenney
@ 2005-07-11 15:16   ` Ingo Molnar
  2005-07-11 15:51     ` Paul E. McKenney
  0 siblings, 1 reply; 4+ messages in thread
From: Ingo Molnar @ 2005-07-11 15:16 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: linux-kernel, dipankar

* Paul E. McKenney <paulmck@us.ibm.com> wrote:

> Hello!
> 
> More progress on CONFIG_PREEMPT_RT-compatible RCU.
> 
> o	Continued prototyping Linux-kernel implementation, still
> 	in the CONFIG_PREEMPT environment.

cool! With the debugging code removed it doesnt look all that complex.  
Do you think i can attempt to plug this into the -RT tree, or should i 
wait some more? (One observation: if you know some branch is slowpath in 
a common codepath then it's useful to mark the condition via unlikely().  
That results in better code layout and is also a guidance for the casual 
reader of the code.)

	Ingo

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2
  2005-07-11 15:16   ` Ingo Molnar
@ 2005-07-11 15:51     ` Paul E. McKenney
  0 siblings, 0 replies; 4+ messages in thread
From: Paul E. McKenney @ 2005-07-11 15:51 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, dipankar

On Mon, Jul 11, 2005 at 05:16:27PM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@us.ibm.com> wrote:
> 
> > Hello!
> > 
> > More progress on CONFIG_PREEMPT_RT-compatible RCU.
> > 
> > o	Continued prototyping Linux-kernel implementation, still
> > 	in the CONFIG_PREEMPT environment.
> 
> cool! With the debugging code removed it doesnt look all that complex.  
> Do you think i can attempt to plug this into the -RT tree, or should i 
> wait some more? (One observation: if you know some branch is slowpath in 
> a common codepath then it's useful to mark the condition via unlikely().  
> That results in better code layout and is also a guidance for the casual 
> reader of the code.)

I would hold off, as it is still quite fragile.  I can make it break
quite easily, so it is not yet time to unleash it on all users of
CONFIG_PREEMPT_RT.

Good point on the unlikely(), updated the ones checking for races.
The checks in rcu_read_lock() and rcu_read_unlock() for racing
counter flips seem to be the most important, but I also added them
to the checks for concurrent synchronize_rcu() calls, see below.

							Thanx, Paul

diff -urpN -X dontdiff linux-2.6.12-rc6/arch/i386/Kconfig linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig
--- linux-2.6.12-rc6/arch/i386/Kconfig	2005-06-17 16:34:16.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/arch/i386/Kconfig	2005-06-25 11:45:40.000000000 -0700
@@ -523,6 +523,34 @@ config PREEMPT
 	  Say Y here if you are building a kernel for a desktop, embedded
 	  or real-time system.  Say N if you are unsure.
 
+config PREEMPT_RCU
+	bool "Preemptible RCU read-side critical sections"
+	depends on PREEMPT
+	default y
+	help
+	  This option reduces the latency of the kernel when reacting to
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in an RCU read-side critical section.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you enjoy debugging random oopses.
+	  Say N if, for whatever reason, you want your kernel to actually work.
+
+config RCU_STATS
+	bool "/proc stats for preemptible RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y
+	help
+	  This option provides /proc stats to provide debugging info.
+	  real-time or interactive events by allowing a low priority process to
+	  be preempted even if it is in an RCU read-side critical section.
+	  This allows applications to run more reliably even when the system is
+	  under load.
+
+	  Say Y here if you want to see RCU stats in /proc
+	  Say N if you are unsure.
+
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
 	depends on PREEMPT
diff -urpN -X dontdiff linux-2.6.12-rc6/fs/proc/proc_misc.c linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c
--- linux-2.6.12-rc6/fs/proc/proc_misc.c	2005-06-17 16:35:03.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/fs/proc/proc_misc.c	2005-06-29 11:22:11.000000000 -0700
@@ -549,6 +549,48 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_data(char *page);
+
+	len = rcu_read_proc_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_wqgp(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_wqgp_data(char *page);
+
+	len = rcu_read_proc_wqgp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ptrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ptrs_data(char *page);
+
+	len = rcu_read_proc_ptrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -571,6 +613,12 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef CONFIG_RCU_STATS
+		{"rcustats",	rcu_read_proc},
+		{"rcugp",	rcu_read_proc_gp},
+		{"rcuwqgp",	rcu_read_proc_wqgp},
+		{"rcuptrs",	rcu_read_proc_ptrs},
+#endif /* #ifdef CONFIG_RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/rcupdate.h linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h
--- linux-2.6.12-rc6/include/linux/rcupdate.h	2005-06-17 16:35:13.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/include/linux/rcupdate.h	2005-07-01 14:55:35.000000000 -0700
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -192,6 +193,18 @@ static inline int rcu_pending(int cpu)
  */
 #define rcu_read_unlock()	preempt_enable()
 
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
+
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * way for writers to lock out RCU readers.  This is a feature, not
@@ -213,14 +226,22 @@ static inline int rcu_pending(int cpu)
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -269,7 +290,11 @@ static inline int rcu_pending(int cpu)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define synchronize_sched() synchronize_rcu()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+extern void synchronize_sched(void);
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
diff -urpN -X dontdiff linux-2.6.12-rc6/include/linux/sched.h linux-2.6.12-rc6-ctrRCU/include/linux/sched.h
--- linux-2.6.12-rc6/include/linux/sched.h	2005-06-17 16:35:13.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/include/linux/sched.h	2005-06-22 17:25:49.000000000 -0700
@@ -740,6 +740,11 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -urpN -X dontdiff linux-2.6.12-rc6/kernel/rcupdate.c linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c
--- linux-2.6.12-rc6/kernel/rcupdate.c	2005-06-17 16:35:15.000000000 -0700
+++ linux-2.6.12-rc6-ctrRCU/kernel/rcupdate.c	2005-07-11 08:47:08.000000000 -0700
@@ -46,6 +46,9 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/workqueue.h>
+
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
@@ -480,3 +483,526 @@ EXPORT_SYMBOL(call_rcu);  /* WARNING: GP
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_PREEMPT_RT
+#error "Hey!!!  This fails miserably even in straightforward testing!!!"
+#endif /* #ifdef CONFIG_PREEMPT_RT */
+
+#define raw_spinlock_t spinlock_t
+#define _raw_spin_lock(l) spin_lock(l)
+#define _raw_spin_unlock(l) spin_unlock(l)
+#define RAW_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		batch;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long		n_next_length;
+#ifdef CONFIG_RCU_STATS
+	long		n_next_add;
+	long		n_wait_length;
+	long		n_wait_add;
+	long		n_done_length;
+	long		n_done_add;
+	long		n_done_remove;
+	atomic_t	n_done_invoked;
+	long		n_rcu_process_callbacks;
+	long		n_rcu_check_callbacks;
+	long		n_rcu_advance_callbacks1;
+	long		n_rcu_advance_callbacks2;
+	long		n_rcu_advance_callbacks3;
+	long		n_rcu_advance_callbacks4;
+	long		n_rcu_advance_callbacks5;
+	atomic_t	n_synchronize_rcu;
+	atomic_t	n_synchronize_rcu_early1;
+	atomic_t	n_synchronize_rcu_flip1; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip2; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip3; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip4; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip5; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip6; /*&&&&*/
+	atomic_t	n_synchronize_rcu_flip_early1;
+	long		n_synchronize_rcu_flip_early2;
+#endif /* #ifdef CONFIG_RCU_STATS */
+};
+struct rcu_ctrlblk {
+	long		batch;
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.batch = 0,
+};
+DECLARE_MUTEX(rcu_flipmutex);
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
+
+#define RCU_NEXT_LIST_TRIGGER_LENGTH 100
+
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);	/* @@@ inside "if" for PREEMPT_RT... */
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.batch & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (unlikely(flipctr != (rcu_ctrlblk.batch & 0x1))) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+#ifndef CONFIG_PREEMPT_RT
+	preempt_disable();
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+	local_irq_restore(oldirq);  /*@@@ should be able to precede. */
+}
+
+void
+rcu_read_unlock(void)
+{
+#ifndef CONFIG_PREEMPT_RT
+	unsigned long oldirq;
+
+	local_irq_save(oldirq);  /* @@@ should be able to reverse... */
+	preempt_enable();
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (unlikely(current->rcu_flipctr2 != NULL)) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+#ifndef CONFIG_PREEMPT_RT
+	local_irq_restore(oldirq);
+#endif /* #ifndef CONFIG_PREEMPT_RT */
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_advance_callbacks1++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	if (rcu_data.batch != rcu_ctrlblk.batch) {
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_rcu_advance_callbacks2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		if (rcu_data.waitlist != NULL) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_done_length += rcu_data.n_wait_length;
+			rcu_data.n_done_add += rcu_data.n_wait_length;
+			rcu_data.n_wait_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		}
+		if (rcu_data.nextlist != NULL) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks4++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_wait_length += rcu_data.n_next_length;
+			rcu_data.n_wait_add += rcu_data.n_next_length;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.n_next_length = 0;
+		} else {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_advance_callbacks5++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.batch = rcu_ctrlblk.batch;
+	}
+}
+
+void
+synchronize_rcu_wqh(void *dummy)
+{
+	synchronize_rcu();  /* @@@ need non-blocking attempt to flip... */
+}
+
+DECLARE_WORK(synchronize_rcu_wq, synchronize_rcu_wqh, NULL);
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	_raw_spin_lock(&rcu_data.lock);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_check_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+	} else {
+		_raw_spin_unlock(&rcu_data.lock);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+	if (unlikely(rcu_data.n_next_length >= RCU_NEXT_LIST_TRIGGER_LENGTH)) {
+		schedule_work(&synchronize_rcu_wq);
+	}
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	struct rcu_head *next, *list;
+
+	_raw_spin_lock(&rcu_data.lock);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_process_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		_raw_spin_unlock(&rcu_data.lock);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_done_remove += rcu_data.n_done_length;
+	rcu_data.n_done_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	_raw_spin_unlock(&rcu_data.lock);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_done_invoked);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	}
+}
+
+/*
+ * Force a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+__synchronize_rcu_flip(void)
+{
+	int cpu;
+	long flipctr;
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip3);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	flipctr = rcu_data.batch;
+	down(&rcu_flipmutex);
+	if (unlikely(flipctr != rcu_data.batch)) {
+	
+		/* Our work is done!  ;-) */
+
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_synchronize_rcu_flip_early2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		up(&rcu_flipmutex);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Wait for the completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip4);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	for_each_cpu(cpu) {
+		while (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	/* Do the flip. */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip5);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	smp_mb();
+	rcu_ctrlblk.batch++;
+	smp_mb(); /* Not 100% certain that this one is required. */
+		  /* In fact, it seems likely that it can go. */
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip6);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	
+	/* A bit awkward -- looking ahead to per-CPU callback queues. */
+	/* Process callbacks... */
+
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	_raw_spin_unlock(&rcu_data.lock);
+	tasklet_schedule(&rcu_data.rcu_tasklet);
+
+	up(&rcu_flipmutex);
+}
+
+/*
+ * Force a single flip of the counters, but do it lazily, waiting
+ * a bit to see if someone else will do it for us.
+ */
+static void
+synchronize_rcu_flip(void)
+{
+	long oldbatch = rcu_data.batch;
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(HZ / 100);
+	if (unlikely(rcu_data.batch != oldbatch)) {
+		
+		/* Our work is done! */
+
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_synchronize_rcu_flip_early1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+
+	/* No one else did it for us, so force the flip. */
+
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu_flip2);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__synchronize_rcu_flip();
+}
+
+void
+synchronize_rcu(void)
+{
+	long oldbatch = rcu_data.batch;
+
+	/* Need two flips to get a grace period. */
+
+/*&&&&  printk("Entering synchronize_rcu()\n"); */
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_synchronize_rcu);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	synchronize_rcu_flip();
+
+	if (unlikely((rcu_data.batch - oldbatch) >= 2)) {
+
+		/* Someone else helped with the second flip, we are done! */
+
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_synchronize_rcu_early1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+
+	synchronize_rcu_flip();
+/*&&&&*/printk("Exiting synchronize_rcu()\n");
+}
+
+void fastcall
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	head->func = func;
+	head->next = NULL;
+	_raw_spin_lock(&rcu_data.lock);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_next_add++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	rcu_data.n_next_length++;
+	_raw_spin_unlock(&rcu_data.lock);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
+ */
+void
+synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map; 
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	rcu_data.lock = RAW_SPIN_LOCK_UNLOCKED;
+	rcu_data.batch = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc_data(char *page)
+{
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld\n"
+		       "sr=%d srf1=%d srf2=%d srf3=%d srf4=%d srf5=%d srf6=%d\n"
+		       "ac1=%ld ac2=%ld ac3=%ld ac4=%ld ac5=%ld\n"
+		       "sre1=%d srfe1=%d srfe2=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld rcc=%ld rpc=%ld dl=%ld dr=%ld di=%d\n",
+		       rcu_ctrlblk.batch,
+		       rcu_data.batch,
+
+		       atomic_read(&rcu_data.n_synchronize_rcu),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip1),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip2),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip3),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip4),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip5),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip6),
+
+		       rcu_data.n_rcu_advance_callbacks1,
+		       rcu_data.n_rcu_advance_callbacks2,
+		       rcu_data.n_rcu_advance_callbacks3,
+		       rcu_data.n_rcu_advance_callbacks4,
+		       rcu_data.n_rcu_advance_callbacks5,
+
+		       atomic_read(&rcu_data.n_synchronize_rcu_early1),
+		       atomic_read(&rcu_data.n_synchronize_rcu_flip_early1),
+		       rcu_data.n_synchronize_rcu_flip_early2,
+
+		       rcu_data.n_next_add,
+		       rcu_data.n_next_length,
+		       rcu_data.n_wait_add,
+		       rcu_data.n_wait_length,
+		       rcu_data.n_rcu_check_callbacks,
+		       rcu_data.n_rcu_process_callbacks,
+		       rcu_data.n_done_length,
+		       rcu_data.n_done_remove,
+		       atomic_read(&rcu_data.n_done_invoked));
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.batch;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.batch);
+}
+
+int rcu_read_proc_wqgp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.batch;
+
+	schedule_work(&synchronize_rcu_wq);
+	return sprintf(page, "oldggp=%ld\n", oldgp);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
+EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL(rcu_read_lock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_unlock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: Removal in April 2006. */
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2005-07-11 18:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-06-25  1:38 [RFC] RCU and CONFIG_PREEMPT_RT progress, part 2 Paul E. McKenney
2005-07-11 15:07 ` Paul E. McKenney
2005-07-11 15:16   ` Ingo Molnar
2005-07-11 15:51     ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox