All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] rcu_nest: Update description of rcu_nest.[hc]
@ 2019-05-29  4:47 Junchang Wang
  2019-05-29 13:30 ` Paul E. McKenney
  0 siblings, 1 reply; 2+ messages in thread
From: Junchang Wang @ 2019-05-29  4:47 UTC (permalink / raw)
  To: paulmck, akiyks; +Cc: perfbook, Junchang Wang

Commit 61e0eb9cdb89 ("rcu_nest: Fix concurrency issues") fixes a few
concurrency issues in rcu_nest.h and rcu_nest.c. This patch accordingly
updates the description in Appendix B Toy RCU Implementations. Besides, the
new scheme, which directly extracts snippet from rcu_nest.[hc], is applied.


Signed-off-by: Junchang Wang <junchangwang@gmail.com>
---

Thanks again for writing and sharing perfbook. I can always find great
pleasure in reading through a chapter of this book :-).

 CodeSamples/defer/rcu_nest.c |  35 ++++++++------
 CodeSamples/defer/rcu_nest.h |  36 +++++++++------
 appendix/toyrcu/toyrcu.tex   | 108 +++++++++++++++----------------------------
 3 files changed, 81 insertions(+), 98 deletions(-)

diff --git a/CodeSamples/defer/rcu_nest.c b/CodeSamples/defer/rcu_nest.c
index 362f466..7e36866 100644
--- a/CodeSamples/defer/rcu_nest.c
+++ b/CodeSamples/defer/rcu_nest.c
@@ -21,45 +21,52 @@
 #include "../api.h"
 #include "rcu_nest.h"
 
-void synchronize_rcu(void)
+//\begin{snippet}[labelbase=ln:defer:rcu_nest:synchronize,gobbleblank=yes,commandchars=\%\@\$]
+void synchronize_rcu(void)				//\lnlbl{syn:b}
 {
 	int t;
-
+								//\fcvblank
 	/* Memory barrier ensures mutation seen before grace period. */
 
-	smp_mb();
+	smp_mb();					//\lnlbl{syn:mb1}
 
 	/* Only one synchronize_rcu() at a time. */
 
-	spin_lock(&rcu_gp_lock);
+	spin_lock(&rcu_gp_lock);			//\lnlbl{syn:spinlock}
 
 	/* Advance to a new grace-period number, enforce ordering. */
 
-	WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR_BOTTOM_BIT);
-	smp_mb();
+	WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr +		//\lnlbl{syn:incgp1}
+			RCU_GP_CTR_BOTTOM_BIT);		//\lnlbl{syn:incgp2}
+	smp_mb();					//\lnlbl{syn:mb2}
 
 	/*
 	 * Wait until all threads are either out of their RCU read-side
 	 * critical sections or are aware of the new grace period.
 	 */
 
-	for_each_thread(t) {
-		while (rcu_gp_ongoing(t) &&
-		       ((READ_ONCE(per_thread(rcu_reader_gp, t)) -
-			 rcu_gp_ctr) < 0)) {
+	for_each_thread(t) {				//\lnlbl{syn:scan:b}
+		while (rcu_gp_ongoing(t) &&		//\lnlbl{syn:ongoing}
+		       ((READ_ONCE(per_thread(rcu_reader_gp, t)) -//\lnlbl{syn:lt1}
+		          rcu_gp_ctr) < 0)) {		//\lnlbl{syn:lt2}
+#ifndef FCV_SNIPPET
 			/*@@@ poll(NULL, 0, 10); */
 			barrier();
+#else
+			poll(NULL, 0, 10);      	//\lnlbl{syn:poll}
+#endif
 		}
-	}
+	}						//\lnlbl{syn:scan:e}
 
 	/* Let other synchronize_rcu() instances move ahead. */
 
-	spin_unlock(&rcu_gp_lock);
+	spin_unlock(&rcu_gp_lock);			//\lnlbl{syn:spinunlock}
 
 	/* Ensure that any subsequent free()s happen -after- above checks. */
 
-	smp_mb();
-}
+	smp_mb();					//\lnlbl{syn:mb3}
+}							//\lnlbl{syn:e}
+//\end{snippet}
 
 #ifdef TEST
 #define RCU_READ_NESTABLE
diff --git a/CodeSamples/defer/rcu_nest.h b/CodeSamples/defer/rcu_nest.h
index 7e7b7de..19a0956 100644
--- a/CodeSamples/defer/rcu_nest.h
+++ b/CodeSamples/defer/rcu_nest.h
@@ -42,25 +42,28 @@ static void rcu_init(void)
 	init_per_thread(rcu_reader_gp, 0);
 }
 
-static void rcu_read_lock(void)
+//\begin{snippet}[labelbase=ln:defer:rcu_nest:read_lock_unlock,gobbleblank=yes,commandchars=\%\@\$]
+static void rcu_read_lock(void)				//\lnlbl{lock:b}
 {
 	unsigned long tmp;
 	unsigned long *rrgp;
-
+								//\fcvblank
 	/*
 	 * If this is the outermost RCU read-side critical section,
 	 * copy the global grace-period counter.  In either case,
 	 * increment the nesting count held in the low-order bits.
 	 */
 
-	rrgp = &__get_thread_var(rcu_reader_gp);
+	rrgp = &__get_thread_var(rcu_reader_gp);	//\lnlbl{lock:readgp}
+#ifndef FCV_SNIPPET
 retry:
-	tmp = *rrgp;
-	if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)
-		tmp = READ_ONCE(rcu_gp_ctr);
-	tmp++;
-	WRITE_ONCE(*rrgp, tmp);
-	smp_mb();
+#endif
+	tmp = *rrgp;					//\lnlbl{lock:wtmp1}
+	if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)		//\lnlbl{lock:checktmp}
+		tmp = READ_ONCE(rcu_gp_ctr);		//\lnlbl{lock:wtmp2}
+	tmp++;						//\lnlbl{lock:inctmp}
+	WRITE_ONCE(*rrgp, tmp);				//\lnlbl{lock:writegp}
+	smp_mb();					//\lnlbl{lock:mb1}
 
 	/*
 	 * A reader could be suspended in between fetching the value of *rrgp
@@ -71,14 +74,15 @@ retry:
 	 * ULONG_MAX. To handle this correctly, we adopt the helper function
 	 * ULONG_CMP_GE.
 	 */
-
+#ifndef FCV_SNIPPET
 	if (((tmp & RCU_GP_CTR_NEST_MASK) == 1) &&
 	     ULONG_CMP_GE(READ_ONCE(rcu_gp_ctr), tmp + MAX_GP_ADV_DISTANCE)) {
 		WRITE_ONCE(*rrgp, *rrgp - 1);
 		goto retry;
 	}
+#endif
 }
-
+								//\fcvblank
 static void rcu_read_unlock(void)
 {
 	/*
@@ -86,11 +90,15 @@ static void rcu_read_unlock(void)
 	 * which had better not initially be zero.
 	 */
 
-	smp_mb();
+	smp_mb();					//\lnlbl{unlock:mb1}
+#ifndef FCV_SNIPPET
 #ifdef DEBUG_EXTREME
 	BUG_ON((__get_thread_var(rcu_reader_gp) & RCU_GP_CTR_NEST_MASK) != 0);
-#endif /* #ifdef DEBUG_EXTREME */
-	__get_thread_var(rcu_reader_gp)--;
+#endif /* #ifdef DEBUG_EXTREME */				//\fcvexclude
+#endif
+	__get_thread_var(rcu_reader_gp)--;		//\lnlbl{unlock:decgp}
 }
+								//\fcvblank
+//\end{snippet}
 
 extern void synchronize_rcu(void);
diff --git a/appendix/toyrcu/toyrcu.tex b/appendix/toyrcu/toyrcu.tex
index ded754d..0e84b05 100644
--- a/appendix/toyrcu/toyrcu.tex
+++ b/appendix/toyrcu/toyrcu.tex
@@ -1395,60 +1395,15 @@ variables.
 \end{listing}
 
 \begin{listing}[tb]
-{ \scriptsize
-\begin{verbbox}
- 1 static void rcu_read_lock(void)
- 2 {
- 3   long tmp;
- 4   long *rrgp;
- 5 
- 6   rrgp = &__get_thread_var(rcu_reader_gp);
- 7   tmp = *rrgp;
- 8   if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)
- 9     tmp = ACCESS_ONCE(rcu_gp_ctr);
-10   tmp++;
-11   *rrgp = tmp;
-12   smp_mb();
-13 }
-14 
-15 static void rcu_read_unlock(void)
-16 {
-17   long tmp;
-18 
-19   smp_mb();
-20   __get_thread_var(rcu_reader_gp)--;
-21 }
-22 
-23 void synchronize_rcu(void)
-24 {
-25   int t;
-26 
-27   smp_mb();
-28   spin_lock(&rcu_gp_lock);
-29   ACCESS_ONCE(rcu_gp_ctr) +=
-30     RCU_GP_CTR_BOTTOM_BIT;
-31   smp_mb();
-32   for_each_thread(t) {
-33     while (rcu_gp_ongoing(t) &&
-34            ((per_thread(rcu_reader_gp, t) -
-35              rcu_gp_ctr) < 0)) {
-36       poll(NULL, 0, 10);
-37     }
-38   }
-39   spin_unlock(&rcu_gp_lock);
-40   smp_mb();
-41 }
-\end{verbbox}
-}
-\centering
-\theverbbox
+\input{CodeSamples/defer/rcu_nest@read_lock_unlock.fcv}\vspace*{-11pt}\fvset{firstnumber=last}
+\input{CodeSamples/defer/rcu_nest@synchronize.fcv}\fvset{firstnumber=auto}
 \caption{Nestable RCU Using a Free-Running Counter}
 \label{lst:app:toyrcu:Nestable RCU Using a Free-Running Counter}
 \end{listing}
 
 Listing~\ref{lst:app:toyrcu:Nestable RCU Using a Free-Running Counter}
 (\path{rcu_nest.h} and \path{rcu_nest.c})
-show an RCU implementation based on a single global free-running counter,
+shows an RCU implementation based on a single global free-running counter,
 but that permits nesting of RCU read-side critical sections.
 This nestability is accomplished by reserving the low-order bits of the
 global \co{rcu_gp_ctr} to count nesting, using the definitions shown in
@@ -1472,23 +1427,28 @@ reserves seven bits, for a maximum RCU read-side critical-section
 nesting depth of 127, which should be well in excess of that needed
 by most applications.
 
+\begin{lineref}[ln:defer:rcu_nest:read_lock_unlock:lock]
 The resulting \co{rcu_read_lock()} implementation is still reasonably
 straightforward.
-Line~6 places a pointer to this thread's instance of \co{rcu_reader_gp}
+Line~\lnref{readgp} places a pointer to
+this thread's instance of \co{rcu_reader_gp}
 into the local variable \co{rrgp}, minimizing the number of expensive
 calls to the pthreads thread-local-state API.
-Line~7 records the current value of \co{rcu_reader_gp} into another
-local variable \co{tmp}, and line~8 checks to see if the low-order
-bits are zero, which would indicate that this is the outermost
-\co{rcu_read_lock()}.
-If so, line~9 places the global \co{rcu_gp_ctr} into \co{tmp} because
-the current value previously fetched by line~7 is likely to be obsolete.
-In either case, line~10 increments the nesting depth, which you will
-recall is stored in the seven low-order bits of the counter.
-Line~11 stores the updated counter back into this thread's instance
-of \co{rcu_reader_gp}, and, finally, line~12 executes a memory
-barrier to prevent the RCU read-side critical section from bleeding out
+Line~\lnref{wtmp1} records the current value of \co{rcu_reader_gp}
+into another local variable \co{tmp}, and line~\lnref{checktmp} checks
+to see if the low-order bits are zero, which would indicate that
+this is the outermost \co{rcu_read_lock()}.
+If so, line~\lnref{wtmp2} places the global \co{rcu_gp_ctr}
+into \co{tmp} because the current value previously fetched by
+line~\lnref{wtmp1} is likely to be obsolete.
+In either case, line~\lnref{inctmp} increments the nesting depth,
+which you will recall is stored in the seven low-order bits of the counter.
+Line~\lnref{writegp} stores the updated counter back into this thread's
+instance of \co{rcu_reader_gp}, and,
+finally, line~\lnref{mb1} executes a memory barrier
+to prevent the RCU read-side critical section from bleeding out
 into the code preceding the call to \co{rcu_read_lock()}.
+\end{lineref}
 
 In other words, this implementation of \co{rcu_read_lock()} picks up a copy
 of the global \co{rcu_gp_ctr} unless the current invocation of
@@ -1499,29 +1459,35 @@ Either way, it increments whatever value it fetched in order to record
 an additional nesting level, and stores the result in the current
 thread's instance of \co{rcu_reader_gp}.
 
+\begin{lineref}[ln:defer:rcu_nest:read_lock_unlock:unlock]
 Interestingly enough, despite their \co{rcu_read_lock()} differences,
 the implementation of \co{rcu_read_unlock()}
 is broadly similar to that shown in
 Section~\ref{sec:app:toyrcu:RCU Based on Free-Running Counter}.
-Line~19 executes a memory barrier in order to prevent the RCU read-side
+Line~\lnref{mb1} executes a memory barrier
+in order to prevent the RCU read-side
 critical section from bleeding out into code following the call
 to \co{rcu_read_unlock()}, and
-line~20 decrements this thread's instance of \co{rcu_reader_gp},
+line~\lnref{decgp} decrements this thread's instance of \co{rcu_reader_gp},
 which has the effect of decrementing the nesting count contained in
 \co{rcu_reader_gp}'s low-order bits.
 Debugging versions of this primitive would check (before decrementing!)
 that these low-order bits were non-zero.
+\end{lineref}
 
+\begin{lineref}[ln:defer:rcu_nest:synchronize:syn]
 The implementation of \co{synchronize_rcu()} is quite similar to
 that shown in
 Section~\ref{sec:app:toyrcu:RCU Based on Free-Running Counter}.
 There are two differences.
-The first is that lines~29 and~30 adds \co{RCU_GP_CTR_BOTTOM_BIT}
-to the global \co{rcu_gp_ctr} instead of adding the constant ``2'',
-and the second is that the comparison on line~33 has been abstracted
-out to a separate function, where it checks the bit indicated
-by \co{RCU_GP_CTR_BOTTOM_BIT} instead of unconditionally checking
-the low-order bit.
+The first is that lines~\lnref{incgp1} and~\lnref{incgp2}
+adds \co{RCU_GP_CTR_BOTTOM_BIT} to the global \co{rcu_gp_ctr}
+instead of adding the constant ``2'',
+and the second is that the comparison on line~\lnref{ongoing}
+has been abstracted out to a separate function,
+where it checks the bit indicated by \co{RCU_GP_CTR_BOTTOM_BIT}
+instead of unconditionally checking the low-order bit.
+\end{lineref}
 
 This approach achieves read-side performance almost equal to that
 shown in
@@ -1562,10 +1528,12 @@ overhead.
 	how could you double the time required to overflow the global
 	\co{rcu_gp_ctr}?
 \QuickQuizAnswer{
+	\begin{lineref}[ln:defer:rcu_nest:synchronize:syn]
 	One way would be to replace the magnitude comparison on
-	lines~33 and 34 with an inequality check of the per-thread
-	\co{rcu_reader_gp} variable against
+	lines~\lnref{lt1} and \lnref{lt2} with an inequality check of
+	the per-thread \co{rcu_reader_gp} variable against
 	\co{rcu_gp_ctr+RCU_GP_CTR_BOTTOM_BIT}.
+	\end{lineref}
 } \QuickQuizEnd
 
 \QuickQuiz{}
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] rcu_nest: Update description of rcu_nest.[hc]
  2019-05-29  4:47 [PATCH] rcu_nest: Update description of rcu_nest.[hc] Junchang Wang
@ 2019-05-29 13:30 ` Paul E. McKenney
  0 siblings, 0 replies; 2+ messages in thread
From: Paul E. McKenney @ 2019-05-29 13:30 UTC (permalink / raw)
  To: Junchang Wang; +Cc: akiyks, perfbook

On Wed, May 29, 2019 at 12:47:42PM +0800, Junchang Wang wrote:
> Commit 61e0eb9cdb89 ("rcu_nest: Fix concurrency issues") fixes a few
> concurrency issues in rcu_nest.h and rcu_nest.c. This patch accordingly
> updates the description in Appendix B Toy RCU Implementations. Besides, the
> new scheme, which directly extracts snippet from rcu_nest.[hc], is applied.
> 
> 
> Signed-off-by: Junchang Wang <junchangwang@gmail.com>

Applied and pushed, thank you, Junchang!

							Thanx, Paul

> ---
> 
> Thanks again for writing and sharing perfbook. I can always find great
> pleasure in reading through a chapter of this book :-).
> 
>  CodeSamples/defer/rcu_nest.c |  35 ++++++++------
>  CodeSamples/defer/rcu_nest.h |  36 +++++++++------
>  appendix/toyrcu/toyrcu.tex   | 108 +++++++++++++++----------------------------
>  3 files changed, 81 insertions(+), 98 deletions(-)
> 
> diff --git a/CodeSamples/defer/rcu_nest.c b/CodeSamples/defer/rcu_nest.c
> index 362f466..7e36866 100644
> --- a/CodeSamples/defer/rcu_nest.c
> +++ b/CodeSamples/defer/rcu_nest.c
> @@ -21,45 +21,52 @@
>  #include "../api.h"
>  #include "rcu_nest.h"
>  
> -void synchronize_rcu(void)
> +//\begin{snippet}[labelbase=ln:defer:rcu_nest:synchronize,gobbleblank=yes,commandchars=\%\@\$]
> +void synchronize_rcu(void)				//\lnlbl{syn:b}
>  {
>  	int t;
> -
> +								//\fcvblank
>  	/* Memory barrier ensures mutation seen before grace period. */
>  
> -	smp_mb();
> +	smp_mb();					//\lnlbl{syn:mb1}
>  
>  	/* Only one synchronize_rcu() at a time. */
>  
> -	spin_lock(&rcu_gp_lock);
> +	spin_lock(&rcu_gp_lock);			//\lnlbl{syn:spinlock}
>  
>  	/* Advance to a new grace-period number, enforce ordering. */
>  
> -	WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR_BOTTOM_BIT);
> -	smp_mb();
> +	WRITE_ONCE(rcu_gp_ctr, rcu_gp_ctr +		//\lnlbl{syn:incgp1}
> +			RCU_GP_CTR_BOTTOM_BIT);		//\lnlbl{syn:incgp2}
> +	smp_mb();					//\lnlbl{syn:mb2}
>  
>  	/*
>  	 * Wait until all threads are either out of their RCU read-side
>  	 * critical sections or are aware of the new grace period.
>  	 */
>  
> -	for_each_thread(t) {
> -		while (rcu_gp_ongoing(t) &&
> -		       ((READ_ONCE(per_thread(rcu_reader_gp, t)) -
> -			 rcu_gp_ctr) < 0)) {
> +	for_each_thread(t) {				//\lnlbl{syn:scan:b}
> +		while (rcu_gp_ongoing(t) &&		//\lnlbl{syn:ongoing}
> +		       ((READ_ONCE(per_thread(rcu_reader_gp, t)) -//\lnlbl{syn:lt1}
> +		          rcu_gp_ctr) < 0)) {		//\lnlbl{syn:lt2}
> +#ifndef FCV_SNIPPET
>  			/*@@@ poll(NULL, 0, 10); */
>  			barrier();
> +#else
> +			poll(NULL, 0, 10);      	//\lnlbl{syn:poll}
> +#endif
>  		}
> -	}
> +	}						//\lnlbl{syn:scan:e}
>  
>  	/* Let other synchronize_rcu() instances move ahead. */
>  
> -	spin_unlock(&rcu_gp_lock);
> +	spin_unlock(&rcu_gp_lock);			//\lnlbl{syn:spinunlock}
>  
>  	/* Ensure that any subsequent free()s happen -after- above checks. */
>  
> -	smp_mb();
> -}
> +	smp_mb();					//\lnlbl{syn:mb3}
> +}							//\lnlbl{syn:e}
> +//\end{snippet}
>  
>  #ifdef TEST
>  #define RCU_READ_NESTABLE
> diff --git a/CodeSamples/defer/rcu_nest.h b/CodeSamples/defer/rcu_nest.h
> index 7e7b7de..19a0956 100644
> --- a/CodeSamples/defer/rcu_nest.h
> +++ b/CodeSamples/defer/rcu_nest.h
> @@ -42,25 +42,28 @@ static void rcu_init(void)
>  	init_per_thread(rcu_reader_gp, 0);
>  }
>  
> -static void rcu_read_lock(void)
> +//\begin{snippet}[labelbase=ln:defer:rcu_nest:read_lock_unlock,gobbleblank=yes,commandchars=\%\@\$]
> +static void rcu_read_lock(void)				//\lnlbl{lock:b}
>  {
>  	unsigned long tmp;
>  	unsigned long *rrgp;
> -
> +								//\fcvblank
>  	/*
>  	 * If this is the outermost RCU read-side critical section,
>  	 * copy the global grace-period counter.  In either case,
>  	 * increment the nesting count held in the low-order bits.
>  	 */
>  
> -	rrgp = &__get_thread_var(rcu_reader_gp);
> +	rrgp = &__get_thread_var(rcu_reader_gp);	//\lnlbl{lock:readgp}
> +#ifndef FCV_SNIPPET
>  retry:
> -	tmp = *rrgp;
> -	if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)
> -		tmp = READ_ONCE(rcu_gp_ctr);
> -	tmp++;
> -	WRITE_ONCE(*rrgp, tmp);
> -	smp_mb();
> +#endif
> +	tmp = *rrgp;					//\lnlbl{lock:wtmp1}
> +	if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)		//\lnlbl{lock:checktmp}
> +		tmp = READ_ONCE(rcu_gp_ctr);		//\lnlbl{lock:wtmp2}
> +	tmp++;						//\lnlbl{lock:inctmp}
> +	WRITE_ONCE(*rrgp, tmp);				//\lnlbl{lock:writegp}
> +	smp_mb();					//\lnlbl{lock:mb1}
>  
>  	/*
>  	 * A reader could be suspended in between fetching the value of *rrgp
> @@ -71,14 +74,15 @@ retry:
>  	 * ULONG_MAX. To handle this correctly, we adopt the helper function
>  	 * ULONG_CMP_GE.
>  	 */
> -
> +#ifndef FCV_SNIPPET
>  	if (((tmp & RCU_GP_CTR_NEST_MASK) == 1) &&
>  	     ULONG_CMP_GE(READ_ONCE(rcu_gp_ctr), tmp + MAX_GP_ADV_DISTANCE)) {
>  		WRITE_ONCE(*rrgp, *rrgp - 1);
>  		goto retry;
>  	}
> +#endif
>  }
> -
> +								//\fcvblank
>  static void rcu_read_unlock(void)
>  {
>  	/*
> @@ -86,11 +90,15 @@ static void rcu_read_unlock(void)
>  	 * which had better not initially be zero.
>  	 */
>  
> -	smp_mb();
> +	smp_mb();					//\lnlbl{unlock:mb1}
> +#ifndef FCV_SNIPPET
>  #ifdef DEBUG_EXTREME
>  	BUG_ON((__get_thread_var(rcu_reader_gp) & RCU_GP_CTR_NEST_MASK) != 0);
> -#endif /* #ifdef DEBUG_EXTREME */
> -	__get_thread_var(rcu_reader_gp)--;
> +#endif /* #ifdef DEBUG_EXTREME */				//\fcvexclude
> +#endif
> +	__get_thread_var(rcu_reader_gp)--;		//\lnlbl{unlock:decgp}
>  }
> +								//\fcvblank
> +//\end{snippet}
>  
>  extern void synchronize_rcu(void);
> diff --git a/appendix/toyrcu/toyrcu.tex b/appendix/toyrcu/toyrcu.tex
> index ded754d..0e84b05 100644
> --- a/appendix/toyrcu/toyrcu.tex
> +++ b/appendix/toyrcu/toyrcu.tex
> @@ -1395,60 +1395,15 @@ variables.
>  \end{listing}
>  
>  \begin{listing}[tb]
> -{ \scriptsize
> -\begin{verbbox}
> - 1 static void rcu_read_lock(void)
> - 2 {
> - 3   long tmp;
> - 4   long *rrgp;
> - 5 
> - 6   rrgp = &__get_thread_var(rcu_reader_gp);
> - 7   tmp = *rrgp;
> - 8   if ((tmp & RCU_GP_CTR_NEST_MASK) == 0)
> - 9     tmp = ACCESS_ONCE(rcu_gp_ctr);
> -10   tmp++;
> -11   *rrgp = tmp;
> -12   smp_mb();
> -13 }
> -14 
> -15 static void rcu_read_unlock(void)
> -16 {
> -17   long tmp;
> -18 
> -19   smp_mb();
> -20   __get_thread_var(rcu_reader_gp)--;
> -21 }
> -22 
> -23 void synchronize_rcu(void)
> -24 {
> -25   int t;
> -26 
> -27   smp_mb();
> -28   spin_lock(&rcu_gp_lock);
> -29   ACCESS_ONCE(rcu_gp_ctr) +=
> -30     RCU_GP_CTR_BOTTOM_BIT;
> -31   smp_mb();
> -32   for_each_thread(t) {
> -33     while (rcu_gp_ongoing(t) &&
> -34            ((per_thread(rcu_reader_gp, t) -
> -35              rcu_gp_ctr) < 0)) {
> -36       poll(NULL, 0, 10);
> -37     }
> -38   }
> -39   spin_unlock(&rcu_gp_lock);
> -40   smp_mb();
> -41 }
> -\end{verbbox}
> -}
> -\centering
> -\theverbbox
> +\input{CodeSamples/defer/rcu_nest@read_lock_unlock.fcv}\vspace*{-11pt}\fvset{firstnumber=last}
> +\input{CodeSamples/defer/rcu_nest@synchronize.fcv}\fvset{firstnumber=auto}
>  \caption{Nestable RCU Using a Free-Running Counter}
>  \label{lst:app:toyrcu:Nestable RCU Using a Free-Running Counter}
>  \end{listing}
>  
>  Listing~\ref{lst:app:toyrcu:Nestable RCU Using a Free-Running Counter}
>  (\path{rcu_nest.h} and \path{rcu_nest.c})
> -show an RCU implementation based on a single global free-running counter,
> +shows an RCU implementation based on a single global free-running counter,
>  but that permits nesting of RCU read-side critical sections.
>  This nestability is accomplished by reserving the low-order bits of the
>  global \co{rcu_gp_ctr} to count nesting, using the definitions shown in
> @@ -1472,23 +1427,28 @@ reserves seven bits, for a maximum RCU read-side critical-section
>  nesting depth of 127, which should be well in excess of that needed
>  by most applications.
>  
> +\begin{lineref}[ln:defer:rcu_nest:read_lock_unlock:lock]
>  The resulting \co{rcu_read_lock()} implementation is still reasonably
>  straightforward.
> -Line~6 places a pointer to this thread's instance of \co{rcu_reader_gp}
> +Line~\lnref{readgp} places a pointer to
> +this thread's instance of \co{rcu_reader_gp}
>  into the local variable \co{rrgp}, minimizing the number of expensive
>  calls to the pthreads thread-local-state API.
> -Line~7 records the current value of \co{rcu_reader_gp} into another
> -local variable \co{tmp}, and line~8 checks to see if the low-order
> -bits are zero, which would indicate that this is the outermost
> -\co{rcu_read_lock()}.
> -If so, line~9 places the global \co{rcu_gp_ctr} into \co{tmp} because
> -the current value previously fetched by line~7 is likely to be obsolete.
> -In either case, line~10 increments the nesting depth, which you will
> -recall is stored in the seven low-order bits of the counter.
> -Line~11 stores the updated counter back into this thread's instance
> -of \co{rcu_reader_gp}, and, finally, line~12 executes a memory
> -barrier to prevent the RCU read-side critical section from bleeding out
> +Line~\lnref{wtmp1} records the current value of \co{rcu_reader_gp}
> +into another local variable \co{tmp}, and line~\lnref{checktmp} checks
> +to see if the low-order bits are zero, which would indicate that
> +this is the outermost \co{rcu_read_lock()}.
> +If so, line~\lnref{wtmp2} places the global \co{rcu_gp_ctr}
> +into \co{tmp} because the current value previously fetched by
> +line~\lnref{wtmp1} is likely to be obsolete.
> +In either case, line~\lnref{inctmp} increments the nesting depth,
> +which you will recall is stored in the seven low-order bits of the counter.
> +Line~\lnref{writegp} stores the updated counter back into this thread's
> +instance of \co{rcu_reader_gp}, and,
> +finally, line~\lnref{mb1} executes a memory barrier
> +to prevent the RCU read-side critical section from bleeding out
>  into the code preceding the call to \co{rcu_read_lock()}.
> +\end{lineref}
>  
>  In other words, this implementation of \co{rcu_read_lock()} picks up a copy
>  of the global \co{rcu_gp_ctr} unless the current invocation of
> @@ -1499,29 +1459,35 @@ Either way, it increments whatever value it fetched in order to record
>  an additional nesting level, and stores the result in the current
>  thread's instance of \co{rcu_reader_gp}.
>  
> +\begin{lineref}[ln:defer:rcu_nest:read_lock_unlock:unlock]
>  Interestingly enough, despite their \co{rcu_read_lock()} differences,
>  the implementation of \co{rcu_read_unlock()}
>  is broadly similar to that shown in
>  Section~\ref{sec:app:toyrcu:RCU Based on Free-Running Counter}.
> -Line~19 executes a memory barrier in order to prevent the RCU read-side
> +Line~\lnref{mb1} executes a memory barrier
> +in order to prevent the RCU read-side
>  critical section from bleeding out into code following the call
>  to \co{rcu_read_unlock()}, and
> -line~20 decrements this thread's instance of \co{rcu_reader_gp},
> +line~\lnref{decgp} decrements this thread's instance of \co{rcu_reader_gp},
>  which has the effect of decrementing the nesting count contained in
>  \co{rcu_reader_gp}'s low-order bits.
>  Debugging versions of this primitive would check (before decrementing!)
>  that these low-order bits were non-zero.
> +\end{lineref}
>  
> +\begin{lineref}[ln:defer:rcu_nest:synchronize:syn]
>  The implementation of \co{synchronize_rcu()} is quite similar to
>  that shown in
>  Section~\ref{sec:app:toyrcu:RCU Based on Free-Running Counter}.
>  There are two differences.
> -The first is that lines~29 and~30 adds \co{RCU_GP_CTR_BOTTOM_BIT}
> -to the global \co{rcu_gp_ctr} instead of adding the constant ``2'',
> -and the second is that the comparison on line~33 has been abstracted
> -out to a separate function, where it checks the bit indicated
> -by \co{RCU_GP_CTR_BOTTOM_BIT} instead of unconditionally checking
> -the low-order bit.
> +The first is that lines~\lnref{incgp1} and~\lnref{incgp2}
> +adds \co{RCU_GP_CTR_BOTTOM_BIT} to the global \co{rcu_gp_ctr}
> +instead of adding the constant ``2'',
> +and the second is that the comparison on line~\lnref{ongoing}
> +has been abstracted out to a separate function,
> +where it checks the bit indicated by \co{RCU_GP_CTR_BOTTOM_BIT}
> +instead of unconditionally checking the low-order bit.
> +\end{lineref}
>  
>  This approach achieves read-side performance almost equal to that
>  shown in
> @@ -1562,10 +1528,12 @@ overhead.
>  	how could you double the time required to overflow the global
>  	\co{rcu_gp_ctr}?
>  \QuickQuizAnswer{
> +	\begin{lineref}[ln:defer:rcu_nest:synchronize:syn]
>  	One way would be to replace the magnitude comparison on
> -	lines~33 and 34 with an inequality check of the per-thread
> -	\co{rcu_reader_gp} variable against
> +	lines~\lnref{lt1} and \lnref{lt2} with an inequality check of
> +	the per-thread \co{rcu_reader_gp} variable against
>  	\co{rcu_gp_ctr+RCU_GP_CTR_BOTTOM_BIT}.
> +	\end{lineref}
>  } \QuickQuizEnd
>  
>  \QuickQuiz{}
> -- 
> 2.7.4
> 


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-05-29 13:30 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-05-29  4:47 [PATCH] rcu_nest: Update description of rcu_nest.[hc] Junchang Wang
2019-05-29 13:30 ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.