[PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
To: LKML <linux-kernel@vger.kernel.org>, RCU <rcu@vger.kernel.org>,
	"Paul E . McKenney" <paulmck@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Michal Hocko <mhocko@suse.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"Theodore Y . Ts'o" <tytso@mit.edu>,
	Joel Fernandes <joel@joelfernandes.org>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Uladzislau Rezki <urezki@gmail.com>,
	Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Subject: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context
Date: Thu, 29 Oct 2020 17:50:04 +0100	[thread overview]
Message-ID: <20201029165019.14218-1-urezki@gmail.com> (raw)

The current memmory-allocation interface presents to following
difficulties that this patch is designed to overcome:

a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
   complain about violation("BUG: Invalid wait context") of the
   nesting rules. It does the raw_spinlock vs. spinlock nesting
   checks, i.e. it is not legal to acquire a spinlock_t while
   holding a raw_spinlock_t.

   Internally the kfree_rcu() uses raw_spinlock_t whereas the
   "page allocator" internally deals with spinlock_t to access
   to its zones. The code also can be broken from higher level
   of view:
   <snip>
       raw_spin_lock(&some_lock);
       kfree_rcu(some_pointer, some_field_offset);
   <snip>

b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
   is converted into sleepable variant. Invoking the page allocator from
   atomic contexts leads to "BUG: scheduling while atomic".

c) call_rcu() is invoked from raw atomic context and kfree_rcu()
   and kvfree_rcu() are expected to be called from atomic raw context
   as well.

Move out a page allocation from contexts which trigger kvfree_rcu()
function to the separate worker. When a k[v]free_rcu() per-cpu page
cache is empty a fallback mechanism is used and a special job is
scheduled to refill the per-cpu cache.

Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/
Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object")
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 06895ef85d69..f2da2a1cc716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
  * per-CPU. Object size is equal to one page. This value
  * can be changed at boot time.
  */
-static int rcu_min_cached_objs = 2;
+static int rcu_min_cached_objs = 5;
 module_param(rcu_min_cached_objs, int, 0444);
 
 /* Retrieve RCU kthreads priority for rcutorture */
@@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
  *	In order to save some per-cpu space the list is singular.
  *	Even though it is lockless an access has to be protected by the
  *	per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
  * @nr_bkv_objs: number of allocated objects at @bkvcache.
  *
  * This is a per-CPU structure.  The reason that it is not included in
@@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
 	bool monitor_todo;
 	bool initialized;
 	int count;
+
+	struct work_struct page_cache_work;
+	atomic_t work_in_progress;
+	struct hrtimer hrtimer;
+
 	struct llist_head bkvcache;
 	int nr_bkv_objs;
 };
@@ -3217,10 +3225,10 @@ static void kfree_rcu_work(struct work_struct *work)
 			}
 			rcu_lock_release(&rcu_callback_map);
 
-			krcp = krc_this_cpu_lock(&flags);
+			raw_spin_lock_irqsave(&krcp->lock, flags);
 			if (put_cached_bnode(krcp, bkvhead[i]))
 				bkvhead[i] = NULL;
-			krc_this_cpu_unlock(krcp, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 			if (bkvhead[i])
 				free_page((unsigned long) bkvhead[i]);
@@ -3347,6 +3355,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+	struct kfree_rcu_cpu *krcp =
+		container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+	queue_work(system_highpri_wq, &krcp->page_cache_work);
+	return HRTIMER_NORESTART;
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+	struct kvfree_rcu_bulk_data *bnode;
+	struct kfree_rcu_cpu *krcp =
+		container_of(work, struct kfree_rcu_cpu,
+			page_cache_work);
+	unsigned long flags;
+	bool pushed;
+	int i;
+
+	for (i = 0; i < rcu_min_cached_objs; i++) {
+		bnode = (struct kvfree_rcu_bulk_data *)
+			__get_free_page(GFP_KERNEL | __GFP_NOWARN);
+
+		if (bnode) {
+			raw_spin_lock_irqsave(&krcp->lock, flags);
+			pushed = put_cached_bnode(krcp, bnode);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+			if (!pushed) {
+				free_page((unsigned long) bnode);
+				break;
+			}
+		}
+	}
+
+	atomic_set(&krcp->work_in_progress, 0);
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+			!atomic_xchg(&krcp->work_in_progress, 1)) {
+		hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+			HRTIMER_MODE_REL);
+		krcp->hrtimer.function = schedule_page_work_fn;
+		hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+	}
+}
+
 static inline bool
 kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 {
@@ -3363,32 +3422,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 	if (!krcp->bkvhead[idx] ||
 			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
 		bnode = get_cached_bnode(krcp);
-		if (!bnode) {
-			/*
-			 * To keep this path working on raw non-preemptible
-			 * sections, prevent the optional entry into the
-			 * allocator as it uses sleeping locks. In fact, even
-			 * if the caller of kfree_rcu() is preemptible, this
-			 * path still is not, as krcp->lock is a raw spinlock.
-			 * With additional page pre-allocation in the works,
-			 * hitting this return is going to be much less likely.
-			 */
-			if (IS_ENABLED(CONFIG_PREEMPT_RT))
-				return false;
-
-			/*
-			 * NOTE: For one argument of kvfree_rcu() we can
-			 * drop the lock and get the page in sleepable
-			 * context. That would allow to maintain an array
-			 * for the CONFIG_PREEMPT_RT as well if no cached
-			 * pages are available.
-			 */
-			bnode = (struct kvfree_rcu_bulk_data *)
-				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-		}
-
 		/* Switch to emergency path. */
-		if (unlikely(!bnode))
+		if (!bnode)
 			return false;
 
 		/* Initialize the new block. */
@@ -3452,12 +3487,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 		goto unlock_return;
 	}
 
-	/*
-	 * Under high memory pressure GFP_NOWAIT can fail,
-	 * in that case the emergency path is maintained.
-	 */
 	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
 	if (!success) {
+		run_page_cache_worker(krcp);
+
 		if (head == NULL)
 			// Inline if kvfree_rcu(one_arg) call.
 			goto unlock_return;
@@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-		struct kvfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
-		for (i = 0; i < rcu_min_cached_objs; i++) {
-			bnode = (struct kvfree_rcu_bulk_data *)
-				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-
-			if (bnode)
-				put_cached_bnode(krcp, bnode);
-			else
-				pr_err("Failed to preallocate for %d CPU!\n", cpu);
-		}
-
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+		INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
 	if (register_shrinker(&kfree_rcu_shrinker))
-- 
2.20.1

next             reply	other threads:[~2020-10-29 16:50 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-29 16:50 Uladzislau Rezki (Sony) [this message]
2020-10-29 16:50 ` [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 03/16] preempt: Make preempt count unconditional Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 05/16] lockdep: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 06/16] mm/pagemap: " Uladzislau Rezki (Sony)
2020-10-29 20:57   ` Uladzislau Rezki
2020-10-29 21:26     ` Paul E. McKenney
2020-10-29 21:03   ` kernel test robot
2020-10-29 21:21   ` kernel test robot
2020-10-30  2:36   ` kernel test robot
2020-10-29 16:50 ` [PATCH 07/16] locking/bitspinlock: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 08/16] uaccess: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 09/16] sched: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 10/16] ARM: " Uladzislau Rezki (Sony)
2020-10-29 16:50   ` Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 11/16] xtensa: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [Intel-gfx] [PATCH 12/16] drm/i915: " Uladzislau Rezki (Sony)
2020-10-29 16:50   ` Uladzislau Rezki (Sony)
2020-10-29 16:50   ` Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 13/16] rcutorture: " Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig Uladzislau Rezki (Sony)
2020-10-29 16:50 ` [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible Uladzislau Rezki (Sony)
2020-11-03 18:03   ` Joel Fernandes
2020-11-04 11:39     ` Uladzislau Rezki
2020-11-04 14:36       ` Joel Fernandes
2020-10-29 16:50 ` [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache Uladzislau Rezki (Sony)
2020-10-29 19:47   ` Paul E. McKenney
2020-10-29 20:13     ` Uladzislau Rezki
2020-10-29 20:22       ` Uladzislau Rezki
2020-10-29 20:33         ` Paul E. McKenney
2020-10-29 21:00           ` Uladzislau Rezki
2020-11-03 15:47 ` [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context Joel Fernandes
2020-11-03 16:33   ` Uladzislau Rezki
2020-11-03 19:18     ` Paul E. McKenney
2020-11-04 12:35       ` Uladzislau Rezki
2020-11-04 14:12         ` Paul E. McKenney
2020-11-04 14:40           ` Uladzislau Rezki
2020-11-03 17:54 ` Joel Fernandes
2020-11-04 12:12   ` Uladzislau Rezki
2020-11-04 15:01     ` Joel Fernandes
2020-11-04 18:38       ` Uladzislau Rezki

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:06895ef85d6 dfblob:f2da2a1cc71 )
 OR (
bs:"[PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201029165019.14218-1-urezki@gmail.com \
    --to=urezki@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bigeasy@linutronix.de \
    --cc=joel@joelfernandes.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mhocko@suse.com \
    --cc=oleksiy.avramchenko@sonymobile.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rcu@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.