public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [ANNOUNCE] v5.14-rc4-rt4
@ 2021-08-02 16:27 Sebastian Andrzej Siewior
  2021-08-04  8:24 ` Daniel Wagner
  0 siblings, 1 reply; 34+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-02 16:27 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

Dear RT folks!

I'm pleased to announce the v5.14-rc4-rt4 patch set. 

Changes since v5.14-rc4-rt3:

  - Updating the locking bits:

    - Use proper task state in ww_mutex_lock_interruptible(), reported
      by Mike Galbraith.

    - Fix the wake_q handling for a task which blocks simultaneously as
      a regular task and additionally as a sleeper on a sleeping lock.
      Regression introduced in the v5.14 cycle, reported by Mike
      Galbraith, patched by Thomas Gleixner.

    - Address the futex related review comments by Peter Zijlstra.

Known issues
     - netconsole triggers WARN.

     - The "Memory controller" (CONFIG_MEMCG) has been disabled.

     - A RCU and ARM64 warning has been fixed by Valentin Schneider. It is
       still not clear if the RCU related change is correct.

The delta patch against v5.14-rc4-rt3 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/incr/patch-5.14-rc4-rt3-rt4.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.14-rc4-rt4

The RT patch against v5.14-rc4 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patch-5.14-rc4-rt4.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patches-5.14-rc4-rt4.tar.xz

Sebastian

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index cdd98d57c0e6a..8e57a6660aad1 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -61,11 +61,5 @@ static inline bool wake_q_empty(struct wake_q_head *head)
 
 extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
 extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
-extern void __wake_up_q(struct wake_q_head *head, unsigned int state);
-
-static inline void wake_up_q(struct wake_q_head *head)
-{
-	__wake_up_q(head, TASK_NORMAL);
-}
-
+extern void wake_up_q(struct wake_q_head *head);
 #endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/kernel/futex.c b/kernel/futex.c
index 7fc061ee5f2d4..c4e28bd37abcb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1853,15 +1853,16 @@ enum {
 static inline bool futex_requeue_pi_prepare(struct futex_q *q,
 					    struct futex_pi_state *pi_state)
 {
-	int cur, res, new;
+	int old, new;
 
 	/*
 	 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
 	 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
 	 * ignore the waiter.
 	 */
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
-		if (cur == Q_REQUEUE_PI_IGNORE)
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
+		if (old == Q_REQUEUE_PI_IGNORE)
 			return false;
 
 		/*
@@ -1872,74 +1873,68 @@ static inline bool futex_requeue_pi_prepare(struct futex_q *q,
 		 * trylock, but that would just add more conditionals
 		 * all over the place for a dubious value.
 		 */
-		if (cur != Q_REQUEUE_PI_NONE)
+		if (old != Q_REQUEUE_PI_NONE)
 			break;
 
 		new = Q_REQUEUE_PI_IN_PROGRESS;
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
 	q->pi_state = pi_state;
 	return true;
 }
 
 static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
 {
-	int cur, res, new;
+	int old, new;
 
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
 		if (locked >= 0) {
 			/* Requeue succeeded. Set DONE or LOCKED */
 			new = Q_REQUEUE_PI_DONE + locked;
-		} else if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+		} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
 			/* Deadlock, no early wakeup interleave */
 			new = Q_REQUEUE_PI_NONE;
 		} else {
 			/* Deadlock, early wakeup interleave. */
 			new = Q_REQUEUE_PI_IGNORE;
 		}
-
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
 
 #ifdef CONFIG_PREEMPT_RT
 	/* If the waiter interleaved with the requeue let it know */
-	if (unlikely(cur == Q_REQUEUE_PI_WAIT))
+	if (unlikely(old == Q_REQUEUE_PI_WAIT))
 		rcuwait_wake_up(&q->requeue_wait);
 #endif
 }
 
 static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
 {
-	int cur, new, res;
+	int old, new;
 
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
 		/* Is requeue done already? */
-		if (cur >= Q_REQUEUE_PI_DONE)
-			break;
+		if (old >= Q_REQUEUE_PI_DONE)
+			return old;
 
 		/*
 		 * If not done, then tell the requeue code to either ignore
 		 * the waiter or to wake it up once the requeue is done.
 		 */
-		new = !cur ? Q_REQUEUE_PI_IGNORE : Q_REQUEUE_PI_WAIT;
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+		new = Q_REQUEUE_PI_WAIT;
+		if (old == Q_REQUEUE_PI_NONE)
+			new = Q_REQUEUE_PI_IGNORE;
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
 
 	/* If the requeue was in progress, wait for it to complete */
-	if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+	if (old == Q_REQUEUE_PI_IN_PROGRESS) {
 #ifdef CONFIG_PREEMPT_RT
 		rcuwait_wait_event(&q->requeue_wait,
 				   atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
 				   TASK_UNINTERRUPTIBLE);
 #else
-		while (atomic_read(&q->requeue_state) == Q_REQUEUE_PI_WAIT)
-			cpu_relax();
+		(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
 #endif
 	}
 
@@ -2039,7 +2034,10 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 	if (!top_waiter)
 		return 0;
 
-	/* Ensure that this is a waiter sitting in futex_wait_requeue_pi() */
+	/*
+	 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+	 * and waiting on the 'waitqueue' futex which is always !PI.
+	 */
 	if (!top_waiter->rt_waiter || top_waiter->pi_state)
 		ret = -EINVAL;
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e347bbc12641d..eadaface1fd29 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -429,7 +429,10 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
 						struct rt_mutex_waiter *w)
 {
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
-		wake_q_add(&wqh->rt_head, w->task);
+		if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+			WARN_ON_ONCE(wqh->rtlock_task);
+		get_task_struct(w->task);
+		wqh->rtlock_task = w->task;
 	} else {
 		wake_q_add(&wqh->head, w->task);
 	}
@@ -437,8 +440,11 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
 
 static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !wake_q_empty(&wqh->rt_head))
-		__wake_up_q(&wqh->rt_head, TASK_RTLOCK_WAIT);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+		wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+		put_task_struct(wqh->rtlock_task);
+		wqh->rtlock_task = NULL;
+	}
 
 	if (!wake_q_empty(&wqh->head))
 		wake_up_q(&wqh->head);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 9697b04c40e68..61256de5bd66a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -44,18 +44,18 @@ struct rt_mutex_waiter {
 /**
  * rt_wake_q_head - Wrapper around regular wake_q_head to support
  *		    "sleeping" spinlocks on RT
- * @head:	The regular wake_q_head for sleeping lock variants
- * @rt_head:	The wake_q_head for RT lock (spin/rwlock) variants
+ * @head:		The regular wake_q_head for sleeping lock variants
+ * @rtlock_task:	Task pointer for RT lock (spin/rwlock) wakeups
  */
 struct rt_wake_q_head {
 	struct wake_q_head	head;
-	struct wake_q_head	rt_head;
+	struct task_struct	*rtlock_task;
 };
 
 #define DEFINE_RT_WAKE_Q(name)						\
 	struct rt_wake_q_head name = {					\
 		.head		= WAKE_Q_HEAD_INITIALIZER(name.head),	\
-		.rt_head	= WAKE_Q_HEAD_INITIALIZER(name.rt_head),\
+		.rtlock_task	= NULL,					\
 	}
 
 /*
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index 9b7e3f413a828..519092ee5e88e 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(ww_mutex_lock);
 int __sched
 ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+	return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
 }
 EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87e0ad6665260..a742e9dc9a7cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -920,7 +920,7 @@ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 		put_task_struct(task);
 }
 
-void __wake_up_q(struct wake_q_head *head, unsigned int state)
+void wake_up_q(struct wake_q_head *head)
 {
 	struct wake_q_node *node = head->first;
 
@@ -936,7 +936,7 @@ void __wake_up_q(struct wake_q_head *head, unsigned int state)
 		 * wake_up_process() executes a full barrier, which pairs with
 		 * the queueing in wake_q_add() so as not to miss wakeups.
 		 */
-		wake_up_state(task, state);
+		wake_up_process(task);
 		put_task_struct(task);
 	}
 }
diff --git a/localversion-rt b/localversion-rt
index 1445cd65885cd..ad3da1bcab7e8 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt3
+-rt4

^ permalink raw reply related	[flat|nested] 34+ messages in thread
* [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
@ 2021-08-04 14:43 Jens Axboe
  2021-08-04 15:33 ` Daniel Wagner
  2021-08-05  9:17 ` Peter Zijlstra
  0 siblings, 2 replies; 34+ messages in thread
From: Jens Axboe @ 2021-08-04 14:43 UTC (permalink / raw)
  To: io-uring, LKML; +Cc: Daniel Wagner, Peter Zijlstra

Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:

| [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
| [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [   90.202561] Call Trace:
| [   90.202577]  dump_stack_lvl+0x34/0x44
| [   90.202584]  ___might_sleep.cold+0x87/0x94
| [   90.202588]  rt_spin_lock+0x19/0x70
| [   90.202593]  ___slab_alloc+0xcb/0x7d0
| [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
| [   90.202603]  ? dequeue_entity+0xc3/0x290
| [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202610]  ? pick_next_task_fair+0xb9/0x330
| [   90.202612]  ? __schedule+0x670/0x1410
| [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
| [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202625]  io_wq_worker_sleeping+0x37/0x50
| [   90.202628]  schedule+0x30/0xd0
| [   90.202630]  schedule_timeout+0x8f/0x1a0
| [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
| [   90.202637]  io_wqe_worker+0xfd/0x320
| [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
| [   90.202644]  ? io_worker_handle_work+0x670/0x670
| [   90.202646]  ? io_worker_handle_work+0x670/0x670
| [   90.202649]  ret_from_fork+0x22/0x30

which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.

This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.

https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@beryllium.lan/
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..d5f56d41d59b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -261,42 +265,46 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wq = cwd->wqe->wq;
-	create_io_worker(wq, cwd->wqe, cwd->index);
-	kfree(cwd);
+	worker = container_of(cb, struct io_worker, create_work);
+	wq = worker->wqe->wq;
+	create_io_worker(wq, worker->wqe, worker->create_index);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	if (!io_worker_get(worker))
+		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail_release;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
 
-		kfree(cwd);
-	}
+	clear_bit_unlock(0, &worker->create_state);
+fail_release:
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -314,7 +322,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -973,12 +981,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1003,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2021-08-10 15:23 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-08-02 16:27 [ANNOUNCE] v5.14-rc4-rt4 Sebastian Andrzej Siewior
2021-08-04  8:24 ` Daniel Wagner
2021-08-04 10:43   ` Sebastian Andrzej Siewior
2021-08-04 10:48     ` Sebastian Andrzej Siewior
2021-08-04 11:00       ` Sebastian Andrzej Siewior
2021-08-04 13:17         ` Peter Zijlstra
2021-08-04 13:32           ` Jens Axboe
2021-08-04 14:23             ` Jens Axboe
2021-08-04 15:33               ` Sebastian Andrzej Siewior
2021-08-04 15:39                 ` Jens Axboe
2021-08-04 15:47                   ` Sebastian Andrzej Siewior
2021-08-04 15:49                     ` Jens Axboe
2021-08-04 15:57                       ` Sebastian Andrzej Siewior
2021-08-04 16:05                         ` Jens Axboe
2021-08-04 16:20                           ` Sebastian Andrzej Siewior
2021-08-04 16:20                             ` Jens Axboe
2021-08-04 16:20                           ` Steven Rostedt
2021-08-04 16:22                             ` Jens Axboe
2021-08-04 16:47                               ` Sebastian Andrzej Siewior
2021-08-04 16:57                                 ` Jens Axboe
2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
2021-08-10 11:22                                     ` [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path kernel test robot
2021-08-10 15:22                                     ` kernel test robot
2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
2021-08-04 16:22                         ` Sebastian Andrzej Siewior
2021-08-04 16:25                           ` Steven Rostedt
2021-08-04 16:31                             ` Sebastian Andrzej Siewior
2021-08-04 16:47                               ` Steven Rostedt
2021-08-04 16:57                                 ` Sebastian Andrzej Siewior
  -- strict thread matches above, loose matches on Subject: below --
2021-08-04 14:43 [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path Jens Axboe
2021-08-04 15:33 ` Daniel Wagner
2021-08-04 15:39   ` Jens Axboe
2021-08-05  9:17 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox