All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] locking/rtmutex: Fix task->pi_waiters integrity
@ 2023-07-07 16:10 Peter Zijlstra
  2023-07-14 17:12 ` Henry Wu
  2023-07-26 14:12 ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 4+ messages in thread
From: Peter Zijlstra @ 2023-07-07 16:10 UTC (permalink / raw)
  To: triangletrap12
  Cc: Ingo Molnar, Thomas Gleixner, Will Deacon, Waiman Long,
	linux-kernel, Boqun Feng, Mike Galbraith,
	Sebastian Andrzej Siewior


Henry reported that rt_mutex_adjust_prio_check() has an ordering
problem and puts the lie to the comment in [7]. Sharing the sort key
between lock->waiters and owner->pi_waiters *does* create problems,
since unlike what the comment claims, holding [L] is insufficient.

Notably, consider:

	A
      /   \
     M1   M2
     |     |
     B     C

That is, task A owns both M1 and M2, B and C block on them. In this
case a concurrent chain walk (B & C) will modify their resp. sort keys
in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L]
is meaningless, they're different Ls.

This then gives rise to a race condition between [7] and [11], where
the requeue of pi_waiters will observe an inconsistent tree order.

	B				C

  (holds M1->wait_lock,		(holds M2->wait_lock,
   holds B->pi_lock)		 holds A->pi_lock)

  [7]
  waiter_update_prio();
  ...
  [8]
  raw_spin_unlock(B->pi_lock);
  ...
  [10]
  raw_spin_lock(A->pi_lock);

				[11]
				rt_mutex_enqueue_pi();
				// observes inconsistent A->pi_waiters
				// tree order

Fixing this means either extending the range of the owner lock from
[10-13] to [6-13], with the immediate problem that this means [6-8]
hold both blocked and owner locks, or duplicating the sort key.

Since the locking in chain walk is horrible enough without having to
consider pi_lock nesting rules, duplicate the sort key instead.

By giving each tree their own sort key, the above race becomes
harmless, if C sees B at the old location, then B will correct things
(if they need correcting) when it walks up the chain and reaches A.

Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree")
Reported-by: Henry Wu <triangletrap12@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/rtmutex.c        |  170 ++++++++++++++++++++++++++--------------
 kernel/locking/rtmutex_api.c    |    2 
 kernel/locking/rtmutex_common.h |   47 ++++++++---
 kernel/locking/ww_mutex.h       |   12 +-
 4 files changed, 155 insertions(+), 76 deletions(-)

--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -333,21 +333,43 @@ static __always_inline int __waiter_prio
 	return prio;
 }
 
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
 static __always_inline void
 waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
 {
-	waiter->prio = __waiter_prio(task);
-	waiter->deadline = task->dl.deadline;
+	lockdep_assert_held(&waiter->lock->wait_lock);
+	lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+	waiter->tree.prio = __waiter_prio(task);
+	waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+	lockdep_assert_held(&waiter->lock->wait_lock);
+	lockdep_assert_held(&task->pi_lock);
+	lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+	waiter->pi_tree.prio = waiter->tree.prio;
+	waiter->pi_tree.deadline = waiter->tree.deadline;
 }
 
 /*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
  */
+#define task_to_waiter_node(p)	\
+	&(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
 #define task_to_waiter(p)	\
-	&(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+	&(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
 
-static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
-						struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+					       struct rt_waiter_node *right)
 {
 	if (left->prio < right->prio)
 		return 1;
@@ -364,8 +386,8 @@ static __always_inline int rt_mutex_wait
 	return 0;
 }
 
-static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
-						 struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+						 struct rt_waiter_node *right)
 {
 	if (left->prio != right->prio)
 		return 0;
@@ -385,7 +407,7 @@ static __always_inline int rt_mutex_wait
 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
 				  struct rt_mutex_waiter *top_waiter)
 {
-	if (rt_mutex_waiter_less(waiter, top_waiter))
+	if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
 		return true;
 
 #ifdef RT_MUTEX_BUILD_SPINLOCKS
@@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct
 	 * Note that RT tasks are excluded from same priority (lateral)
 	 * steals to prevent the introduction of an unbounded latency.
 	 */
-	if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+	if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
 		return false;
 
-	return rt_mutex_waiter_equal(waiter, top_waiter);
+	return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
 #else
 	return false;
 #endif
 }
 
 #define __node_2_waiter(node) \
-	rb_entry((node), struct rt_mutex_waiter, tree_entry)
+	rb_entry((node), struct rt_mutex_waiter, tree.entry)
 
 static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
 {
 	struct rt_mutex_waiter *aw = __node_2_waiter(a);
 	struct rt_mutex_waiter *bw = __node_2_waiter(b);
 
-	if (rt_mutex_waiter_less(aw, bw))
+	if (rt_waiter_node_less(&aw->tree, &bw->tree))
 		return 1;
 
 	if (!build_ww_mutex())
 		return 0;
 
-	if (rt_mutex_waiter_less(bw, aw))
+	if (rt_waiter_node_less(&bw->tree, &aw->tree))
 		return 0;
 
 	/* NOTE: relies on waiter->ww_ctx being set before insertion */
@@ -434,48 +456,58 @@ static __always_inline bool __waiter_les
 static __always_inline void
 rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
 {
-	rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
+	lockdep_assert_held(&lock->wait_lock);
+
+	rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
 }
 
 static __always_inline void
 rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
 {
-	if (RB_EMPTY_NODE(&waiter->tree_entry))
+	lockdep_assert_held(&lock->wait_lock);
+
+	if (RB_EMPTY_NODE(&waiter->tree.entry))
 		return;
 
-	rb_erase_cached(&waiter->tree_entry, &lock->waiters);
-	RB_CLEAR_NODE(&waiter->tree_entry);
+	rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+	RB_CLEAR_NODE(&waiter->tree.entry);
 }
 
-#define __node_2_pi_waiter(node) \
-	rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+#define __node_2_rt_node(node) \
+	rb_entry((node), struct rt_waiter_node, entry)
 
-static __always_inline bool
-__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
 {
-	return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+	return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
 }
 
 static __always_inline void
 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 {
-	rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
+	lockdep_assert_held(&task->pi_lock);
+
+	rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
 }
 
 static __always_inline void
 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 {
-	if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+	lockdep_assert_held(&task->pi_lock);
+
+	if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
 		return;
 
-	rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
-	RB_CLEAR_NODE(&waiter->pi_tree_entry);
+	rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+	RB_CLEAR_NODE(&waiter->pi_tree.entry);
 }
 
-static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+						 struct task_struct *p)
 {
 	struct task_struct *pi_task = NULL;
 
+	lockdep_assert_held(&lock->wait_lock);
+	lockdep_assert(rt_mutex_owner(lock) == p);
 	lockdep_assert_held(&p->pi_lock);
 
 	if (task_has_pi_waiters(p))
@@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_b
  * Chain walk basics and protection scope
  *
  * [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
  * [L] rtmutex->wait_lock held
  *
+ * Normal locking order:
+ *
+ *   rtmutex->wait_lock
+ *     task->pi_lock
+ *
  * Step	Description				Protected by
  *	function arguments:
  *	@task					[R]
@@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_b
  *	again:
  *	  loop_sanity_check();
  *	retry:
- * [1]	  lock(task->pi_lock);			[R] acquire [P]
- * [2]	  waiter = task->pi_blocked_on;		[P]
- * [3]	  check_exit_conditions_1();		[P]
- * [4]	  lock = waiter->lock;			[P]
- * [5]	  if (!try_lock(lock->wait_lock)) {	[P] try to acquire [L]
- *	    unlock(task->pi_lock);		release [P]
+ * [1]	  lock(task->pi_lock);			[R] acquire [P1]
+ * [2]	  waiter = task->pi_blocked_on;		[P1]
+ * [3]	  check_exit_conditions_1();		[P1]
+ * [4]	  lock = waiter->lock;			[P1]
+ * [5]	  if (!try_lock(lock->wait_lock)) {	[P1] try to acquire [L]
+ *	    unlock(task->pi_lock);		release [P1]
  *	    goto retry;
  *	  }
- * [6]	  check_exit_conditions_2();		[P] + [L]
- * [7]	  requeue_lock_waiter(lock, waiter);	[P] + [L]
- * [8]	  unlock(task->pi_lock);		release [P]
+ * [6]	  check_exit_conditions_2();		[P1] + [L]
+ * [7]	  requeue_lock_waiter(lock, waiter);	[P1] + [L]
+ * [8]	  unlock(task->pi_lock);		release [P1]
  *	  put_task_struct(task);		release [R]
  * [9]	  check_exit_conditions_3();		[L]
  * [10]	  task = owner(lock);			[L]
  *	  get_task_struct(task);		[L] acquire [R]
- *	  lock(task->pi_lock);			[L] acquire [P]
- * [11]	  requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12]	  check_exit_conditions_4();		[P] + [L]
- * [13]	  unlock(task->pi_lock);		release [P]
+ *	  lock(task->pi_lock);			[L] acquire [P2]
+ * [11]	  requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12]	  check_exit_conditions_4();		[P2] + [L]
+ * [13]	  unlock(task->pi_lock);		release [P2]
  *	  unlock(lock->wait_lock);		release [L]
  *	  goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
  */
 static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
 					      enum rtmutex_chainwalk chwalk,
@@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_
 	 * enabled we continue, but stop the requeueing in the chain
 	 * walk.
 	 */
-	if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+	if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
 		if (!detect_deadlock)
 			goto out_unlock_pi;
 		else
@@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_
 	}
 
 	/*
-	 * [4] Get the next lock
+	 * [4] Get the next lock; per holding task->pi_lock we can't unblock
+	 * and guarantee @lock's existence.
 	 */
 	lock = waiter->lock;
 	/*
 	 * [5] We need to trylock here as we are holding task->pi_lock,
 	 * which is the reverse lock order versus the other rtmutex
 	 * operations.
+	 *
+	 * Per the above, holding task->pi_lock guarantees lock exists, so
+	 * inverting this lock order is infeasible from a life-time
+	 * perspective.
 	 */
 	if (!raw_spin_trylock(&lock->wait_lock)) {
 		raw_spin_unlock_irq(&task->pi_lock);
@@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_
 	 * or
 	 *
 	 *   DL CBS enforcement advancing the effective deadline.
-	 *
-	 * Even though pi_waiters also uses these fields, and that tree is only
-	 * updated in [11], we can do this here, since we hold [L], which
-	 * serializes all pi_waiters access and rb_erase() does not care about
-	 * the values of the node being removed.
 	 */
 	waiter_update_prio(waiter, task);
 
 	rt_mutex_enqueue(lock, waiter);
 
-	/* [8] Release the task */
+	/*
+	 * [8] Release the (blocking) task in preparation for
+	 * taking the owner task in [10].
+	 *
+	 * Since we hold lock->waiter_lock, task cannot unblock, even if we
+	 * release task->pi_lock.
+	 */
 	raw_spin_unlock(&task->pi_lock);
 	put_task_struct(task);
 
@@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_
 		return 0;
 	}
 
-	/* [10] Grab the next task, i.e. the owner of @lock */
+	/*
+	 * [10] Grab the next task, i.e. the owner of @lock
+	 *
+	 * Per holding lock->wait_lock and checking for !owner above, there
+	 * must be an owner and it cannot go away.
+	 */
 	task = get_task_struct(rt_mutex_owner(lock));
 	raw_spin_lock(&task->pi_lock);
 
@@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_
 		 * and adjust the priority of the owner.
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+		waiter_clone_prio(waiter, task);
 		rt_mutex_enqueue_pi(task, waiter);
-		rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(lock, task);
 
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
@@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_
 		 */
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
+		waiter_clone_prio(waiter, task);
 		rt_mutex_enqueue_pi(task, waiter);
-		rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(lock, task);
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mut
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter_update_prio(waiter, task);
+	waiter_clone_prio(waiter, task);
 
 	/* Get the top priority waiter on the lock */
 	if (rt_mutex_has_waiters(lock))
@@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mut
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
-		rt_mutex_adjust_prio(owner);
+		rt_mutex_adjust_prio(lock, owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_wai
 {
 	struct rt_mutex_waiter *waiter;
 
+	lockdep_assert_held(&lock->wait_lock);
+
 	raw_spin_lock(&current->pi_lock);
 
 	waiter = rt_mutex_top_waiter(lock);
@@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_wai
 	 * task unblocks.
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
-	rt_mutex_adjust_prio(current);
+	rt_mutex_adjust_prio(lock, current);
 
 	/*
 	 * As we are waking up the top waiter, and the waiter stays
@@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-	rt_mutex_adjust_prio(owner);
+	rt_mutex_adjust_prio(lock, owner);
 
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct t
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+	if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -17,27 +17,44 @@
 #include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
 
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry:		rbtree node to enqueue into the waiters tree
+ * @prio:		Priority of the waiter
+ * @deadline:		Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+	struct rb_node	entry;
+	int		prio;
+	u64		deadline;
+};
+
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
  * which is allocated on the kernel stack on of the blocked task.
  *
- * @tree_entry:		pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry:	pi node to enqueue into the mutex owner waiters tree
+ * @tree:		node to enqueue into the mutex waiters tree
+ * @pi_tree:		node to enqueue into the mutex owner waiters tree
  * @task:		task reference to the blocked task
  * @lock:		Pointer to the rt_mutex on which the waiter blocks
  * @wake_state:		Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
- * @prio:		Priority of the waiter
- * @deadline:		Deadline of the waiter if applicable
  * @ww_ctx:		WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
  */
 struct rt_mutex_waiter {
-	struct rb_node		tree_entry;
-	struct rb_node		pi_tree_entry;
+	struct rt_waiter_node	tree;
+	struct rt_waiter_node	pi_tree;
 	struct task_struct	*task;
 	struct rt_mutex_base	*lock;
 	unsigned int		wake_state;
-	int			prio;
-	u64			deadline;
 	struct ww_acquire_ctx	*ww_ctx;
 };
 
@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_to
 {
 	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 
-	return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+	return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
 }
 
 static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt
 	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 	struct rt_mutex_waiter *w = NULL;
 
+	lockdep_assert_held(&lock->wait_lock);
+
 	if (leftmost) {
-		w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+		w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
 		BUG_ON(w->lock != lock);
 	}
 	return w;
@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(st
 
 static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
-			pi_tree_entry);
+			pi_tree.entry);
 }
 
 #define RT_MUTEX_HAS_WAITERS	1UL
@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_w
 static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 {
 	debug_rt_mutex_init_waiter(waiter);
-	RB_CLEAR_NODE(&waiter->pi_tree_entry);
-	RB_CLEAR_NODE(&waiter->tree_entry);
+	RB_CLEAR_NODE(&waiter->pi_tree.entry);
+	RB_CLEAR_NODE(&waiter->tree.entry);
 	waiter->wake_state = TASK_NORMAL;
 	waiter->task = NULL;
 }
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
 	struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
 	if (!n)
 		return NULL;
-	return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+	return rb_entry(n, struct rt_mutex_waiter, tree.entry);
 }
 
 static inline struct rt_mutex_waiter *
 __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
 {
-	struct rb_node *n = rb_next(&w->tree_entry);
+	struct rb_node *n = rb_next(&w->tree.entry);
 	if (!n)
 		return NULL;
-	return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+	return rb_entry(n, struct rt_mutex_waiter, tree.entry);
 }
 
 static inline struct rt_mutex_waiter *
 __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
 {
-	struct rb_node *n = rb_prev(&w->tree_entry);
+	struct rb_node *n = rb_prev(&w->tree.entry);
 	if (!n)
 		return NULL;
-	return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+	return rb_entry(n, struct rt_mutex_waiter, tree.entry);
 }
 
 static inline struct rt_mutex_waiter *
@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
 	struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
 	if (!n)
 		return NULL;
-	return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+	return rb_entry(n, struct rt_mutex_waiter, tree.entry);
 }
 
 static inline void

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] locking/rtmutex: Fix task->pi_waiters integrity
  2023-07-07 16:10 [PATCH] locking/rtmutex: Fix task->pi_waiters integrity Peter Zijlstra
@ 2023-07-14 17:12 ` Henry Wu
  2023-07-26 14:12 ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 4+ messages in thread
From: Henry Wu @ 2023-07-14 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, Will Deacon, Waiman Long,
	linux-kernel, Boqun Feng, Mike Galbraith,
	Sebastian Andrzej Siewior

[-- Attachment #1: Type: text/plain, Size: 8006 bytes --]

Hi, all.

Peter Zijlstra <peterz@infradead.org> 于2023年7月8日周六 00:10写道:
>
> Henry reported that rt_mutex_adjust_prio_check() has an ordering
> problem and puts the lie to the comment in [7]. Sharing the sort key
> between lock->waiters and owner->pi_waiters *does* create problems,
> since unlike what the comment claims, holding [L] is insufficient.
>
> Notably, consider:
>
>         A
>       /   \
>      M1   M2
>      |     |
>      B     C
>
> That is, task A owns both M1 and M2, B and C block on them. In this
> case a concurrent chain walk (B & C) will modify their resp. sort keys
> in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L]
> is meaningless, they're different Ls.
>
> This then gives rise to a race condition between [7] and [11], where
> the requeue of pi_waiters will observe an inconsistent tree order.
>
>         B                               C
>
>   (holds M1->wait_lock,         (holds M2->wait_lock,
>    holds B->pi_lock)             holds A->pi_lock)
>
>   [7]
>   waiter_update_prio();
>   ...
>   [8]
>   raw_spin_unlock(B->pi_lock);
>   ...
>   [10]
>   raw_spin_lock(A->pi_lock);
>
>                                 [11]
>                                 rt_mutex_enqueue_pi();
>                                 // observes inconsistent A->pi_waiters
>                                 // tree order
>
> Fixing this means either extending the range of the owner lock from
> [10-13] to [6-13], with the immediate problem that this means [6-8]
> hold both blocked and owner locks, or duplicating the sort key.
>
> Since the locking in chain walk is horrible enough without having to
> consider pi_lock nesting rules, duplicate the sort key instead.
>
> By giving each tree their own sort key, the above race becomes
> harmless, if C sees B at the old location, then B will correct things
> (if they need correcting) when it walks up the chain and reaches A.
>
> Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree")
> Reported-by: Henry Wu <triangletrap12@gmail.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/locking/rtmutex.c        |  170 ++++++++++++++++++++++++++--------------
>  kernel/locking/rtmutex_api.c    |    2
>  kernel/locking/rtmutex_common.h |   47 ++++++++---
>  kernel/locking/ww_mutex.h       |   12 +-
>  4 files changed, 155 insertions(+), 76 deletions(-)
>

Sorry for late reply due to overlooked your patch.

I tested your patch on a 6 core machine with kernel 6.5.0-rc1. It
works well and my test program can't find any race on patched kernel.

I tested it with non-debug kernel for almost 3 hours and tested with
debug kernel for 6 hours. I checked my dmesg and there was no
irregular log. I checked my test program on kernel 6.3.12 that it can
reproduce the race we discussed previously.

You can find my test program in attachment and below is test output.
You can add "Tested-by" in commit msg.

Thanks

Henry

For non-debug kernel:

$ sudo cat /sys/kernel/debug/sched/preempt
none voluntary (full)
$ sudo ./a.out
pid: 23845
Before: prio: 20
    PID     LWP TTY          TIME CMD
  23845   23845 pts/0    00:00:00 a.out
  23845   23846 pts/0    00:00:00 waiter-0
  23845   23847 pts/0    00:00:00 waiter-1
  23845   23848 pts/0    00:00:00 waiter-2
  23845   23849 pts/0    00:00:00 waiter-3
  23845   23850 pts/0    00:00:00 waiter-4
  23845   23851 pts/0    00:00:00 waiter-5
  23845   23852 pts/0    00:00:00 waiter-6
  23845   23853 pts/0    00:00:00 waiter-7
  23845   23854 pts/0    00:00:00 waiter-8
  23845   23855 pts/0    00:00:00 waiter-9
  23845   23856 pts/0    00:00:00 waiter-10
  23845   23857 pts/0    00:00:00 waiter-11
  23845   23858 pts/0    00:00:00 waiter-12
  23845   23859 pts/0    00:00:00 waiter-13
  23845   23860 pts/0    00:00:00 waiter-14
  23845   23861 pts/0    00:00:00 waiter-15
  23845   23862 pts/0    00:00:00 waiter-16
  23845   23863 pts/0    00:00:00 waiter-17
  23845   23864 pts/0    00:00:00 waiter-18
  23845   23865 pts/0    00:00:00 waiter-19
  23845   23866 pts/0    00:00:00 changer-0
  23845   23867 pts/0    00:00:00 changer-1
  23845   23868 pts/0    00:00:00 changer-2
  23845   23869 pts/0    00:00:00 changer-3
  23845   23870 pts/0    00:00:00 changer-4
  23845   23871 pts/0    00:00:00 changer-5
  23845   23872 pts/0    00:00:00 changer-6
  23845   23873 pts/0    00:00:00 changer-7
  23845   23874 pts/0    00:00:00 changer-8
  23845   23875 pts/0    00:00:00 changer-9
  23845   23876 pts/0    00:00:00 changer-10
  23845   23877 pts/0    00:00:00 changer-11
  23845   23878 pts/0    00:00:00 changer-12
  23845   23879 pts/0    00:00:00 changer-13
  23845   23880 pts/0    00:00:00 changer-14
  23845   23881 pts/0    00:00:00 changer-15
  23845   23882 pts/0    00:00:00 changer-16
  23845   23883 pts/0    00:00:00 changer-17
  23845   23884 pts/0    00:00:00 changer-18
  23845   23885 pts/0    00:00:00 changer-19
Prio: -21, elapsed: 2.0 minutes.
Prio: -21, elapsed: 4.0 minutes.
Prio: -21, elapsed: 6.0 minutes.
.............
Prio: -21, elapsed: 166.0 minutes.
Prio: -21, elapsed: 168.0 minutes.
Prio: -21, elapsed: 170.0 minutes.
Prio: -21, elapsed: 172.0 minutes.
Prio: -21, elapsed: 174.0 minutes.

For debug kernel:

$ uname -a
Linux fedora 6.5.0-0.rc1.20230713gteb26cbb1.213.rtmutex.fc38.x86_64+debug
#1 SMP PREEMPT_DYNAMIC Fri Jul 14 14:49:04 CST 2023 x86_64 GNU/Linux
$ sudo cat /sys/kernel/debug/sched/preempt
none voluntary (full)
$ gcc -Wall -g ./pi_650_rc1.c
$ sudo ./a.out
pid: 3141
Before: prio: 20
    PID     LWP TTY          TIME CMD
   3141    3141 pts/0    00:00:00 a.out
   3141    3142 pts/0    00:00:00 waiter-0
   3141    3143 pts/0    00:00:00 waiter-1
   3141    3144 pts/0    00:00:00 waiter-2
   3141    3145 pts/0    00:00:00 waiter-3
   3141    3146 pts/0    00:00:00 waiter-4
   3141    3147 pts/0    00:00:00 waiter-5
   3141    3148 pts/0    00:00:00 waiter-6
   3141    3149 pts/0    00:00:00 waiter-7
   3141    3150 pts/0    00:00:00 waiter-8
   3141    3151 pts/0    00:00:00 waiter-9
   3141    3152 pts/0    00:00:00 waiter-10
   3141    3153 pts/0    00:00:00 waiter-11
   3141    3154 pts/0    00:00:00 waiter-12
   3141    3155 pts/0    00:00:00 waiter-13
   3141    3156 pts/0    00:00:00 waiter-14
   3141    3157 pts/0    00:00:00 waiter-15
   3141    3158 pts/0    00:00:00 waiter-16
   3141    3159 pts/0    00:00:00 waiter-17
   3141    3160 pts/0    00:00:00 waiter-18
   3141    3161 pts/0    00:00:00 waiter-19
   3141    3162 pts/0    00:00:00 changer-0
   3141    3163 pts/0    00:00:00 changer-1
   3141    3164 pts/0    00:00:00 changer-2
   3141    3165 pts/0    00:00:00 changer-3
   3141    3166 pts/0    00:00:00 changer-4
   3141    3167 pts/0    00:00:00 changer-5
   3141    3168 pts/0    00:00:00 changer-6
   3141    3169 pts/0    00:00:00 changer-7
   3141    3170 pts/0    00:00:00 changer-8
   3141    3171 pts/0    00:00:00 changer-9
   3141    3172 pts/0    00:00:00 changer-10
   3141    3173 pts/0    00:00:00 changer-11
   3141    3174 pts/0    00:00:00 changer-12
   3141    3175 pts/0    00:00:00 changer-13
   3141    3176 pts/0    00:00:00 changer-14
   3141    3177 pts/0    00:00:00 changer-15
   3141    3178 pts/0    00:00:00 changer-16
   3141    3179 pts/0    00:00:00 changer-17
   3141    3180 pts/0    00:00:00 changer-18
   3141    3181 pts/0    00:00:00 changer-19
Prio: -21, elapsed: 2.0 minutes.
Prio: -21, elapsed: 4.0 minutes.
Prio: -21, elapsed: 6.0 minutes.
Prio: -21, elapsed: 8.0 minutes.
Prio: -21, elapsed: 10.0 minutes.
Prio: -21, elapsed: 12.0 minutes.
Prio: -21, elapsed: 14.0 minutes.
....................
Prio: -21, elapsed: 388.0 minutes.
Prio: -21, elapsed: 390.0 minutes.
Prio: -21, elapsed: 392.0 minutes.
Prio: -21, elapsed: 394.0 minutes.
Prio: -21, elapsed: 396.0 minutes.
^C
$

[-- Attachment #2: pi_650_rc1.c --]
[-- Type: text/x-csrc, Size: 5641 bytes --]

#define _GNU_SOURCE

#include <assert.h>
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>
#include <sys/syscall.h>

#define PCOUNT 20
#define EXPECTED_PRIO -21

// Copied from related header.
struct sched_attr {
	uint32_t size;

	uint32_t sched_policy;
	uint64_t sched_flags;

	/* SCHED_NORMAL, SCHED_BATCH */
	int32_t sched_nice;

	/* SCHED_FIFO, SCHED_RR */
	uint32_t sched_priority;

	/* SCHED_DEADLINE */
	uint64_t sched_runtime;
	uint64_t sched_deadline;
	uint64_t sched_period;

	/* Utilization hints */
	uint32_t sched_util_min;
	uint32_t sched_util_max;
};

static pthread_mutex_t mutexes[PCOUNT];

static void init_mutex(pthread_mutex_t *mutex) {
	pthread_mutexattr_t attr;
	int ret;

	ret = pthread_mutexattr_init(&attr);
	assert(!ret);
	ret = pthread_mutexattr_setprotocol(&attr, PTHREAD_PRIO_INHERIT);
	assert(!ret);

	ret = pthread_mutex_init(mutex, &attr);
	assert(!ret);

	pthread_mutexattr_destroy(&attr);
}

enum thread_type {
	WAITER,
	PRIO_CHANGER,
};

static struct thread_spec {
	enum thread_type type;
	int name_index;
	pthread_barrier_t *barrier;

	// Used by waiter.
	pthread_mutex_t *wait_mutex;
	pid_t pid_save;
	
	// Used by prio changer.
	pid_t *waiter_pid;
	uint32_t prio;
} thread_specs[PCOUNT * 2];

static pthread_t threads[PCOUNT * 2];
static int thread_count = 0;

static int barrier_wait(pthread_barrier_t *barrier) {
	if (!barrier) {
		return 0;
	}

	int ret = pthread_barrier_wait(barrier);
	assert(!ret || ret == PTHREAD_BARRIER_SERIAL_THREAD);

	return ret;
}

static int sched_getattr(pid_t pid, struct sched_attr *attr, int flags) {
	return syscall(SYS_sched_getattr, pid, attr, sizeof(*attr), flags);
}

static int sched_setattr(pid_t pid, struct sched_attr *attr, int flags) {
	return syscall(SYS_sched_setattr, pid, attr, flags);
}

static void *prio_change_loop(struct thread_spec *spec) {
	struct sched_attr attr;
	int ret;

	for (;;) {
		barrier_wait(spec->barrier);

		ret = sched_getattr(*spec->waiter_pid, &attr, 0);
		if (ret < 0) {
			perror("sched_getattr");
			assert(0);
		}
		
		attr.sched_policy = SCHED_RR;
		attr.sched_priority = spec->prio;
	
		ret = sched_setattr(*spec->waiter_pid, &attr, 0);
		if (ret < 0) {
			perror("sched_setattr");
			exit(1);
		}

		barrier_wait(spec->barrier);
	}
}

static void *thread(void *arg) {
	struct thread_spec *spec = arg;
	char name[64];
	int ret;
	
	snprintf(name, sizeof(name), "%s-%d", 
			spec->type == WAITER ? "waiter" : "changer", 
			spec->name_index);
	ret = pthread_setname_np(pthread_self(), name);
	assert(!ret);

	switch (spec->type) {
	case WAITER:
		spec->pid_save = gettid();

		barrier_wait(spec->barrier);

		ret = pthread_mutex_lock(spec->wait_mutex);
		assert(!ret);
		break;
	case PRIO_CHANGER:
		prio_change_loop(spec);
		break;
	default:
		assert(0);
		break;
	}

	return NULL;
}

static void create_thread(struct thread_spec *spec) {
	int ret;

	ret = pthread_create(&threads[thread_count++], NULL, thread, spec);
	assert(!ret);
}

static int get_prio(int need_print) {
	FILE *file = fopen("/proc/self/stat", "r");
	assert(file);

	char comm[64];
	int tmp, prio, ret;
	ret = fscanf(file, "%d %s %c %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
			&tmp, comm, comm, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp,
			&tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &tmp, &prio);
	assert(ret == 18);

	if (need_print) {
		printf("prio: %d\n", prio);
	}

	fclose(file);

	return prio;
}

static void print_threads(pid_t process_pid) {
	char cmd[128];
	snprintf(cmd, sizeof(cmd), "ps -L -p %d", process_pid);
	system(cmd);
}

int main(void) {
	pid_t pid;
	int ret;

	pid = getpid();
	printf("pid: %d\n", pid);

	for (int i = 0; i < sizeof(mutexes) / sizeof(mutexes[0]); ++i) {
		init_mutex(&mutexes[i]);
		ret = pthread_mutex_lock(&mutexes[i]);
		assert(!ret);
	}

	pthread_barrier_t barrier;
	ret = pthread_barrier_init(&barrier, NULL, PCOUNT + 1);
	assert(!ret);

	for (int i = 0; i < PCOUNT; ++i) {
		struct thread_spec *spec = &thread_specs[i];
		
		spec->type = WAITER;
		spec->name_index = i;
		spec->wait_mutex = &mutexes[i];
		spec->barrier = &barrier;

		create_thread(spec);
	}

	// Wait for filling of pid_save.
	barrier_wait(&barrier);

	for (int i = 0; i < PCOUNT; ++i) {
		struct thread_spec *spec = &thread_specs[i + PCOUNT];

		spec->type = PRIO_CHANGER; 
		spec->name_index = i;
		spec->prio = 99;
		spec->barrier = &barrier;
		spec->waiter_pid = &thread_specs[i].pid_save;

		create_thread(spec);
	}

	printf("Before: ");
	get_prio(1);
	print_threads(pid);

	time_t ts, start_ts;
	srandom(time(&ts));
	start_ts = ts;
	int iter = 0;
	for (;;) {
		++iter;
		for (int i = 0; i < PCOUNT; ++i) {
			thread_specs[i + PCOUNT].prio = (iter % 2) ? i + 1 : PCOUNT - i;
		}

		for (int i = 0; i < PCOUNT; ++i) {
			int pos = random() % PCOUNT;
			int tmp = thread_specs[pos + PCOUNT].prio;
			thread_specs[pos + PCOUNT].prio = thread_specs[PCOUNT].prio;
			thread_specs[PCOUNT].prio = tmp;
		}

		barrier_wait(&barrier);
		barrier_wait(&barrier);

		//system("echo iteration > /sys/kernel/debug/tracing/trace_marker");

		for (int try = 0; ; ++try) {
			int prio = get_prio(0);
			time_t now = time(NULL);
			if (now - ts >= 120) {
				printf("Prio: %d, elapsed: %.1f minutes.\n", prio, (double)(now - start_ts) / 60);
				ts = now;
			}

			if (prio == EXPECTED_PRIO) {
				// Try a new iteration.
				break;
			}

			if (try >= 10) {
				print_threads(pid);
				printf("found race, hang...\n");
				while (1) {
					sleep(3600);
				}
			}
		}
	}

	for (int i = 0; i < thread_count; ++i) {
		pthread_join(threads[i], NULL);
	}

	return 0;
}


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] locking/rtmutex: Fix task->pi_waiters integrity
  2023-07-07 16:10 [PATCH] locking/rtmutex: Fix task->pi_waiters integrity Peter Zijlstra
  2023-07-14 17:12 ` Henry Wu
@ 2023-07-26 14:12 ` Sebastian Andrzej Siewior
  2023-07-26 14:40   ` Peter Zijlstra
  1 sibling, 1 reply; 4+ messages in thread
From: Sebastian Andrzej Siewior @ 2023-07-26 14:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: triangletrap12, Ingo Molnar, Thomas Gleixner, Will Deacon,
	Waiman Long, linux-kernel, Boqun Feng, Mike Galbraith

On 2023-07-07 18:10:52 [+0200], Peter Zijlstra wrote:
> 
> Henry reported that rt_mutex_adjust_prio_check() has an ordering

I'm going to apply it to the RT tree so it does not get lost :)

Sebastian

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] locking/rtmutex: Fix task->pi_waiters integrity
  2023-07-26 14:12 ` Sebastian Andrzej Siewior
@ 2023-07-26 14:40   ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2023-07-26 14:40 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: triangletrap12, Ingo Molnar, Thomas Gleixner, Will Deacon,
	Waiman Long, linux-kernel, Boqun Feng, Mike Galbraith

On Wed, Jul 26, 2023 at 04:12:53PM +0200, Sebastian Andrzej Siewior wrote:
> On 2023-07-07 18:10:52 [+0200], Peter Zijlstra wrote:
> > 
> > Henry reported that rt_mutex_adjust_prio_check() has an ordering
> 
> I'm going to apply it to the RT tree so it does not get lost :)

It sits in tip/locking/urgent and should make it's way to Linus
soon-ish.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-26 19:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-07-07 16:10 [PATCH] locking/rtmutex: Fix task->pi_waiters integrity Peter Zijlstra
2023-07-14 17:12 ` Henry Wu
2023-07-26 14:12 ` Sebastian Andrzej Siewior
2023-07-26 14:40   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.