From: gregkh@linuxfoundation.org
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org,
"Peter Zijlstra (Intel)" <peterz@infradead.org>,
juri.lelli@arm.com, bigeasy@linutronix.de, xlpang@redhat.com,
rostedt@goodmis.org, mathieu.desnoyers@efficios.com,
jdesfossez@efficios.com, dvhart@infradead.org,
bristot@redhat.com, Thomas Gleixner <tglx@linutronix.de>,
Lee Jones <lee.jones@linaro.org>,
Zheng Yejian <zhengyejian1@huawei.com>
Subject: [PATCH 4.4 11/75] futex: Change locking rules
Date: Mon, 15 Mar 2021 14:51:25 +0100 [thread overview]
Message-ID: <20210315135208.632919306@linuxfoundation.org> (raw)
In-Reply-To: <20210315135208.252034256@linuxfoundation.org>
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
From: Peter Zijlstra <peterz@infradead.org>
commit 734009e96d1983ad739e5b656e03430b3660c913 upstream.
This patch comes directly from an origin patch (commit
dc3f2ff11740159080f2e8e359ae0ab57c8e74b6) in v4.9.
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.
The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.
Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.
This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.
Nothing yet relies on the new locking rules.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[Lee: Back-ported in support of a previous futex back-port attempt]
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
kernel/futex.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 112 insertions(+), 26 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1016,6 +1016,39 @@ static void exit_pi_state_list(struct ta
* [10] There is no transient state which leaves owner and user space
* TID out of sync. Except one error case where the kernel is denied
* write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -1023,10 +1056,12 @@ static void exit_pi_state_list(struct ta
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -1034,9 +1069,34 @@ static int attach_to_pi_state(u32 uval,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1051,11 +1111,11 @@ static int attach_to_pi_state(u32 uval,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1067,14 +1127,14 @@ static int attach_to_pi_state(u32 uval,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1083,11 +1143,29 @@ static int attach_to_pi_state(u32 uval,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
+ goto out_einval;
+
+out_attach:
atomic_inc(&pi_state->refcount);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/**
@@ -1180,6 +1258,9 @@ static int attach_to_pi_owner(u32 uval,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1204,7 +1285,8 @@ static int attach_to_pi_owner(u32 uval,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps,
struct task_struct **exiting)
{
@@ -1215,7 +1297,7 @@ static int lookup_pi_state(u32 uval, str
* attach to the pi_state when the validation succeeds.
*/
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1234,7 +1316,7 @@ static int lock_pi_update_atomic(u32 __u
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1298,7 +1380,7 @@ static int futex_lock_pi_atomic(u32 __us
*/
match = futex_top_waiter(hb, key);
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1438,6 +1520,7 @@ static int wake_futex_pi(u32 __user *uad
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1971,7 +2054,7 @@ retry_private:
* rereading and handing potential crap to
* lookup_pi_state.
*/
- ret = lookup_pi_state(ret, hb2, &key2,
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
&pi_state, &exiting);
}
@@ -2249,7 +2332,6 @@ static int __fixup_pi_state_owner(u32 __
int err = 0;
oldowner = pi_state->owner;
-
/*
* We are here because either:
*
@@ -2268,11 +2350,10 @@ static int __fixup_pi_state_owner(u32 __
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (!argowner) {
@@ -2333,21 +2414,26 @@ retry:
return argowner == current;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
err = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
next prev parent reply other threads:[~2021-03-15 13:53 UTC|newest]
Thread overview: 80+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-15 13:51 [PATCH 4.4 00/75] 4.4.262-rc1 review gregkh
2021-03-15 13:51 ` [PATCH 4.4 01/75] uapi: nfnetlink_cthelper.h: fix userspace compilation error gregkh
2021-03-15 13:51 ` [PATCH 4.4 02/75] ath9k: fix transmitting to stations in dynamic SMPS mode gregkh
2021-03-15 13:51 ` [PATCH 4.4 03/75] net: Fix gro aggregation for udp encaps with zero csum gregkh
2021-03-15 13:51 ` [PATCH 4.4 04/75] can: skb: can_skb_set_owner(): fix ref counting if socket was closed before setting skb ownership gregkh
2021-03-15 13:51 ` [PATCH 4.4 05/75] can: flexcan: assert FRZ bit in flexcan_chip_freeze() gregkh
2021-03-15 13:51 ` [PATCH 4.4 06/75] can: flexcan: enable RX FIFO after FRZ/HALT valid gregkh
2021-03-15 13:51 ` [PATCH 4.4 07/75] netfilter: x_tables: gpf inside xt_find_revision() gregkh
2021-03-15 13:51 ` [PATCH 4.4 08/75] cifs: return proper error code in statfs(2) gregkh
2021-03-15 13:51 ` [PATCH 4.4 09/75] floppy: fix lock_fdc() signal handling gregkh
2021-03-15 13:51 ` [PATCH 4.4 10/75] Revert "mm, slub: consider rest of partial list if acquire_slab() fails" gregkh
2021-03-15 13:51 ` gregkh [this message]
2021-03-15 13:51 ` [PATCH 4.4 12/75] futex: Cure exit race gregkh
2021-03-15 13:51 ` [PATCH 4.4 13/75] futex: fix dead code in attach_to_pi_owner() gregkh
2021-03-15 13:51 ` [PATCH 4.4 14/75] net/mlx4_en: update moderation when config reset gregkh
2021-03-15 13:51 ` [PATCH 4.4 15/75] net: lapbether: Remove netif_start_queue / netif_stop_queue gregkh
2021-03-15 13:51 ` [PATCH 4.4 16/75] net: davicom: Fix regulator not turned off on failed probe gregkh
2021-03-15 13:51 ` [PATCH 4.4 17/75] net: davicom: Fix regulator not turned off on driver removal gregkh
2021-03-15 13:51 ` [PATCH 4.4 18/75] media: usbtv: Fix deadlock on suspend gregkh
2021-03-15 13:51 ` [PATCH 4.4 19/75] mmc: mxs-mmc: Fix a resource leak in an error handling path in mxs_mmc_probe() gregkh
2021-03-15 13:51 ` [PATCH 4.4 20/75] mmc: mediatek: fix race condition between msdc_request_timeout and irq gregkh
2021-03-15 13:51 ` [PATCH 4.4 21/75] powerpc/perf: Record counter overflow always if SAMPLE_IP is unset gregkh
2021-03-15 13:51 ` [PATCH 4.4 22/75] PCI: xgene-msi: Fix race in installing chained irq handler gregkh
2021-03-15 13:51 ` [PATCH 4.4 23/75] s390/smp: __smp_rescan_cpus() - move cpumask away from stack gregkh
2021-03-15 13:51 ` [PATCH 4.4 24/75] scsi: libiscsi: Fix iscsi_prep_scsi_cmd_pdu() error handling gregkh
2021-03-15 13:51 ` [PATCH 4.4 25/75] ALSA: hda/hdmi: Cancel pending works before suspend gregkh
2021-03-15 13:51 ` [PATCH 4.4 26/75] ALSA: hda: Avoid spurious unsol event handling during S3/S4 gregkh
2021-03-15 13:51 ` [PATCH 4.4 27/75] ALSA: usb-audio: Fix "cannot get freq eq" errors on Dell AE515 sound bar gregkh
2021-03-15 13:51 ` [PATCH 4.4 28/75] s390/dasd: fix hanging DASD driver unbind gregkh
2021-03-15 13:51 ` [PATCH 4.4 29/75] mmc: core: Fix partition switch time for eMMC gregkh
2021-03-15 13:51 ` [PATCH 4.4 30/75] scripts/recordmcount.{c,pl}: support -ffunction-sections .text.* section names gregkh
2021-03-15 13:51 ` [PATCH 4.4 31/75] libertas: fix a potential NULL pointer dereference gregkh
2021-03-15 13:51 ` [PATCH 4.4 32/75] Goodix Fingerprint device is not a modem gregkh
2021-03-15 13:51 ` [PATCH 4.4 33/75] usb: gadget: f_uac2: always increase endpoint max_packet_size by one audio slot gregkh
2021-03-15 13:51 ` [PATCH 4.4 34/75] usb: renesas_usbhs: Clear PIPECFG for re-enabling pipe with other EPNUM gregkh
2021-03-15 13:51 ` [PATCH 4.4 35/75] xhci: Improve detection of device initiated wake signal gregkh
2021-03-15 13:51 ` [PATCH 4.4 36/75] USB: serial: io_edgeport: fix memory leak in edge_startup gregkh
2021-03-15 13:51 ` [PATCH 4.4 37/75] USB: serial: ch341: add new Product ID gregkh
2021-03-15 13:51 ` [PATCH 4.4 38/75] USB: serial: cp210x: add ID for Acuity Brands nLight Air Adapter gregkh
2021-03-15 13:51 ` [PATCH 4.4 39/75] USB: serial: cp210x: add some more GE USB IDs gregkh
2021-03-15 13:51 ` [PATCH 4.4 40/75] usbip: fix stub_dev to check for stream socket gregkh
2021-03-15 13:51 ` [PATCH 4.4 41/75] usbip: fix vhci_hcd " gregkh
2021-03-15 13:51 ` [PATCH 4.4 42/75] usbip: fix stub_dev usbip_sockfd_store() races leading to gpf gregkh
2021-03-15 13:51 ` [PATCH 4.4 43/75] staging: rtl8192u: fix ->ssid overflow in r8192_wx_set_scan() gregkh
2021-03-15 13:51 ` [PATCH 4.4 44/75] staging: rtl8188eu: prevent ->ssid overflow in rtw_wx_set_scan() gregkh
2021-03-15 13:51 ` [PATCH 4.4 45/75] staging: rtl8712: unterminated string leads to read overflow gregkh
2021-03-15 13:52 ` [PATCH 4.4 46/75] staging: rtl8188eu: fix potential memory corruption in rtw_check_beacon_data() gregkh
2021-03-15 13:52 ` [PATCH 4.4 47/75] staging: rtl8712: Fix possible buffer overflow in r8712_sitesurvey_cmd gregkh
2021-03-15 13:52 ` [PATCH 4.4 48/75] staging: rtl8192e: Fix possible buffer overflow in _rtl92e_wx_set_scan gregkh
2021-03-15 13:52 ` [PATCH 4.4 49/75] staging: comedi: addi_apci_1032: Fix endian problem for COS sample gregkh
2021-03-15 13:52 ` [PATCH 4.4 50/75] staging: comedi: addi_apci_1500: Fix endian problem for command sample gregkh
2021-03-15 13:52 ` [PATCH 4.4 51/75] staging: comedi: adv_pci1710: Fix endian problem for AI command data gregkh
2021-03-15 13:52 ` [PATCH 4.4 52/75] staging: comedi: das6402: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 53/75] staging: comedi: das800: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 54/75] staging: comedi: dmm32at: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 55/75] staging: comedi: me4000: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 56/75] staging: comedi: pcl711: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 57/75] staging: comedi: pcl818: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 58/75] NFSv4.2: fix return value of _nfs4_get_security_label() gregkh
2021-03-15 13:52 ` [PATCH 4.4 59/75] block: rsxx: fix error return code of rsxx_pci_probe() gregkh
2021-03-15 13:52 ` [PATCH 4.4 60/75] prctl: fix PR_SET_MM_AUXV kernel stack leak gregkh
2021-03-15 13:52 ` [PATCH 4.4 61/75] alpha: add $(src)/ rather than $(obj)/ to make source file path gregkh
2021-03-15 13:52 ` [PATCH 4.4 62/75] alpha: merge build rules of division routines gregkh
2021-03-15 13:52 ` [PATCH 4.4 63/75] alpha: make short build log available for " gregkh
2021-03-15 13:52 ` [PATCH 4.4 64/75] alpha: Package string routines together gregkh
2021-03-15 13:52 ` [PATCH 4.4 65/75] alpha: move exports to actual definitions gregkh
2021-03-15 13:52 ` [PATCH 4.4 66/75] alpha: get rid of tail-zeroing in __copy_user() gregkh
2021-03-15 13:52 ` [PATCH 4.4 67/75] alpha: switch __copy_user() and __do_clean_user() to normal calling conventions gregkh
2021-03-15 13:52 ` [PATCH 4.4 68/75] powerpc/64s: Fix instruction encoding for lis in ppc_function_entry() gregkh
2021-03-15 13:52 ` [PATCH 4.4 69/75] media: hdpvr: Fix an error handling path in hdpvr_probe() gregkh
2021-03-15 13:52 ` [PATCH 4.4 70/75] KVM: arm64: Fix exclusive limit for IPA size gregkh
2021-03-15 13:52 ` [PATCH 4.4 71/75] iio: imu: adis16400: release allocated memory on failure gregkh
2021-03-15 13:52 ` [PATCH 4.4 72/75] iio: imu: adis16400: fix memory leak gregkh
2021-03-15 13:52 ` [PATCH 4.4 73/75] xen/events: reset affinity of 2-level event when tearing it down gregkh
2021-03-15 13:52 ` [PATCH 4.4 74/75] xen/events: dont unmask an event channel when an eoi is pending gregkh
2021-03-15 13:52 ` [PATCH 4.4 75/75] xen/events: avoid handling the same event on two cpus at the same time gregkh
2021-03-15 21:05 ` [PATCH 4.4 00/75] 4.4.262-rc1 review Pavel Machek
2021-03-15 21:29 ` Guenter Roeck
2021-03-15 22:57 ` Jason Self
2021-03-16 12:07 ` Naresh Kamboju
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210315135208.632919306@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=bigeasy@linutronix.de \
--cc=bristot@redhat.com \
--cc=dvhart@infradead.org \
--cc=jdesfossez@efficios.com \
--cc=juri.lelli@arm.com \
--cc=lee.jones@linaro.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=stable@vger.kernel.org \
--cc=tglx@linutronix.de \
--cc=xlpang@redhat.com \
--cc=zhengyejian1@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox