From: gregkh@linuxfoundation.org
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org,
"Peter Zijlstra (Intel)" <peterz@infradead.org>,
juri.lelli@arm.com, bigeasy@linutronix.de, xlpang@redhat.com,
rostedt@goodmis.org, mathieu.desnoyers@efficios.com,
jdesfossez@efficios.com, dvhart@infradead.org,
bristot@redhat.com, Thomas Gleixner <tglx@linutronix.de>,
Lee Jones <lee.jones@linaro.org>,
Zheng Yejian <zhengyejian1@huawei.com>
Subject: [PATCH 4.4 11/75] futex: Change locking rules
Date: Mon, 15 Mar 2021 14:51:25 +0100 [thread overview]
Message-ID: <20210315135208.632919306@linuxfoundation.org> (raw)
In-Reply-To: <20210315135208.252034256@linuxfoundation.org>
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
From: Peter Zijlstra <peterz@infradead.org>
commit 734009e96d1983ad739e5b656e03430b3660c913 upstream.
This patch comes directly from an origin patch (commit
dc3f2ff11740159080f2e8e359ae0ab57c8e74b6) in v4.9.
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.
The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.
Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.
This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.
Nothing yet relies on the new locking rules.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[Lee: Back-ported in support of a previous futex back-port attempt]
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
kernel/futex.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 112 insertions(+), 26 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1016,6 +1016,39 @@ static void exit_pi_state_list(struct ta
* [10] There is no transient state which leaves owner and user space
* TID out of sync. Except one error case where the kernel is denied
* write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -1023,10 +1056,12 @@ static void exit_pi_state_list(struct ta
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -1034,9 +1069,34 @@ static int attach_to_pi_state(u32 uval,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1051,11 +1111,11 @@ static int attach_to_pi_state(u32 uval,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1067,14 +1127,14 @@ static int attach_to_pi_state(u32 uval,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1083,11 +1143,29 @@ static int attach_to_pi_state(u32 uval,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
+ goto out_einval;
+
+out_attach:
atomic_inc(&pi_state->refcount);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/**
@@ -1180,6 +1258,9 @@ static int attach_to_pi_owner(u32 uval,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1204,7 +1285,8 @@ static int attach_to_pi_owner(u32 uval,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps,
struct task_struct **exiting)
{
@@ -1215,7 +1297,7 @@ static int lookup_pi_state(u32 uval, str
* attach to the pi_state when the validation succeeds.
*/
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1234,7 +1316,7 @@ static int lock_pi_update_atomic(u32 __u
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1298,7 +1380,7 @@ static int futex_lock_pi_atomic(u32 __us
*/
match = futex_top_waiter(hb, key);
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1438,6 +1520,7 @@ static int wake_futex_pi(u32 __user *uad
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1971,7 +2054,7 @@ retry_private:
* rereading and handing potential crap to
* lookup_pi_state.
*/
- ret = lookup_pi_state(ret, hb2, &key2,
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
&pi_state, &exiting);
}
@@ -2249,7 +2332,6 @@ static int __fixup_pi_state_owner(u32 __
int err = 0;
oldowner = pi_state->owner;
-
/*
* We are here because either:
*
@@ -2268,11 +2350,10 @@ static int __fixup_pi_state_owner(u32 __
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (!argowner) {
@@ -2333,21 +2414,26 @@ retry:
return argowner == current;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
err = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
next prev parent reply other threads:[~2021-03-15 13:53 UTC|newest]
Thread overview: 81+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-15 13:51 [PATCH 4.4 00/75] 4.4.262-rc1 review gregkh
2021-03-15 13:51 ` [PATCH 4.4 01/75] uapi: nfnetlink_cthelper.h: fix userspace compilation error gregkh
2021-03-15 13:51 ` [PATCH 4.4 02/75] ath9k: fix transmitting to stations in dynamic SMPS mode gregkh
2021-03-15 13:51 ` [PATCH 4.4 03/75] net: Fix gro aggregation for udp encaps with zero csum gregkh
2021-03-15 13:51 ` [PATCH 4.4 04/75] can: skb: can_skb_set_owner(): fix ref counting if socket was closed before setting skb ownership gregkh
2021-03-15 13:51 ` [PATCH 4.4 05/75] can: flexcan: assert FRZ bit in flexcan_chip_freeze() gregkh
2021-03-15 13:51 ` [PATCH 4.4 06/75] can: flexcan: enable RX FIFO after FRZ/HALT valid gregkh
2021-03-15 13:51 ` [PATCH 4.4 07/75] netfilter: x_tables: gpf inside xt_find_revision() gregkh
2021-03-15 13:51 ` [PATCH 4.4 08/75] cifs: return proper error code in statfs(2) gregkh
2021-03-15 13:51 ` [PATCH 4.4 09/75] floppy: fix lock_fdc() signal handling gregkh
2021-03-15 13:51 ` [PATCH 4.4 10/75] Revert "mm, slub: consider rest of partial list if acquire_slab() fails" gregkh
2021-03-15 13:51 ` gregkh [this message]
2021-03-15 13:51 ` [PATCH 4.4 12/75] futex: Cure exit race gregkh
2021-03-15 13:51 ` [PATCH 4.4 13/75] futex: fix dead code in attach_to_pi_owner() gregkh
2021-03-15 13:51 ` [PATCH 4.4 14/75] net/mlx4_en: update moderation when config reset gregkh
2021-03-15 13:51 ` [PATCH 4.4 15/75] net: lapbether: Remove netif_start_queue / netif_stop_queue gregkh
2021-03-15 13:51 ` [PATCH 4.4 16/75] net: davicom: Fix regulator not turned off on failed probe gregkh
2021-03-15 13:51 ` [PATCH 4.4 17/75] net: davicom: Fix regulator not turned off on driver removal gregkh
2021-03-15 13:51 ` [PATCH 4.4 18/75] media: usbtv: Fix deadlock on suspend gregkh
2021-03-15 13:51 ` [PATCH 4.4 19/75] mmc: mxs-mmc: Fix a resource leak in an error handling path in mxs_mmc_probe() gregkh
2021-03-15 13:51 ` [PATCH 4.4 20/75] mmc: mediatek: fix race condition between msdc_request_timeout and irq gregkh
2021-03-15 13:51 ` [PATCH 4.4 21/75] powerpc/perf: Record counter overflow always if SAMPLE_IP is unset gregkh
2021-03-15 13:51 ` [PATCH 4.4 22/75] PCI: xgene-msi: Fix race in installing chained irq handler gregkh
2021-03-15 13:51 ` [PATCH 4.4 23/75] s390/smp: __smp_rescan_cpus() - move cpumask away from stack gregkh
2021-03-15 13:51 ` [PATCH 4.4 24/75] scsi: libiscsi: Fix iscsi_prep_scsi_cmd_pdu() error handling gregkh
2021-03-15 13:51 ` [PATCH 4.4 25/75] ALSA: hda/hdmi: Cancel pending works before suspend gregkh
2021-03-15 13:51 ` [PATCH 4.4 26/75] ALSA: hda: Avoid spurious unsol event handling during S3/S4 gregkh
2021-03-15 13:51 ` [PATCH 4.4 27/75] ALSA: usb-audio: Fix "cannot get freq eq" errors on Dell AE515 sound bar gregkh
2021-03-15 13:51 ` [PATCH 4.4 28/75] s390/dasd: fix hanging DASD driver unbind gregkh
2021-03-15 13:51 ` [PATCH 4.4 29/75] mmc: core: Fix partition switch time for eMMC gregkh
2021-03-15 13:51 ` [PATCH 4.4 30/75] scripts/recordmcount.{c,pl}: support -ffunction-sections .text.* section names gregkh
2021-03-15 13:51 ` [PATCH 4.4 31/75] libertas: fix a potential NULL pointer dereference gregkh
2021-03-15 13:51 ` [PATCH 4.4 32/75] Goodix Fingerprint device is not a modem gregkh
2021-03-15 13:51 ` [PATCH 4.4 33/75] usb: gadget: f_uac2: always increase endpoint max_packet_size by one audio slot gregkh
2021-03-15 13:51 ` [PATCH 4.4 34/75] usb: renesas_usbhs: Clear PIPECFG for re-enabling pipe with other EPNUM gregkh
2021-03-15 13:51 ` [PATCH 4.4 35/75] xhci: Improve detection of device initiated wake signal gregkh
2021-03-15 13:51 ` [PATCH 4.4 36/75] USB: serial: io_edgeport: fix memory leak in edge_startup gregkh
2021-03-15 13:51 ` [PATCH 4.4 37/75] USB: serial: ch341: add new Product ID gregkh
2021-03-15 13:51 ` [PATCH 4.4 38/75] USB: serial: cp210x: add ID for Acuity Brands nLight Air Adapter gregkh
2021-03-15 13:51 ` [PATCH 4.4 39/75] USB: serial: cp210x: add some more GE USB IDs gregkh
2021-03-15 13:51 ` [PATCH 4.4 40/75] usbip: fix stub_dev to check for stream socket gregkh
2021-03-15 13:51 ` [PATCH 4.4 41/75] usbip: fix vhci_hcd " gregkh
2021-03-15 13:51 ` [PATCH 4.4 42/75] usbip: fix stub_dev usbip_sockfd_store() races leading to gpf gregkh
2021-03-15 13:51 ` [PATCH 4.4 43/75] staging: rtl8192u: fix ->ssid overflow in r8192_wx_set_scan() gregkh
2021-03-15 13:51 ` [PATCH 4.4 44/75] staging: rtl8188eu: prevent ->ssid overflow in rtw_wx_set_scan() gregkh
2021-03-15 13:51 ` [PATCH 4.4 45/75] staging: rtl8712: unterminated string leads to read overflow gregkh
2021-03-15 13:52 ` [PATCH 4.4 46/75] staging: rtl8188eu: fix potential memory corruption in rtw_check_beacon_data() gregkh
2021-03-15 13:52 ` [PATCH 4.4 47/75] staging: rtl8712: Fix possible buffer overflow in r8712_sitesurvey_cmd gregkh
2021-03-15 13:52 ` [PATCH 4.4 48/75] staging: rtl8192e: Fix possible buffer overflow in _rtl92e_wx_set_scan gregkh
2021-03-15 13:52 ` [PATCH 4.4 49/75] staging: comedi: addi_apci_1032: Fix endian problem for COS sample gregkh
2021-03-15 13:52 ` [PATCH 4.4 50/75] staging: comedi: addi_apci_1500: Fix endian problem for command sample gregkh
2021-03-15 13:52 ` [PATCH 4.4 51/75] staging: comedi: adv_pci1710: Fix endian problem for AI command data gregkh
2021-03-15 13:52 ` [PATCH 4.4 52/75] staging: comedi: das6402: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 53/75] staging: comedi: das800: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 54/75] staging: comedi: dmm32at: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 55/75] staging: comedi: me4000: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 56/75] staging: comedi: pcl711: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 57/75] staging: comedi: pcl818: " gregkh
2021-03-15 13:52 ` [PATCH 4.4 58/75] NFSv4.2: fix return value of _nfs4_get_security_label() gregkh
2021-03-15 13:52 ` [PATCH 4.4 59/75] block: rsxx: fix error return code of rsxx_pci_probe() gregkh
2021-03-15 13:52 ` [PATCH 4.4 60/75] prctl: fix PR_SET_MM_AUXV kernel stack leak gregkh
2021-03-15 13:52 ` [PATCH 4.4 61/75] alpha: add $(src)/ rather than $(obj)/ to make source file path gregkh
2021-03-15 13:52 ` [PATCH 4.4 62/75] alpha: merge build rules of division routines gregkh
2021-03-15 13:52 ` [PATCH 4.4 63/75] alpha: make short build log available for " gregkh
2021-03-15 13:52 ` [PATCH 4.4 64/75] alpha: Package string routines together gregkh
2021-03-15 13:52 ` [PATCH 4.4 65/75] alpha: move exports to actual definitions gregkh
2021-03-15 13:52 ` [PATCH 4.4 66/75] alpha: get rid of tail-zeroing in __copy_user() gregkh
2021-03-15 13:52 ` [PATCH 4.4 67/75] alpha: switch __copy_user() and __do_clean_user() to normal calling conventions gregkh
2021-03-15 13:52 ` [PATCH 4.4 68/75] powerpc/64s: Fix instruction encoding for lis in ppc_function_entry() gregkh
2021-03-15 13:52 ` [PATCH 4.4 69/75] media: hdpvr: Fix an error handling path in hdpvr_probe() gregkh
2021-03-15 13:52 ` [PATCH 4.4 70/75] KVM: arm64: Fix exclusive limit for IPA size gregkh
2021-03-15 13:52 ` [PATCH 4.4 71/75] iio: imu: adis16400: release allocated memory on failure gregkh
2021-03-15 13:52 ` [PATCH 4.4 72/75] iio: imu: adis16400: fix memory leak gregkh
2021-03-15 13:52 ` [PATCH 4.4 73/75] xen/events: reset affinity of 2-level event when tearing it down gregkh
2021-03-15 13:52 ` [PATCH 4.4 74/75] xen/events: dont unmask an event channel when an eoi is pending gregkh
2021-03-15 13:52 ` [PATCH 4.4 75/75] xen/events: avoid handling the same event on two cpus at the same time gregkh
2021-03-15 17:07 ` [PATCH 4.4 00/75] 4.4.262-rc1 review Jon Hunter
2021-03-15 21:05 ` Pavel Machek
2021-03-15 21:29 ` Guenter Roeck
2021-03-15 22:57 ` Jason Self
2021-03-16 12:07 ` Naresh Kamboju
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210315135208.632919306@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=bigeasy@linutronix.de \
--cc=bristot@redhat.com \
--cc=dvhart@infradead.org \
--cc=jdesfossez@efficios.com \
--cc=juri.lelli@arm.com \
--cc=lee.jones@linaro.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=stable@vger.kernel.org \
--cc=tglx@linutronix.de \
--cc=xlpang@redhat.com \
--cc=zhengyejian1@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.