FAILED: patch "[PATCH] cxl/pmem: Fix module reload vs workqueue state" failed to apply to 5.15-stable tree

All of lore.kernel.org
 help / color / mirror / Atom feed

From: <gregkh@linuxfoundation.org>
To: dan.j.williams@intel.com, stable@vger.kernel.org,
	vishal.l.verma@intel.com
Cc: <stable@vger.kernel.org>
Subject: FAILED: patch "[PATCH] cxl/pmem: Fix module reload vs workqueue state" failed to apply to 5.15-stable tree
Date: Sun, 23 Jan 2022 15:25:33 +0100	[thread overview]
Message-ID: <164294793385141@kroah.com> (raw)


The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable@vger.kernel.org>.

thanks,

greg k-h

------------------ original commit in Linus's tree ------------------

From 53989fad1286e652ea3655ae3367ba698da8d2ff Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 11 Nov 2021 10:19:05 -0800
Subject: [PATCH] cxl/pmem: Fix module reload vs workqueue state

A test of the form:

    while true; do modprobe -r cxl_pmem; modprobe cxl_pmem; done

May lead to a crash signature of the form:

    BUG: unable to handle page fault for address: ffffffffc0660030
    #PF: supervisor instruction fetch in kernel mode
    #PF: error_code(0x0010) - not-present page
    [..]
    Workqueue: cxl_pmem 0xffffffffc0660030
    RIP: 0010:0xffffffffc0660030
    Code: Unable to access opcode bytes at RIP 0xffffffffc0660006.
    [..]
    Call Trace:
     ? process_one_work+0x4ec/0x9c0
     ? pwq_dec_nr_in_flight+0x100/0x100
     ? rwlock_bug.part.0+0x60/0x60
     ? worker_thread+0x2eb/0x700

In that report the 0xffffffffc0660030 address corresponds to the former
function address of cxl_nvb_update_state() from a previous load of the
module, not the current address. Fix that by arranging for ->state_work
in the 'struct cxl_nvdimm_bridge' object to be reinitialized on cxl_pmem
module reload.

Details:

Recall that CXL subsystem wants to link a CXL memory expander device to
an NVDIMM sub-hierarchy when both a persistent memory range has been
registered by the CXL platform driver (cxl_acpi) *and* when that CXL
memory expander has published persistent memory capacity (Get Partition
Info). To this end the cxl_nvdimm_bridge driver arranges to rescan the
CXL bus when either of those conditions change. The helper
bus_rescan_devices() can not be called underneath the device_lock() for
any device on that bus, so the cxl_nvdimm_bridge driver uses a workqueue
for the rescan.

Typically a driver allocates driver data to hold a 'struct work_struct'
for a driven device, but for a workqueue that may run after ->remove()
returns, driver data will have been freed. The 'struct
cxl_nvdimm_bridge' object holds the state and work_struct directly.
Unfortunately it was only arranging for that infrastructure to be
initialized once per device creation rather than the necessary once per
workqueue (cxl_pmem_wq) creation.

Introduce is_cxl_nvdimm_bridge() and cxl_nvdimm_bridge_reset() in
support of invalidating stale references to a recently destroyed
cxl_pmem_wq.

Cc: <stable@vger.kernel.org>
Fixes: 8fdcb1704f61 ("cxl/pmem: Add initial infrastructure for pmem support")
Reported-by: Vishal Verma <vishal.l.verma@intel.com>
Tested-by: Vishal Verma <vishal.l.verma@intel.com>
Link: https://lore.kernel.org/r/163665474585.3505991.8397182770066720755.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index 2e402f0b2a10..b5fca97b0a07 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -51,10 +51,16 @@ struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, CXL);
 
-__mock int match_nvdimm_bridge(struct device *dev, const void *data)
+bool is_cxl_nvdimm_bridge(struct device *dev)
 {
 	return dev->type == &cxl_nvdimm_bridge_type;
 }
+EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm_bridge, CXL);
+
+__mock int match_nvdimm_bridge(struct device *dev, const void *data)
+{
+	return is_cxl_nvdimm_bridge(dev);
+}
 
 struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd)
 {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 3af704e9b448..ab4596f0b751 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -196,6 +196,13 @@ struct cxl_decoder {
 };
 
 
+/**
+ * enum cxl_nvdimm_brige_state - state machine for managing bus rescans
+ * @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed
+ * @CXL_NVB_DEAD: Set at brige unregistration to preclude async probing
+ * @CXL_NVB_ONLINE: Target state after successful ->probe()
+ * @CXL_NVB_OFFLINE: Target state after ->remove() or failed ->probe()
+ */
 enum cxl_nvdimm_brige_state {
 	CXL_NVB_NEW,
 	CXL_NVB_DEAD,
@@ -308,6 +315,7 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
 						     struct cxl_port *port);
 struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm(struct device *dev);
+bool is_cxl_nvdimm_bridge(struct device *dev);
 int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
 struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
 
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 17e82ae90456..b65a272a2d6d 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -315,6 +315,31 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = {
 	.id = CXL_DEVICE_NVDIMM_BRIDGE,
 };
 
+/*
+ * Return all bridges to the CXL_NVB_NEW state to invalidate any
+ * ->state_work referring to the now destroyed cxl_pmem_wq.
+ */
+static int cxl_nvdimm_bridge_reset(struct device *dev, void *data)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+
+	if (!is_cxl_nvdimm_bridge(dev))
+		return 0;
+
+	cxl_nvb = to_cxl_nvdimm_bridge(dev);
+	device_lock(dev);
+	cxl_nvb->state = CXL_NVB_NEW;
+	device_unlock(dev);
+
+	return 0;
+}
+
+static void destroy_cxl_pmem_wq(void)
+{
+	destroy_workqueue(cxl_pmem_wq);
+	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_bridge_reset);
+}
+
 static __init int cxl_pmem_init(void)
 {
 	int rc;
@@ -340,7 +365,7 @@ static __init int cxl_pmem_init(void)
 err_nvdimm:
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 err_bridge:
-	destroy_workqueue(cxl_pmem_wq);
+	destroy_cxl_pmem_wq();
 	return rc;
 }
 
@@ -348,7 +373,7 @@ static __exit void cxl_pmem_exit(void)
 {
 	cxl_driver_unregister(&cxl_nvdimm_driver);
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
-	destroy_workqueue(cxl_pmem_wq);
+	destroy_cxl_pmem_wq();
 }
 
 MODULE_LICENSE("GPL v2");

                 reply	other threads:[~2022-01-23 14:25 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2e402f0b2a1 dfblob:b5fca97b0a0 dfblob:3af704e9b44
dfblob:ab4596f0b75 dfblob:17e82ae9045 dfblob:b65a272a2d6 )
 OR (
bs:"cxl/pmem: Fix module reload vs workqueue state" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=164294793385141@kroah.com \
    --to=gregkh@linuxfoundation.org \
    --cc=dan.j.williams@intel.com \
    --cc=stable@vger.kernel.org \
    --cc=vishal.l.verma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.