linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next,v7] net: mana: Add handler for hardware servicing events
@ 2025-06-10 18:42 Haiyang Zhang
  2025-06-13  1:21 ` Jakub Kicinski
  0 siblings, 1 reply; 3+ messages in thread
From: Haiyang Zhang @ 2025-06-10 18:42 UTC (permalink / raw)
  To: linux-hyperv, netdev
  Cc: haiyangz, decui, stephen, kys, paulros, olaf, vkuznets, davem,
	wei.liu, edumazet, kuba, pabeni, leon, longli, ssengar,
	linux-rdma, daniel, john.fastabend, bpf, ast, hawk, tglx,
	shradhagupta, andrew+netdev, kotaranov, horms, linux-kernel

From: Haiyang Zhang <haiyangz@microsoft.com>

To collaborate with hardware servicing events, upon receiving the special
EQE notification from the HW channel, remove the devices on this bus.
Then, after a waiting period based on the device specs, rescan the parent
bus to recover the devices.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Simon Horman <horms@kernel.org>
---
v7:
rebased.

v6:
Not acquiring module refcnt as suggested by Paolo Abeni.

v5:
Get refcnt of the pdev struct to avoid removal before running the work
as suggested by Jakub Kicinski.

v4:
Renamed EQE type 135 to GDMA_EQE_HWC_RESET_REQUEST, since there can
be multiple cases of this reset request.

v3:
Updated for checkpatch warnings as suggested by Simon Horman.

v2:
Added dev_dbg for service type as suggested by Shradha Gupta.
Added driver cap bit.
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 67 +++++++++++++++++++
 include/net/mana/gdma.h                       | 10 ++-
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3504507477c6..c75184519fe4 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -352,11 +352,58 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
 }
 EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
 
+#define MANA_SERVICE_PERIOD 10
+
+struct mana_serv_work {
+	struct work_struct serv_work;
+	struct pci_dev *pdev;
+};
+
+static void mana_serv_func(struct work_struct *w)
+{
+	struct mana_serv_work *mns_wk;
+	struct pci_bus *bus, *parent;
+	struct pci_dev *pdev;
+
+	mns_wk = container_of(w, struct mana_serv_work, serv_work);
+	pdev = mns_wk->pdev;
+
+	pci_lock_rescan_remove();
+
+	if (!pdev)
+		goto out;
+
+	bus = pdev->bus;
+	if (!bus) {
+		dev_err(&pdev->dev, "MANA service: no bus\n");
+		goto out;
+	}
+
+	parent = bus->parent;
+	if (!parent) {
+		dev_err(&pdev->dev, "MANA service: no parent bus\n");
+		goto out;
+	}
+
+	pci_stop_and_remove_bus_device(bus->self);
+
+	msleep(MANA_SERVICE_PERIOD * 1000);
+
+	pci_rescan_bus(parent);
+
+out:
+	pci_unlock_rescan_remove();
+
+	pci_dev_put(pdev);
+	kfree(mns_wk);
+}
+
 static void mana_gd_process_eqe(struct gdma_queue *eq)
 {
 	u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE);
 	struct gdma_context *gc = eq->gdma_dev->gdma_context;
 	struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr;
+	struct mana_serv_work *mns_wk;
 	union gdma_eqe_info eqe_info;
 	enum gdma_eqe_type type;
 	struct gdma_event event;
@@ -401,6 +448,26 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 		eq->eq.callback(eq->eq.context, eq, &event);
 		break;
 
+	case GDMA_EQE_HWC_FPGA_RECONFIG:
+		dev_info(gc->dev, "Recv MANA service type:%d\n", type);
+
+		if (gc->in_service) {
+			dev_info(gc->dev, "Already in service\n");
+			break;
+		}
+
+		mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
+		if (!mns_wk)
+			break;
+
+		dev_info(gc->dev, "Start MANA service type:%d\n", type);
+		gc->in_service = true;
+		mns_wk->pdev = to_pci_dev(gc->dev);
+		pci_dev_get(mns_wk->pdev);
+		INIT_WORK(&mns_wk->serv_work, mana_serv_func);
+		schedule_work(&mns_wk->serv_work);
+		break;
+
 	default:
 		break;
 	}
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 3ce56a816425..bfae59202669 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -58,7 +58,7 @@ enum gdma_eqe_type {
 	GDMA_EQE_HWC_INIT_EQ_ID_DB	= 129,
 	GDMA_EQE_HWC_INIT_DATA		= 130,
 	GDMA_EQE_HWC_INIT_DONE		= 131,
-	GDMA_EQE_HWC_SOC_RECONFIG	= 132,
+	GDMA_EQE_HWC_FPGA_RECONFIG	= 132,
 	GDMA_EQE_HWC_SOC_RECONFIG_DATA	= 133,
 	GDMA_EQE_HWC_SOC_SERVICE	= 134,
 	GDMA_EQE_RNIC_QP_FATAL		= 176,
@@ -403,6 +403,8 @@ struct gdma_context {
 	u32			test_event_eq_id;
 
 	bool			is_pf;
+	bool			in_service;
+
 	phys_addr_t		bar0_pa;
 	void __iomem		*bar0_va;
 	void __iomem		*shm_base;
@@ -578,12 +580,16 @@ enum {
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
 
+/* Driver can self reset on FPGA Reconfig EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
+
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
 	 GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
 	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
 	 GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
-	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
+	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next,v7] net: mana: Add handler for hardware servicing events
  2025-06-10 18:42 [PATCH net-next,v7] net: mana: Add handler for hardware servicing events Haiyang Zhang
@ 2025-06-13  1:21 ` Jakub Kicinski
  2025-06-13 17:06   ` Haiyang Zhang
  0 siblings, 1 reply; 3+ messages in thread
From: Jakub Kicinski @ 2025-06-13  1:21 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: linux-hyperv, netdev, haiyangz, decui, stephen, kys, paulros,
	olaf, vkuznets, davem, wei.liu, edumazet, pabeni, leon, longli,
	ssengar, linux-rdma, daniel, john.fastabend, bpf, ast, hawk, tglx,
	shradhagupta, andrew+netdev, kotaranov, horms, linux-kernel

On Tue, 10 Jun 2025 11:42:22 -0700 Haiyang Zhang wrote:
> v6:
> Not acquiring module refcnt as suggested by Paolo Abeni.

TBH I'm not 100% sure this is correct.
If the service worker operations end up unbinding the driver from
the device holding the device ref may not prevent the module from
being unloaded.

Could you try to trigger that condition? Make that msleep() in the work
even longer and try to remove the module while the work is sleeping
there?

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH net-next,v7] net: mana: Add handler for hardware servicing events
  2025-06-13  1:21 ` Jakub Kicinski
@ 2025-06-13 17:06   ` Haiyang Zhang
  0 siblings, 0 replies; 3+ messages in thread
From: Haiyang Zhang @ 2025-06-13 17:06 UTC (permalink / raw)
  To: Jakub Kicinski, Haiyang Zhang
  Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org, Dexuan Cui,
	stephen@networkplumber.org, KY Srinivasan, Paul Rosswurm,
	olaf@aepfle.de, vkuznets@redhat.com, davem@davemloft.net,
	wei.liu@kernel.org, edumazet@google.com, pabeni@redhat.com,
	leon@kernel.org, Long Li, ssengar@linux.microsoft.com,
	linux-rdma@vger.kernel.org, daniel@iogearbox.net,
	john.fastabend@gmail.com, bpf@vger.kernel.org, ast@kernel.org,
	hawk@kernel.org, tglx@linutronix.de,
	shradhagupta@linux.microsoft.com, andrew+netdev@lunn.ch,
	Konstantin Taranov, horms@kernel.org,
	linux-kernel@vger.kernel.org



> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Thursday, June 12, 2025 9:22 PM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; Haiyang Zhang
> <haiyangz@microsoft.com>; Dexuan Cui <decui@microsoft.com>;
> stephen@networkplumber.org; KY Srinivasan <kys@microsoft.com>; Paul
> Rosswurm <paulros@microsoft.com>; olaf@aepfle.de; vkuznets@redhat.com;
> davem@davemloft.net; wei.liu@kernel.org; edumazet@google.com;
> pabeni@redhat.com; leon@kernel.org; Long Li <longli@microsoft.com>;
> ssengar@linux.microsoft.com; linux-rdma@vger.kernel.org;
> daniel@iogearbox.net; john.fastabend@gmail.com; bpf@vger.kernel.org;
> ast@kernel.org; hawk@kernel.org; tglx@linutronix.de;
> shradhagupta@linux.microsoft.com; andrew+netdev@lunn.ch; Konstantin
> Taranov <kotaranov@microsoft.com>; horms@kernel.org; linux-
> kernel@vger.kernel.org
> Subject: [EXTERNAL] Re: [PATCH net-next,v7] net: mana: Add handler for
> hardware servicing events
> 
> On Tue, 10 Jun 2025 11:42:22 -0700 Haiyang Zhang wrote:
> > v6:
> > Not acquiring module refcnt as suggested by Paolo Abeni.
> 
> TBH I'm not 100% sure this is correct.
> If the service worker operations end up unbinding the driver from
> the device holding the device ref may not prevent the module from
> being unloaded.
> 
> Could you try to trigger that condition? Make that msleep() in the work
> even longer and try to remove the module while the work is sleeping
> there?

Thanks for your suggestion! I tested and found that I can rmmod mana 
during the sleep and caused accessing freed memory. And getting the extra 
module refcnt fixed this (prevented rmmod during sleep). So, I added back
the module refcnt holding, and submitted v8.

Thanks,
- Haiyang


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-06-13 17:06 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-10 18:42 [PATCH net-next,v7] net: mana: Add handler for hardware servicing events Haiyang Zhang
2025-06-13  1:21 ` Jakub Kicinski
2025-06-13 17:06   ` Haiyang Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).