Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 net 2/2] net: enetc: fix NTMP DMA use-after-free issue
From: Wei Fang @ 2026-04-15  6:08 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, chleroy
  Cc: netdev, linux-kernel, imx, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20260415060833.2303846-1-wei.fang@nxp.com>

The AI-generated review reported a potential DMA use-after-free issue
[1]. If netc_xmit_ntmp_cmd() times out and returns an error, the pending
command is not explicitly aborted, while ntmp_free_data_mem()
unconditionally frees the DMA buffer. If the buffer has already been
reallocated elsewhere, this may lead to silent memory corruption. Because
the hardware eventually processes the pending command and perform a DMA
write of the response to the physical address of the freed buffer.

To resolve this issue, this patch does the following modifications:

1. Convert cbdr->ring_lock from a spinlock to a mutex

The lock was originally a spinlock in case NTMP operations might be
invoked from atomic context. After downstream support for all NTMP
tables, no such usage has materialized. A mutex lock is now required
because the driver now needs to reclaim used BDs and release associated
DMA memory within the lock's context, while dma_free_coherent() might
sleep.

2. Introduce software command BD (struct netc_swcbd)

The hardware write-back overwrites the addr and len fields of the BD,
so the driver cannot rely on the hardware BD to free the associated DMA
memory. The driver now maintains a software shadow BD storing the DMA
buffer pointer, DMA address, and size. And netc_xmit_ntmp_cmd() only
reclaims older BDs when the number of used BDs reaches
NETC_CBDR_CLEAN_WORK (16). The software BD enables correct DMA memory
release. With this, struct ntmp_dma_buf and ntmp_free_data_mem() are no
longer needed and are removed.

3. Require callers to hold ring_lock across netc_xmit_ntmp_cmd()

netc_xmit_ntmp_cmd() releases the ring_lock before the caller finishes
consuming the response. At this point, if a concurrent thread submits
a new command, it may trigger ntmp_clean_cbdr() and free the DMA buffer
while it is still in use. Move ring_lock ownership to the caller to
ensure the response buffer cannot be reclaimed prematurely. So the
helpers ntmp_select_and_lock_cbdr() and ntmp_unlock_cbdr() are added.

These changes eliminate the DMA use-after-free condition and ensure safe
and consistent BD reclamation and DMA buffer lifecycle management.

Fixes: 4701073c3deb ("net: enetc: add initial netc-lib driver to support NTMP")
Link: https://lore.kernel.org/netdev/20260403011729.1795413-1-kuba@kernel.org/ # [1]
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/ethernet/freescale/enetc/ntmp.c   | 214 ++++++++++--------
 .../ethernet/freescale/enetc/ntmp_private.h   |   8 +-
 include/linux/fsl/ntmp.h                      |   9 +-
 3 files changed, 134 insertions(+), 97 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index b188eb2d40c0..70bbc5d2d5d4 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -7,6 +7,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/fsl/netc_global.h>
 #include <linux/iopoll.h>
+#include <linux/vmalloc.h>
 
 #include "ntmp_private.h"
 
@@ -42,6 +43,12 @@ int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
 	if (!cbdr->addr_base)
 		return -ENOMEM;
 
+	cbdr->swcbd = vcalloc(cbd_num, sizeof(struct netc_swcbd));
+	if (!cbdr->swcbd) {
+		dma_free_coherent(dev, size, cbdr->addr_base, cbdr->dma_base);
+		return -ENOMEM;
+	}
+
 	cbdr->dma_size = size;
 	cbdr->bd_num = cbd_num;
 	cbdr->regs = *regs;
@@ -52,7 +59,7 @@ int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
 	cbdr->addr_base_align = PTR_ALIGN(cbdr->addr_base,
 					  NTMP_BASE_ADDR_ALIGN);
 
-	spin_lock_init(&cbdr->ring_lock);
+	mutex_init(&cbdr->ring_lock);
 
 	cbdr->next_to_use = netc_read(cbdr->regs.pir);
 	cbdr->next_to_clean = netc_read(cbdr->regs.cir) & NETC_CBDRCIR_INDEX;
@@ -71,10 +78,24 @@ int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(ntmp_init_cbdr);
 
+static void ntmp_free_data_mem(struct device *dev, struct netc_swcbd *swcbd)
+{
+	if (unlikely(!swcbd->buf))
+		return;
+
+	dma_free_coherent(dev, swcbd->size + NTMP_DATA_ADDR_ALIGN,
+			  swcbd->buf, swcbd->dma);
+}
+
 void ntmp_free_cbdr(struct netc_cbdr *cbdr)
 {
 	/* Disable the Control BD Ring */
 	netc_write(cbdr->regs.mr, 0);
+
+	for (int i = 0; i < cbdr->bd_num; i++)
+		ntmp_free_data_mem(cbdr->dev, &cbdr->swcbd[i]);
+
+	vfree(cbdr->swcbd);
 	dma_free_coherent(cbdr->dev, cbdr->dma_size, cbdr->addr_base,
 			  cbdr->dma_base);
 	memset(cbdr, 0, sizeof(*cbdr));
@@ -94,40 +115,59 @@ static union netc_cbd *ntmp_get_cbd(struct netc_cbdr *cbdr, int index)
 
 static void ntmp_clean_cbdr(struct netc_cbdr *cbdr)
 {
-	union netc_cbd *cbd;
-	int i;
+	int i = cbdr->next_to_clean;
 
-	i = cbdr->next_to_clean;
 	while ((netc_read(cbdr->regs.cir) & NETC_CBDRCIR_INDEX) != i) {
-		cbd = ntmp_get_cbd(cbdr, i);
+		union netc_cbd *cbd = ntmp_get_cbd(cbdr, i);
+		struct netc_swcbd *swcbd = &cbdr->swcbd[i];
+
+		ntmp_free_data_mem(cbdr->dev, swcbd);
+		memset(swcbd, 0, sizeof(*swcbd));
 		memset(cbd, 0, sizeof(*cbd));
 		i = (i + 1) % cbdr->bd_num;
 	}
 
+	dma_wmb();
 	cbdr->next_to_clean = i;
 }
 
-static int netc_xmit_ntmp_cmd(struct ntmp_user *user, union netc_cbd *cbd)
+static void ntmp_select_and_lock_cbdr(struct ntmp_user *user,
+				      struct netc_cbdr **cbdr)
+{
+	/* Currently only ENETC is supported, and it has only one command
+	 * BD ring.
+	 */
+	*cbdr = &user->ring[0];
+
+	mutex_lock(&(*cbdr)->ring_lock);
+}
+
+static void ntmp_unlock_cbdr(struct netc_cbdr *cbdr)
+{
+	mutex_unlock(&cbdr->ring_lock);
+}
+
+static int netc_xmit_ntmp_cmd(struct netc_cbdr *cbdr, union netc_cbd *cbd,
+			      struct netc_swcbd *swcbd)
 {
 	union netc_cbd *cur_cbd;
-	struct netc_cbdr *cbdr;
-	int i, err;
+	int i, err, used_bds;
 	u16 status;
 	u32 val;
 
-	/* Currently only i.MX95 ENETC is supported, and it only has one
-	 * command BD ring
-	 */
-	cbdr = &user->ring[0];
-
-	spin_lock_bh(&cbdr->ring_lock);
-
-	if (unlikely(!ntmp_get_free_cbd_num(cbdr)))
+	used_bds = cbdr->bd_num - ntmp_get_free_cbd_num(cbdr);
+	if (unlikely(used_bds >= NETC_CBDR_CLEAN_WORK)) {
 		ntmp_clean_cbdr(cbdr);
+		if (unlikely(!ntmp_get_free_cbd_num(cbdr))) {
+			ntmp_free_data_mem(cbdr->dev, swcbd);
+			return -EBUSY;
+		}
+	}
 
 	i = cbdr->next_to_use;
 	cur_cbd = ntmp_get_cbd(cbdr, i);
 	*cur_cbd = *cbd;
+	cbdr->swcbd[i] = *swcbd;
 	dma_wmb();
 
 	/* Update producer index of both software and hardware */
@@ -135,17 +175,16 @@ static int netc_xmit_ntmp_cmd(struct ntmp_user *user, union netc_cbd *cbd)
 	cbdr->next_to_use = i;
 	netc_write(cbdr->regs.pir, i);
 
-	err = read_poll_timeout_atomic(netc_read, val,
-				       (val & NETC_CBDRCIR_INDEX) == i,
-				       NETC_CBDR_DELAY_US, NETC_CBDR_TIMEOUT,
-				       true, cbdr->regs.cir);
+	err = read_poll_timeout(netc_read, val,
+				(val & NETC_CBDRCIR_INDEX) == i,
+				NETC_CBDR_DELAY_US, NETC_CBDR_TIMEOUT,
+				true, cbdr->regs.cir);
 	if (unlikely(err))
-		goto cbdr_unlock;
+		return err;
 
 	if (unlikely(val & NETC_CBDRCIR_SBE)) {
-		dev_err(user->dev, "Command BD system bus error\n");
-		err = -EIO;
-		goto cbdr_unlock;
+		dev_err(cbdr->dev, "Command BD system bus error\n");
+		return -EIO;
 	}
 
 	dma_rmb();
@@ -157,40 +196,29 @@ static int netc_xmit_ntmp_cmd(struct ntmp_user *user, union netc_cbd *cbd)
 	/* Check the writeback error status */
 	status = le16_to_cpu(cbd->resp_hdr.error_rr) & NTMP_RESP_ERROR;
 	if (unlikely(status)) {
-		err = -EIO;
-		dev_err(user->dev, "Command BD error: 0x%04x\n", status);
+		dev_err(cbdr->dev, "Command BD error: 0x%04x\n", status);
+		return -EIO;
 	}
 
-	ntmp_clean_cbdr(cbdr);
-	dma_wmb();
-
-cbdr_unlock:
-	spin_unlock_bh(&cbdr->ring_lock);
-
-	return err;
+	return 0;
 }
 
-static int ntmp_alloc_data_mem(struct ntmp_dma_buf *data, void **buf_align)
+static int ntmp_alloc_data_mem(struct device *dev, struct netc_swcbd *swcbd,
+			       void **buf_align)
 {
 	void *buf;
 
-	buf = dma_alloc_coherent(data->dev, data->size + NTMP_DATA_ADDR_ALIGN,
-				 &data->dma, GFP_KERNEL);
+	buf = dma_alloc_coherent(dev, swcbd->size + NTMP_DATA_ADDR_ALIGN,
+				 &swcbd->dma, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
-	data->buf = buf;
+	swcbd->buf = buf;
 	*buf_align = PTR_ALIGN(buf, NTMP_DATA_ADDR_ALIGN);
 
 	return 0;
 }
 
-static void ntmp_free_data_mem(struct ntmp_dma_buf *data)
-{
-	dma_free_coherent(data->dev, data->size + NTMP_DATA_ADDR_ALIGN,
-			  data->buf, data->dma);
-}
-
 static void ntmp_fill_request_hdr(union netc_cbd *cbd, dma_addr_t dma,
 				  int len, int table_id, int cmd,
 				  int access_method)
@@ -241,37 +269,39 @@ static int ntmp_delete_entry_by_id(struct ntmp_user *user, int tbl_id,
 				   u8 tbl_ver, u32 entry_id, u32 req_len,
 				   u32 resp_len)
 {
-	struct ntmp_dma_buf data = {
-		.dev = user->dev,
+	struct netc_swcbd swcbd = {
 		.size = max(req_len, resp_len),
 	};
 	struct ntmp_req_by_eid *req;
+	struct netc_cbdr *cbdr;
 	union netc_cbd cbd;
 	int err;
 
-	err = ntmp_alloc_data_mem(&data, (void **)&req);
+	err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
 	if (err)
 		return err;
 
 	ntmp_fill_crd_eid(req, tbl_ver, 0, 0, entry_id);
-	ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(req_len, resp_len),
+	ntmp_fill_request_hdr(&cbd, swcbd.dma, NTMP_LEN(req_len, resp_len),
 			      tbl_id, NTMP_CMD_DELETE, NTMP_AM_ENTRY_ID);
 
-	err = netc_xmit_ntmp_cmd(user, &cbd);
+	ntmp_select_and_lock_cbdr(user, &cbdr);
+	err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
 	if (err)
 		dev_err(user->dev,
 			"Failed to delete entry 0x%x of %s, err: %pe",
 			entry_id, ntmp_table_name(tbl_id), ERR_PTR(err));
-
-	ntmp_free_data_mem(&data);
+	ntmp_unlock_cbdr(cbdr);
 
 	return err;
 }
 
-static int ntmp_query_entry_by_id(struct ntmp_user *user, int tbl_id,
-				  u32 len, struct ntmp_req_by_eid *req,
-				  dma_addr_t dma, bool compare_eid)
+static int ntmp_query_entry_by_id(struct netc_cbdr *cbdr, int tbl_id,
+				  struct ntmp_req_by_eid *req,
+				  struct netc_swcbd *swcbd,
+				  bool compare_eid)
 {
+	u32 len = NTMP_LEN(sizeof(*req), swcbd->size);
 	struct ntmp_cmn_resp_query *resp;
 	int cmd = NTMP_CMD_QUERY;
 	union netc_cbd cbd;
@@ -283,10 +313,11 @@ static int ntmp_query_entry_by_id(struct ntmp_user *user, int tbl_id,
 		cmd = NTMP_CMD_QU;
 
 	/* Request header */
-	ntmp_fill_request_hdr(&cbd, dma, len, tbl_id, cmd, NTMP_AM_ENTRY_ID);
-	err = netc_xmit_ntmp_cmd(user, &cbd);
+	ntmp_fill_request_hdr(&cbd, swcbd->dma, len, tbl_id, cmd,
+			      NTMP_AM_ENTRY_ID);
+	err = netc_xmit_ntmp_cmd(cbdr, &cbd, swcbd);
 	if (err) {
-		dev_err(user->dev,
+		dev_err(cbdr->dev,
 			"Failed to query entry 0x%x of %s, err: %pe\n",
 			entry_id, ntmp_table_name(tbl_id), ERR_PTR(err));
 		return err;
@@ -300,7 +331,7 @@ static int ntmp_query_entry_by_id(struct ntmp_user *user, int tbl_id,
 
 	resp = (struct ntmp_cmn_resp_query *)req;
 	if (unlikely(le32_to_cpu(resp->entry_id) != entry_id)) {
-		dev_err(user->dev,
+		dev_err(cbdr->dev,
 			"%s: query EID 0x%x doesn't match response EID 0x%x\n",
 			ntmp_table_name(tbl_id), entry_id, le32_to_cpu(resp->entry_id));
 		return -EIO;
@@ -312,15 +343,15 @@ static int ntmp_query_entry_by_id(struct ntmp_user *user, int tbl_id,
 int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id,
 			struct maft_entry_data *maft)
 {
-	struct ntmp_dma_buf data = {
-		.dev = user->dev,
+	struct netc_swcbd swcbd = {
 		.size = sizeof(struct maft_req_add),
 	};
 	struct maft_req_add *req;
+	struct netc_cbdr *cbdr;
 	union netc_cbd cbd;
 	int err;
 
-	err = ntmp_alloc_data_mem(&data, (void **)&req);
+	err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
 	if (err)
 		return err;
 
@@ -329,14 +360,15 @@ int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id,
 	req->keye = maft->keye;
 	req->cfge = maft->cfge;
 
-	ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(data.size, 0),
+	ntmp_fill_request_hdr(&cbd, swcbd.dma, NTMP_LEN(swcbd.size, 0),
 			      NTMP_MAFT_ID, NTMP_CMD_ADD, NTMP_AM_ENTRY_ID);
-	err = netc_xmit_ntmp_cmd(user, &cbd);
+
+	ntmp_select_and_lock_cbdr(user, &cbdr);
+	err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
 	if (err)
 		dev_err(user->dev, "Failed to add MAFT entry 0x%x, err: %pe\n",
 			entry_id, ERR_PTR(err));
-
-	ntmp_free_data_mem(&data);
+	ntmp_unlock_cbdr(cbdr);
 
 	return err;
 }
@@ -345,31 +377,31 @@ EXPORT_SYMBOL_GPL(ntmp_maft_add_entry);
 int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id,
 			  struct maft_entry_data *maft)
 {
-	struct ntmp_dma_buf data = {
-		.dev = user->dev,
+	struct netc_swcbd swcbd = {
 		.size = sizeof(struct maft_resp_query),
 	};
 	struct maft_resp_query *resp;
 	struct ntmp_req_by_eid *req;
+	struct netc_cbdr *cbdr;
 	int err;
 
-	err = ntmp_alloc_data_mem(&data, (void **)&req);
+	err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
 	if (err)
 		return err;
 
 	ntmp_fill_crd_eid(req, user->tbl.maft_ver, 0, 0, entry_id);
-	err = ntmp_query_entry_by_id(user, NTMP_MAFT_ID,
-				     NTMP_LEN(sizeof(*req), data.size),
-				     req, data.dma, true);
+
+	ntmp_select_and_lock_cbdr(user, &cbdr);
+	err = ntmp_query_entry_by_id(cbdr, NTMP_MAFT_ID, req, &swcbd, true);
 	if (err)
-		goto end;
+		goto unlock_cbdr;
 
 	resp = (struct maft_resp_query *)req;
 	maft->keye = resp->keye;
 	maft->cfge = resp->cfge;
 
-end:
-	ntmp_free_data_mem(&data);
+unlock_cbdr:
+	ntmp_unlock_cbdr(cbdr);
 
 	return err;
 }
@@ -385,8 +417,9 @@ EXPORT_SYMBOL_GPL(ntmp_maft_delete_entry);
 int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
 			   int count)
 {
-	struct ntmp_dma_buf data = {.dev = user->dev};
 	struct rsst_req_update *req;
+	struct netc_swcbd swcbd;
+	struct netc_cbdr *cbdr;
 	union netc_cbd cbd;
 	int err, i;
 
@@ -394,8 +427,8 @@ int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
 		/* HW only takes in a full 64 entry table */
 		return -EINVAL;
 
-	data.size = struct_size(req, groups, count);
-	err = ntmp_alloc_data_mem(&data, (void **)&req);
+	swcbd.size = struct_size(req, groups, count);
+	err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
 	if (err)
 		return err;
 
@@ -405,15 +438,15 @@ int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table,
 	for (i = 0; i < count; i++)
 		req->groups[i] = (u8)(table[i]);
 
-	ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(data.size, 0),
+	ntmp_fill_request_hdr(&cbd, swcbd.dma, NTMP_LEN(swcbd.size, 0),
 			      NTMP_RSST_ID, NTMP_CMD_UPDATE, NTMP_AM_ENTRY_ID);
 
-	err = netc_xmit_ntmp_cmd(user, &cbd);
+	ntmp_select_and_lock_cbdr(user, &cbdr);
+	err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
 	if (err)
 		dev_err(user->dev, "Failed to update RSST entry, err: %pe\n",
 			ERR_PTR(err));
-
-	ntmp_free_data_mem(&data);
+	ntmp_unlock_cbdr(cbdr);
 
 	return err;
 }
@@ -421,8 +454,9 @@ EXPORT_SYMBOL_GPL(ntmp_rsst_update_entry);
 
 int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count)
 {
-	struct ntmp_dma_buf data = {.dev = user->dev};
 	struct ntmp_req_by_eid *req;
+	struct netc_swcbd swcbd;
+	struct netc_cbdr *cbdr;
 	union netc_cbd cbd;
 	int err, i;
 	u8 *group;
@@ -431,21 +465,23 @@ int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count)
 		/* HW only takes in a full 64 entry table */
 		return -EINVAL;
 
-	data.size = NTMP_ENTRY_ID_SIZE + RSST_STSE_DATA_SIZE(count) +
-		    RSST_CFGE_DATA_SIZE(count);
-	err = ntmp_alloc_data_mem(&data, (void **)&req);
+	swcbd.size = NTMP_ENTRY_ID_SIZE + RSST_STSE_DATA_SIZE(count) +
+		     RSST_CFGE_DATA_SIZE(count);
+	err = ntmp_alloc_data_mem(user->dev, &swcbd, (void **)&req);
 	if (err)
 		return err;
 
 	/* Set the request data buffer */
 	ntmp_fill_crd_eid(req, user->tbl.rsst_ver, 0, 0, 0);
-	ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(sizeof(*req), data.size),
+	ntmp_fill_request_hdr(&cbd, swcbd.dma, NTMP_LEN(sizeof(*req), swcbd.size),
 			      NTMP_RSST_ID, NTMP_CMD_QUERY, NTMP_AM_ENTRY_ID);
-	err = netc_xmit_ntmp_cmd(user, &cbd);
+
+	ntmp_select_and_lock_cbdr(user, &cbdr);
+	err = netc_xmit_ntmp_cmd(cbdr, &cbd, &swcbd);
 	if (err) {
 		dev_err(user->dev, "Failed to query RSST entry, err: %pe\n",
 			ERR_PTR(err));
-		goto end;
+		goto unlock_cbdr;
 	}
 
 	group = (u8 *)req;
@@ -453,8 +489,8 @@ int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count)
 	for (i = 0; i < count; i++)
 		table[i] = group[i];
 
-end:
-	ntmp_free_data_mem(&data);
+unlock_cbdr:
+	ntmp_unlock_cbdr(cbdr);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index 3459cc45b610..f8dff3ba2c28 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -14,6 +14,7 @@
 #define NETC_CBDR_BD_NUM	256
 #define NETC_CBDRCIR_INDEX	GENMASK(9, 0)
 #define NETC_CBDRCIR_SBE	BIT(31)
+#define NETC_CBDR_CLEAN_WORK	16
 
 union netc_cbd {
 	struct {
@@ -56,13 +57,6 @@ union netc_cbd {
 	} resp_hdr; /* NTMP Response Message Header Format */
 };
 
-struct ntmp_dma_buf {
-	struct device *dev;
-	size_t size;
-	void *buf;
-	dma_addr_t dma;
-};
-
 struct ntmp_cmn_req_data {
 	__le16 update_act;
 	u8 dbg_opt;
diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h
index 916dc4fe7de3..83a449b4d6ec 100644
--- a/include/linux/fsl/ntmp.h
+++ b/include/linux/fsl/ntmp.h
@@ -31,6 +31,12 @@ struct netc_tbl_vers {
 	u8 rsst_ver;
 };
 
+struct netc_swcbd {
+	void *buf;
+	dma_addr_t dma;
+	size_t size;
+};
+
 struct netc_cbdr {
 	struct device *dev;
 	struct netc_cbdr_regs regs;
@@ -44,9 +50,10 @@ struct netc_cbdr {
 	void *addr_base_align;
 	dma_addr_t dma_base;
 	dma_addr_t dma_base_align;
+	struct netc_swcbd *swcbd;
 
 	/* Serialize the order of command BD ring */
-	spinlock_t ring_lock;
+	struct mutex ring_lock;
 };
 
 struct ntmp_user {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 net 1/2] net: enetc: correct the command BD ring consumer index
From: Wei Fang @ 2026-04-15  6:08 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, chleroy
  Cc: netdev, linux-kernel, imx, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20260415060833.2303846-1-wei.fang@nxp.com>

The command BD ring cousumer index register has the consumer index as
the lower 10 bits, and the bit 31 is SBE, which indicates whether a
system bus error occurred during execution of the CBD command. So if a
system bus error occurs, reading the register will get the SBE bit set.

However, the current implementation directly uses the register value as
the consumer index without masking it. Therefore, if a system bus error
occurs, an incorrect consumer index will be obtained, causing errors in
the processing of the command BD ring. Thus, we need to mask out the
other bits to obtain the correct consumer index.

In addition, this patch adds a check for the SBE bit after the polling
loop and returns an error if the bit is set.

Fixes: 4701073c3deb ("net: enetc: add initial netc-lib driver to support NTMP")
Signed-off-by: Wei Fang <wei.fang@nxp.com>
---
 drivers/net/ethernet/freescale/enetc/ntmp.c         | 13 ++++++++++---
 drivers/net/ethernet/freescale/enetc/ntmp_private.h |  2 ++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c
index 0c1d343253bf..b188eb2d40c0 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp.c
+++ b/drivers/net/ethernet/freescale/enetc/ntmp.c
@@ -55,7 +55,7 @@ int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev,
 	spin_lock_init(&cbdr->ring_lock);
 
 	cbdr->next_to_use = netc_read(cbdr->regs.pir);
-	cbdr->next_to_clean = netc_read(cbdr->regs.cir);
+	cbdr->next_to_clean = netc_read(cbdr->regs.cir) & NETC_CBDRCIR_INDEX;
 
 	/* Step 1: Configure the base address of the Control BD Ring */
 	netc_write(cbdr->regs.bar0, lower_32_bits(cbdr->dma_base_align));
@@ -98,7 +98,7 @@ static void ntmp_clean_cbdr(struct netc_cbdr *cbdr)
 	int i;
 
 	i = cbdr->next_to_clean;
-	while (netc_read(cbdr->regs.cir) != i) {
+	while ((netc_read(cbdr->regs.cir) & NETC_CBDRCIR_INDEX) != i) {
 		cbd = ntmp_get_cbd(cbdr, i);
 		memset(cbd, 0, sizeof(*cbd));
 		i = (i + 1) % cbdr->bd_num;
@@ -135,12 +135,19 @@ static int netc_xmit_ntmp_cmd(struct ntmp_user *user, union netc_cbd *cbd)
 	cbdr->next_to_use = i;
 	netc_write(cbdr->regs.pir, i);
 
-	err = read_poll_timeout_atomic(netc_read, val, val == i,
+	err = read_poll_timeout_atomic(netc_read, val,
+				       (val & NETC_CBDRCIR_INDEX) == i,
 				       NETC_CBDR_DELAY_US, NETC_CBDR_TIMEOUT,
 				       true, cbdr->regs.cir);
 	if (unlikely(err))
 		goto cbdr_unlock;
 
+	if (unlikely(val & NETC_CBDRCIR_SBE)) {
+		dev_err(user->dev, "Command BD system bus error\n");
+		err = -EIO;
+		goto cbdr_unlock;
+	}
+
 	dma_rmb();
 	/* Get the writeback command BD, because the caller may need
 	 * to check some other fields of the response header.
diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
index 34394e40fddd..3459cc45b610 100644
--- a/drivers/net/ethernet/freescale/enetc/ntmp_private.h
+++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h
@@ -12,6 +12,8 @@
 
 #define NTMP_EID_REQ_LEN	8
 #define NETC_CBDR_BD_NUM	256
+#define NETC_CBDRCIR_INDEX	GENMASK(9, 0)
+#define NETC_CBDRCIR_SBE	BIT(31)
 
 union netc_cbd {
 	struct {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 net 0/2] net: enetc: fix command BD ring issues
From: Wei Fang @ 2026-04-15  6:08 UTC (permalink / raw)
  To: claudiu.manoil, vladimir.oltean, xiaoning.wang, andrew+netdev,
	davem, edumazet, kuba, pabeni, chleroy
  Cc: netdev, linux-kernel, imx, linuxppc-dev, linux-arm-kernel

Currently, the implementation of command BD ring has two issues, one is
that the driver may obtain wrong consumer index of the ring, because the
driver does not mask out the SBE bit of the CIR value, so a wrong index
will be obtained when a SBE error ouccrs. The other one is that the DMA
buffer may be used after free. If netc_xmit_ntmp_cmd() times out and
returns an error, the pending command is not explicitly aborted, while
ntmp_free_data_mem() unconditionally frees the DMA buffer. If the buffer
has already been reallocated elsewhere, this may lead to silent memory
corruption. Because the hardware eventually processes the pending command
and perform a DMA write of the response to the physical address of the
freed buffer. So this patch set is to fix these two issues.

---
v2:
1. Check the SBE bit in netc_xmit_ntmp_cmd().
2. Fix DMA buffer leak issue when netc_xmit_ntmp_cmd returns -EBUSY.
3. Check swcbd->buf in ntmp_free_data_mem().
4. Move ring_lock ownership to the caller to ensure the response buffer
cannot be reclaimed prematurely. So add the helpers ntmp_unlock_cbdr()
and ntmp_select_and_lock_cbdr().
---

Wei Fang (2):
  net: enetc: correct the command BD ring consumer index
  net: enetc: fix NTMP DMA use-after-free issue

 drivers/net/ethernet/freescale/enetc/ntmp.c   | 217 +++++++++++-------
 .../ethernet/freescale/enetc/ntmp_private.h   |  10 +-
 include/linux/fsl/ntmp.h                      |   9 +-
 3 files changed, 141 insertions(+), 95 deletions(-)

-- 
2.34.1


^ permalink raw reply

* Re: [PATCH v2] rose: fix OOB reads on short CLEAR REQUEST frames
From: Ashutosh Desai @ 2026-04-15  6:03 UTC (permalink / raw)
  To: edumazet; +Cc: netdev, linux-hams, davem, kuba, pabeni, horms, linux-kernel
In-Reply-To: <CANn89iLXG5ZMNeHkcLW+Ug9PxNnw_EKtpGVkPw1qeXEjJNtA0g@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 323 bytes --]

Hi Eric,

On Mon, Apr 13, 2026 at 11:11 PM Eric Dumazet wrote:
> rose_process_rx_frame() callers already call kfree_skb(skb) if
> rose_process_rx_frame() returns a 0. Your patch would add double-frees.
>
> Your patch is white-space mangled.

Thanks for the review. Sent v3 with all issues addressed.

Best regards,
Ashutosh

^ permalink raw reply

* [PATCH v3 net] rose: fix OOB reads on short CLEAR REQUEST frames
From: Ashutosh Desai @ 2026-04-15  5:57 UTC (permalink / raw)
  To: netdev
  Cc: linux-hams, davem, edumazet, kuba, pabeni, horms, stable,
	linux-kernel, Ashutosh Desai

rose_process_rx_frame() calls rose_decode() which reads skb->data[2]
without any prior length check. For CLEAR REQUEST frames the state
machines then read skb->data[3] and skb->data[4] as the cause and
diagnostic bytes.

A crafted 3-byte ROSE CLEAR REQUEST frame passes the minimum length
gate in rose_route_frame() and reaches rose_process_rx_frame(), where
rose_decode() reads one byte past the header and the state machines
read two bytes past the valid buffer. A remote peer can exploit this
to leak kernel memory contents or trigger a kernel panic.

Add a pskb_may_pull(skb, 3) check before rose_decode() to cover its
skb->data[2] access, and a pskb_may_pull(skb, 5) check afterwards for
the CLEAR REQUEST path to cover the cause and diagnostic reads.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Signed-off-by: Ashutosh Desai <ashutoshdesai993@gmail.com>
---
V2 -> V3: drop kfree_skb() calls to fix double-free; add end-user
          visible symptom to commit log; use [net] subject prefix
V1 -> V2: switch skb->len check to pskb_may_pull; add pskb_may_pull(skb, 3)
          before rose_decode() to cover its skb->data[2] access

v2: https://lore.kernel.org/netdev/177614667427.3606651.8700070406932922261@gmail.com/
v1: https://lore.kernel.org/netdev/20260409013246.2051746-1-ashutoshdesai993@gmail.com/

 net/rose/rose_in.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index 0276b393f0e5..8e60dc562b4a 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -269,8 +269,14 @@ int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb)
 	if (rose->state == ROSE_STATE_0)
 		return 0;
 
+	if (!pskb_may_pull(skb, 3))
+		return 0;
+
 	frametype = rose_decode(skb, &ns, &nr, &q, &d, &m);
 
+	if (frametype == ROSE_CLEAR_REQUEST && !pskb_may_pull(skb, 5))
+		return 0;
+
 	switch (rose->state) {
 	case ROSE_STATE_1:
 		queued = rose_state1_machine(sk, skb, frametype);
-- 
2.34.1


^ permalink raw reply related

* [PATCH net 13/13] e1000e: Unroll PTP in probe error handling
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Matt Vollrath, Avigail Dahan
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Matt Vollrath <tactii@gmail.com>

If probe fails after registering the PTP clock and its delayed work,
these resources must be released.

This was not an issue until a 2016 fix moved the e1000e_ptp_init() call
before the jump to err_register.

Fixes: aa524b66c5ef ("e1000e: don't modify SYSTIM registers during SIOCSHWTSTAMP ioctl")
Signed-off-by: Matt Vollrath <tactii@gmail.com>
Tested-by: Avigail Dahan <avigailx.dahan@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 9befdacd6730..7ce0cc8ab8f4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7706,6 +7706,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 err_register:
 	if (!(adapter->flags & FLAG_HAS_AMT))
 		e1000e_release_hw_control(adapter);
+	e1000e_ptp_remove(adapter);
 err_eeprom:
 	if (hw->phy.ops.check_reset_block && !hw->phy.ops.check_reset_block(hw))
 		e1000_phy_hw_reset(&adapter->hw);

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 11/13] iavf: fix wrong VLAN mask for legacy Rx descriptors L2TAG2
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Petr Oros, Aleksandr Loktionov, Paul Menzel,
	Rafal Romanowski
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Petr Oros <poros@redhat.com>

The IAVF_RXD_LEGACY_L2TAG2_M mask was incorrectly defined as
GENMASK_ULL(63, 32), extracting 32 bits from qw2 instead of the
16-bit VLAN tag. In the legacy Rx descriptor layout, the 2nd L2TAG2
(VLAN tag) occupies bits 63:48 of qw2, not 63:32.

The oversized mask causes FIELD_GET to return a 32-bit value where the
actual VLAN tag sits in bits 31:16. When this value is passed to
iavf_receive_skb() as a u16 parameter, it gets truncated to the lower
16 bits (which contain the 1st L2TAG2, typically zero). As a result,
__vlan_hwaccel_put_tag() is never called and software VLAN interfaces
on VFs receive no traffic.

This affects VFs behind ice PF (VIRTCHNL VLAN v2) when the PF
advertises VLAN stripping into L2TAG2_2 and legacy descriptors are
used.

The flex descriptor path already uses the correct mask
(IAVF_RXD_FLEX_L2TAG2_2_M = GENMASK_ULL(63, 48)).

Reproducer:
 1. Create 2 VFs on ice PF (echo 2 > sriov_numvfs)
 2. Disable spoofchk on both VFs
 3. Move each VF into a separate network namespace
 4. On each VF: create VLAN interface (e.g. vlan 198), assign IP,
    bring up
 5. Set rx-vlan-offload OFF on both VFs
 6. Ping between VLAN interfaces -> expect PASS
    (VLAN tag stays in packet data, kernel matches in-band)
 7. Set rx-vlan-offload ON on both VFs
 8. Ping between VLAN interfaces -> expect FAIL if bug present
    (HW strips VLAN tag into descriptor L2TAG2 field, wrong mask
    extracts bits 47:32 instead of 63:48, truncated to u16 -> zero,
    __vlan_hwaccel_put_tag() never called, packet delivered to parent
    interface, not VLAN interface)

The reproducer requires legacy Rx descriptors. On modern ice + iavf
with full PTP support, flex descriptors are always negotiated and the
buggy legacy path is never reached. Flex descriptors require all of:
 - CONFIG_PTP_1588_CLOCK enabled
 - VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC granted by PF
 - PTP capabilities negotiated (VIRTCHNL_VF_CAP_PTP)
 - VIRTCHNL_1588_PTP_CAP_RX_TSTAMP supported
 - VIRTCHNL_RXDID_2_FLEX_SQ_NIC present in DDP profile

If any condition is not met, iavf_select_rx_desc_format() falls back
to legacy descriptors (RXDID=1) and the wrong L2TAG2 mask is hit.

Fixes: 2dc8e7c36d80 ("iavf: refactor iavf_clean_rx_irq to support legacy and flex descriptors")
Signed-off-by: Petr Oros <poros@redhat.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 1d8cf29cb65a..5bb1de1cfd33 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -277,7 +277,7 @@ struct iavf_rx_desc {
 /* L2 Tag 2 Presence */
 #define IAVF_RXD_LEGACY_L2TAG2P_M		BIT(0)
 /* Stripped S-TAG VLAN from the receive packet */
-#define IAVF_RXD_LEGACY_L2TAG2_M		GENMASK_ULL(63, 32)
+#define IAVF_RXD_LEGACY_L2TAG2_M		GENMASK_ULL(63, 48)
 /* Stripped S-TAG VLAN from the receive packet */
 #define IAVF_RXD_FLEX_L2TAG2_2_M		GENMASK_ULL(63, 48)
 /* The packet is a UDP tunneled packet */

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 12/13] idpf: fix xdp crash in soft reset error path
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Emil Tantilov, stable, Aleksandr Loktionov,
	Patryk Holda
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

NULL pointer dereference is reported in cases where idpf_vport_open()
fails during soft reset:

./xdpsock -i <inf> -q -r -N

[ 3179.186687] idpf 0000:83:00.0: Failed to initialize queue ids for vport 0: -12
[ 3179.276739] BUG: kernel NULL pointer dereference, address: 0000000000000010
[ 3179.277636] #PF: supervisor read access in kernel mode
[ 3179.278470] #PF: error_code(0x0000) - not-present page
[ 3179.279285] PGD 0
[ 3179.280083] Oops: Oops: 0000 [#1] SMP NOPTI
...
[ 3179.283997] Workqueue: events xp_release_deferred
[ 3179.284770] RIP: 0010:idpf_find_rxq_vec+0x17/0x30 [idpf]
...
[ 3179.291937] Call Trace:
[ 3179.292392]  <TASK>
[ 3179.292843]  idpf_qp_switch+0x25/0x820 [idpf]
[ 3179.293325]  idpf_xsk_pool_setup+0x7c/0x520 [idpf]
[ 3179.293803]  idpf_xdp+0x59/0x240 [idpf]
[ 3179.294275]  xp_disable_drv_zc+0x62/0xb0
[ 3179.294743]  xp_clear_dev+0x40/0xb0
[ 3179.295198]  xp_release_deferred+0x1f/0xa0
[ 3179.295648]  process_one_work+0x226/0x730
[ 3179.296106]  worker_thread+0x19e/0x340
[ 3179.296557]  ? __pfx_worker_thread+0x10/0x10
[ 3179.297009]  kthread+0xf4/0x130
[ 3179.297459]  ? __pfx_kthread+0x10/0x10
[ 3179.297910]  ret_from_fork+0x32c/0x410
[ 3179.298361]  ? __pfx_kthread+0x10/0x10
[ 3179.298702]  ret_from_fork_asm+0x1a/0x30

Fix the error handling of the soft reset in idpf_xdp_setup_prog() by
restoring the vport->xdp_prog to the old value. This avoids referencing
the orphaned prog that was copied to vport->xdp_prog in the soft reset
and prevents subsequent false positive by idpf_xdp_enabled().

Update the restart check in idpf_xsk_pool_setup() to use IDPF_VPORT_UP bit
instead of netif_running(). The idpf_vport_stop/start() calls will not
update the __LINK_STATE_START bit, making this test a false positive
should the soft reset fail.

Fixes: 3d57b2c00f09 ("idpf: add XSk pool initialization")
Cc: stable@vger.kernel.org
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Patryk Holda <patryk.holda@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/idpf/xdp.c | 1 +
 drivers/net/ethernet/intel/idpf/xsk.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c
index cbccd4546768..18a6e7062863 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -488,6 +488,7 @@ static int idpf_xdp_setup_prog(struct idpf_vport *vport,
 				   "Could not reopen the vport after XDP setup");
 
 		cfg->user_config.xdp_prog = old;
+		vport->xdp_prog = old;
 		old = prog;
 	}
 
diff --git a/drivers/net/ethernet/intel/idpf/xsk.c b/drivers/net/ethernet/intel/idpf/xsk.c
index d95d3efdfd36..3d8c430efd2b 100644
--- a/drivers/net/ethernet/intel/idpf/xsk.c
+++ b/drivers/net/ethernet/intel/idpf/xsk.c
@@ -553,6 +553,7 @@ int idpf_xskrq_poll(struct idpf_rx_queue *rxq, u32 budget)
 
 int idpf_xsk_pool_setup(struct idpf_vport *vport, struct netdev_bpf *bpf)
 {
+	const struct idpf_netdev_priv *np = netdev_priv(vport->netdev);
 	struct xsk_buff_pool *pool = bpf->xsk.pool;
 	u32 qid = bpf->xsk.queue_id;
 	bool restart;
@@ -568,7 +569,8 @@ int idpf_xsk_pool_setup(struct idpf_vport *vport, struct netdev_bpf *bpf)
 		return -EINVAL;
 	}
 
-	restart = idpf_xdp_enabled(vport) && netif_running(vport->netdev);
+	restart = idpf_xdp_enabled(vport) &&
+		  test_bit(IDPF_VPORT_UP, np->state);
 	if (!restart)
 		goto pool;
 

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 10/13] i40e: fix napi_enable/disable skipping ringless q_vectors
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Aleksandr Loktionov, stable, Sunitha Mekala
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

After ethtool -L reduces the queue count, i40e_napi_disable_all() sets
NAPI_STATE_SCHED on all q_vectors, then i40e_vsi_map_rings_to_vectors()
clears ring pointers on the excess ones.  i40e_napi_enable_all() skips
those with:

	if (q_vector->rx.ring || q_vector->tx.ring)
		napi_enable(&q_vector->napi);

leaving them on dev->napi_list with NAPI_STATE_SCHED permanently set.

Writing to /sys/class/net/<iface>/threaded calls napi_stop_kthread()
on every entry in dev->napi_list.  The function loops on msleep(20)
waiting for NAPI_STATE_SCHED to clear -- which never happens for the
stale q_vectors.  The task hangs in D state forever; a concurrent write
deadlocks on dev->lock held by the first.

Commit 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no
rings") added the guard to prevent a divide-by-zero in i40e_napi_poll()
when epoll busy-poll iterated all device NAPIs (4.x era). Since
7adc3d57fe2b ("net: Introduce preferred busy-polling"), from v5.11,
napi_busy_loop() polls by napi_id keyed to the socket, so ringless
q_vectors are never selected.  i40e_msix_clean_rings() also independently
avoids scheduling NAPI for them.  The guard is safe to remove.

Add an early return in i40e_napi_poll() for num_ringpairs == 0 so the
function is self-defending against a NULL tx.ring dereference at the
WB_ON_ITR check, should the NAPI ever fire through an unexpected path.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/
Fixes: 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no rings")
Cc: stable@vger.kernel.org
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 28 ++++++++++++++++------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 ++++++++++
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 028bd500603a..b4ca8485f4b5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5182,6 +5182,14 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf)
 /**
  * i40e_napi_enable_all - Enable NAPI for all q_vectors in the VSI
  * @vsi: the VSI being configured
+ *
+ * Enable NAPI on every q_vector that is registered with the netdev,
+ * regardless of whether it currently has rings assigned.  After a queue-
+ * count reduction (e.g. ethtool -L combined 1) the excess q_vectors lose
+ * their ring pointers inside i40e_vsi_map_rings_to_vectors but remain on
+ * dev->napi_list.  Leaving them in the napi_disable()-ed state
+ * (NAPI_STATE_SCHED set) causes napi_set_threaded() to spin forever on
+ * msleep(20) waiting for that bit to clear.
  **/
 static void i40e_napi_enable_all(struct i40e_vsi *vsi)
 {
@@ -5190,17 +5198,17 @@ static void i40e_napi_enable_all(struct i40e_vsi *vsi)
 	if (!vsi->netdev)
 		return;
 
-	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
-		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
-
-		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_enable(&q_vector->napi);
-	}
+	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
+		napi_enable(&vsi->q_vectors[q_idx]->napi);
 }
 
 /**
  * i40e_napi_disable_all - Disable NAPI for all q_vectors in the VSI
  * @vsi: the VSI being configured
+ *
+ * Mirror of i40e_napi_enable_all: operate on every registered q_vector so
+ * enable/disable calls are always balanced, even when some q_vectors carry
+ * no rings (as happens after a queue-count reduction).
  **/
 static void i40e_napi_disable_all(struct i40e_vsi *vsi)
 {
@@ -5209,12 +5217,8 @@ static void i40e_napi_disable_all(struct i40e_vsi *vsi)
 	if (!vsi->netdev)
 		return;
 
-	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
-		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
-
-		if (q_vector->rx.ring || q_vector->tx.ring)
-			napi_disable(&q_vector->napi);
-	}
+	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
+		napi_disable(&vsi->q_vectors[q_idx]->napi);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 894f2d06d39d..3123459208d3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2760,6 +2760,16 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
 		return 0;
 	}
 
+	/* A q_vector can have its ring pointers cleared after a queue-count
+	 * reduction (ethtool -L combined N) while napi_enable() was already
+	 * called on it.  Complete immediately so the poll loop exits cleanly
+	 * and we never dereference the NULL ring pointer below.
+	 */
+	if (unlikely(!q_vector->num_ringpairs)) {
+		napi_complete_done(napi, 0);
+		return 0;
+	}
+
 	/* Since the actual Tx work is minimal, we can give the Tx a larger
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 09/13] i40e: don't advertise IFF_SUPP_NOFCS
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Kohei Enju, Aleksandr Loktionov,
	Sunitha Mekala
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Kohei Enju <kohei@enjuk.jp>

i40e advertises IFF_SUPP_NOFCS, allowing users to use the SO_NOFCS
socket option. However, this option is silently ignored, as the driver
does not check skb->no_fcs, and always enables FCS insertion offload.

Fix this by removing the advertisement of IFF_SUPP_NOFCS.

This behavior can be reproduced with a simple AF_PACKET socket:

  import socket
  s = socket.socket(socket.AF_PACKET, socket.SOCK_RAW)
  s.setsockopt(socket.SOL_SOCKET, 43, 1) # SO_NOFCS
  s.bind(("eth0", 0))
  s.send(b'\xff' * 64)

Previously, send() succeeds but the driver ignores SO_NOFCS.
With this change, send() fails with -EPROTONOSUPPORT, as expected.

Fixes: 41c445ff0f48 ("i40e: main driver core")
Signed-off-by: Kohei Enju <kohei@enjuk.jp>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 926d001b2150..028bd500603a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -13783,7 +13783,6 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 	netdev->neigh_priv_len = sizeof(u32) * 4;
 
 	netdev->priv_flags |= IFF_UNICAST_FLT;
-	netdev->priv_flags |= IFF_SUPP_NOFCS;
 	/* Setup netdev TC information */
 	i40e_vsi_config_netdev_tc(vsi, vsi->tc_config.enabled_tc);
 

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 08/13] ice: fix potential NULL pointer deref in error path of ice_set_ringparam()
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Kohei Enju, Paul Greenwalt, Rinitha S
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Kohei Enju <kohei@enjuk.jp>

ice_set_ringparam nullifies tstamp_ring of temporary tx_rings, without
clearing ICE_TX_RING_FLAGS_TXTIME bit.
When ICE_TX_RING_FLAGS_TXTIME is set and the subsequent
ice_setup_tx_ring() call fails, a NULL pointer dereference could happen
in the unwinding sequence:

ice_clean_tx_ring()
-> ice_is_txtime_cfg() == true (ICE_TX_RING_FLAGS_TXTIME is set)
-> ice_free_tx_tstamp_ring()
  -> ice_free_tstamp_ring()
    -> tstamp_ring->desc (NULL deref)

Clear ICE_TX_RING_FLAGS_TXTIME bit to avoid the potential issue.

Note that this potential issue is found by manual code review.
Compile test only since unfortunately I don't have E830 devices.

Fixes: ccde82e90946 ("ice: add E830 Earliest TxTime First Offload support")
Signed-off-by: Kohei Enju <kohei@enjuk.jp>
Reviewed-by: Paul Greenwalt <paul.greenwalt@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ethtool.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index e6a20af6f63d..f28416a707d7 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -3290,6 +3290,7 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring,
 		tx_rings[i].desc = NULL;
 		tx_rings[i].tx_buf = NULL;
 		tx_rings[i].tstamp_ring = NULL;
+		clear_bit(ICE_TX_RING_FLAGS_TXTIME, tx_rings[i].flags);
 		tx_rings[i].tx_tstamps = &pf->ptp.port.tx;
 		err = ice_setup_tx_ring(&tx_rings[i]);
 		if (err) {

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 07/13] ice: fix race condition in TX timestamp ring cleanup
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Keita Morisaki, Aleksandr Loktionov,
	Rinitha S
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Keita Morisaki <kmta1236@gmail.com>

Fix a race condition between ice_free_tx_tstamp_ring() and ice_tx_map()
that can cause a NULL pointer dereference.

ice_free_tx_tstamp_ring currently clears the ICE_TX_FLAGS_TXTIME flag
after NULLing the tstamp_ring. This could allow a concurrent ice_tx_map
call on another CPU to dereference the tstamp_ring, which could lead to
a NULL pointer dereference.

  CPU A:ice_free_tx_tstamp_ring() | CPU B:ice_tx_map()
  --------------------------------|---------------------------------
  tx_ring->tstamp_ring = NULL     |
                                  | ice_is_txtime_cfg() -> true
                                  | tstamp_ring = tx_ring->tstamp_ring
                                  | tstamp_ring->count  // NULL deref!
  flags &= ~ICE_TX_FLAGS_TXTIME   |

Fix by:
1. Reordering ice_free_tx_tstamp_ring() to clear the flag before
   NULLing the pointer, with smp_wmb() to ensure proper ordering.
2. Adding smp_rmb() in ice_tx_map() after the flag check to order the
   flag read before the pointer read, using READ_ONCE() for the
   pointer, and adding a NULL check as a safety net.
3. Converting tx_ring->flags from u8 to DECLARE_BITMAP() and using
   atomic bitops (set_bit(), clear_bit(), test_bit()) for all flag
   operations throughout the driver:
   - ICE_TX_RING_FLAGS_XDP
   - ICE_TX_RING_FLAGS_VLAN_L2TAG1
   - ICE_TX_RING_FLAGS_VLAN_L2TAG2
   - ICE_TX_RING_FLAGS_TXTIME

Fixes: ccde82e909467 ("ice: add E830 Earliest TxTime First Offload support")
Signed-off-by: Keita Morisaki <kmta1236@gmail.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice.h         |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.h    | 16 ++++++++++------
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c |  2 +-
 drivers/net/ethernet/intel/ice/ice_lib.c     |  4 ++--
 drivers/net/ethernet/intel/ice/ice_txrx.c    | 23 ++++++++++++++++-------
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index eb3a48330cc1..725b130dd3a2 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -753,7 +753,7 @@ static inline bool ice_is_xdp_ena_vsi(struct ice_vsi *vsi)
 
 static inline void ice_set_ring_xdp(struct ice_tx_ring *ring)
 {
-	ring->flags |= ICE_TX_FLAGS_RING_XDP;
+	set_bit(ICE_TX_RING_FLAGS_XDP, ring->flags);
 }
 
 /**
@@ -778,7 +778,7 @@ static inline bool ice_is_txtime_ena(const struct ice_tx_ring *ring)
  */
 static inline bool ice_is_txtime_cfg(const struct ice_tx_ring *ring)
 {
-	return !!(ring->flags & ICE_TX_FLAGS_TXTIME);
+	return test_bit(ICE_TX_RING_FLAGS_TXTIME, ring->flags);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index b6547e1b7c42..5e517f219379 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -212,6 +212,14 @@ enum ice_rx_dtype {
 	ICE_RX_DTYPE_SPLIT_ALWAYS	= 2,
 };
 
+enum ice_tx_ring_flags {
+	ICE_TX_RING_FLAGS_XDP,
+	ICE_TX_RING_FLAGS_VLAN_L2TAG1,
+	ICE_TX_RING_FLAGS_VLAN_L2TAG2,
+	ICE_TX_RING_FLAGS_TXTIME,
+	ICE_TX_RING_FLAGS_NBITS,
+};
+
 struct ice_pkt_ctx {
 	u64 cached_phctime;
 	__be16 vlan_proto;
@@ -352,11 +360,7 @@ struct ice_tx_ring {
 	u16 count;			/* Number of descriptors */
 	u16 q_index;			/* Queue number of ring */
 
-	u8 flags;
-#define ICE_TX_FLAGS_RING_XDP		BIT(0)
-#define ICE_TX_FLAGS_RING_VLAN_L2TAG1	BIT(1)
-#define ICE_TX_FLAGS_RING_VLAN_L2TAG2	BIT(2)
-#define ICE_TX_FLAGS_TXTIME		BIT(3)
+	DECLARE_BITMAP(flags, ICE_TX_RING_FLAGS_NBITS);
 
 	struct xsk_buff_pool *xsk_pool;
 
@@ -398,7 +402,7 @@ static inline bool ice_ring_ch_enabled(struct ice_tx_ring *ring)
 
 static inline bool ice_ring_is_xdp(struct ice_tx_ring *ring)
 {
-	return !!(ring->flags & ICE_TX_FLAGS_RING_XDP);
+	return test_bit(ICE_TX_RING_FLAGS_XDP, ring->flags);
 }
 
 enum ice_container_type {
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index bd77f1c001ee..16aa25535152 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -943,7 +943,7 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring,
 		/* if this is not already set it means a VLAN 0 + priority needs
 		 * to be offloaded
 		 */
-		if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
+		if (test_bit(ICE_TX_RING_FLAGS_VLAN_L2TAG2, tx_ring->flags))
 			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
 		else
 			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 689c6025ea82..837b71b7b2b7 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1412,9 +1412,9 @@ static int ice_vsi_alloc_rings(struct ice_vsi *vsi)
 		ring->count = vsi->num_tx_desc;
 		ring->txq_teid = ICE_INVAL_TEID;
 		if (dvm_ena)
-			ring->flags |= ICE_TX_FLAGS_RING_VLAN_L2TAG2;
+			set_bit(ICE_TX_RING_FLAGS_VLAN_L2TAG2, ring->flags);
 		else
-			ring->flags |= ICE_TX_FLAGS_RING_VLAN_L2TAG1;
+			set_bit(ICE_TX_RING_FLAGS_VLAN_L2TAG1, ring->flags);
 		WRITE_ONCE(vsi->tx_rings[i], ring);
 	}
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 7be9c062949b..4ca1a0602307 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -190,9 +190,10 @@ void ice_free_tstamp_ring(struct ice_tx_ring *tx_ring)
 void ice_free_tx_tstamp_ring(struct ice_tx_ring *tx_ring)
 {
 	ice_free_tstamp_ring(tx_ring);
+	clear_bit(ICE_TX_RING_FLAGS_TXTIME, tx_ring->flags);
+	smp_wmb();	/* order flag clear before pointer NULL */
 	kfree_rcu(tx_ring->tstamp_ring, rcu);
-	tx_ring->tstamp_ring = NULL;
-	tx_ring->flags &= ~ICE_TX_FLAGS_TXTIME;
+	WRITE_ONCE(tx_ring->tstamp_ring, NULL);
 }
 
 /**
@@ -405,7 +406,7 @@ static int ice_alloc_tstamp_ring(struct ice_tx_ring *tx_ring)
 	tx_ring->tstamp_ring = tstamp_ring;
 	tstamp_ring->desc = NULL;
 	tstamp_ring->count = ice_calc_ts_ring_count(tx_ring);
-	tx_ring->flags |= ICE_TX_FLAGS_TXTIME;
+	set_bit(ICE_TX_RING_FLAGS_TXTIME, tx_ring->flags);
 	return 0;
 }
 
@@ -1521,13 +1522,20 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
 		return;
 
 	if (ice_is_txtime_cfg(tx_ring)) {
-		struct ice_tstamp_ring *tstamp_ring = tx_ring->tstamp_ring;
-		u32 tstamp_count = tstamp_ring->count;
-		u32 j = tstamp_ring->next_to_use;
+		struct ice_tstamp_ring *tstamp_ring;
+		u32 tstamp_count, j;
 		struct ice_ts_desc *ts_desc;
 		struct timespec64 ts;
 		u32 tstamp;
 
+		smp_rmb();	/* order flag read before pointer read */
+		tstamp_ring = READ_ONCE(tx_ring->tstamp_ring);
+		if (unlikely(!tstamp_ring))
+			goto ring_kick;
+
+		tstamp_count = tstamp_ring->count;
+		j = tstamp_ring->next_to_use;
+
 		ts = ktime_to_timespec64(first->skb->tstamp);
 		tstamp = ts.tv_nsec >> ICE_TXTIME_CTX_RESOLUTION_128NS;
 
@@ -1555,6 +1563,7 @@ ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
 		tstamp_ring->next_to_use = j;
 		writel_relaxed(j, tstamp_ring->tail);
 	} else {
+ring_kick:
 		writel_relaxed(i, tx_ring->tail);
 	}
 	return;
@@ -1814,7 +1823,7 @@ ice_tx_prepare_vlan_flags(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first)
 	 */
 	if (skb_vlan_tag_present(skb)) {
 		first->vid = skb_vlan_tag_get(skb);
-		if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
+		if (test_bit(ICE_TX_RING_FLAGS_VLAN_L2TAG2, tx_ring->flags))
 			first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
 		else
 			first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 06/13] ice: fix ICE_AQ_LINK_SPEED_M for 200G
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Paul Greenwalt, Aleksandr Loktionov,
	Simon Horman, Sunitha Mekala
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Paul Greenwalt <paul.greenwalt@intel.com>

When setting PHY configuration during driver initialization, 200G link
speed is not being advertised even when the PHY is capable. This is
because the get PHY capabilities link speed response is being masked by
ICE_AQ_LINK_SPEED_M, which does not include the 200G link speed bit.

ICE_AQ_LINK_SPEED_200GB is defined as BIT(11), but the mask 0x7FF only
covers bits 0-10. Fix ICE_AQ_LINK_SPEED_M to use GENMASK(11, 0) so
that it covers all defined link speed bits including 200G.

Fixes: 24407a01e57c ("ice: Add 200G speed/phy type use")
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 859e9c66f3e7..3cbb1b0582e3 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1252,7 +1252,7 @@ struct ice_aqc_get_link_status_data {
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_3	2
 #define ICE_AQ_LINK_PWR_QSFP_CLASS_4	3
 	__le16 link_speed;
-#define ICE_AQ_LINK_SPEED_M		0x7FF
+#define ICE_AQ_LINK_SPEED_M		GENMASK(11, 0)
 #define ICE_AQ_LINK_SPEED_10MB		BIT(0)
 #define ICE_AQ_LINK_SPEED_100MB		BIT(1)
 #define ICE_AQ_LINK_SPEED_1000MB	BIT(2)

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 05/13] ice: fix PHY config on media change with link-down-on-close
From: Jacob Keller @ 2026-04-15  5:48 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Paul Greenwalt, Przemek Kitszel,
	Aleksandr Loktionov, Sunitha Mekala
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Paul Greenwalt <paul.greenwalt@intel.com>

Commit 1a3571b5938c ("ice: restore PHY settings on media insertion")
introduced separate flows for setting PHY configuration on media
present: ice_configure_phy() when link-down-on-close is disabled, and
ice_force_phys_link_state() when enabled. The latter incorrectly uses
the previous configuration even after module change, causing link
issues such as wrong speed or no link.

Unify PHY configuration into a single ice_phy_cfg() function with a
link_en parameter, ensuring PHY capabilities are always fetched fresh
from hardware.

Fixes: 1a3571b5938c ("ice: restore PHY settings on media insertion")
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Paul Greenwalt <paul.greenwalt@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_main.c | 121 +++++++-----------------------
 1 file changed, 27 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 3c36e3641b9e..ce3a0afe302d 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -1922,82 +1922,6 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 	ice_print_vfs_mdd_events(pf);
 }
 
-/**
- * ice_force_phys_link_state - Force the physical link state
- * @vsi: VSI to force the physical link state to up/down
- * @link_up: true/false indicates to set the physical link to up/down
- *
- * Force the physical link state by getting the current PHY capabilities from
- * hardware and setting the PHY config based on the determined capabilities. If
- * link changes a link event will be triggered because both the Enable Automatic
- * Link Update and LESM Enable bits are set when setting the PHY capabilities.
- *
- * Returns 0 on success, negative on failure
- */
-static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up)
-{
-	struct ice_aqc_get_phy_caps_data *pcaps;
-	struct ice_aqc_set_phy_cfg_data *cfg;
-	struct ice_port_info *pi;
-	struct device *dev;
-	int retcode;
-
-	if (!vsi || !vsi->port_info || !vsi->back)
-		return -EINVAL;
-	if (vsi->type != ICE_VSI_PF)
-		return 0;
-
-	dev = ice_pf_to_dev(vsi->back);
-
-	pi = vsi->port_info;
-
-	pcaps = kzalloc_obj(*pcaps);
-	if (!pcaps)
-		return -ENOMEM;
-
-	retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_ACTIVE_CFG, pcaps,
-				      NULL);
-	if (retcode) {
-		dev_err(dev, "Failed to get phy capabilities, VSI %d error %d\n",
-			vsi->vsi_num, retcode);
-		retcode = -EIO;
-		goto out;
-	}
-
-	/* No change in link */
-	if (link_up == !!(pcaps->caps & ICE_AQC_PHY_EN_LINK) &&
-	    link_up == !!(pi->phy.link_info.link_info & ICE_AQ_LINK_UP))
-		goto out;
-
-	/* Use the current user PHY configuration. The current user PHY
-	 * configuration is initialized during probe from PHY capabilities
-	 * software mode, and updated on set PHY configuration.
-	 */
-	cfg = kmemdup(&pi->phy.curr_user_phy_cfg, sizeof(*cfg), GFP_KERNEL);
-	if (!cfg) {
-		retcode = -ENOMEM;
-		goto out;
-	}
-
-	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
-	if (link_up)
-		cfg->caps |= ICE_AQ_PHY_ENA_LINK;
-	else
-		cfg->caps &= ~ICE_AQ_PHY_ENA_LINK;
-
-	retcode = ice_aq_set_phy_cfg(&vsi->back->hw, pi, cfg, NULL);
-	if (retcode) {
-		dev_err(dev, "Failed to set phy config, VSI %d error %d\n",
-			vsi->vsi_num, retcode);
-		retcode = -EIO;
-	}
-
-	kfree(cfg);
-out:
-	kfree(pcaps);
-	return retcode;
-}
-
 /**
  * ice_init_nvm_phy_type - Initialize the NVM PHY type
  * @pi: port info structure
@@ -2066,7 +1990,7 @@ static void ice_init_link_dflt_override(struct ice_port_info *pi)
  * first time media is available. The ICE_LINK_DEFAULT_OVERRIDE_PENDING state
  * is used to indicate that the user PHY cfg default override is initialized
  * and the PHY has not been configured with the default override settings. The
- * state is set here, and cleared in ice_configure_phy the first time the PHY is
+ * state is set here, and cleared in ice_phy_cfg the first time the PHY is
  * configured.
  *
  * This function should be called only if the FW doesn't support default
@@ -2172,14 +2096,18 @@ static int ice_init_phy_user_cfg(struct ice_port_info *pi)
 }
 
 /**
- * ice_configure_phy - configure PHY
+ * ice_phy_cfg - configure PHY
  * @vsi: VSI of PHY
+ * @link_en: true/false indicates to set link to enable/disable
  *
  * Set the PHY configuration. If the current PHY configuration is the same as
- * the curr_user_phy_cfg, then do nothing to avoid link flap. Otherwise
- * configure the based get PHY capabilities for topology with media.
+ * the curr_user_phy_cfg and link_en hasn't changed, then do nothing to avoid
+ * link flap. Otherwise configure the PHY based get PHY capabilities for
+ * topology with media and link_en.
+ *
+ * Return: 0 on success, negative on failure
  */
-static int ice_configure_phy(struct ice_vsi *vsi)
+static int ice_phy_cfg(struct ice_vsi *vsi, bool link_en)
 {
 	struct device *dev = ice_pf_to_dev(vsi->back);
 	struct ice_port_info *pi = vsi->port_info;
@@ -2199,9 +2127,6 @@ static int ice_configure_phy(struct ice_vsi *vsi)
 	    phy->link_info.topo_media_conflict == ICE_AQ_LINK_TOPO_UNSUPP_MEDIA)
 		return -EPERM;
 
-	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags))
-		return ice_force_phys_link_state(vsi, true);
-
 	pcaps = kzalloc_obj(*pcaps);
 	if (!pcaps)
 		return -ENOMEM;
@@ -2215,10 +2140,8 @@ static int ice_configure_phy(struct ice_vsi *vsi)
 		goto done;
 	}
 
-	/* If PHY enable link is configured and configuration has not changed,
-	 * there's nothing to do
-	 */
-	if (pcaps->caps & ICE_AQC_PHY_EN_LINK &&
+	/* Configuration has not changed. There's nothing to do. */
+	if (link_en == !!(pcaps->caps & ICE_AQC_PHY_EN_LINK) &&
 	    ice_phy_caps_equals_cfg(pcaps, &phy->curr_user_phy_cfg))
 		goto done;
 
@@ -2282,8 +2205,12 @@ static int ice_configure_phy(struct ice_vsi *vsi)
 	 */
 	ice_cfg_phy_fc(pi, cfg, phy->curr_user_fc_req);
 
-	/* Enable link and link update */
-	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT | ICE_AQ_PHY_ENA_LINK;
+	/* Enable/Disable link and link update */
+	cfg->caps |= ICE_AQ_PHY_ENA_AUTO_LINK_UPDT;
+	if (link_en)
+		cfg->caps |= ICE_AQ_PHY_ENA_LINK;
+	else
+		cfg->caps &= ~ICE_AQ_PHY_ENA_LINK;
 
 	err = ice_aq_set_phy_cfg(&pf->hw, pi, cfg, NULL);
 	if (err)
@@ -2336,7 +2263,7 @@ static void ice_check_media_subtask(struct ice_pf *pf)
 		    test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags))
 			return;
 
-		err = ice_configure_phy(vsi);
+		err = ice_phy_cfg(vsi, true);
 		if (!err)
 			clear_bit(ICE_FLAG_NO_MEDIA, pf->flags);
 
@@ -4892,9 +4819,15 @@ static int ice_init_link(struct ice_pf *pf)
 
 		if (!test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, pf->flags)) {
 			struct ice_vsi *vsi = ice_get_main_vsi(pf);
+			struct ice_link_default_override_tlv *ldo;
+			bool link_en;
+
+			ldo = &pf->link_dflt_override;
+			link_en = !(ldo->options &
+				    ICE_LINK_OVERRIDE_AUTO_LINK_DIS);
 
 			if (vsi)
-				ice_configure_phy(vsi);
+				ice_phy_cfg(vsi, link_en);
 		}
 	} else {
 		set_bit(ICE_FLAG_NO_MEDIA, pf->flags);
@@ -9707,7 +9640,7 @@ int ice_open_internal(struct net_device *netdev)
 			}
 		}
 
-		err = ice_configure_phy(vsi);
+		err = ice_phy_cfg(vsi, true);
 		if (err) {
 			netdev_err(netdev, "Failed to set physical link up, error %d\n",
 				   err);
@@ -9748,7 +9681,7 @@ int ice_stop(struct net_device *netdev)
 	}
 
 	if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) {
-		int link_err = ice_force_phys_link_state(vsi, false);
+		int link_err = ice_phy_cfg(vsi, false);
 
 		if (link_err) {
 			if (link_err == -ENOMEDIUM)

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 04/13] ice: fix double-free of tx_buf skb
From: Jacob Keller @ 2026-04-15  5:47 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Michal Schmidt
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Michal Schmidt <mschmidt@redhat.com>

If ice_tso() or ice_tx_csum() fail, the error path in
ice_xmit_frame_ring() frees the skb, but the 'first' tx_buf still points
to it and is marked as valid (ICE_TX_BUF_SKB).
'next_to_use' remains unchanged, so the potential problem will
likely fix itself when the next packet is transmitted and the tx_buf
gets overwritten. But if there is no next packet and the interface is
brought down instead, ice_clean_tx_ring() -> ice_unmap_and_free_tx_buf()
will find the tx_buf and free the skb for the second time.

The fix is to reset the tx_buf type to ICE_TX_BUF_EMPTY in the error
path, so that ice_unmap_and_free_tx_buf().
Move the initialization of 'first' up, to ensure it's already valid in
case we hit the linearization error path.

The bug was spotted by AI while I had it looking for something else.
It also proposed an initial version of the patch.

I reproduced the bug and tested the fix by adding code to inject
failures, on a build with KASAN.

I looked for similar bugs in related Intel drivers and did not find any.

Fixes: d76a60ba7afb ("ice: Add support for VLANs and offloads")
Assisted-by: Claude:claude-4.6-opus-high Cursor
Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_txrx.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index a2cd4cf37734..7be9c062949b 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -2158,6 +2158,9 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
 
 	ice_trace(xmit_frame_ring, tx_ring, skb);
 
+	/* record the location of the first descriptor for this packet */
+	first = &tx_ring->tx_buf[tx_ring->next_to_use];
+
 	count = ice_xmit_desc_count(skb);
 	if (ice_chk_linearize(skb, count)) {
 		if (__skb_linearize(skb))
@@ -2183,8 +2186,6 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
 
 	offload.tx_ring = tx_ring;
 
-	/* record the location of the first descriptor for this packet */
-	first = &tx_ring->tx_buf[tx_ring->next_to_use];
 	first->skb = skb;
 	first->type = ICE_TX_BUF_SKB;
 	first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
@@ -2249,6 +2250,7 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
 out_drop:
 	ice_trace(xmit_frame_ring_drop, tx_ring, skb);
 	dev_kfree_skb_any(skb);
+	first->type = ICE_TX_BUF_EMPTY;
 	return NETDEV_TX_OK;
 }
 

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 03/13] ice: fix double free in ice_sf_eth_activate() error path
From: Jacob Keller @ 2026-04-15  5:47 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Guangshuo Li, stable, Aleksandr Loktionov,
	Simon Horman
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Guangshuo Li <lgs201920130244@gmail.com>

When auxiliary_device_add() fails, ice_sf_eth_activate() jumps to
aux_dev_uninit and calls auxiliary_device_uninit(&sf_dev->adev).

The device release callback ice_sf_dev_release() frees sf_dev, but
the current error path falls through to sf_dev_free and calls
kfree(sf_dev) again, causing a double free.

Keep kfree(sf_dev) for the auxiliary_device_init() failure path, but
avoid falling through to sf_dev_free after auxiliary_device_uninit().

Fixes: 13acc5c4cdbe ("ice: subfunction activation and base devlink ops")
Cc: stable@vger.kernel.org
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Guangshuo Li <lgs201920130244@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_sf_eth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/ice/ice_sf_eth.c b/drivers/net/ethernet/intel/ice/ice_sf_eth.c
index 2cf04bc6edce..a730aa368c92 100644
--- a/drivers/net/ethernet/intel/ice/ice_sf_eth.c
+++ b/drivers/net/ethernet/intel/ice/ice_sf_eth.c
@@ -305,6 +305,8 @@ ice_sf_eth_activate(struct ice_dynamic_port *dyn_port,
 
 aux_dev_uninit:
 	auxiliary_device_uninit(&sf_dev->adev);
+	return err;
+
 sf_dev_free:
 	kfree(sf_dev);
 xa_erase:

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 02/13] ice: update PCS latency settings for E825 10G/25Gb modes
From: Jacob Keller @ 2026-04-15  5:47 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Grzegorz Nitka, Zoltan Fodor,
	Aleksandr Loktionov, Sunitha Mekala
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Grzegorz Nitka <grzegorz.nitka@intel.com>

Update MAC Rx/Tx offset registers settings (PHY_MAC_[RX|TX]_OFFSET
registers) with the data obtained with the latest research. It applies
to PCS latency settings for the following speeds/modes:
* 10Gb NO-FEC
        - TX latency changed from 71.25 ns to 73 ns
        - RX latency changed from -25.6 ns to -28 ns
* 25Gb NO-FEC
	- TX latency changed from 28.17 ns to 33 ns
        - RX latency changed from -12.45 ns to -12 ns
* 25Gb RS-FEC
        - TX latency changed from 64.5 ns to 69 ns
        - RX latency changed from -3.6 ns to -3 ns

The original data came from simulation and pre-production hardware.
The new data measures the actual delays and as such is more accurate.

Fixes: 7cab44f1c35f ("ice: Introduce ETH56G PHY model for E825C products")
Co-developed-by: Zoltan Fodor <zoltan.fodor@intel.com>
Signed-off-by: Zoltan Fodor <zoltan.fodor@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
Tested-by: Sunitha Mekala <sunithax.d.mekala@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ptp_consts.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
index 19dddd9b53dd..4d298c27bfb2 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_consts.h
@@ -78,14 +78,14 @@ struct ice_eth56g_mac_reg_cfg eth56g_mac_cfg[NUM_ICE_ETH56G_LNK_SPD] = {
 		.blktime = 0x666, /* 3.2 */
 		.tx_offset = {
 			.serdes = 0x234c, /* 17.6484848 */
-			.no_fec = 0x8e80, /* 71.25 */
+			.no_fec = 0x93d9, /* 73 */
 			.fc = 0xb4a4, /* 90.32 */
 			.sfd = 0x4a4, /* 2.32 */
 			.onestep = 0x4ccd /* 38.4 */
 		},
 		.rx_offset = {
 			.serdes = 0xffffeb27, /* -10.42424 */
-			.no_fec = 0xffffcccd, /* -25.6 */
+			.no_fec = 0xffffc7b6, /* -28 */
 			.fc = 0xfffc557b, /* -469.26 */
 			.sfd = 0x4a4, /* 2.32 */
 			.bs_ds = 0x32 /* 0.0969697 */
@@ -118,17 +118,17 @@ struct ice_eth56g_mac_reg_cfg eth56g_mac_cfg[NUM_ICE_ETH56G_LNK_SPD] = {
 		.mktime = 0x147b, /* 10.24, only if RS-FEC enabled */
 		.tx_offset = {
 			.serdes = 0xe1e, /* 7.0593939 */
-			.no_fec = 0x3857, /* 28.17 */
+			.no_fec = 0x4266, /* 33 */
 			.fc = 0x48c3, /* 36.38 */
-			.rs = 0x8100, /* 64.5 */
+			.rs = 0x8a00, /* 69 */
 			.sfd = 0x1dc, /* 0.93 */
 			.onestep = 0x1eb8 /* 15.36 */
 		},
 		.rx_offset = {
 			.serdes = 0xfffff7a9, /* -4.1697 */
-			.no_fec = 0xffffe71a, /* -12.45 */
+			.no_fec = 0xffffe700, /* -12 */
 			.fc = 0xfffe894d, /* -187.35 */
-			.rs = 0xfffff8cd, /* -3.6 */
+			.rs = 0xfffff8cc, /* -3 */
 			.sfd = 0x1dc, /* 0.93 */
 			.bs_ds = 0x14 /* 0.0387879, RS-FEC 0 */
 		}

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 01/13] ice: fix 'adjust' timer programming for E830 devices
From: Jacob Keller @ 2026-04-15  5:47 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Grzegorz Nitka, Aleksandr Loktionov,
	Simon Horman, Rinitha S
In-Reply-To: <20260414-iwl-net-submission-2026-04-14-v1-0-852f38e7da39@intel.com>

From: Grzegorz Nitka <grzegorz.nitka@intel.com>

Fix incorrect 'adjust the timer' programming sequence for E830 devices
series. Only shadow registers GLTSYN_SHADJ were programmed in the
current implementation. According to the specification [1], write to
command GLTSYN_CMD register is also required with CMD field set to
"Adjust the Time" value, for the timer adjustment to take the effect.

The flow was broken for the adjustment less than S32_MAX/MIN range
(around +/- 2 seconds). For bigger adjustment, non-atomic programming
flow is used, involving set timer programming. Non-atomic flow is
implemented correctly.

Testing hints:
Run command:
	phc_ctl /dev/ptpX get adj 2 get
Expected result:
	Returned timestamps differ at least by 2 seconds

[1] Intel® Ethernet Controller E830 Datasheet rev 1.3, chapter 9.7.5.4
https://cdrdv2.intel.com/v1/dl/getContent/787353?explicitVersion=true

Fixes: f00307522786 ("ice: Implement PTP support for E830 devices")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Tested-by: Rinitha S <sx.rinitha@intel.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index 61c0a0d93ea8..5a5c511ccbb6 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -5381,8 +5381,8 @@ int ice_ptp_write_incval_locked(struct ice_hw *hw, u64 incval)
  */
 int ice_ptp_adj_clock(struct ice_hw *hw, s32 adj)
 {
+	int err = 0;
 	u8 tmr_idx;
-	int err;
 
 	tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned;
 
@@ -5399,8 +5399,8 @@ int ice_ptp_adj_clock(struct ice_hw *hw, s32 adj)
 		err = ice_ptp_prep_phy_adj_e810(hw, adj);
 		break;
 	case ICE_MAC_E830:
-		/* E830 sync PHYs automatically after setting GLTSYN_SHADJ */
-		return 0;
+		/* E830 sync PHYs automatically after setting cmd register */
+		break;
 	case ICE_MAC_GENERIC:
 		err = ice_ptp_prep_phy_adj_e82x(hw, adj);
 		break;

-- 
2.53.0.1066.g1eceb487f285


^ permalink raw reply related

* [PATCH net 00/13] Intel Wired LAN Driver Updates 2026-04-14 (ice, i40e, iavf, idpf, e1000e)
From: Jacob Keller @ 2026-04-15  5:47 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, Jacob Keller, Grzegorz Nitka, Aleksandr Loktionov,
	Simon Horman, Rinitha S, Zoltan Fodor, Sunitha Mekala,
	Guangshuo Li, stable, Michal Schmidt, Paul Greenwalt,
	Przemek Kitszel, Keita Morisaki, Kohei Enju, Petr Oros,
	Paul Menzel, Rafal Romanowski, Emil Tantilov, Patryk Holda,
	Matt Vollrath, Avigail Dahan

Grzegorz updates the logic for adjusting the PTP hardware clock on E830,
fixing a bug that prevented adjustments below S32_MAX/MIN nanoseconds.

Grzegorz and Zoli update the PCS latency settings for E825 devices at 10GbE
and 25GbE, improving the accuracy of timestamps based on data from
production hardware.

Michal Schmidt fixes a double-free that could happen if a particular error
path is taken in ice_xmit_frame_ring().

Guangshuo fixes a double-free that could happen during error paths in the
ice_sf_eth_activate() function.

Paul Greenwalt fixes the PHY link configuration when the link-down-on-close
driver parameter is enabled and new media is inserted.

Paul Greenwalt fixes the ICE_AQ_LINK_SPEED_M macro for 200G, enabling 200G
link speed advertisement.

Keita Morisaki fixes a race condition in the ice Tx timestamp ring cleanup,
preventing a possible NULL pointer dereference.

Kohei Enju fixes a potential NULL pointer dereference in ice_set_ring_param().

Kohei Enju fixes i40e to stop advertising IFF_SUPP_NOFCS, when the driver
does not actually support the feature.

Aleksandr fixes i40e napi_enable/disable for q_vectors that no longer have
rings.

Petr fixes the VLAN L2TAG2 mask when the iAVF VF and a PF negotiate use of
the legacy Rx descriptor format.

Emil fixes a NULL pointer dereference that can happen in the soft reset if
a particular error path is taken.

Matt fixes the unrolling logic for PTP when the e1000e probe fails after
the PTP clock has been registered.

 **A note to stable backports**

  The patches [7/13] ("ice: fix race condition in TX timestamp ring
  cleanup") and [8/13] ("ice: fix potential NULL pointer deref in error
  path of ice_set_ringparam()") must be backported together. Otherwise the
  fix in patch 8 will not work properly.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
Aleksandr Loktionov (1):
      i40e: fix napi_enable/disable skipping ringless q_vectors

Emil Tantilov (1):
      idpf: fix xdp crash in soft reset error path

Grzegorz Nitka (2):
      ice: fix 'adjust' timer programming for E830 devices
      ice: update PCS latency settings for E825 10G/25Gb modes

Guangshuo Li (1):
      ice: fix double free in ice_sf_eth_activate() error path

Keita Morisaki (1):
      ice: fix race condition in TX timestamp ring cleanup

Kohei Enju (2):
      ice: fix potential NULL pointer deref in error path of ice_set_ringparam()
      i40e: don't advertise IFF_SUPP_NOFCS

Matt Vollrath (1):
      e1000e: Unroll PTP in probe error handling

Michal Schmidt (1):
      ice: fix double-free of tx_buf skb

Paul Greenwalt (2):
      ice: fix PHY config on media change with link-down-on-close
      ice: fix ICE_AQ_LINK_SPEED_M for 200G

Petr Oros (1):
      iavf: fix wrong VLAN mask for legacy Rx descriptors L2TAG2

 drivers/net/ethernet/intel/iavf/iavf_type.h     |   2 +-
 drivers/net/ethernet/intel/ice/ice.h            |   4 +-
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h |   2 +-
 drivers/net/ethernet/intel/ice/ice_ptp_consts.h |  12 +--
 drivers/net/ethernet/intel/ice/ice_txrx.h       |  16 ++--
 drivers/net/ethernet/intel/e1000e/netdev.c      |   1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c     |  29 +++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c     |  10 ++
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c    |   2 +-
 drivers/net/ethernet/intel/ice/ice_ethtool.c    |   1 +
 drivers/net/ethernet/intel/ice/ice_lib.c        |   4 +-
 drivers/net/ethernet/intel/ice/ice_main.c       | 121 ++++++------------------
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c     |   6 +-
 drivers/net/ethernet/intel/ice/ice_sf_eth.c     |   2 +
 drivers/net/ethernet/intel/ice/ice_txrx.c       |  29 ++++--
 drivers/net/ethernet/intel/idpf/xdp.c           |   1 +
 drivers/net/ethernet/intel/idpf/xsk.c           |   4 +-
 17 files changed, 107 insertions(+), 139 deletions(-)
---
base-commit: b9d8b856689d2b968495d79fe653d87fcb8ad98c
change-id: 20260414-iwl-net-submission-2026-04-14-6203e1860df3

Best regards,
--  
Jacob Keller <jacob.e.keller@intel.com>


^ permalink raw reply

* Re: [PATCH] net: ipv4: fix alignment fault in sysctl_fib_multipath_hash_seed on ARM64 with Clang
From: Eric Dumazet @ 2026-04-15  5:43 UTC (permalink / raw)
  To: Juno Choii; +Cc: netdev, davem, kuba, pabeni, horms, linux-kernel
In-Reply-To: <20260415051343.1190626-1-juno.choi@lge.com>

On Tue, Apr 14, 2026 at 10:13 PM Juno Choii <juno.choi@lge.com> wrote:
>
> From: Juno Choi <juno.choi@lge.com>
>
> On ARM64, Clang may generate ldaxr (64-bit exclusive load) for
> READ_ONCE() on 8-byte structs. ldaxr requires 8-byte natural
> alignment, but sysctl_fib_multipath_hash_seed (two u32 members)
> only has 4-byte natural alignment.
>
> When this struct lands at a 4-byte-aligned but not 8-byte-aligned
> offset within struct netns_ipv4, the ldaxr triggers an alignment
> fault in rt6_multipath_hash(), causing a kernel panic in the IPv6
> packet receive path (rtl8168_poll -> ipv6_list_rcv ->
> rt6_multipath_hash).
>
> Add __aligned(8) to the struct definition when building for ARM64
> with Clang to ensure proper alignment for atomic 8-byte loads.
>
> Signed-off-by: Juno Choi <juno.choi@lge.com>
> ---

It seems you missed

commit 4ee7fa6cf78ff26d783d39e2949d14c4c1cd5e7f
Author: Yung Chih Su <yuuchihsu@gmail.com>
Date:   Mon Mar 2 14:02:47 2026 +0800

    net: ipv4: fix ARM64 alignment fault in multipath hash seed

    `struct sysctl_fib_multipath_hash_seed` contains two u32 fields
    (user_seed and mp_seed), making it an 8-byte structure with a 4-byte
    alignment requirement.

    In `fib_multipath_hash_from_keys()`, the code evaluates the entire
    struct atomically via `READ_ONCE()`:

        mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;

    While this silently works on GCC by falling back to unaligned regular
    loads which the ARM64 kernel tolerates, it causes a fatal kernel panic
    when compiled with Clang and LTO enabled.

    Commit e35123d83ee3 ("arm64: lto: Strengthen READ_ONCE() to acquire
    when CONFIG_LTO=y") strengthens `READ_ONCE()` to use Load-Acquire
    instructions (`ldar` / `ldapr`) to prevent compiler reordering bugs
    under Clang LTO. Since the macro evaluates the full 8-byte struct,
    Clang emits a 64-bit `ldar` instruction. ARM64 architecture strictly
    requires `ldar` to be naturally aligned, thus executing it on a 4-byte
    aligned address triggers a strict Alignment Fault (FSC = 0x21).

    Fix the read side by moving the `READ_ONCE()` directly to the `u32`
    member, which emits a safe 32-bit `ldar Wn`.

    Furthermore, Eric Dumazet pointed out that `WRITE_ONCE()` on the entire
    struct in `proc_fib_multipath_hash_set_seed()` is also flawed. Analysis
    shows that Clang splits this 8-byte write into two separate 32-bit
    `str` instructions. While this avoids an alignment fault, it destroys
    atomicity and exposes a tear-write vulnerability. Fix this by
    explicitly splitting the write into two 32-bit `WRITE_ONCE()`
    operations.

    Finally, add the missing `READ_ONCE()` when reading `user_seed` in
    `proc_fib_multipath_hash_seed()` to ensure proper pairing and
    concurrency safety.

    Fixes: 4ee2a8cace3f ("net: ipv4: Add a sysctl to set multipath hash seed")
    Signed-off-by: Yung Chih Su <yuuchihsu@gmail.com>
    Reviewed-by: Eric Dumazet <edumazet@google.com>
    Link: https://patch.msgid.link/20260302060247.7066-1-yuuchihsu@gmail.com
    Signed-off-by: Jakub Kicinski <kuba@kernel.org>

So perhaps you only want this followup:

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 7bd87d0547d8af40ca8736e97eac9e3d8a069052..5de5fd9465b8c5ea81d92dc74d7c6e50e3a94c73
100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -11404,7 +11404,7 @@ static int mlxsw_sp_mp_hash_init(struct
mlxsw_sp *mlxsw_sp)
        u32 seed;
        int err;

-       seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).user_seed;
+       seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.user_seed);
        if (!seed)
                seed = jhash(mlxsw_sp->base_mac, sizeof(mlxsw_sp->base_mac), 0);

^ permalink raw reply

* Re: [RFC net-next 1/3] net/tls_sw: support randomized zero padding
From: Wilfred Mallawa @ 2026-04-15  5:40 UTC (permalink / raw)
  To: Alistair Francis, Wilfred Mallawa, kuba@kernel.org
  Cc: corbet@lwn.net, dlemoal@kernel.org, davem@davemloft.net,
	linux-kselftest@vger.kernel.org, john.fastabend@gmail.com,
	sd@queasysnail.net, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, pabeni@redhat.com,
	skhan@linuxfoundation.org, edumazet@google.com, horms@kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <49513ee4347536e7c8419e9e65b8c619a8c665bb.camel@wdc.com>

>>> Sorry, I realized when i hit "send" that I phrased my previous
>>> message
>>> poorly. When I say "potential" I mean someone actually presenting a
>>> PoC
>>> and a CVE is issued for it. Have we seen any of those?
> In 2014 a group at UC Berkeley used HTTPS traffic analysis to identify:
>
> "individual pages in the same web-site with 90% accuracy, exposing
> personal details including medical conditions, financial and legal
> affairs and sexual orientation."
>
> They used machine learning to help and that was over 10 years ago. So I
> suspect modern day machine learning would make this even easier to do
> today.
>
> Obviously that is HTTP traffic, which is different to the NVMe-TCP
> traffic this series is targeting, but it does still seem like a real
> concern.
>
> They talk about a range of defences in the paper, with tradeoffs
> between all of them. But the linear defence seems like the one that is
> applicable here:
>
> "linear defense pads all packet sizes up to multiples of 128"
>
> The linear defence seems to reduce the Pan attack from 60% to around
> 25% and the BoG attack from 90% to around 60%.
>
> On top of that the
>
> "Burst defense offers greater protection, operating between the TCP
> layer and application layer to pad contiguous bursts of traffic up to 
> predefined thresholds uniquely determined for each website"
>
> Which to me sounds like the random padding proposed in this series
> would provide more protection then the basic linear padding used in the
> paper.
>
> To me analysing TLS traffic does seem like a plausible threat and
> something that randomised padding would help with. Leaving it up to
> userspace to decide based on their threat model seems like a good
> approach as well.
>
> 1: https://secml.cs.berkeley.edu/pets2014/
>
> Alistair

gentle ping. Are there any further thoughts on adding this support?

Wilfred


^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH v2] dpf: fix UAF and double free in idpf_plug_vport_aux_dev() error path
From: Jacob Keller @ 2026-04-15  5:37 UTC (permalink / raw)
  To: Guangshuo Li
  Cc: Tony Nguyen, Przemek Kitszel, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Joshua Hay,
	Tatyana Nikolova, Madhu Chittim, intel-wired-lan, netdev,
	linux-kernel, Greg Kroah-Hartman, stable
In-Reply-To: <CANUHTR8uNVWR48xs90s+MtGQ6J-1j5R0+64MKVGin0cf-FjRWA@mail.gmail.com>

On 4/14/2026 6:47 PM, Guangshuo Li wrote:
> Hi Jacob,
> 
> Thanks for reviewing.
> 
> On Wed, 15 Apr 2026 at 05:03, Jacob Keller <jacob.e.keller@intel.com> wrote:
>>
>>
>> This doesn't look right. The commit message analysis seems to match this
>> fix from Greg KH:
>>
>> https://lore.kernel.org/intel-wired-lan/2026041432-tapestry-condition-22ff@gregkh/
>>
>> But the changes do not make any sense to me. It looks like a poorly done
>> AI-generated "fix" which is not correct. Greg's version does look like
>> it properly resolves this.
>>
>>> v2:
>>>   - note that the issue was identified by my static analysis tool
>>>   - and confirmed by manual review
>>>
>>
>> What even is this change log?? I see that version was sent and everyone
>> else was sane enough to just silently reject or ignore the v1...
>>
>>>  drivers/net/ethernet/intel/idpf/idpf_idc.c | 6 +++++-
>>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c
>>> index 6dad0593f7f2..2a18907643fc 100644
>>> --- a/drivers/net/ethernet/intel/idpf/idpf_idc.c
>>> +++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c
>>> @@ -59,6 +59,7 @@ static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info,
>>>       char name[IDPF_IDC_MAX_ADEV_NAME_LEN];
>>>       struct auxiliary_device *adev;
>>>       int ret;
>>> +     int adev_id;
>>>
>>
>> You create a local variable here...
>>
>>>       iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
>>>       if (!iadev)
>>> @@ -74,11 +75,14 @@ static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info,
>>>               goto err_ida_alloc;
>>>       }
>>>       adev->id = ret;
>>> +     adev->id = adev_id;
>>
>> adev_is is never initialized, so you assign a random garbage
>> uninitialized value. This is obviously wrong and will lead to worse
>> errors than the failed cleanup.
>>
>> I'm rejecting this patch in favor of the clearly appropriate fix from Greg.
>>
>>>       adev->dev.release = idpf_vport_adev_release;
>>>       adev->dev.parent = &cdev_info->pdev->dev;
>>>       sprintf(name, "%04x.rdma.vdev", cdev_info->pdev->vendor);
>>>       adev->name = name;
>>>
>>> +     /* iadev is owned by the auxiliary device */
>>> +     iadev = NULL;>          ret = auxiliary_device_init(adev);
>>>       if (ret)
>>>               goto err_aux_dev_init;
>>> @@ -92,7 +96,7 @@ static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info,
>>>  err_aux_dev_add:
>>>       auxiliary_device_uninit(adev);
>>>  err_aux_dev_init:
>>> -     ida_free(&idpf_idc_ida, adev->id);
>>> +     ida_free(&idpf_idc_ida, adev_id);
>>>  err_ida_alloc:
>>>       vdev_info->adev = NULL;
>>>       kfree(iadev);
>>
> 
> You are right that the v2 patch as sent is incomplete. That was my
> mistake when preparing/sending v2: it accidentally dropped the adev_id
> = ret; assignment, which made that version incorrect.
> 
> For reference, the original v1 patch is here:
> 
> https://lkml.org/lkml/2026/3/21/421
> 
> In v1, adev_id was assigned from ret before use, so I believe that
> particular uninitialized-variable issue was introduced in the v2
> posting.
> 
> Sorry for the confusion caused by the broken v2 posting.

No problem. I had missed the other version, which explains my confusion.
Still, to my eyes, the fix looks to be an equivalent fix as one
submitted by GregKH:

https://lore.kernel.org/intel-wired-lan/2026041116-retail-bagginess-250f@gregkh/

Do you agree this is effectively a different fix for the same problem?
Or is there really two different double-free issues here that both need
patching? I haven't been able to fully convince my self either way, but
I am leaning on this being one problem, and I think Gregs solution feels
simpler to understand.

Thanks,
Jake

> 
> Thanks,
> Guangshuo


^ permalink raw reply

* Re: [ovs-dev] [PATCH net-next v2] net: openvswitch: decouple flow_table from ovs_mutex
From: Adrián Moreno @ 2026-04-15  5:32 UTC (permalink / raw)
  To: Aaron Conole
  Cc: Adrian Moreno via dev, netdev, open list:OPENVSWITCH, Paolo Abeni,
	open list, Ilya Maximets, Eric Dumazet, Simon Horman,
	Jakub Kicinski, David S. Miller
In-Reply-To: <f7twlyeabra.fsf@redhat.com>

On Fri, Apr 10, 2026 at 02:52:41PM -0400, Aaron Conole wrote:
> Hi Adrian,
>
> Thanks for the patch.  A few questions inline.
>
> Adrian Moreno via dev <ovs-dev@openvswitch.org> writes:
>
> > Currently the entire ovs module is write-protected using the global
> > ovs_mutex. While this simple approach works fine for control-plane
> > operations (such as vport configurations), requiring the global mutex
> > for flow modifications can be problematic.
> >
> > During periods of high control-plane operations, e.g: netdevs (vports)
> > coming and going, RTNL can suffer contention. This contention is easily
> > transferred to the ovs_mutex as RTNL nests inside ovs_mutex. Flow
> > modifications, however, are done as part of packet processing and having
> > them wait for RTNL pressure to go away can lead to packet drops.
> >
> > This patch decouples flow_table modifications from ovs_mutex by means of
> > the following:
> >
> > 1 - Make flow_table an rcu-protected pointer inside the datapath.
> > This allows both objects to be protected independently while reducing the
> > amount of changes required in "flow_table.c".
> >
> > 2 - Create a new mutex inside the flow_table that protects it from
> > concurrent modifications.
> > Putting the mutex inside flow_table makes it easier to consume for
> > functions inside flow_table.c that do not currently take pointers to the
> > datapath.
> > Some function signatures need to be changed to accept flow_table so that
> > lockdep checks can be performed.
> >
> > 3 - Create a reference count to temporarily extend rcu protection from
> > the datapath to the flow_table.
> > In order to use the flow_table without locking ovs_mutex, the flow_table
> > pointer must be first dereferenced within an rcu-protected region.
> > Next, the table->mutex needs to be locked to protect it from
> > concurrent writes but mutexes must not be locked inside an rcu-protected
> > region, so the rcu-protected region must be left at which point the
> > datapath can be concurrently freed.
> > To extend the protection beyond the rcu region, a reference count is used.
> > One reference is held by the datapath, the other is temporarily
> > increased during flow modifications. For example:
> >
> > Datapath deletion:
> >
> >   ovs_lock();
> >   table = rcu_dereference_protected(dp->table, ...);
> >   rcu_assign_pointer(dp->table, NULL);
> >   ovs_flow_tbl_put(table);
> >   ovs_unlock();
>
> I guess it's possible now to have flow operations succeed on
> 'removed-but-not-yet-freed' tables.  That's probably worth documenting
> somewhere, since it is a slight behavior change.  More below
>

You are right. That corner case is kind of weird as we could be adding a
flow to a table that has been deteched from the datapath and will be
freed inmediately after. I can add a comment in __dp_destroy about this.


> > Flow modification:
> >
> >   rcu_read_lock();
> >   dp = get_dp(...);
> >   table = rcu_dereference(dp->table);
> >   ovs_flow_tbl_get(table);
> >   rcu_read_unlock();
> >
> >   mutex_lock(&table->lock);
> >   /* Perform modifications on the flow_table */
> >   mutex_unlock(&table->lock);
> >   ovs_flow_tbl_put(table);
> >
> > Signed-off-by: Adrian Moreno <amorenoz@redhat.com>
> > ---
> > v2: Fix argument in ovs_flow_tbl_put (sparse)
> >     Remove rcu checks in ovs_dp_masks_rebalance
> > ---
> >  net/openvswitch/datapath.c   | 285 ++++++++++++++++++++++++-----------
> >  net/openvswitch/datapath.h   |   2 +-
> >  net/openvswitch/flow.c       |  13 +-
> >  net/openvswitch/flow.h       |   9 +-
> >  net/openvswitch/flow_table.c | 180 ++++++++++++++--------
> >  net/openvswitch/flow_table.h |  51 ++++++-
> >  6 files changed, 380 insertions(+), 160 deletions(-)
> >
> > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> > index e209099218b4..9c234993520c 100644
> > --- a/net/openvswitch/datapath.c
> > +++ b/net/openvswitch/datapath.c
> > @@ -88,13 +88,17 @@ static void ovs_notify(struct genl_family *family,
> >   * DOC: Locking:
> >   *
> >   * All writes e.g. Writes to device state (add/remove datapath, port, set
> > - * operations on vports, etc.), Writes to other state (flow table
> > - * modifications, set miscellaneous datapath parameters, etc.) are protected
> > - * by ovs_lock.
> > + * operations on vports, etc.) and writes to other datapath parameters
> > + * are protected by ovs_lock.
> > + *
> > + * Writes to the flow table are NOT protected by ovs_lock. Instead, a per-table
> > + * mutex and reference count are used (see comment above "struct flow_table"
> > + * definition). On some few occasions, the per-flow table mutex is nested
> > + * inside ovs_mutex.
> >   *
> >   * Reads are protected by RCU.
> >   *
> > - * There are a few special cases (mostly stats) that have their own
> > + * There are a few other special cases (mostly stats) that have their own
> >   * synchronization but they nest under all of above and don't interact with
> >   * each other.
> >   *
> > @@ -166,7 +170,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
> >  {
> >  	struct datapath *dp = container_of(rcu, struct datapath, rcu);
> >
> > -	ovs_flow_tbl_destroy(&dp->table);
> >  	free_percpu(dp->stats_percpu);
> >  	kfree(dp->ports);
> >  	ovs_meters_exit(dp);
> > @@ -247,6 +250,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
> >  	struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage);
> >  	const struct vport *p = OVS_CB(skb)->input_vport;
> >  	struct datapath *dp = p->dp;
> > +	struct flow_table *table;
> >  	struct sw_flow *flow;
> >  	struct sw_flow_actions *sf_acts;
> >  	struct dp_stats_percpu *stats;
> > @@ -257,9 +261,16 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
> >  	int error;
> >
> >  	stats = this_cpu_ptr(dp->stats_percpu);
> > +	table = rcu_dereference(dp->table);
> > +	if (!table) {
> > +		net_dbg_ratelimited("ovs: no flow table on datapath %s\n",
> > +				    ovs_dp_name(dp));
> > +		kfree_skb(skb);
> > +		return;
> > +	}
> >
> >  	/* Look up flow. */
> > -	flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
> > +	flow = ovs_flow_tbl_lookup_stats(table, key, skb_get_hash(skb),
> >  					 &n_mask_hit, &n_cache_hit);
> >  	if (unlikely(!flow)) {
> >  		struct dp_upcall_info upcall;
> > @@ -752,12 +763,16 @@ static struct genl_family dp_packet_genl_family __ro_after_init = {
> >  static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
> >  			 struct ovs_dp_megaflow_stats *mega_stats)
> >  {
> > +	struct flow_table *table = ovsl_dereference(dp->table);
> >  	int i;
> >
> >  	memset(mega_stats, 0, sizeof(*mega_stats));
> >
> > -	stats->n_flows = ovs_flow_tbl_count(&dp->table);
> > -	mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table);
> > +	if (table) {
> > +		stats->n_flows = ovs_flow_tbl_count(table);
>
> Previously, when calling this we'd be under the ovs_mutex and the read
> on table->count would be somewhat coherent (for some definition of
> coherent).  BUT we are now doing a bare read.  I'm not sure if we should
> take the lock here, or at least give some kind of barrier (READ_ONCE and
> update the count setting sites with WRITE_ONCEs)?  WDYT?
>

I think you are right, this call can now happen in parallel with
statistic updates on the flow table side.

IIUC, datapath operations such as this still hold the ovs_mutex,
"ovsl_dereference()" above should splat if that's not true. And
"table->lock" would force us to also hold it while updating the
stats which undermines the purpose of this patch. So
READ_ONCE/WRITE_ONCE seems like a good solution here.

> > +		mega_stats->n_masks = ovs_flow_tbl_num_masks(table);
> > +	}
> > +
> >
> >  	stats->n_hit = stats->n_missed = stats->n_lost = 0;
> >
> > @@ -829,15 +844,16 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
> >  		+ nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */
> >  }
> >
> > -/* Called with ovs_mutex or RCU read lock. */
> > +/* Called with table->lock or RCU read lock. */
> >  static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
> > +				   const struct flow_table *table,
> >  				   struct sk_buff *skb)
> >  {
> >  	struct ovs_flow_stats stats;
> >  	__be16 tcp_flags;
> >  	unsigned long used;
> >
> > -	ovs_flow_stats_get(flow, &stats, &used, &tcp_flags);
> > +	ovs_flow_stats_get(flow, table, &stats, &used, &tcp_flags);
> >
> >  	if (used &&
> >  	    nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used),
> > @@ -857,8 +873,9 @@ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow,
> >  	return 0;
> >  }
> >
> > -/* Called with ovs_mutex or RCU read lock. */
> > +/* Called with RCU read lock or table->lock held. */
> >  static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
> > +				     const struct flow_table *table,
> >  				     struct sk_buff *skb, int skb_orig_len)
> >  {
> >  	struct nlattr *start;
> > @@ -878,7 +895,7 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
> >  	if (start) {
> >  		const struct sw_flow_actions *sf_acts;
> >
> > -		sf_acts = rcu_dereference_ovsl(flow->sf_acts);
> > +		sf_acts = rcu_dereference_ovs_tbl(flow->sf_acts, table);
> >  		err = ovs_nla_put_actions(sf_acts->actions,
> >  					  sf_acts->actions_len, skb);
> >
> > @@ -897,8 +914,10 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow,
> >  	return 0;
> >  }
> >
> > -/* Called with ovs_mutex or RCU read lock. */
> > -static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
> > +/* Called with table->lock or RCU read lock. */
> > +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow,
> > +				  const struct flow_table *table,
> > +				  int dp_ifindex,
> >  				  struct sk_buff *skb, u32 portid,
> >  				  u32 seq, u32 flags, u8 cmd, u32 ufid_flags)
> >  {
> > @@ -929,12 +948,12 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex,
> >  			goto error;
> >  	}
> >
> > -	err = ovs_flow_cmd_fill_stats(flow, skb);
> > +	err = ovs_flow_cmd_fill_stats(flow, table, skb);
> >  	if (err)
> >  		goto error;
> >
> >  	if (should_fill_actions(ufid_flags)) {
> > -		err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len);
> > +		err = ovs_flow_cmd_fill_actions(flow, table, skb, skb_orig_len);
> >  		if (err)
> >  			goto error;
> >  	}
> > @@ -968,8 +987,9 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
> >  	return skb;
> >  }
> >
> > -/* Called with ovs_mutex. */
> > +/* Called with table->lock. */
> >  static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
> > +					       const struct flow_table *table,
> >  					       int dp_ifindex,
> >  					       struct genl_info *info, u8 cmd,
> >  					       bool always, u32 ufid_flags)
> > @@ -977,12 +997,12 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
> >  	struct sk_buff *skb;
> >  	int retval;
> >
> > -	skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts),
> > +	skb = ovs_flow_cmd_alloc_info(ovs_tbl_dereference(flow->sf_acts, table),
> >  				      &flow->id, info, always, ufid_flags);
> >  	if (IS_ERR_OR_NULL(skb))
> >  		return skb;
> >
> > -	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
> > +	retval = ovs_flow_cmd_fill_info(flow, table, dp_ifindex, skb,
> >  					info->snd_portid, info->snd_seq, 0,
> >  					cmd, ufid_flags);
> >  	if (WARN_ON_ONCE(retval < 0)) {
> > @@ -998,6 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  	struct nlattr **a = info->attrs;
> >  	struct ovs_header *ovs_header = genl_info_userhdr(info);
> >  	struct sw_flow *flow = NULL, *new_flow;
> > +	struct flow_table *table;
> >  	struct sw_flow_mask mask;
> >  	struct sk_buff *reply;
> >  	struct datapath *dp;
> > @@ -1064,30 +1085,43 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  		goto err_kfree_acts;
> >  	}
> >
>
> I think this can lead to a weird(?) behavior:
>
> thread A (dp_destroy):                   thread b (ovs_flow_cmd_new):
> rcu_assign_pointer(dp->table, NULL)
>                                          rcu_read_lock();
>                                          table =
>                                          rcu_dereference(dp->table);
>                                            [old table]
>                                          ovs_flow_tbl_get(table)
>                                              //refcnt change
>                                          rcu_read_unlock()
> ovs_flow_tbl_put(table) // refcnt chg
>                                          mutex_lock(table->lock)
>                                          ovs_flow_table_insert(...)
>                                          [success reply]
>                                          mutex_unlock(table->lock)
>                                          ovs_flow_tbl_put(table)
>                                          // table flow flush, etc.
>
> I guess it isn't a huge deal (installing flow while deleting table would
> be weird from a userspace perspective), and I think it is safe, but it
> is worth mentioning that we can have such scenario now.
>

I completely agree, this was not documented (it will in the next version
of the patch) but it's the inevitable side effect of this design.

> > -	ovs_lock();
> > +	rcu_read_lock();
> >  	dp = get_dp(net, ovs_header->dp_ifindex);
> >  	if (unlikely(!dp)) {
> >  		error = -ENODEV;
> > -		goto err_unlock_ovs;
> > +		rcu_read_unlock();
> > +		goto err_kfree_reply;
> >  	}
> > +	table = rcu_dereference(dp->table);
> > +	if (!table || !ovs_flow_tbl_get(table)) {
> > +		error = -ENODEV;
> > +		rcu_read_unlock();
> > +		goto err_kfree_reply;
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	/* It is safe to dereference "table" after leaving rcu read-protected
> > +	 * region because it's pinned by refcount.
> > +	 */
> > +	mutex_lock(&table->lock);
> >
> >  	/* Check if this is a duplicate flow */
> >  	if (ovs_identifier_is_ufid(&new_flow->id))
> > -		flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
> > +		flow = ovs_flow_tbl_lookup_ufid(table, &new_flow->id);
> >  	if (!flow)
> > -		flow = ovs_flow_tbl_lookup(&dp->table, key);
> > +		flow = ovs_flow_tbl_lookup(table, key);
> >  	if (likely(!flow)) {
> >  		rcu_assign_pointer(new_flow->sf_acts, acts);
> >
> >  		/* Put flow in bucket. */
> > -		error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
> > +		error = ovs_flow_tbl_insert(table, new_flow, &mask);
> >  		if (unlikely(error)) {
> >  			acts = NULL;
> > -			goto err_unlock_ovs;
> > +			goto err_unlock_tbl;
> >  		}
> >
> >  		if (unlikely(reply)) {
> > -			error = ovs_flow_cmd_fill_info(new_flow,
> > +			error = ovs_flow_cmd_fill_info(new_flow, table,
> >  						       ovs_header->dp_ifindex,
> >  						       reply, info->snd_portid,
> >  						       info->snd_seq, 0,
> > @@ -1095,7 +1129,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  						       ufid_flags);
> >  			BUG_ON(error < 0);
> >  		}
> > -		ovs_unlock();
> > +		mutex_unlock(&table->lock);
> > +		ovs_flow_tbl_put(table);
> >  	} else {
> >  		struct sw_flow_actions *old_acts;
> >
> > @@ -1108,28 +1143,28 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  		if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE
> >  							 | NLM_F_EXCL))) {
> >  			error = -EEXIST;
> > -			goto err_unlock_ovs;
> > +			goto err_unlock_tbl;
> >  		}
> >  		/* The flow identifier has to be the same for flow updates.
> >  		 * Look for any overlapping flow.
> >  		 */
> >  		if (unlikely(!ovs_flow_cmp(flow, &match))) {
> >  			if (ovs_identifier_is_key(&flow->id))
> > -				flow = ovs_flow_tbl_lookup_exact(&dp->table,
> > +				flow = ovs_flow_tbl_lookup_exact(table,
> >  								 &match);
> >  			else /* UFID matches but key is different */
> >  				flow = NULL;
> >  			if (!flow) {
> >  				error = -ENOENT;
> > -				goto err_unlock_ovs;
> > +				goto err_unlock_tbl;
> >  			}
> >  		}
> >  		/* Update actions. */
> > -		old_acts = ovsl_dereference(flow->sf_acts);
> > +		old_acts = ovs_tbl_dereference(flow->sf_acts, table);
> >  		rcu_assign_pointer(flow->sf_acts, acts);
> >
> >  		if (unlikely(reply)) {
> > -			error = ovs_flow_cmd_fill_info(flow,
> > +			error = ovs_flow_cmd_fill_info(flow, table,
> >  						       ovs_header->dp_ifindex,
> >  						       reply, info->snd_portid,
> >  						       info->snd_seq, 0,
> > @@ -1137,7 +1172,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  						       ufid_flags);
> >  			BUG_ON(error < 0);
> >  		}
> > -		ovs_unlock();
> > +		mutex_unlock(&table->lock);
> > +		ovs_flow_tbl_put(table);
> >
> >  		ovs_nla_free_flow_actions_rcu(old_acts);
> >  		ovs_flow_free(new_flow, false);
> > @@ -1149,8 +1185,10 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  	kfree(key);
> >  	return 0;
> >
> > -err_unlock_ovs:
> > -	ovs_unlock();
> > +err_unlock_tbl:
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> > +err_kfree_reply:
> >  	kfree_skb(reply);
> >  err_kfree_acts:
> >  	ovs_nla_free_flow_actions(acts);
> > @@ -1244,6 +1282,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
> >  	struct net *net = sock_net(skb->sk);
> >  	struct nlattr **a = info->attrs;
> >  	struct ovs_header *ovs_header = genl_info_userhdr(info);
> > +	struct flow_table *table;
> >  	struct sw_flow_key key;
> >  	struct sw_flow *flow;
> >  	struct sk_buff *reply = NULL;
> > @@ -1278,29 +1317,43 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
> >  		}
> >  	}
> >
> > -	ovs_lock();
> > +	rcu_read_lock();
> >  	dp = get_dp(net, ovs_header->dp_ifindex);
> >  	if (unlikely(!dp)) {
> >  		error = -ENODEV;
> > -		goto err_unlock_ovs;
> > +		rcu_read_unlock();
> > +		goto err_free_reply;
> >  	}
> > +	table = rcu_dereference(dp->table);
> > +	if (!table || !ovs_flow_tbl_get(table)) {
> > +		rcu_read_unlock();
> > +		error = -ENODEV;
> > +		goto err_free_reply;
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	/* It is safe to dereference "table" after leaving rcu read-protected
> > +	 * region because it's pinned by refcount.
> > +	 */
> > +	mutex_lock(&table->lock);
> > +
> >  	/* Check that the flow exists. */
> >  	if (ufid_present)
> > -		flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid);
> > +		flow = ovs_flow_tbl_lookup_ufid(table, &sfid);
> >  	else
> > -		flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
> > +		flow = ovs_flow_tbl_lookup_exact(table, &match);
> >  	if (unlikely(!flow)) {
> >  		error = -ENOENT;
> > -		goto err_unlock_ovs;
> > +		goto err_unlock_tbl;
> >  	}
> >
> >  	/* Update actions, if present. */
> >  	if (likely(acts)) {
> > -		old_acts = ovsl_dereference(flow->sf_acts);
> > +		old_acts = ovs_tbl_dereference(flow->sf_acts, table);
> >  		rcu_assign_pointer(flow->sf_acts, acts);
> >
> >  		if (unlikely(reply)) {
> > -			error = ovs_flow_cmd_fill_info(flow,
> > +			error = ovs_flow_cmd_fill_info(flow, table,
> >  						       ovs_header->dp_ifindex,
> >  						       reply, info->snd_portid,
> >  						       info->snd_seq, 0,
> > @@ -1310,20 +1363,22 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
> >  		}
> >  	} else {
> >  		/* Could not alloc without acts before locking. */
> > -		reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
> > +		reply = ovs_flow_cmd_build_info(flow, table,
> > +						ovs_header->dp_ifindex,
> >  						info, OVS_FLOW_CMD_SET, false,
> >  						ufid_flags);
> >
> >  		if (IS_ERR(reply)) {
> >  			error = PTR_ERR(reply);
> > -			goto err_unlock_ovs;
> > +			goto err_unlock_tbl;
> >  		}
> >  	}
> >
> >  	/* Clear stats. */
> >  	if (a[OVS_FLOW_ATTR_CLEAR])
> > -		ovs_flow_stats_clear(flow);
> > -	ovs_unlock();
> > +		ovs_flow_stats_clear(flow, table);
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> >
> >  	if (reply)
> >  		ovs_notify(&dp_flow_genl_family, reply, info);
> > @@ -1332,8 +1387,10 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
> >
> >  	return 0;
> >
> > -err_unlock_ovs:
> > -	ovs_unlock();
> > +err_unlock_tbl:
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> > +err_free_reply:
> >  	kfree_skb(reply);
> >  err_kfree_acts:
> >  	ovs_nla_free_flow_actions(acts);
> > @@ -1346,6 +1403,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
> >  	struct nlattr **a = info->attrs;
> >  	struct ovs_header *ovs_header = genl_info_userhdr(info);
> >  	struct net *net = sock_net(skb->sk);
> > +	struct flow_table *table;
> >  	struct sw_flow_key key;
> >  	struct sk_buff *reply;
> >  	struct sw_flow *flow;
> > @@ -1370,33 +1428,48 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
> >  	if (err)
> >  		return err;
> >
> > -	ovs_lock();
> > +	rcu_read_lock();
> >  	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
> >  	if (!dp) {
> > -		err = -ENODEV;
> > -		goto unlock;
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> >  	}
> > +	table = rcu_dereference(dp->table);
> > +	if (!table || !ovs_flow_tbl_get(table)) {
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	/* It is safe to dereference "table" after leaving rcu read-protected
> > +	 * region because it's pinned by refcount.
> > +	 */
> > +	mutex_lock(&table->lock);
> > +
> >
> >  	if (ufid_present)
> > -		flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
> > +		flow = ovs_flow_tbl_lookup_ufid(table, &ufid);
> >  	else
> > -		flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
> > +		flow = ovs_flow_tbl_lookup_exact(table, &match);
> >  	if (!flow) {
> >  		err = -ENOENT;
> >  		goto unlock;
> >  	}
> >
> > -	reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
> > -					OVS_FLOW_CMD_GET, true, ufid_flags);
> > +	reply = ovs_flow_cmd_build_info(flow, table, ovs_header->dp_ifindex,
> > +					info, OVS_FLOW_CMD_GET, true,
> > +					ufid_flags);
> >  	if (IS_ERR(reply)) {
> >  		err = PTR_ERR(reply);
> >  		goto unlock;
> >  	}
> >
> > -	ovs_unlock();
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> >  	return genlmsg_reply(reply, info);
> >  unlock:
> > -	ovs_unlock();
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> >  	return err;
> >  }
> >
> > @@ -1405,6 +1478,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
> >  	struct nlattr **a = info->attrs;
> >  	struct ovs_header *ovs_header = genl_info_userhdr(info);
> >  	struct net *net = sock_net(skb->sk);
> > +	struct flow_table *table;
> >  	struct sw_flow_key key;
> >  	struct sk_buff *reply;
> >  	struct sw_flow *flow = NULL;
> > @@ -1425,36 +1499,49 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
> >  			return err;
> >  	}
> >
> > -	ovs_lock();
> > +	rcu_read_lock();
> >  	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
> >  	if (unlikely(!dp)) {
> > -		err = -ENODEV;
> > -		goto unlock;
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> >  	}
> > +	table = rcu_dereference(dp->table);
> > +	if (!table || !ovs_flow_tbl_get(table)) {
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	/* It is safe to dereference "table" after leaving rcu read-protected
> > +	 * region because it's pinned by refcount.
> > +	 */
> > +	mutex_lock(&table->lock);
> > +
> >
> >  	if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
> > -		err = ovs_flow_tbl_flush(&dp->table);
> > +		err = ovs_flow_tbl_flush(table);
> >  		goto unlock;
> >  	}
> >
> >  	if (ufid_present)
> > -		flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid);
> > +		flow = ovs_flow_tbl_lookup_ufid(table, &ufid);
> >  	else
> > -		flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
> > +		flow = ovs_flow_tbl_lookup_exact(table, &match);
> >  	if (unlikely(!flow)) {
> >  		err = -ENOENT;
> >  		goto unlock;
> >  	}
> >
> > -	ovs_flow_tbl_remove(&dp->table, flow);
> > -	ovs_unlock();
> > +	ovs_flow_tbl_remove(table, flow);
> > +	mutex_unlock(&table->lock);
> >
> >  	reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts,
> >  					&flow->id, info, false, ufid_flags);
> >  	if (likely(reply)) {
> >  		if (!IS_ERR(reply)) {
> >  			rcu_read_lock();	/*To keep RCU checker happy. */
> > -			err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex,
> > +			err = ovs_flow_cmd_fill_info(flow, table,
> > +						     ovs_header->dp_ifindex,
> >  						     reply, info->snd_portid,
> >  						     info->snd_seq, 0,
> >  						     OVS_FLOW_CMD_DEL,
> > @@ -1473,10 +1560,12 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
> >  	}
> >
> >  out_free:
> > +	ovs_flow_tbl_put(table);
> >  	ovs_flow_free(flow, true);
> >  	return 0;
> >  unlock:
> > -	ovs_unlock();
> > +	mutex_unlock(&table->lock);
> > +	ovs_flow_tbl_put(table);
> >  	return err;
> >  }
> >
> > @@ -1485,6 +1574,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
> >  	struct nlattr *a[__OVS_FLOW_ATTR_MAX];
> >  	struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
> >  	struct table_instance *ti;
> > +	struct flow_table *table;
> >  	struct datapath *dp;
> >  	u32 ufid_flags;
> >  	int err;
> > @@ -1501,8 +1591,13 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
> >  		rcu_read_unlock();
> >  		return -ENODEV;
> >  	}
> > +	table = rcu_dereference(dp->table);
> > +	if (!table) {
> > +		rcu_read_unlock();
> > +		return -ENODEV;
> > +	}
> >
> > -	ti = rcu_dereference(dp->table.ti);
> > +	ti = rcu_dereference(table->ti);
> >  	for (;;) {
> >  		struct sw_flow *flow;
> >  		u32 bucket, obj;
> > @@ -1513,8 +1608,8 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
> >  		if (!flow)
> >  			break;
> >
> > -		if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
> > -					   NETLINK_CB(cb->skb).portid,
> > +		if (ovs_flow_cmd_fill_info(flow, table, ovs_header->dp_ifindex,
> > +					   skb, NETLINK_CB(cb->skb).portid,
> >  					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
> >  					   OVS_FLOW_CMD_GET, ufid_flags) < 0)
> >  			break;
> > @@ -1598,8 +1693,13 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
> >  	struct ovs_dp_stats dp_stats;
> >  	struct ovs_dp_megaflow_stats dp_megaflow_stats;
> >  	struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids);
> > +	struct flow_table *table;
> >  	int err, pids_len;
> >
> > +	table = ovsl_dereference(dp->table);
> > +	if (!table)
> > +		return -ENODEV;
> > +
> >  	ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family,
> >  				 flags, cmd);
> >  	if (!ovs_header)
> > @@ -1625,7 +1725,7 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
> >  		goto nla_put_failure;
> >
> >  	if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE,
> > -			ovs_flow_tbl_masks_cache_size(&dp->table)))
> > +			ovs_flow_tbl_masks_cache_size(table)))
> >  		goto nla_put_failure;
> >
> >  	if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) {
> > @@ -1736,6 +1836,7 @@ u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
> >  static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
> >  {
> >  	u32 user_features = 0, old_features = dp->user_features;
> > +	struct flow_table *table;
> >  	int err;
> >
> >  	if (a[OVS_DP_ATTR_USER_FEATURES]) {
> > @@ -1757,8 +1858,12 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
> >  		int err;
> >  		u32 cache_size;
> >
> > +		table = ovsl_dereference(dp->table);
> > +		if (!table)
> > +			return -ENODEV;
> > +
> >  		cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]);
> > -		err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size);
> > +		err = ovs_flow_tbl_masks_cache_resize(table, cache_size);
> >  		if (err)
> >  			return err;
> >  	}
> > @@ -1810,6 +1915,7 @@ static int ovs_dp_vport_init(struct datapath *dp)
> >  static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  {
> >  	struct nlattr **a = info->attrs;
> > +	struct flow_table *table;
> >  	struct vport_parms parms;
> >  	struct sk_buff *reply;
> >  	struct datapath *dp;
> > @@ -1833,9 +1939,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  	ovs_dp_set_net(dp, sock_net(skb->sk));
> >
> >  	/* Allocate table. */
> > -	err = ovs_flow_tbl_init(&dp->table);
> > -	if (err)
> > +	table = ovs_flow_tbl_alloc();
> > +	if (IS_ERR(table)) {
> > +		err = PTR_ERR(table);
> >  		goto err_destroy_dp;
> > +	}
> > +	rcu_assign_pointer(dp->table, table);
> >
> >  	err = ovs_dp_stats_init(dp);
> >  	if (err)
> > @@ -1905,7 +2014,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  err_destroy_stats:
> >  	free_percpu(dp->stats_percpu);
> >  err_destroy_table:
> > -	ovs_flow_tbl_destroy(&dp->table);
> > +	ovs_flow_tbl_put(table);
> >  err_destroy_dp:
> >  	kfree(dp);
> >  err_destroy_reply:
> > @@ -1917,7 +2026,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
> >  /* Called with ovs_mutex. */
> >  static void __dp_destroy(struct datapath *dp)
> >  {
> > -	struct flow_table *table = &dp->table;
> > +	struct flow_table *table = rcu_dereference_protected(dp->table,
> > +					lockdep_ovsl_is_held());
> >  	int i;
> >
> >  	if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
> > @@ -1939,14 +2049,10 @@ static void __dp_destroy(struct datapath *dp)
> >  	 */
> >  	ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
> >
> > -	/* Flush sw_flow in the tables. RCU cb only releases resource
> > -	 * such as dp, ports and tables. That may avoid some issues
> > -	 * such as RCU usage warning.
> > -	 */
> > -	table_instance_flow_flush(table, ovsl_dereference(table->ti),
> > -				  ovsl_dereference(table->ufid_ti));
> > +	rcu_assign_pointer(dp->table, NULL);
> > +	ovs_flow_tbl_put(table);
> >
> > -	/* RCU destroy the ports, meters and flow tables. */
> > +	/* RCU destroy the ports and meters. */
> >  	call_rcu(&dp->rcu, destroy_dp_rcu);
> >  }
> >
> > @@ -2554,13 +2660,18 @@ static void ovs_dp_masks_rebalance(struct work_struct *work)
> >  {
> >  	struct ovs_net *ovs_net = container_of(work, struct ovs_net,
> >  					       masks_rebalance.work);
> > +	struct flow_table *table;
> >  	struct datapath *dp;
> >
> >  	ovs_lock();
> > -
> > -	list_for_each_entry(dp, &ovs_net->dps, list_node)
> > -		ovs_flow_masks_rebalance(&dp->table);
> > -
> > +	list_for_each_entry(dp, &ovs_net->dps, list_node) {
> > +		table = ovsl_dereference(dp->table);
> > +		if (!table)
> > +			continue;
>
> Should we take a reference for table here?  I guess it's kindof safe
> because of the ovs_lock() above, but if that gets removed it's possible
> someone misses that there isn't a refcnt pin here (but everywhere else
> has a ovs_flow_tbl_get before it).
>

Good point. As you say, it is safe but still we should probably do it.
I'll change this.

Just for reference:
I actually contemplated the possibility of removing the lock here, or at
least removing its scope. We still need it to serialize access to
"&ovs_net->dps" but we could then increase a reference to the table and
release the lock. The code would then look bad because we'd be releasing
the lock in the middle of the loop. After some thought, all this
complexity didn't feel necessary for something that happens every 4s and
that is not affected by RTNL contention.

Thanks.
Adrián

> > +		mutex_lock(&table->lock);
> > +		ovs_flow_masks_rebalance(table);
> > +		mutex_unlock(&table->lock);
> > +	}
> >  	ovs_unlock();
> >
> >  	schedule_delayed_work(&ovs_net->masks_rebalance,
> > diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
> > index db0c3e69d66c..44773bf9f645 100644
> > --- a/net/openvswitch/datapath.h
> > +++ b/net/openvswitch/datapath.h
> > @@ -90,7 +90,7 @@ struct datapath {
> >  	struct list_head list_node;
> >
> >  	/* Flow table. */
> > -	struct flow_table table;
> > +	struct flow_table __rcu *table;
> >
> >  	/* Switch ports. */
> >  	struct hlist_head *ports;
> > diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> > index 66366982f604..0a748cf20f53 100644
> > --- a/net/openvswitch/flow.c
> > +++ b/net/openvswitch/flow.c
> > @@ -124,8 +124,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
> >  	spin_unlock(&stats->lock);
> >  }
> >
> > -/* Must be called with rcu_read_lock or ovs_mutex. */
> > +/* Must be called with rcu_read_lock or table->lock held. */
> >  void ovs_flow_stats_get(const struct sw_flow *flow,
> > +			const struct flow_table *table,
> >  			struct ovs_flow_stats *ovs_stats,
> >  			unsigned long *used, __be16 *tcp_flags)
> >  {
> > @@ -136,7 +137,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
> >  	memset(ovs_stats, 0, sizeof(*ovs_stats));
> >
> >  	for_each_cpu(cpu, flow->cpu_used_mask) {
> > -		struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
> > +		struct sw_flow_stats *stats =
> > +			rcu_dereference_ovs_tbl(flow->stats[cpu], table);
> >
> >  		if (stats) {
> >  			/* Local CPU may write on non-local stats, so we must
> > @@ -153,13 +155,14 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
> >  	}
> >  }
> >
> > -/* Called with ovs_mutex. */
> > -void ovs_flow_stats_clear(struct sw_flow *flow)
> > +/* Called with table->lock held. */
> > +void ovs_flow_stats_clear(struct sw_flow *flow, struct flow_table *table)
> >  {
> >  	unsigned int cpu;
> >
> >  	for_each_cpu(cpu, flow->cpu_used_mask) {
> > -		struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
> > +		struct sw_flow_stats *stats =
> > +			ovs_tbl_dereference(flow->stats[cpu], table);
> >
> >  		if (stats) {
> >  			spin_lock_bh(&stats->lock);
> > diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
> > index b5711aff6e76..e05ed6796e4e 100644
> > --- a/net/openvswitch/flow.h
> > +++ b/net/openvswitch/flow.h
> > @@ -23,6 +23,7 @@
> >  #include <net/dst_metadata.h>
> >  #include <net/nsh.h>
> >
> > +struct flow_table;
> >  struct sk_buff;
> >
> >  enum sw_flow_mac_proto {
> > @@ -280,9 +281,11 @@ static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid)
> >
> >  void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags,
> >  			   const struct sk_buff *);
> > -void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *,
> > -			unsigned long *used, __be16 *tcp_flags);
> > -void ovs_flow_stats_clear(struct sw_flow *);
> > +void ovs_flow_stats_get(const struct sw_flow *flow,
> > +			const struct flow_table *table,
> > +			struct ovs_flow_stats *stats, unsigned long *used,
> > +			__be16 *tcp_flags);
> > +void ovs_flow_stats_clear(struct sw_flow *flow, struct flow_table *table);
> >  u64 ovs_flow_used_time(unsigned long flow_jiffies);
> >
> >  int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
> > diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
> > index 61c6a5f77c2e..d9dbe4b4807c 100644
> > --- a/net/openvswitch/flow_table.c
> > +++ b/net/openvswitch/flow_table.c
> > @@ -45,6 +45,16 @@
> >  static struct kmem_cache *flow_cache;
> >  struct kmem_cache *flow_stats_cache __read_mostly;
> >
> > +#ifdef CONFIG_LOCKDEP
> > +int lockdep_ovs_tbl_is_held(const struct flow_table *table)
> > +{
> > +	if (debug_locks)
> > +		return lockdep_is_held(&table->lock);
> > +	else
> > +		return 1;
> > +}
> > +#endif
> > +
> >  static u16 range_n_bytes(const struct sw_flow_key_range *range)
> >  {
> >  	return range->end - range->start;
> > @@ -249,12 +259,12 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
> >  	if (!new)
> >  		return -ENOMEM;
> >
> > -	old = ovsl_dereference(tbl->mask_array);
> > +	old = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	if (old) {
> >  		int i;
> >
> >  		for (i = 0; i < old->max; i++) {
> > -			if (ovsl_dereference(old->masks[i]))
> > +			if (ovs_tbl_dereference(old->masks[i], tbl))
> >  				new->masks[new->count++] = old->masks[i];
> >  		}
> >  		call_rcu(&old->rcu, mask_array_rcu_cb);
> > @@ -268,7 +278,7 @@ static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
> >  static int tbl_mask_array_add_mask(struct flow_table *tbl,
> >  				   struct sw_flow_mask *new)
> >  {
> > -	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
> > +	struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	int err, ma_count = READ_ONCE(ma->count);
> >
> >  	if (ma_count >= ma->max) {
> > @@ -277,7 +287,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
> >  		if (err)
> >  			return err;
> >
> > -		ma = ovsl_dereference(tbl->mask_array);
> > +		ma = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	} else {
> >  		/* On every add or delete we need to reset the counters so
> >  		 * every new mask gets a fair chance of being prioritized.
> > @@ -285,7 +295,7 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
> >  		tbl_mask_array_reset_counters(ma);
> >  	}
> >
> > -	BUG_ON(ovsl_dereference(ma->masks[ma_count]));
> > +	WARN_ON_ONCE(ovs_tbl_dereference(ma->masks[ma_count], tbl));
> >
> >  	rcu_assign_pointer(ma->masks[ma_count], new);
> >  	WRITE_ONCE(ma->count, ma_count + 1);
> > @@ -296,12 +306,12 @@ static int tbl_mask_array_add_mask(struct flow_table *tbl,
> >  static void tbl_mask_array_del_mask(struct flow_table *tbl,
> >  				    struct sw_flow_mask *mask)
> >  {
> > -	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
> > +	struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	int i, ma_count = READ_ONCE(ma->count);
> >
> >  	/* Remove the deleted mask pointers from the array */
> >  	for (i = 0; i < ma_count; i++) {
> > -		if (mask == ovsl_dereference(ma->masks[i]))
> > +		if (mask == ovs_tbl_dereference(ma->masks[i], tbl))
> >  			goto found;
> >  	}
> >
> > @@ -329,10 +339,10 @@ static void tbl_mask_array_del_mask(struct flow_table *tbl,
> >  static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
> >  {
> >  	if (mask) {
> > -		/* ovs-lock is required to protect mask-refcount and
> > +		/* table lock is required to protect mask-refcount and
> >  		 * mask list.
> >  		 */
> > -		ASSERT_OVSL();
> > +		ASSERT_OVS_TBL(tbl);
> >  		BUG_ON(!mask->ref_count);
> >  		mask->ref_count--;
> >
> > @@ -386,7 +396,8 @@ static struct mask_cache *tbl_mask_cache_alloc(u32 size)
> >  }
> >  int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
> >  {
> > -	struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
> > +	struct mask_cache *mc = rcu_dereference_ovs_tbl(table->mask_cache,
> > +							table);
> >  	struct mask_cache *new;
> >
> >  	if (size == mc->cache_size)
> > @@ -406,15 +417,23 @@ int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
> >  	return 0;
> >  }
> >
> > -int ovs_flow_tbl_init(struct flow_table *table)
> > +struct flow_table *ovs_flow_tbl_alloc(void)
> >  {
> >  	struct table_instance *ti, *ufid_ti;
> > +	struct flow_table *table;
> >  	struct mask_cache *mc;
> >  	struct mask_array *ma;
> >
> > +	table = kzalloc_obj(*table, GFP_KERNEL);
> > +	if (!table)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	mutex_init(&table->lock);
> > +	refcount_set(&table->refcnt, 1);
> > +
> >  	mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
> >  	if (!mc)
> > -		return -ENOMEM;
> > +		goto free_table;
> >
> >  	ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
> >  	if (!ma)
> > @@ -435,7 +454,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
> >  	table->last_rehash = jiffies;
> >  	table->count = 0;
> >  	table->ufid_count = 0;
> > -	return 0;
> > +	return table;
> >
> >  free_ti:
> >  	__table_instance_destroy(ti);
> > @@ -443,7 +462,10 @@ int ovs_flow_tbl_init(struct flow_table *table)
> >  	__mask_array_destroy(ma);
> >  free_mask_cache:
> >  	__mask_cache_destroy(mc);
> > -	return -ENOMEM;
> > +free_table:
> > +	mutex_destroy(&table->lock);
> > +	kfree(table);
> > +	return ERR_PTR(-ENOMEM);
> >  }
> >
> >  static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
> > @@ -470,7 +492,7 @@ static void table_instance_flow_free(struct flow_table *table,
> >  	flow_mask_remove(table, flow->mask);
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table mutex held. */
> >  void table_instance_flow_flush(struct flow_table *table,
> >  			       struct table_instance *ti,
> >  			       struct table_instance *ufid_ti)
> > @@ -505,11 +527,11 @@ static void table_instance_destroy(struct table_instance *ti,
> >  	call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
> >  }
> >
> > -/* No need for locking this function is called from RCU callback or
> > - * error path.
> > - */
> > -void ovs_flow_tbl_destroy(struct flow_table *table)
> > +/* No need for locking this function is called from RCU callback. */
> > +static void ovs_flow_tbl_destroy_rcu(struct rcu_head *rcu)
> >  {
> > +	struct flow_table *table = container_of(rcu, struct flow_table, rcu);
> > +
> >  	struct table_instance *ti = rcu_dereference_raw(table->ti);
> >  	struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
> >  	struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
> > @@ -518,6 +540,20 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
> >  	call_rcu(&mc->rcu, mask_cache_rcu_cb);
> >  	call_rcu(&ma->rcu, mask_array_rcu_cb);
> >  	table_instance_destroy(ti, ufid_ti);
> > +	mutex_destroy(&table->lock);
> > +	kfree(table);
> > +}
> > +
> > +void ovs_flow_tbl_put(struct flow_table *table)
> > +{
> > +	if (refcount_dec_and_test(&table->refcnt)) {
> > +		mutex_lock(&table->lock);
> > +		table_instance_flow_flush(table,
> > +					  ovs_tbl_dereference(table->ti, table),
> > +					  ovs_tbl_dereference(table->ufid_ti, table));
> > +		mutex_unlock(&table->lock);
> > +		call_rcu(&table->rcu, ovs_flow_tbl_destroy_rcu);
> > +	}
> >  }
> >
> >  struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
> > @@ -571,7 +607,8 @@ static void ufid_table_instance_insert(struct table_instance *ti,
> >  	hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
> >  }
> >
> > -static void flow_table_copy_flows(struct table_instance *old,
> > +static void flow_table_copy_flows(struct flow_table *table,
> > +				  struct table_instance *old,
> >  				  struct table_instance *new, bool ufid)
> >  {
> >  	int old_ver;
> > @@ -588,17 +625,18 @@ static void flow_table_copy_flows(struct table_instance *old,
> >  		if (ufid)
> >  			hlist_for_each_entry_rcu(flow, head,
> >  						 ufid_table.node[old_ver],
> > -						 lockdep_ovsl_is_held())
> > +						 lockdep_ovs_tbl_is_held(table))
> >  				ufid_table_instance_insert(new, flow);
> >  		else
> >  			hlist_for_each_entry_rcu(flow, head,
> >  						 flow_table.node[old_ver],
> > -						 lockdep_ovsl_is_held())
> > +						 lockdep_ovs_tbl_is_held(table))
> >  				table_instance_insert(new, flow);
> >  	}
> >  }
> >
> > -static struct table_instance *table_instance_rehash(struct table_instance *ti,
> > +static struct table_instance *table_instance_rehash(struct flow_table *table,
> > +						    struct table_instance *ti,
> >  						    int n_buckets, bool ufid)
> >  {
> >  	struct table_instance *new_ti;
> > @@ -607,16 +645,19 @@ static struct table_instance *table_instance_rehash(struct table_instance *ti,
> >  	if (!new_ti)
> >  		return NULL;
> >
> > -	flow_table_copy_flows(ti, new_ti, ufid);
> > +	flow_table_copy_flows(table, ti, new_ti, ufid);
> >
> >  	return new_ti;
> >  }
> >
> > +/* Must be called with flow_table->lock held. */
> >  int ovs_flow_tbl_flush(struct flow_table *flow_table)
> >  {
> >  	struct table_instance *old_ti, *new_ti;
> >  	struct table_instance *old_ufid_ti, *new_ufid_ti;
> >
> > +	ASSERT_OVS_TBL(flow_table);
> > +
> >  	new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
> >  	if (!new_ti)
> >  		return -ENOMEM;
> > @@ -624,8 +665,8 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
> >  	if (!new_ufid_ti)
> >  		goto err_free_ti;
> >
> > -	old_ti = ovsl_dereference(flow_table->ti);
> > -	old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
> > +	old_ti = ovs_tbl_dereference(flow_table->ti, flow_table);
> > +	old_ufid_ti = ovs_tbl_dereference(flow_table->ufid_ti, flow_table);
> >
> >  	rcu_assign_pointer(flow_table->ti, new_ti);
> >  	rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
> > @@ -693,7 +734,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
> >  	return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
> >  }
> >
> > -static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
> > +static struct sw_flow *masked_flow_lookup(struct flow_table *tbl,
> > +					  struct table_instance *ti,
> >  					  const struct sw_flow_key *unmasked,
> >  					  const struct sw_flow_mask *mask,
> >  					  u32 *n_mask_hit)
> > @@ -709,7 +751,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
> >  	(*n_mask_hit)++;
> >
> >  	hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
> > -				 lockdep_ovsl_is_held()) {
> > +				 lockdep_ovs_tbl_is_held(tbl)) {
> >  		if (flow->mask == mask && flow->flow_table.hash == hash &&
> >  		    flow_cmp_masked_key(flow, &masked_key, &mask->range))
> >  			return flow;
> > @@ -736,9 +778,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
> >  	int i;
> >
> >  	if (likely(*index < ma->max)) {
> > -		mask = rcu_dereference_ovsl(ma->masks[*index]);
> > +		mask = rcu_dereference_ovs_tbl(ma->masks[*index], tbl);
> >  		if (mask) {
> > -			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
> > +			flow = masked_flow_lookup(tbl, ti, key, mask, n_mask_hit);
> >  			if (flow) {
> >  				u64_stats_update_begin(&stats->syncp);
> >  				stats->usage_cntrs[*index]++;
> > @@ -754,11 +796,11 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
> >  		if (i == *index)
> >  			continue;
> >
> > -		mask = rcu_dereference_ovsl(ma->masks[i]);
> > +		mask = rcu_dereference_ovs_tbl(ma->masks[i], tbl);
> >  		if (unlikely(!mask))
> >  			break;
> >
> > -		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
> > +		flow = masked_flow_lookup(tbl, ti, key, mask, n_mask_hit);
> >  		if (flow) { /* Found */
> >  			*index = i;
> >  			u64_stats_update_begin(&stats->syncp);
> > @@ -845,8 +887,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
> >  struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
> >  				    const struct sw_flow_key *key)
> >  {
> > -	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
> > -	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
> > +	struct table_instance *ti = rcu_dereference_ovs_tbl(tbl->ti, tbl);
> > +	struct mask_array *ma = rcu_dereference_ovs_tbl(tbl->mask_array, tbl);
> >  	u32 __always_unused n_mask_hit;
> >  	u32 __always_unused n_cache_hit;
> >  	struct sw_flow *flow;
> > @@ -865,21 +907,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
> >  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
> >  					  const struct sw_flow_match *match)
> >  {
> > -	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
> > +	struct mask_array *ma = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	int i;
> >
> > -	/* Always called under ovs-mutex. */
> > +	/* Always called under tbl->lock. */
> >  	for (i = 0; i < ma->max; i++) {
> > -		struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
> > +		struct table_instance *ti =
> > +				rcu_dereference_ovs_tbl(tbl->ti, tbl);
> >  		u32 __always_unused n_mask_hit;
> >  		struct sw_flow_mask *mask;
> >  		struct sw_flow *flow;
> >
> > -		mask = ovsl_dereference(ma->masks[i]);
> > +		mask = ovs_tbl_dereference(ma->masks[i], tbl);
> >  		if (!mask)
> >  			continue;
> >
> > -		flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
> > +		flow = masked_flow_lookup(tbl, ti, match->key, mask, &n_mask_hit);
> >  		if (flow && ovs_identifier_is_key(&flow->id) &&
> >  		    ovs_flow_cmp_unmasked_key(flow, match)) {
> >  			return flow;
> > @@ -915,7 +958,7 @@ bool ovs_flow_cmp(const struct sw_flow *flow,
> >  struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
> >  					 const struct sw_flow_id *ufid)
> >  {
> > -	struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
> > +	struct table_instance *ti = rcu_dereference_ovs_tbl(tbl->ufid_ti, tbl);
> >  	struct sw_flow *flow;
> >  	struct hlist_head *head;
> >  	u32 hash;
> > @@ -923,7 +966,7 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
> >  	hash = ufid_hash(ufid);
> >  	head = find_bucket(ti, hash);
> >  	hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
> > -				 lockdep_ovsl_is_held()) {
> > +				 lockdep_ovs_tbl_is_held(tbl)) {
> >  		if (flow->ufid_table.hash == hash &&
> >  		    ovs_flow_cmp_ufid(flow, ufid))
> >  			return flow;
> > @@ -933,28 +976,33 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
> >
> >  int ovs_flow_tbl_num_masks(const struct flow_table *table)
> >  {
> > -	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
> > +	struct mask_array *ma = rcu_dereference_ovs_tbl(table->mask_array,
> > +							table);
> >  	return READ_ONCE(ma->count);
> >  }
> >
> >  u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
> >  {
> > -	struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
> > +	struct mask_cache *mc = rcu_dereference_ovs_tbl(table->mask_cache,
> > +							table);
> >
> >  	return READ_ONCE(mc->cache_size);
> >  }
> >
> > -static struct table_instance *table_instance_expand(struct table_instance *ti,
> > +static struct table_instance *table_instance_expand(struct flow_table *table,
> > +						    struct table_instance *ti,
> >  						    bool ufid)
> >  {
> > -	return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
> > +	return table_instance_rehash(table, ti, ti->n_buckets * 2, ufid);
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table mutex held. */
> >  void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
> >  {
> > -	struct table_instance *ti = ovsl_dereference(table->ti);
> > -	struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
> > +	struct table_instance *ti = ovs_tbl_dereference(table->ti,
> > +							table);
> > +	struct table_instance *ufid_ti = ovs_tbl_dereference(table->ufid_ti,
> > +							     table);
> >
> >  	BUG_ON(table->count == 0);
> >  	table_instance_flow_free(table, ti, ufid_ti, flow);
> > @@ -988,10 +1036,10 @@ static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
> >  	struct mask_array *ma;
> >  	int i;
> >
> > -	ma = ovsl_dereference(tbl->mask_array);
> > +	ma = ovs_tbl_dereference(tbl->mask_array, tbl);
> >  	for (i = 0; i < ma->max; i++) {
> >  		struct sw_flow_mask *t;
> > -		t = ovsl_dereference(ma->masks[i]);
> > +		t = ovs_tbl_dereference(ma->masks[i], tbl);
> >
> >  		if (t && mask_equal(mask, t))
> >  			return t;
> > @@ -1029,22 +1077,25 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
> >  	return 0;
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table mutex held. */
> >  static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
> >  {
> >  	struct table_instance *new_ti = NULL;
> >  	struct table_instance *ti;
> >
> > +	ASSERT_OVS_TBL(table);
> > +
> >  	flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
> > -	ti = ovsl_dereference(table->ti);
> > +	ti = ovs_tbl_dereference(table->ti, table);
> >  	table_instance_insert(ti, flow);
> >  	table->count++;
> >
> >  	/* Expand table, if necessary, to make room. */
> >  	if (table->count > ti->n_buckets)
> > -		new_ti = table_instance_expand(ti, false);
> > +		new_ti = table_instance_expand(table, ti, false);
> >  	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
> > -		new_ti = table_instance_rehash(ti, ti->n_buckets, false);
> > +		new_ti = table_instance_rehash(table, ti, ti->n_buckets,
> > +					       false);
> >
> >  	if (new_ti) {
> >  		rcu_assign_pointer(table->ti, new_ti);
> > @@ -1053,13 +1104,15 @@ static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
> >  	}
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table mutex held. */
> >  static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
> >  {
> >  	struct table_instance *ti;
> >
> > +	ASSERT_OVS_TBL(table);
> > +
> >  	flow->ufid_table.hash = ufid_hash(&flow->id);
> > -	ti = ovsl_dereference(table->ufid_ti);
> > +	ti = ovs_tbl_dereference(table->ufid_ti, table);
> >  	ufid_table_instance_insert(ti, flow);
> >  	table->ufid_count++;
> >
> > @@ -1067,7 +1120,7 @@ static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
> >  	if (table->ufid_count > ti->n_buckets) {
> >  		struct table_instance *new_ti;
> >
> > -		new_ti = table_instance_expand(ti, true);
> > +		new_ti = table_instance_expand(table, ti, true);
> >  		if (new_ti) {
> >  			rcu_assign_pointer(table->ufid_ti, new_ti);
> >  			call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
> > @@ -1075,12 +1128,14 @@ static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
> >  	}
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table mutex held. */
> >  int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
> >  			const struct sw_flow_mask *mask)
> >  {
> >  	int err;
> >
> > +	ASSERT_OVS_TBL(table);
> > +
> >  	err = flow_mask_insert(table, flow, mask);
> >  	if (err)
> >  		return err;
> > @@ -1099,10 +1154,11 @@ static int compare_mask_and_count(const void *a, const void *b)
> >  	return (s64)mc_b->counter - (s64)mc_a->counter;
> >  }
> >
> > -/* Must be called with OVS mutex held. */
> > +/* Must be called with table->lock held. */
> >  void ovs_flow_masks_rebalance(struct flow_table *table)
> >  {
> > -	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
> > +	struct mask_array *ma = rcu_dereference_ovs_tbl(table->mask_array,
> > +							table);
> >  	struct mask_count *masks_and_count;
> >  	struct mask_array *new;
> >  	int masks_entries = 0;
> > @@ -1117,7 +1173,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
> >  		struct sw_flow_mask *mask;
> >  		int cpu;
> >
> > -		mask = rcu_dereference_ovsl(ma->masks[i]);
> > +		mask = rcu_dereference_ovs_tbl(ma->masks[i], table);
> >  		if (unlikely(!mask))
> >  			break;
> >
> > @@ -1171,7 +1227,7 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
> >  	for (i = 0; i < masks_entries; i++) {
> >  		int index = masks_and_count[i].index;
> >
> > -		if (ovsl_dereference(ma->masks[index]))
> > +		if (ovs_tbl_dereference(ma->masks[index], table))
> >  			new->masks[new->count++] = ma->masks[index];
> >  	}
> >
> > diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
> > index f524dc3e4862..cffd412c9045 100644
> > --- a/net/openvswitch/flow_table.h
> > +++ b/net/openvswitch/flow_table.h
> > @@ -59,7 +59,29 @@ struct table_instance {
> >  	u32 hash_seed;
> >  };
> >
> > +/* Locking:
> > + *
> > + * flow_table is _not_ protected by ovs_lock (see comment above ovs_mutex
> > + * in datapath.c).
> > + *
> > + * All writes to flow_table are protected by the embedded "lock".
> > + * In order to ensure datapath destruction does not trigger the destruction
> > + * of the flow_table, "refcnt" is used. Therefore, writers must:
> > + * 1 - Enter rcu read-protected section
> > + * 2 - Increase "table->refcnt"
> > + * 3 - Leave rcu read-protected section (to avoid using mutexes inside rcu)
> > + * 4 - Lock "table->lock"
> > + * 5 - Perform modifications
> > + * 6 - Release "table->lock"
> > + * 7 - Decrease "table->refcnt"
> > + *
> > + * Reads are protected by RCU.
> > + */
> >  struct flow_table {
> > +	/* Locks flow table writes. */
> > +	struct mutex lock;
> > +	refcount_t refcnt;
> > +	struct rcu_head rcu;
> >  	struct table_instance __rcu *ti;
> >  	struct table_instance __rcu *ufid_ti;
> >  	struct mask_cache __rcu *mask_cache;
> > @@ -71,15 +93,40 @@ struct flow_table {
> >
> >  extern struct kmem_cache *flow_stats_cache;
> >
> > +#ifdef CONFIG_LOCKDEP
> > +int lockdep_ovs_tbl_is_held(const struct flow_table *table);
> > +#else
> > +static inline int lockdep_ovs_tbl_is_held(const struct flow_table *table)
> > +{
> > +	(void)table;
> > +	return 1;
> > +}
> > +#endif
> > +
> > +#define ASSERT_OVS_TBL(tbl)   WARN_ON(!lockdep_ovs_tbl_is_held(tbl))
> > +
> > +/* Lock-protected update-allowed dereferences.*/
> > +#define ovs_tbl_dereference(p, tbl)	\
> > +	rcu_dereference_protected(p, lockdep_ovs_tbl_is_held(tbl))
> > +
> > +/* Read dereferences can be protected by either RCU, table lock or ovs_mutex. */
> > +#define rcu_dereference_ovs_tbl(p, tbl) \
> > +	rcu_dereference_check(p,		\
> > +		lockdep_ovs_tbl_is_held(tbl) || lockdep_ovsl_is_held())
> > +
> >  int ovs_flow_init(void);
> >  void ovs_flow_exit(void);
> >
> >  struct sw_flow *ovs_flow_alloc(void);
> >  void ovs_flow_free(struct sw_flow *, bool deferred);
> >
> > -int ovs_flow_tbl_init(struct flow_table *);
> > +struct flow_table *ovs_flow_tbl_alloc(void);
> > +void ovs_flow_tbl_put(struct flow_table *table);
> > +static inline bool ovs_flow_tbl_get(struct flow_table *table)
> > +{
> > +	return refcount_inc_not_zero(&table->refcnt);
> > +}
> >  int ovs_flow_tbl_count(const struct flow_table *table);
> > -void ovs_flow_tbl_destroy(struct flow_table *table);
> >  int ovs_flow_tbl_flush(struct flow_table *flow_table);
> >
> >  int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
>


^ permalink raw reply

* [PATCH] net: ipv4: fix alignment fault in sysctl_fib_multipath_hash_seed on ARM64 with Clang
From: Juno Choii @ 2026-04-15  5:13 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, kuba, pabeni, horms, linux-kernel, Juno Choi

From: Juno Choi <juno.choi@lge.com>

On ARM64, Clang may generate ldaxr (64-bit exclusive load) for
READ_ONCE() on 8-byte structs. ldaxr requires 8-byte natural
alignment, but sysctl_fib_multipath_hash_seed (two u32 members)
only has 4-byte natural alignment.

When this struct lands at a 4-byte-aligned but not 8-byte-aligned
offset within struct netns_ipv4, the ldaxr triggers an alignment
fault in rt6_multipath_hash(), causing a kernel panic in the IPv6
packet receive path (rtl8168_poll -> ipv6_list_rcv ->
rt6_multipath_hash).

Add __aligned(8) to the struct definition when building for ARM64
with Clang to ensure proper alignment for atomic 8-byte loads.

Signed-off-by: Juno Choi <juno.choi@lge.com>
---
 include/net/netns/ipv4.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 276f622f3516..4366ab26512d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -41,11 +41,18 @@ struct inet_timewait_death_row {
 struct tcp_fastopen_context;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+#if defined(CONFIG_ARM64) && defined(CONFIG_CC_IS_CLANG)
+struct sysctl_fib_multipath_hash_seed {
+	u32 user_seed;
+	u32 mp_seed;
+} __aligned(8);
+#else
 struct sysctl_fib_multipath_hash_seed {
 	u32 user_seed;
 	u32 mp_seed;
 };
 #endif
+#endif
 
 struct netns_ipv4 {
 	/* Cacheline organization can be found documented in
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH bpf v4 5/5] bpf, sockmap: Take state lock for af_unix iter
From: Kuniyuki Iwashima @ 2026-04-15  5:02 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: John Fastabend, Jakub Sitnicki, Eric Dumazet, Paolo Abeni,
	Willem de Bruijn, David S. Miller, Jakub Kicinski, Simon Horman,
	Yonghong Song, Andrii Nakryiko, Alexei Starovoitov,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Shuah Khan, Cong Wang, netdev, bpf, linux-kernel, linux-kselftest
In-Reply-To: <20260414-unix-proto-update-null-ptr-deref-v4-5-2af6fe97918e@rbox.co>

On Tue, Apr 14, 2026 at 7:13 AM Michal Luczaj <mhal@rbox.co> wrote:
>
> When a BPF iterator program updates a sockmap, there is a race condition in
> unix_stream_bpf_update_proto() where the `peer` pointer can become stale[1]
> during a state transition TCP_ESTABLISHED -> TCP_CLOSE.
>
>         CPU0 bpf                          CPU1 close
>         --------                          ----------
> // unix_stream_bpf_update_proto()
> sk_pair = unix_peer(sk)
> if (unlikely(!sk_pair))
>    return -EINVAL;
>                                      // unix_release_sock()
>                                      skpair = unix_peer(sk);
>                                      unix_peer(sk) = NULL;
>                                      sock_put(skpair)
> sock_hold(sk_pair) // UaF
>
> More practically, this fix guarantees that the iterator program is
> consistently provided with a unix socket that remains stable during
> iterator execution.
>
> [1]:
> BUG: KASAN: slab-use-after-free in unix_stream_bpf_update_proto+0x155/0x490
> Write of size 4 at addr ffff8881178c9a00 by task test_progs/2231
> Call Trace:
>  dump_stack_lvl+0x5d/0x80
>  print_report+0x170/0x4f3
>  kasan_report+0xe4/0x1c0
>  kasan_check_range+0x125/0x200
>  unix_stream_bpf_update_proto+0x155/0x490
>  sock_map_link+0x71c/0xec0
>  sock_map_update_common+0xbc/0x600
>  sock_map_update_elem+0x19a/0x1f0
>  bpf_prog_bbbf56096cdd4f01_selective_dump_unix+0x20c/0x217
>  bpf_iter_run_prog+0x21e/0xae0
>  bpf_iter_unix_seq_show+0x1e0/0x2a0
>  bpf_seq_read+0x42c/0x10d0
>  vfs_read+0x171/0xb20
>  ksys_read+0xff/0x200
>  do_syscall_64+0xf7/0x5e0
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Allocated by task 2236:
>  kasan_save_stack+0x30/0x50
>  kasan_save_track+0x14/0x30
>  __kasan_slab_alloc+0x63/0x80
>  kmem_cache_alloc_noprof+0x1d5/0x680
>  sk_prot_alloc+0x59/0x210
>  sk_alloc+0x34/0x470
>  unix_create1+0x86/0x8a0
>  unix_stream_connect+0x318/0x15b0
>  __sys_connect+0xfd/0x130
>  __x64_sys_connect+0x72/0xd0
>  do_syscall_64+0xf7/0x5e0
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Freed by task 2236:
>  kasan_save_stack+0x30/0x50
>  kasan_save_track+0x14/0x30
>  kasan_save_free_info+0x3b/0x70
>  __kasan_slab_free+0x47/0x70
>  kmem_cache_free+0x11c/0x590
>  __sk_destruct+0x432/0x6e0
>  unix_release_sock+0x9b3/0xf60
>  unix_release+0x8a/0xf0
>  __sock_release+0xb0/0x270
>  sock_close+0x18/0x20
>  __fput+0x36e/0xac0
>  fput_close_sync+0xe5/0x1a0
>  __x64_sys_close+0x7d/0xd0
>  do_syscall_64+0xf7/0x5e0
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
> Fixes: 2c860a43dd77 ("bpf: af_unix: Implement BPF iterator for UNIX domain socket.")
> Signed-off-by: Michal Luczaj <mhal@rbox.co>

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

Thanks for the fixes, Michal !

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox