* [PATCH v12 net-next 3/5] psp: add a new netdev event for dev unregister
From: Wei Wang @ 2026-04-18 17:00 UTC (permalink / raw)
To: netdev, Jakub Kicinski, Daniel Zahka, Willem de Bruijn, David Wei,
Andrew Lunn, David S . Miller, Eric Dumazet, Simon Horman,
Paolo Abeni
Cc: Wei Wang
In-Reply-To: <20260418170056.3490525-1-weibunny.kernel@gmail.com>
From: Wei Wang <weibunny@fb.com>
Add a new netdev event for dev unregister and handle the removal of this
dev from psp->assoc_dev_list, upon the first dev-assoc operation.
Signed-off-by: Wei Wang <weibunny@fb.com>
Reviewed-by: Daniel Zahka <daniel.zahka@gmail.com>
---
Documentation/netlink/specs/psp.yaml | 2 +-
net/psp/psp-nl-gen.c | 2 +-
net/psp/psp-nl-gen.h | 3 ++
net/psp/psp.h | 1 +
net/psp/psp_main.c | 76 ++++++++++++++++++++++++++++
net/psp/psp_nl.c | 29 +++++++++++
6 files changed, 111 insertions(+), 2 deletions(-)
diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
index 3d1b7223e084..538ed9184965 100644
--- a/Documentation/netlink/specs/psp.yaml
+++ b/Documentation/netlink/specs/psp.yaml
@@ -328,7 +328,7 @@ operations:
- nsid
reply:
attributes: []
- pre: psp-device-get-locked
+ pre: psp-device-get-locked-dev-assoc
post: psp-device-unlock
-
name: dev-disassoc
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
index 114299c64423..389a8480cc3d 100644
--- a/net/psp/psp-nl-gen.c
+++ b/net/psp/psp-nl-gen.c
@@ -135,7 +135,7 @@ static const struct genl_split_ops psp_nl_ops[] = {
},
{
.cmd = PSP_CMD_DEV_ASSOC,
- .pre_doit = psp_device_get_locked,
+ .pre_doit = psp_device_get_locked_dev_assoc,
.doit = psp_nl_dev_assoc_doit,
.post_doit = psp_device_unlock,
.policy = psp_dev_assoc_nl_policy,
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
index 4dd0f0f23053..24d51bff997f 100644
--- a/net/psp/psp-nl-gen.h
+++ b/net/psp/psp-nl-gen.h
@@ -21,6 +21,9 @@ int psp_device_get_locked_admin(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
+int psp_device_get_locked_dev_assoc(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info);
void
psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
struct genl_info *info);
diff --git a/net/psp/psp.h b/net/psp/psp.h
index 0f9c4e4e52cb..c82b21bae240 100644
--- a/net/psp/psp.h
+++ b/net/psp/psp.h
@@ -15,6 +15,7 @@ extern struct mutex psp_devs_lock;
void psp_dev_free(struct psp_dev *psd);
int psp_dev_check_access(struct psp_dev *psd, struct net *net, bool admin);
+int psp_attach_netdev_notifier(void);
void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd);
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 9049f1d2ff02..5a134b72f320 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -376,6 +376,82 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
}
EXPORT_SYMBOL(psp_dev_rcv);
+static void psp_dev_disassoc_one(struct psp_dev *psd, struct net_device *dev)
+{
+ struct psp_assoc_dev *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &psd->assoc_dev_list, dev_list) {
+ if (entry->assoc_dev == dev) {
+ list_del(&entry->dev_list);
+ psd->assoc_dev_cnt--;
+ rcu_assign_pointer(entry->assoc_dev->psp_dev, NULL);
+ netdev_put(entry->assoc_dev, &entry->dev_tracker);
+ kfree(entry);
+ return;
+ }
+ }
+}
+
+static int psp_netdev_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct psp_dev *psd;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ rcu_read_lock();
+ psd = rcu_dereference(dev->psp_dev);
+ if (psd && psp_dev_tryget(psd)) {
+ rcu_read_unlock();
+ mutex_lock(&psd->lock);
+ psp_dev_disassoc_one(psd, dev);
+ mutex_unlock(&psd->lock);
+ psp_dev_put(psd);
+ } else {
+ rcu_read_unlock();
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block psp_netdev_notifier = {
+ .notifier_call = psp_netdev_event,
+};
+
+static DEFINE_MUTEX(psp_notifier_lock);
+static bool psp_notifier_registered;
+
+/**
+ * psp_attach_netdev_notifier() - register netdev notifier on first use
+ *
+ * Register the netdevice notifier when the first device association
+ * is created. In many installations no associations will be created and
+ * the notifier won't be needed.
+ *
+ * Must be called without psd->lock held, due to lock ordering:
+ * rtnl_lock -> psd->lock (the notifier callback runs under rtnl_lock
+ * and takes psd->lock).
+ */
+int psp_attach_netdev_notifier(void)
+{
+ int err = 0;
+
+ if (READ_ONCE(psp_notifier_registered))
+ return 0;
+
+ mutex_lock(&psp_notifier_lock);
+ if (!psp_notifier_registered) {
+ err = register_netdevice_notifier(&psp_netdev_notifier);
+ if (!err)
+ WRITE_ONCE(psp_notifier_registered, true);
+ }
+ mutex_unlock(&psp_notifier_lock);
+
+ return err;
+}
+
static int __init psp_init(void)
{
mutex_init(&psp_devs_lock);
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
index 75ca32821d28..d622f91a979e 100644
--- a/net/psp/psp_nl.c
+++ b/net/psp/psp_nl.c
@@ -167,6 +167,22 @@ int psp_device_get_locked(const struct genl_split_ops *ops,
return __psp_device_get_locked(ops, skb, info, false);
}
+/*
+ * Non-admin version of psp_device_get_locked() + psp_attach_netdev_notifier()
+ * only used for dev-assoc.
+ */
+int psp_device_get_locked_dev_assoc(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+
+ err = psp_attach_netdev_notifier();
+ if (err)
+ return err;
+
+ return __psp_device_get_locked(ops, skb, info, false);
+}
+
static struct net *psp_nl_resolve_assoc_dev_ns(struct psp_dev *psd,
struct genl_info *info)
{
@@ -532,6 +548,19 @@ int psp_nl_dev_assoc_doit(struct sk_buff *skb, struct genl_info *info)
}
psp_assoc_dev->assoc_dev = assoc_dev;
+
+ /* Check for race with NETDEV_UNREGISTER. The cmpxchg above is a
+ * full barrier, and the unregister path has synchronize_net()
+ * between setting NETREG_UNREGISTERING and reading psp_dev in the
+ * notifier. So at least one side would do the clean-up if we are in
+ * the middle of unregitering assoc_dev.
+ * And the clean-up is serialized by psd->lock.
+ */
+ if (READ_ONCE(assoc_dev->reg_state) != NETREG_REGISTERED) {
+ err = -ENODEV;
+ goto rsp_err;
+ }
+
rsp = psp_nl_reply_new(info);
if (!rsp) {
err = -ENOMEM;
--
2.52.0
^ permalink raw reply related
* [PATCH v12 net-next 2/5] psp: add new netlink cmd for dev-assoc and dev-disassoc
From: Wei Wang @ 2026-04-18 17:00 UTC (permalink / raw)
To: netdev, Jakub Kicinski, Daniel Zahka, Willem de Bruijn, David Wei,
Andrew Lunn, David S . Miller, Eric Dumazet, Simon Horman,
Paolo Abeni
Cc: Wei Wang
In-Reply-To: <20260418170056.3490525-1-weibunny.kernel@gmail.com>
From: Wei Wang <weibunny@fb.com>
The main purpose of this cmd is to be able to associate a
non-psp-capable device (e.g. veth or netkit) with a psp device.
One use case is if we create a pair of veth/netkit, and assign 1 end
inside a netns, while leaving the other end within the default netns,
with a real PSP device, e.g. netdevsim or a physical PSP-capable NIC.
With this command, we could associate the veth/netkit inside the netns
with PSP device, so the virtual device could act as PSP-capable device
to initiate PSP connections, and performs PSP encryption/decryption on
the real PSP device.
Signed-off-by: Wei Wang <weibunny@fb.com>
Reviewed-by: Daniel Zahka <daniel.zahka@gmail.com>
---
Documentation/netlink/specs/psp.yaml | 67 +++++-
include/net/psp/types.h | 23 ++
include/uapi/linux/psp.h | 13 +
net/psp/psp-nl-gen.c | 32 +++
net/psp/psp-nl-gen.h | 2 +
net/psp/psp_main.c | 21 ++
net/psp/psp_nl.c | 340 ++++++++++++++++++++++++++-
7 files changed, 486 insertions(+), 12 deletions(-)
diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
index c54e1202cbe0..3d1b7223e084 100644
--- a/Documentation/netlink/specs/psp.yaml
+++ b/Documentation/netlink/specs/psp.yaml
@@ -13,6 +13,17 @@ definitions:
hdr0-aes-gmac-128, hdr0-aes-gmac-256]
attribute-sets:
+ -
+ name: assoc-dev-info
+ attributes:
+ -
+ name: ifindex
+ doc: ifindex of an associated network device.
+ type: u32
+ -
+ name: nsid
+ doc: Network namespace ID of the associated device.
+ type: s32
-
name: dev
attributes:
@@ -24,7 +35,9 @@ attribute-sets:
min: 1
-
name: ifindex
- doc: ifindex of the main netdevice linked to the PSP device.
+ doc: |
+ ifindex of the main netdevice linked to the PSP device,
+ or the ifindex to associate with the PSP device.
type: u32
-
name: psp-versions-cap
@@ -38,6 +51,28 @@ attribute-sets:
type: u32
enum: version
enum-as-flags: true
+ -
+ name: assoc-list
+ doc: List of associated virtual devices.
+ type: nest
+ nested-attributes: assoc-dev-info
+ multi-attr: true
+ -
+ name: nsid
+ doc: |
+ Network namespace ID for the device to associate/disassociate.
+ Optional for dev-assoc and dev-disassoc; if not present, the
+ device is looked up in the caller's network namespace.
+ type: s32
+ -
+ name: by-association
+ doc: |
+ Flag indicating the PSP device is an associated device from a
+ different network namespace.
+ Present when in associated namespace, absent when in primary/host
+ namespace.
+ type: flag
+
-
name: assoc
attributes:
@@ -170,6 +205,8 @@ operations:
- ifindex
- psp-versions-cap
- psp-versions-ena
+ - assoc-list
+ - by-association
pre: psp-device-get-locked
post: psp-device-unlock
dump:
@@ -279,6 +316,34 @@ operations:
post: psp-device-unlock
dump:
reply: *stats-all
+ -
+ name: dev-assoc
+ doc: Associate a network device with a PSP device.
+ attribute-set: dev
+ do:
+ request:
+ attributes:
+ - id
+ - ifindex
+ - nsid
+ reply:
+ attributes: []
+ pre: psp-device-get-locked
+ post: psp-device-unlock
+ -
+ name: dev-disassoc
+ doc: Disassociate a network device from a PSP device.
+ attribute-set: dev
+ do:
+ request:
+ attributes:
+ - id
+ - ifindex
+ - nsid
+ reply:
+ attributes: []
+ pre: psp-device-get-locked
+ post: psp-device-unlock
mcast-groups:
list:
diff --git a/include/net/psp/types.h b/include/net/psp/types.h
index 25a9096d4e7d..87991a1ea02d 100644
--- a/include/net/psp/types.h
+++ b/include/net/psp/types.h
@@ -5,6 +5,7 @@
#include <linux/mutex.h>
#include <linux/refcount.h>
+#include <net/net_trackers.h>
struct netlink_ext_ack;
@@ -43,9 +44,29 @@ struct psp_dev_config {
u32 versions;
};
+/* Max number of devices that can be associated with a single PSP device.
+ * Each entry consumes ~24 bytes in the netlink dev-get response, and the
+ * response must fit in GENLMSG_DEFAULT_SIZE (~3.7KB).
+ */
+#define PSP_ASSOC_DEV_MAX 128
+
+/**
+ * struct psp_assoc_dev - wrapper for associated net_device
+ * @dev_list: list node for psp_dev::assoc_dev_list
+ * @assoc_dev: the associated net_device
+ * @dev_tracker: tracker for the net_device reference
+ */
+struct psp_assoc_dev {
+ struct list_head dev_list;
+ struct net_device *assoc_dev;
+ netdevice_tracker dev_tracker;
+};
+
/**
* struct psp_dev - PSP device struct
* @main_netdev: original netdevice of this PSP device
+ * @assoc_dev_list: list of psp_assoc_dev entries associated with this PSP device
+ * @assoc_dev_cnt: number of entries in @assoc_dev_list
* @ops: driver callbacks
* @caps: device capabilities
* @drv_priv: driver priv pointer
@@ -67,6 +88,8 @@ struct psp_dev_config {
*/
struct psp_dev {
struct net_device *main_netdev;
+ struct list_head assoc_dev_list;
+ int assoc_dev_cnt;
struct psp_dev_ops *ops;
struct psp_dev_caps *caps;
diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h
index a3a336488dc3..1c8899cd4da5 100644
--- a/include/uapi/linux/psp.h
+++ b/include/uapi/linux/psp.h
@@ -17,11 +17,22 @@ enum psp_version {
PSP_VERSION_HDR0_AES_GMAC_256,
};
+enum {
+ PSP_A_ASSOC_DEV_INFO_IFINDEX = 1,
+ PSP_A_ASSOC_DEV_INFO_NSID,
+
+ __PSP_A_ASSOC_DEV_INFO_MAX,
+ PSP_A_ASSOC_DEV_INFO_MAX = (__PSP_A_ASSOC_DEV_INFO_MAX - 1)
+};
+
enum {
PSP_A_DEV_ID = 1,
PSP_A_DEV_IFINDEX,
PSP_A_DEV_PSP_VERSIONS_CAP,
PSP_A_DEV_PSP_VERSIONS_ENA,
+ PSP_A_DEV_ASSOC_LIST,
+ PSP_A_DEV_NSID,
+ PSP_A_DEV_BY_ASSOCIATION,
__PSP_A_DEV_MAX,
PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1)
@@ -74,6 +85,8 @@ enum {
PSP_CMD_RX_ASSOC,
PSP_CMD_TX_ASSOC,
PSP_CMD_GET_STATS,
+ PSP_CMD_DEV_ASSOC,
+ PSP_CMD_DEV_DISASSOC,
__PSP_CMD_MAX,
PSP_CMD_MAX = (__PSP_CMD_MAX - 1)
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
index 1f5e73e7ccc1..114299c64423 100644
--- a/net/psp/psp-nl-gen.c
+++ b/net/psp/psp-nl-gen.c
@@ -53,6 +53,20 @@ static const struct nla_policy psp_get_stats_nl_policy[PSP_A_STATS_DEV_ID + 1] =
[PSP_A_STATS_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
};
+/* PSP_CMD_DEV_ASSOC - do */
+static const struct nla_policy psp_dev_assoc_nl_policy[PSP_A_DEV_NSID + 1] = {
+ [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+ [PSP_A_DEV_IFINDEX] = { .type = NLA_U32, },
+ [PSP_A_DEV_NSID] = { .type = NLA_S32, },
+};
+
+/* PSP_CMD_DEV_DISASSOC - do */
+static const struct nla_policy psp_dev_disassoc_nl_policy[PSP_A_DEV_NSID + 1] = {
+ [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+ [PSP_A_DEV_IFINDEX] = { .type = NLA_U32, },
+ [PSP_A_DEV_NSID] = { .type = NLA_S32, },
+};
+
/* Ops table for psp */
static const struct genl_split_ops psp_nl_ops[] = {
{
@@ -119,6 +133,24 @@ static const struct genl_split_ops psp_nl_ops[] = {
.dumpit = psp_nl_get_stats_dumpit,
.flags = GENL_CMD_CAP_DUMP,
},
+ {
+ .cmd = PSP_CMD_DEV_ASSOC,
+ .pre_doit = psp_device_get_locked,
+ .doit = psp_nl_dev_assoc_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_dev_assoc_nl_policy,
+ .maxattr = PSP_A_DEV_NSID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = PSP_CMD_DEV_DISASSOC,
+ .pre_doit = psp_device_get_locked,
+ .doit = psp_nl_dev_disassoc_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_dev_disassoc_nl_policy,
+ .maxattr = PSP_A_DEV_NSID,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group psp_nl_mcgrps[] = {
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
index 977355455395..4dd0f0f23053 100644
--- a/net/psp/psp-nl-gen.h
+++ b/net/psp/psp-nl-gen.h
@@ -33,6 +33,8 @@ int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info);
int psp_nl_get_stats_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int psp_nl_dev_assoc_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_dev_disassoc_doit(struct sk_buff *skb, struct genl_info *info);
enum {
PSP_NLGRP_MGMT,
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 82de78a1d6bd..9049f1d2ff02 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -37,8 +37,18 @@ struct mutex psp_devs_lock;
*/
int psp_dev_check_access(struct psp_dev *psd, struct net *net, bool admin)
{
+ struct psp_assoc_dev *entry;
+
if (dev_net(psd->main_netdev) == net)
return 0;
+
+ if (!admin) {
+ list_for_each_entry(entry, &psd->assoc_dev_list, dev_list) {
+ if (dev_net(entry->assoc_dev) == net)
+ return 0;
+ }
+ }
+
return -ENOENT;
}
@@ -74,6 +84,7 @@ psp_dev_create(struct net_device *netdev,
return ERR_PTR(-ENOMEM);
psd->main_netdev = netdev;
+ INIT_LIST_HEAD(&psd->assoc_dev_list);
psd->ops = psd_ops;
psd->caps = psd_caps;
psd->drv_priv = priv_ptr;
@@ -121,6 +132,7 @@ void psp_dev_free(struct psp_dev *psd)
*/
void psp_dev_unregister(struct psp_dev *psd)
{
+ struct psp_assoc_dev *entry, *entry_tmp;
struct psp_assoc *pas, *next;
mutex_lock(&psp_devs_lock);
@@ -140,6 +152,15 @@ void psp_dev_unregister(struct psp_dev *psd)
list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list)
psp_dev_tx_key_del(psd, pas);
+ list_for_each_entry_safe(entry, entry_tmp, &psd->assoc_dev_list,
+ dev_list) {
+ list_del(&entry->dev_list);
+ rcu_assign_pointer(entry->assoc_dev->psp_dev, NULL);
+ netdev_put(entry->assoc_dev, &entry->dev_tracker);
+ kfree(entry);
+ }
+ psd->assoc_dev_cnt = 0;
+
rcu_assign_pointer(psd->main_netdev->psp_dev, NULL);
psd->ops = NULL;
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
index eb47a9ee4438..75ca32821d28 100644
--- a/net/psp/psp_nl.c
+++ b/net/psp/psp_nl.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ethtool.h>
+#include <linux/net_namespace.h>
#include <linux/skbuff.h>
#include <linux/xarray.h>
#include <net/genetlink.h>
@@ -38,6 +39,73 @@ static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info)
return genlmsg_reply(rsp, info);
}
+/**
+ * psp_nl_multicast_per_ns() - multicast a notification to each unique netns
+ * @psd: PSP device (must be locked)
+ * @group: multicast group
+ * @build_ntf: callback to build an skb for a given netns, or NULL on failure
+ * @ctx: opaque context passed to @build_ntf
+ *
+ * Iterates all unique network namespaces from the associated device list
+ * plus the main device's netns. For each unique netns, calls @build_ntf
+ * to construct a notification skb and multicasts it.
+ */
+static void psp_nl_multicast_per_ns(struct psp_dev *psd, unsigned int group,
+ struct sk_buff *(*build_ntf)(struct psp_dev *,
+ struct net *,
+ void *),
+ void *ctx)
+{
+ struct psp_assoc_dev *entry;
+ struct xarray sent_nets;
+ struct net *main_net;
+ struct sk_buff *ntf;
+
+ main_net = dev_net(psd->main_netdev);
+ xa_init(&sent_nets);
+
+ list_for_each_entry(entry, &psd->assoc_dev_list, dev_list) {
+ struct net *assoc_net = dev_net(entry->assoc_dev);
+ int ret;
+
+ if (net_eq(assoc_net, main_net))
+ continue;
+
+ ret = xa_insert(&sent_nets, (unsigned long)assoc_net, assoc_net,
+ GFP_KERNEL);
+ if (ret == -EBUSY)
+ continue;
+
+ ntf = build_ntf(psd, assoc_net, ctx);
+ if (!ntf)
+ continue;
+
+ genlmsg_multicast_netns(&psp_nl_family, assoc_net, ntf, 0,
+ group, GFP_KERNEL);
+ }
+ xa_destroy(&sent_nets);
+
+ /* Send to main device netns */
+ ntf = build_ntf(psd, main_net, ctx);
+ if (!ntf)
+ return;
+ genlmsg_multicast_netns(&psp_nl_family, main_net, ntf, 0, group,
+ GFP_KERNEL);
+}
+
+static struct sk_buff *psp_nl_clone_ntf(struct psp_dev *psd, struct net *net,
+ void *ctx)
+{
+ return skb_clone(ctx, GFP_KERNEL);
+}
+
+static void psp_nl_multicast_all_ns(struct psp_dev *psd, struct sk_buff *ntf,
+ unsigned int group)
+{
+ psp_nl_multicast_per_ns(psd, group, psp_nl_clone_ntf, ntf);
+ nlmsg_free(ntf);
+}
+
/* Device stuff */
static struct psp_dev *
@@ -79,18 +147,58 @@ static int __psp_device_get_locked(const struct genl_split_ops *ops,
return PTR_ERR_OR_ZERO(info->user_ptr[0]);
}
+/*
+ * Admin version of psp_device_get_locked() where it returns psd only if
+ * current netns is the same as psd->main_netdev's netns.
+ */
int psp_device_get_locked_admin(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
return __psp_device_get_locked(ops, skb, info, true);
}
+/*
+ * Non-admin version of psp_device_get_locked() where it returns psd in netns
+ * for not only psd->main_netdev but all netdevs in psd->assoc_dev_list.
+ */
int psp_device_get_locked(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
return __psp_device_get_locked(ops, skb, info, false);
}
+static struct net *psp_nl_resolve_assoc_dev_ns(struct psp_dev *psd,
+ struct genl_info *info)
+{
+ struct net *net;
+ int nsid;
+
+ if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_IFINDEX))
+ return ERR_PTR(-EINVAL);
+
+ if (info->attrs[PSP_A_DEV_NSID]) {
+ /* Only callers in the main netns may specify nsid */
+ if (dev_net(psd->main_netdev) != genl_info_net(info)) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[PSP_A_DEV_NSID]);
+ return ERR_PTR(-EPERM);
+ }
+
+ nsid = nla_get_s32(info->attrs[PSP_A_DEV_NSID]);
+
+ net = get_net_ns_by_id(genl_info_net(info), nsid);
+ if (!net) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[PSP_A_DEV_NSID]);
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ net = get_net(genl_info_net(info));
+ }
+
+ return net;
+}
+
void
psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
struct genl_info *info)
@@ -103,11 +211,74 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
sockfd_put(socket);
}
+static bool psp_has_assoc_dev_in_ns(struct psp_dev *psd, struct net *net)
+{
+ struct psp_assoc_dev *entry;
+
+ list_for_each_entry(entry, &psd->assoc_dev_list, dev_list) {
+ if (dev_net(entry->assoc_dev) == net)
+ return true;
+ }
+
+ return false;
+}
+
+static int psp_nl_fill_assoc_dev_list(struct psp_dev *psd, struct sk_buff *rsp,
+ struct net *cur_net,
+ struct net *filter_net)
+{
+ struct psp_assoc_dev *entry;
+ struct net *dev_net_ns;
+ struct nlattr *nest;
+ int nsid;
+
+ list_for_each_entry(entry, &psd->assoc_dev_list, dev_list) {
+ dev_net_ns = dev_net(entry->assoc_dev);
+
+ if (filter_net && dev_net_ns != filter_net)
+ continue;
+
+ /* When filtering by namespace, all devices are in the caller's
+ * namespace so nsid is always NETNSA_NSID_NOT_ASSIGNED (-1).
+ * Otherwise, calculate the nsid relative to cur_net.
+ */
+ nsid = filter_net ? NETNSA_NSID_NOT_ASSIGNED :
+ peernet2id_alloc(cur_net, dev_net_ns,
+ GFP_KERNEL);
+
+ nest = nla_nest_start(rsp, PSP_A_DEV_ASSOC_LIST);
+ if (!nest)
+ return -1;
+
+ if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_INFO_IFINDEX,
+ entry->assoc_dev->ifindex) ||
+ nla_put_s32(rsp, PSP_A_ASSOC_DEV_INFO_NSID, nsid)) {
+ nla_nest_cancel(rsp, nest);
+ return -1;
+ }
+
+ nla_nest_end(rsp, nest);
+ }
+
+ return 0;
+}
+
static int
psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp,
const struct genl_info *info)
{
+ struct net *cur_net;
void *hdr;
+ int err;
+
+ cur_net = genl_info_net(info);
+
+ /* Skip this device if we're in an associated netns but have no
+ * associated devices in cur_net
+ */
+ if (cur_net != dev_net(psd->main_netdev) &&
+ !psp_has_assoc_dev_in_ns(psd, cur_net))
+ return 0;
hdr = genlmsg_iput(rsp, info);
if (!hdr)
@@ -119,6 +290,22 @@ psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp,
nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions))
goto err_cancel_msg;
+ if (cur_net == dev_net(psd->main_netdev)) {
+ /* Primary device - dump assoc list */
+ err = psp_nl_fill_assoc_dev_list(psd, rsp, cur_net, NULL);
+ if (err)
+ goto err_cancel_msg;
+ } else {
+ /* In netns: set by-association flag and dump filtered
+ * assoc list containing only devices in cur_net
+ */
+ if (nla_put_flag(rsp, PSP_A_DEV_BY_ASSOCIATION))
+ goto err_cancel_msg;
+ err = psp_nl_fill_assoc_dev_list(psd, rsp, cur_net, cur_net);
+ if (err)
+ goto err_cancel_msg;
+ }
+
genlmsg_end(rsp, hdr);
return 0;
@@ -127,27 +314,34 @@ psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp,
return -EMSGSIZE;
}
-void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd)
+static struct sk_buff *psp_nl_build_dev_ntf(struct psp_dev *psd,
+ struct net *net, void *ctx)
{
+ u32 cmd = *(u32 *)ctx;
struct genl_info info;
struct sk_buff *ntf;
- if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev),
- PSP_NLGRP_MGMT))
- return;
+ if (!genl_has_listeners(&psp_nl_family, net, PSP_NLGRP_MGMT))
+ return NULL;
ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!ntf)
- return;
+ return NULL;
genl_info_init_ntf(&info, &psp_nl_family, cmd);
+ genl_info_net_set(&info, net);
if (psp_nl_dev_fill(psd, ntf, &info)) {
nlmsg_free(ntf);
- return;
+ return NULL;
}
- genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf,
- 0, PSP_NLGRP_MGMT, GFP_KERNEL);
+ return ntf;
+}
+
+void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd)
+{
+ psp_nl_multicast_per_ns(psd, PSP_NLGRP_MGMT,
+ psp_nl_build_dev_ntf, &cmd);
}
int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info)
@@ -281,8 +475,9 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info)
psd->stats.rotations++;
nlmsg_end(ntf, (struct nlmsghdr *)ntf->data);
- genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf,
- 0, PSP_NLGRP_USE, GFP_KERNEL);
+
+ psp_nl_multicast_all_ns(psd, ntf, PSP_NLGRP_USE);
+
return psp_nl_reply_send(rsp, info);
err_free_ntf:
@@ -292,6 +487,128 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info)
return err;
}
+int psp_nl_dev_assoc_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct psp_dev *psd = info->user_ptr[0];
+ struct psp_assoc_dev *psp_assoc_dev;
+ struct net_device *assoc_dev;
+ struct sk_buff *rsp;
+ u32 assoc_ifindex;
+ struct net *net;
+ int err;
+
+ if (psd->assoc_dev_cnt >= PSP_ASSOC_DEV_MAX) {
+ NL_SET_ERR_MSG(info->extack,
+ "Maximum number of associated devices reached");
+ return -ENOSPC;
+ }
+
+ net = psp_nl_resolve_assoc_dev_ns(psd, info);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ psp_assoc_dev = kzalloc_obj(*psp_assoc_dev, GFP_KERNEL);
+ if (!psp_assoc_dev) {
+ err = -ENOMEM;
+ goto err_put_net;
+ }
+
+ assoc_ifindex = nla_get_u32(info->attrs[PSP_A_DEV_IFINDEX]);
+ assoc_dev = netdev_get_by_index(net, assoc_ifindex,
+ &psp_assoc_dev->dev_tracker,
+ GFP_KERNEL);
+ if (!assoc_dev) {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_DEV_IFINDEX]);
+ err = -ENODEV;
+ goto assoc_dev_err;
+ }
+
+ /* Check if device is already associated with a PSP device */
+ if (cmpxchg(&assoc_dev->psp_dev, NULL, RCU_INITIALIZER(psd))) {
+ NL_SET_ERR_MSG(info->extack,
+ "Device already associated with a PSP device");
+ err = -EBUSY;
+ goto cmpxchg_err;
+ }
+
+ psp_assoc_dev->assoc_dev = assoc_dev;
+ rsp = psp_nl_reply_new(info);
+ if (!rsp) {
+ err = -ENOMEM;
+ goto rsp_err;
+ }
+
+ list_add_tail(&psp_assoc_dev->dev_list, &psd->assoc_dev_list);
+ psd->assoc_dev_cnt++;
+
+ put_net(net);
+
+ psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF);
+
+ return psp_nl_reply_send(rsp, info);
+
+rsp_err:
+ rcu_assign_pointer(assoc_dev->psp_dev, NULL);
+cmpxchg_err:
+ netdev_put(assoc_dev, &psp_assoc_dev->dev_tracker);
+assoc_dev_err:
+ kfree(psp_assoc_dev);
+err_put_net:
+ put_net(net);
+
+ return err;
+}
+
+int psp_nl_dev_disassoc_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct psp_assoc_dev *entry, *found = NULL;
+ struct psp_dev *psd = info->user_ptr[0];
+ struct sk_buff *rsp;
+ u32 assoc_ifindex;
+ struct net *net;
+
+ net = psp_nl_resolve_assoc_dev_ns(psd, info);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ assoc_ifindex = nla_get_u32(info->attrs[PSP_A_DEV_IFINDEX]);
+
+ /* Search the association list by ifindex and netns */
+ list_for_each_entry(entry, &psd->assoc_dev_list, dev_list) {
+ if (entry->assoc_dev->ifindex == assoc_ifindex &&
+ dev_net(entry->assoc_dev) == net) {
+ found = entry;
+ break;
+ }
+ }
+
+ if (!found) {
+ put_net(net);
+ NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_DEV_IFINDEX]);
+ return -ENODEV;
+ }
+
+ rsp = psp_nl_reply_new(info);
+ if (!rsp) {
+ put_net(net);
+ return -ENOMEM;
+ }
+
+ /* Notify before removal */
+ psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF);
+
+ /* Remove from the association list */
+ list_del(&found->dev_list);
+ psd->assoc_dev_cnt--;
+ rcu_assign_pointer(found->assoc_dev->psp_dev, NULL);
+ netdev_put(found->assoc_dev, &found->dev_tracker);
+ kfree(found);
+
+ put_net(net);
+
+ return psp_nl_reply_send(rsp, info);
+}
+
/* Key etc. */
int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
@@ -320,8 +637,10 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
psd = psp_dev_get_for_sock(socket->sk);
if (psd) {
+ mutex_lock(&psd->lock);
err = psp_dev_check_access(psd, genl_info_net(info), false);
if (err) {
+ mutex_unlock(&psd->lock);
psp_dev_put(psd);
psd = NULL;
}
@@ -334,7 +653,6 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
id = info->attrs[PSP_A_ASSOC_DEV_ID];
if (psd) {
- mutex_lock(&psd->lock);
if (id && psd->id != nla_get_u32(id)) {
mutex_unlock(&psd->lock);
NL_SET_ERR_MSG_ATTR(info->extack, id,
--
2.52.0
^ permalink raw reply related
* [PATCH v12 net-next 1/5] psp: add admin/non-admin version of psp_device_get_locked
From: Wei Wang @ 2026-04-18 17:00 UTC (permalink / raw)
To: netdev, Jakub Kicinski, Daniel Zahka, Willem de Bruijn, David Wei,
Andrew Lunn, David S . Miller, Eric Dumazet, Simon Horman,
Paolo Abeni
Cc: Wei Wang
In-Reply-To: <20260418170056.3490525-1-weibunny.kernel@gmail.com>
From: Wei Wang <weibunny@fb.com>
Introduce 2 versions of psp_device_get_locked:
1. psp_device_get_locked_admin(): This version is used for operations
that would change the status of the psd, and are currently used for
dev-set and key-rotation.
2. psp_device_get_locked(): This is the non-admin version, which are
used for broader user issued operations including: dev-get, rx-assoc,
tx-assoc, get-stats.
Following commit will be implementing both of the checks.
Signed-off-by: Wei Wang <weibunny@fb.com>
Reviewed-by: Daniel Zahka <daniel.zahka@gmail.com>
---
Documentation/netlink/specs/psp.yaml | 4 ++--
net/psp/psp-nl-gen.c | 4 ++--
net/psp/psp-nl-gen.h | 2 ++
net/psp/psp.h | 2 +-
net/psp/psp_main.c | 7 +++++-
net/psp/psp_nl.c | 33 ++++++++++++++++++++--------
6 files changed, 37 insertions(+), 15 deletions(-)
diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml
index 100c36cda8e5..c54e1202cbe0 100644
--- a/Documentation/netlink/specs/psp.yaml
+++ b/Documentation/netlink/specs/psp.yaml
@@ -195,7 +195,7 @@ operations:
- psp-versions-ena
reply:
attributes: []
- pre: psp-device-get-locked
+ pre: psp-device-get-locked-admin
post: psp-device-unlock
-
name: dev-change-ntf
@@ -214,7 +214,7 @@ operations:
reply:
attributes:
- id
- pre: psp-device-get-locked
+ pre: psp-device-get-locked-admin
post: psp-device-unlock
-
name: key-rotate-ntf
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
index 22a48d0fa378..1f5e73e7ccc1 100644
--- a/net/psp/psp-nl-gen.c
+++ b/net/psp/psp-nl-gen.c
@@ -71,7 +71,7 @@ static const struct genl_split_ops psp_nl_ops[] = {
},
{
.cmd = PSP_CMD_DEV_SET,
- .pre_doit = psp_device_get_locked,
+ .pre_doit = psp_device_get_locked_admin,
.doit = psp_nl_dev_set_doit,
.post_doit = psp_device_unlock,
.policy = psp_dev_set_nl_policy,
@@ -80,7 +80,7 @@ static const struct genl_split_ops psp_nl_ops[] = {
},
{
.cmd = PSP_CMD_KEY_ROTATE,
- .pre_doit = psp_device_get_locked,
+ .pre_doit = psp_device_get_locked_admin,
.doit = psp_nl_key_rotate_doit,
.post_doit = psp_device_unlock,
.policy = psp_key_rotate_nl_policy,
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
index 599c5f1c82f2..977355455395 100644
--- a/net/psp/psp-nl-gen.h
+++ b/net/psp/psp-nl-gen.h
@@ -17,6 +17,8 @@ extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1];
int psp_device_get_locked(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
+int psp_device_get_locked_admin(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
void
diff --git a/net/psp/psp.h b/net/psp/psp.h
index 9f19137593a0..0f9c4e4e52cb 100644
--- a/net/psp/psp.h
+++ b/net/psp/psp.h
@@ -14,7 +14,7 @@ extern struct xarray psp_devs;
extern struct mutex psp_devs_lock;
void psp_dev_free(struct psp_dev *psd);
-int psp_dev_check_access(struct psp_dev *psd, struct net *net);
+int psp_dev_check_access(struct psp_dev *psd, struct net *net, bool admin);
void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd);
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
index 9508b6c38003..82de78a1d6bd 100644
--- a/net/psp/psp_main.c
+++ b/net/psp/psp_main.c
@@ -27,10 +27,15 @@ struct mutex psp_devs_lock;
* psp_dev_check_access() - check if user in a given net ns can access PSP dev
* @psd: PSP device structure user is trying to access
* @net: net namespace user is in
+ * @admin: If true, only allow access from @psd's main device's netns,
+ * for admin operations like config changes and key rotation.
+ * If false, also allow access from network namespaces that have
+ * an associated device with @psd, for read-only and association
+ * management operations.
*
* Return: 0 if PSP device should be visible in @net, errno otherwise.
*/
-int psp_dev_check_access(struct psp_dev *psd, struct net *net)
+int psp_dev_check_access(struct psp_dev *psd, struct net *net, bool admin)
{
if (dev_net(psd->main_netdev) == net)
return 0;
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
index 6afd7707ec12..eb47a9ee4438 100644
--- a/net/psp/psp_nl.c
+++ b/net/psp/psp_nl.c
@@ -41,7 +41,8 @@ static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info)
/* Device stuff */
static struct psp_dev *
-psp_device_get_and_lock(struct net *net, struct nlattr *dev_id)
+psp_device_get_and_lock(struct net *net, struct nlattr *dev_id,
+ bool admin)
{
struct psp_dev *psd;
int err;
@@ -56,7 +57,7 @@ psp_device_get_and_lock(struct net *net, struct nlattr *dev_id)
mutex_lock(&psd->lock);
mutex_unlock(&psp_devs_lock);
- err = psp_dev_check_access(psd, net);
+ err = psp_dev_check_access(psd, net, admin);
if (err) {
mutex_unlock(&psd->lock);
return ERR_PTR(err);
@@ -65,17 +66,31 @@ psp_device_get_and_lock(struct net *net, struct nlattr *dev_id)
return psd;
}
-int psp_device_get_locked(const struct genl_split_ops *ops,
- struct sk_buff *skb, struct genl_info *info)
+static int __psp_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info,
+ bool admin)
{
if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID))
return -EINVAL;
info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info),
- info->attrs[PSP_A_DEV_ID]);
+ info->attrs[PSP_A_DEV_ID],
+ admin);
return PTR_ERR_OR_ZERO(info->user_ptr[0]);
}
+int psp_device_get_locked_admin(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __psp_device_get_locked(ops, skb, info, true);
+}
+
+int psp_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __psp_device_get_locked(ops, skb, info, false);
+}
+
void
psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
struct genl_info *info)
@@ -160,7 +175,7 @@ static int
psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb,
struct psp_dev *psd)
{
- if (psp_dev_check_access(psd, sock_net(rsp->sk)))
+ if (psp_dev_check_access(psd, sock_net(rsp->sk), false))
return 0;
return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb));
@@ -305,7 +320,7 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
psd = psp_dev_get_for_sock(socket->sk);
if (psd) {
- err = psp_dev_check_access(psd, genl_info_net(info));
+ err = psp_dev_check_access(psd, genl_info_net(info), false);
if (err) {
psp_dev_put(psd);
psd = NULL;
@@ -330,7 +345,7 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
psp_dev_put(psd);
} else {
- psd = psp_device_get_and_lock(genl_info_net(info), id);
+ psd = psp_device_get_and_lock(genl_info_net(info), id, false);
if (IS_ERR(psd)) {
err = PTR_ERR(psd);
goto err_sock_put;
@@ -573,7 +588,7 @@ static int
psp_nl_stats_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb,
struct psp_dev *psd)
{
- if (psp_dev_check_access(psd, sock_net(rsp->sk)))
+ if (psp_dev_check_access(psd, sock_net(rsp->sk), false))
return 0;
return psp_nl_stats_fill(psd, rsp, genl_info_dump(cb));
--
2.52.0
^ permalink raw reply related
* [PATCH v12 net-next 0/5] psp: Add support for dev-assoc/disassoc
From: Wei Wang @ 2026-04-18 17:00 UTC (permalink / raw)
To: netdev, Jakub Kicinski, Daniel Zahka, Willem de Bruijn, David Wei,
Andrew Lunn, David S . Miller, Eric Dumazet, Simon Horman,
Paolo Abeni
Cc: Wei Wang
From: Wei Wang <weibunny@fb.com>
The main purpose of this feature is to associate virtual devices like
veth or netkit with a real PSP device, so we could provide PSP
functionality to the application running with virtual devices.
A typical deployment that works with this feature is as follows:
Host Namespace:
psp_dev_local ←──physically linked──→ psp_dev_peer
(PSP device)
│
│ BPF on psp_dev_local ingress: bpf_redirect_peer() to nk_guest
│
nk_host / veth_host
│
│ BPF on nk_host ingress: bpf_redirect_neigh() to psp_dev_local
│
Guest Namespace (netns):
│
nk_guest / veth_guest
★ PSP application run here
Remote Namespace (_netns):
psp_dev_peer
★ PSP server application runs here
Note:
The general requirement for this feature to work:
For PSP to work correctly, the egress device at validate_xmit_skb()
time must have psp_dev matching the association's psd. Any device
stacking or traffic redirection that changes the egress device will
cause either:
1. TX validation failure (SKB_DROP_REASON_PSP_OUTPUT) - fail-safe
2. RX policy failure after tx-assoc - packets without PSP extension
are rejected by receiver expecting encrypted traffic
Here are a few examples that this feature would not work:
- Bonding with load balancing in round-robin, XOR, 802.3ad mode across
multiple PSP devices, or mixed PSP and non-PSP devices
- Bonding with active-backup mode might work without PSP migration for
failover case.
- ipvlan/macvlan in bridge mode would not work given packets are
loopbacked locally without going through the PSP device.
Changes since v11:
- Cap max number of associated devs per psd to 128 to avoid overflowing
GENLMSG_DEFAULT_SIZE for netlink msg in patch 2
- Make common function for getting net in psp_nl_dev_assoc_doit() and
psp_nl_dev_disassoc_doit() in patch 2
- Only allow NSID to be passed in when user is in dev_net(psd->main_netdev)
to avoid manipulation of dev-assoc/dev-disassoc from netns other than
its own in patch 2
- Fixed the race between netdev_unregister() and psp_nl_dev_assoc_doit()
by adding devreg status check in psp_nl_dev_assoc_doit().
Changes since v10:
- Corrected typo on patch 1
- Removed the kdoc style comments, Use goto style in
psp_nl_dev_assoc_doit() clean up code, Resolved "TOCTOU" issue in
psp_assoc_device_get_locked() in patch 2
- Replaced psp_devs_lock with a new mutex in
psp_attach_netdev_notifier(), Fixed kdoc style comments in patch 3
Changes since v9:
- Added comments for psp_device_get_locked(), fixed lint issue, fixed
rcu warning in patch 2
- Return error if register_netdevice_notifier() fails in
psp_device_get_locked_dev_assoc() in patch 3
- Removed psp version and ip version for unnecessary tests cases in
patch 5
Changes since v8:
- Rebase
Changes since v7:
- Refactor in patch 1 to have a common helper for
psp_device_get_locked_admin() and psp_device_get_locked()
- Take psd->lock in psp_assoc_device_get_locked() before
psp_dev_check_access() in patch 2
- Use cmpxchg() for assoc_dev->psp_dev assignment when doing dev-assoc
in patch 2
- Check for err for register_netdevice_notifier() in patch 3
- Call psp_attach_netdev_notifier() in pre_doit handler for dev-assoc to
avoid releasing of psd->lock in patch 3
Changes since v6:
- Remove the unused remote_addr, nk_guest_addr and import cmd in patch 5
Changes since v5:
- Remove module_exit() in patch 3
Changes since v4:
- Address compilation warning in patch 3
- Removed the call to psp_nl_has_listeners_any_ns() and check listeners
when looping through netns in psp_nl_notify_dev() in patch 2. This
makes sure we only send notification to netns that has listeners.
Changes since v3:
- Make nsid optional for dev-assoc/dev-disassoc operation, and use
the ns user is in when it's not specified. Also added a test for this.
- Fix psp_nl_notify_dev() to compute the correct nsid relative to the
listener's netns.
- Only register the new netdev event for psp dev cleanup upon the first
successful dev-assoc operation.
- Change the following in selftest:
- Add CONFIG_NETKIT to driver/net's config
- Fall back to NetDrvEpEnv and run basic test cases if NetDrvContEnv
does not load
- Use ksft_variants instead of psp_ip_ver_test_builder
Changes since v2:
- Change the newly added parameter to psp_device_get_and_lock() to
admin in patch 1. Introduce 2 device check functions:
- psp_device_get_locked_admin() for dev-set and key-rotate
- psp_device_get_locked() for all other operations
Flip the logic for checking the dev_assoc_list accordingly in patch 2.
- Move psp_nl_notify_dev() before removing the dev from assoc_dev_list
in psp_nl_dev_disassoc_doit() and correct the typo in commit msg in
patch 2.
- Remove the threading and subprocess and some comment updates in patch 5.
Changes since v1:
- Update the first 4 patches to reflect the latest changes in
https://lore.kernel.org/netdev/20260302053315.1919859-1-dw@davidwei.uk/
- Update patch 9 to add a param to NetDrvContEnv to control the loading
of the tx forwarding bpf program
Wei Wang (5):
psp: add admin/non-admin version of psp_device_get_locked
psp: add new netlink cmd for dev-assoc and dev-disassoc
psp: add a new netdev event for dev unregister
selftests/net: Add bpf skb forwarding program
selftest/net: psp: Add test for dev-assoc/disassoc
Documentation/netlink/specs/psp.yaml | 71 ++-
include/net/psp/types.h | 23 +
include/uapi/linux/psp.h | 13 +
net/psp/psp-nl-gen.c | 36 +-
net/psp/psp-nl-gen.h | 7 +
net/psp/psp.h | 3 +-
net/psp/psp_main.c | 104 +++-
net/psp/psp_nl.c | 402 ++++++++++++++-
tools/testing/selftests/drivers/net/config | 1 +
.../drivers/net/hw/nk_redirect.bpf.c | 60 +++
.../selftests/drivers/net/lib/py/env.py | 54 ++-
tools/testing/selftests/drivers/net/psp.py | 457 ++++++++++++++++--
12 files changed, 1170 insertions(+), 61 deletions(-)
create mode 100644 tools/testing/selftests/drivers/net/hw/nk_redirect.bpf.c
--
2.52.0
^ permalink raw reply
* Re: [PATCH iwl-next v2 3/3] igc: add support for forcing link speed without autonegotiation
From: Simon Horman @ 2026-04-18 16:48 UTC (permalink / raw)
To: khai.wen.tan
Cc: 'Simon Horman', anthony.l.nguyen, przemyslaw.kitszel,
andrew+netdev, davem, edumazet, kuba, pabeni, intel-wired-lan,
netdev, linux-kernel, faizal.abdul.rahim, hong.aun.looi,
khai.wen.tan, faizal.abdul.rahim
In-Reply-To: <20260416015520.6090-4-khai.wen.tan@linux.intel.com>
From: 'Simon Horman' <horms@kernel.org>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
igc: add support for forcing link speed without autonegotiation
This commit adds support to allow users to force 10/100 Mb/s link speed
and duplex via ethtool when autonegotiation is disabled.
> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> index cfcbf2fdad6ea..5bd37d1be1688 100644
> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
[ ... ]
> @@ -2000,6 +2013,41 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
> return 0;
> }
>
> +/**
> + * igc_handle_autoneg_disabled - Configure forced speed/duplex settings
> + * @adapter: private driver structure
> + * @speed: requested speed (must be SPEED_10 or SPEED_100)
> + * @duplex: requested duplex
> + *
> + * Records forced speed/duplex when autoneg is disabled.
> + * Caller must validate speed before calling this function.
> + */
> +static void igc_handle_autoneg_disabled(struct igc_adapter *adapter, u32 speed,
> + u8 duplex)
> +{
> + struct igc_mac_info *mac = &adapter->hw.mac;
> +
> + switch (speed) {
> + case SPEED_10:
> + mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
> + IGC_FORCED_10F : IGC_FORCED_10H;
> + break;
> + case SPEED_100:
> + mac->forced_speed_duplex = (duplex == DUPLEX_FULL) ?
> + IGC_FORCED_100F : IGC_FORCED_100H;
> + break;
> + default:
> + WARN_ONCE(1, "Unsupported speed %u\n", speed);
> + return;
> + }
> +
> + mac->autoneg_enabled = false;
> +
> + /* Half-duplex cannot support flow control per IEEE 802.3 */
> + if (duplex == DUPLEX_HALF)
> + adapter->hw.fc.requested_mode = igc_fc_none;
> +}
If userspace issues an ethtool command without specifying duplex, the duplex
value can be DUPLEX_UNKNOWN. In this case, forced_speed_duplex correctly
defaults to half-duplex since duplex == DUPLEX_FULL evaluates to false.
However, since DUPLEX_UNKNOWN is not equal to DUPLEX_HALF, the check above
fails, and the requested flow control mode remains unchanged. Could this leave
flow control enabled on a forced half-duplex link, violating IEEE 802.3?
Should this check be based on the resolved mac->forced_speed_duplex state
instead?
[ ... ]
> @@ -2071,14 +2120,20 @@ igc_ethtool_set_link_ksettings(struct net_device *netdev,
> }
> }
>
> + if (cmd->base.autoneg == AUTONEG_DISABLE &&
> + cmd->base.speed != SPEED_10 && cmd->base.speed != SPEED_100) {
> + netdev_info(dev, "Unsupported speed for forced link\n");
> + return -EINVAL;
> + }
> +
> while (test_and_set_bit(__IGC_RESETTING, &adapter->state))
> usleep_range(1000, 2000);
This isn't a bug introduced by this patch, but does this ad-hoc
synchronization using bit flags circumvent proper locking guarantees? The
networking subsystem guidelines forbid using bit flags as ad-hoc locks.
Using a bit flag in a polling loop bypasses lockdep analysis, fairness, and
strict memory ordering. Could this be replaced with a real synchronization
mechanism like a mutex?
^ permalink raw reply
* Re: [PATCH v2 0/6] selftests: net: multithread + rss_multiqueue support for iou-zcrx
From: Jakub Kicinski @ 2026-04-18 16:49 UTC (permalink / raw)
To: Juanlu Herrero; +Cc: dw, netdev
In-Reply-To: <cover.1776444379.git.juanlu@fastmail.com>
On Fri, 17 Apr 2026 09:49:46 -0700 Juanlu Herrero wrote:
> Add multithread support to the iou-zcrx selftest, plus a new
> rss_multiqueue Python variant that exercises multi-queue zero-copy
> receive on a single listening socket with NAPI-ID-based dispatch.
## Form letter - net-next-closed
We have already submitted our pull request with net-next material for v7.1,
and therefore net-next is closed for new drivers, features, code refactoring
and optimizations. We are currently accepting bug fixes only.
Please repost when net-next reopens after Apr 27th.
RFC patches sent for review only are obviously welcome at any time.
See: https://www.kernel.org/doc/html/next/process/maintainer-netdev.html#development-cycle
--
pw-bot: defer
pv-bot: closed
^ permalink raw reply
* Re: [net,PATCH v3 1/2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Marek Vasut @ 2026-04-18 16:46 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: netdev, stable, David S. Miller, Andrew Lunn, Eric Dumazet,
Jakub Kicinski, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
Yicong Hui, linux-kernel
In-Reply-To: <20260416104818._EDbo9hA@linutronix.de>
On 4/16/26 12:48 PM, Sebastian Andrzej Siewior wrote:
> On 2026-04-16 11:26:00 [+0200], Marek Vasut wrote:
>>> memory allocation. Therefore I am saying this backtrace is from an older
>>> kernel.
>>
>> I actually did update the backtrace in V3 with the one from next 20260413
>> that contained b44596ffe1b4 ("ARM: Allow to enable RT") from
>> stable-rt/v6.12-rt-rebase branch [1] .
>>
>> I think I misunderstood the usage of "softirq is raised" vs. "softirq is
>> invoked" above . Is it possible that there was an already raised softirq
>> before the threaded IRQ handler was invoked, and __netdev_alloc_skb() is
>> what invoked that softirq ?
>
> It is not impossible. Something needs to netif_wake_queue() and
> ks8851_irq() must only report IRQ_RXI (not IRQ_TXI). Then it can happen.
> But usually the driver "stops" the queue if it can't process any new
> packets and resumes it once a packet has been sent so it has room again.
This driver .start_xmit is very simple, if there is space in the 6 kiB
TX FIFO, then the packet is written into it, otherwise the .start_xmit
returns NETDEV_TX_BUSY . There does not seem to be any
netif_{start,stop,wake}_queue() in the .start_xmit path.
^ permalink raw reply
* [PATCH net] seg6: fix seg6 lwtunnel output redirect for L2 reduced encap mode
From: Andrea Mayer @ 2026-04-18 16:28 UTC (permalink / raw)
To: davem, dsahern, edumazet, kuba, pabeni, horms
Cc: anton.makarov11235, stefano.salsano, netdev, linux-kernel,
Andrea Mayer, stable
When SEG6_IPTUN_MODE_L2ENCAP_RED (L2ENCAP_RED) was introduced, the
condition in seg6_build_state() that excludes L2 encap modes from
setting LWTUNNEL_STATE_OUTPUT_REDIRECT was not updated to account for
the new mode.
As a consequence, L2ENCAP_RED routes incorrectly trigger seg6_output()
on the output path, where the packet is silently dropped because
skb_mac_header_was_set() fails on L3 packets.
Extend the check to also exclude L2ENCAP_RED, consistent with L2ENCAP.
Fixes: 13f0296be8ec ("seg6: add support for SRv6 H.L2Encaps.Red behavior")
Cc: stable@vger.kernel.org
Signed-off-by: Andrea Mayer <andrea.mayer@uniroma2.it>
---
net/ipv6/seg6_iptunnel.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 97b50d9b1365..9b64343ebad6 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -746,7 +746,8 @@ static int seg6_build_state(struct net *net, struct nlattr *nla,
newts->type = LWTUNNEL_ENCAP_SEG6;
newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
- if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+ if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP &&
+ tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP_RED)
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
newts->headroom = seg6_lwt_headroom(tuninfo);
--
2.20.1
^ permalink raw reply related
* [PATCH nf] netfilter: xt_TCPMSS: check skb_dst before path-MTU clamping
From: Weiming Shi @ 2026-04-18 16:30 UTC (permalink / raw)
To: Pablo Neira Ayuso, Florian Westphal, David S . Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Phil Sutter, Simon Horman, netfilter-devel, coreteam, netdev,
Xiang Mei, Weiming Shi
When TCPMSS with CLAMP_PMTU is used via nft_compat in a non-base
chain, par->hook_mask is set to 0, bypassing the checkentry hook
validation. The target can then run at PRE_ROUTING where skb_dst is
NULL, causing a null-ptr-deref in tcpmss_mangle_packet():
KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
RIP: 0010:tcpmss_mangle_packet (include/net/dst.h:219 net/netfilter/xt_TCPMSS.c:105)
tcpmss_tg4 (net/netfilter/xt_TCPMSS.c:202)
nft_target_eval_xt (net/netfilter/nft_compat.c:87)
nft_do_chain (net/netfilter/nf_tables_core.c:287)
nf_hook_slow (net/netfilter/core.c:623)
Check skb_dst() for NULL before calling dst_mtu().
Fixes: 493618a92c6a ("netfilter: nft_compat: fix hook validation for non-base chains")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
---
net/netfilter/xt_TCPMSS.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 116a885adb3c..79b5e475e23e 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -102,7 +102,12 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
struct net *net = xt_net(par);
unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
- unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
+ unsigned int min_mtu;
+
+ if (!skb_dst(skb))
+ return -1;
+
+ min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
if (min_mtu <= minlen) {
net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v5 net] nfc: hci: fix out-of-bounds read in HCP header parsing
From: Simon Horman @ 2026-04-18 16:30 UTC (permalink / raw)
To: Ashutosh Desai
Cc: netdev, kuba, edumazet, davem, pabeni, stable, linux-kernel
In-Reply-To: <20260416051522.4154698-1-ashutoshdesai993@gmail.com>
On Thu, Apr 16, 2026 at 05:15:22AM +0000, Ashutosh Desai wrote:
> nfc_hci_recv_from_llc() and nci_hci_data_received_cb() cast skb->data
> to struct hcp_packet and read the message header byte without checking
> that enough data is present in the linear sk_buff area. A malicious NFC
> peer can send a 1-byte HCP frame that passes through the SHDLC layer
> and reaches these functions, causing an out-of-bounds heap read.
>
> Fix this by adding pskb_may_pull() before each cast to ensure the full
> 2-byte HCP header is pulled into the linear area before it is accessed.
>
> Fixes: 8b8d2e08bf0d ("NFC: HCI support")
> Fixes: 11f54f228643 ("NFC: nci: Add HCI over NCI protocol support")
> Cc: stable@vger.kernel.org
> Signed-off-by: Ashutosh Desai <ashutoshdesai993@gmail.com>
> ---
> V4 -> V5: fix whitespace damage
> V3 -> V4: add Fixes tags
> V2 -> V3: drop redundant checks from nfc_hci_msg_rx_work/nci_hci_msg_rx_work;
> remove incorrect Suggested-by tag
> V1 -> V2: use pskb_may_pull() instead of skb->len check
>
> v4: https://lore.kernel.org/netdev/177614425081.3600288.2536320552978506086@gmail.com/
> v3: https://lore.kernel.org/netdev/20260413024329.3293075-1-ashutoshdesai993@gmail.com/
> v2: https://lore.kernel.org/netdev/20260409150825.2217133-1-ashutoshdesai993@gmail.com/
> v1: https://lore.kernel.org/netdev/20260408223113.2009304-1-ashutoshdesai993@gmail.com/
>
> net/nfc/hci/core.c | 5 +++++
> net/nfc/nci/hci.c | 5 +++++
> 2 files changed, 10 insertions(+)
Reviewed-by: Simon Horman <horms@kernel.org>
Review of this patch at Sashiko.dev flags a number of related problems in
this code. I believe none of them introduced by this patch. And that
they can all be treated as area for possible follow-up.
^ permalink raw reply
* Re: [PATCH net 2/3] octeontx2-af: npc: cn20k: Drop debugfs_create_file() error checks in init
From: Simon Horman @ 2026-04-18 16:20 UTC (permalink / raw)
To: Ratheesh Kannoth
Cc: netdev, linux-kernel, sgoutham, davem, edumazet, kuba, pabeni,
andrew+netdev, dan.carpenter, Dan Carpenter
In-Reply-To: <20260416035352.333808-3-rkannoth@marvell.com>
On Thu, Apr 16, 2026 at 09:23:51AM +0530, Ratheesh Kannoth wrote:
> debugfs is not intended to be checked for allocation failures the way
> other kernel APIs are: callers should not fail probe or subsystem init
> because a debugfs node could not be created, including when debugfs is
> disabled in Kconfig. Replacing NULL checks with IS_ERR() checks is
> similarly wrong for optional debugfs.
>
> Remove dentry checks and -EFAULT returns from npc_cn20k_debugfs_init().
> https://staticthinking.wordpress.com/2023/07/24/debugfs-functions-are-not-supposed-to-be-checked/
>
> CC: Dan Carpenter <error27@gmail.com>
> Link: https://lore.kernel.org/netdev/adjNGPWKMOk3KgWL@stanley.mountain/
> Fixes: 528530dff56b ("octeontx2-af: npc: cn20k: add debugfs support")
> Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
Reviewed-by: Simon Horman <horms@kernel.org>
^ permalink raw reply
* Re: [PATCH net 3/3] octeontx2-af: npc: cn20k: Return error when defrag rollback free fails
From: Simon Horman @ 2026-04-18 16:18 UTC (permalink / raw)
To: rkannoth
Cc: 'Simon Horman', netdev, linux-kernel, sgoutham, davem,
edumazet, kuba, pabeni, andrew+netdev, dan.carpenter, error27
In-Reply-To: <20260416035352.333808-4-rkannoth@marvell.com>
From: 'Simon Horman' <horms@kernel.org>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
octeontx2-af: npc: cn20k: Return error when defrag rollback free fails
This commit attempts to fix a bug where an error code wasn't returned
when a rollback failure occurred during partial allocation in
npc_defrag_alloc_free_slots(). It sets rc to -EFAULT if the
__npc_subbank_free() call fails.
Link: https://lore.kernel.org/netdev/adjNJEpILRZATB2N@stanley.mountain/
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
> index 2fcd0ee2b1e10..df192729ac1d4 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
> @@ -3541,6 +3541,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu,
> dev_err(rvu->dev,
> "%s: Error to free mcam idx=%u\n",
> __func__, save[i]);
> + rc = -EFAULT;
> break;
> }
> }
While this ensures an error is returned if the rollback fails, does the
function still incorrectly return success when the rollback succeeds?
Earlier in the function, if the allocation cannot fulfill the requested slots,
the code jumps to the rollback loop:
if (alloc_cnt1 + alloc_cnt2 != cnt) {
...
goto fail_free_alloc;
}
Inside the fail_free_alloc loop, the rc variable is overwritten by the
subbank lookup:
fail_free_alloc:
for (i = 0; i < alloc_cnt1 + alloc_cnt2; i++) {
rc = npc_mcam_idx_2_subbank_idx(rvu, save[i], &sb, &sb_off);
If the subbank lookup and the subsequent __npc_subbank_free() succeed,
rc will be 0.
Since the loop finishes normally without hitting the newly added rc = -EFAULT
path, does the function end up returning 0 at the end of the fail_free_alloc
block?
Could this cause the caller to incorrectly assume that all slots were
successfully allocated, leading to operations on the freed or uninitialized
slots in the save array? Using a separate variable for the loop operations
or explicitly setting an error code like -ENOSPC before the goto might help
preserve the failure state.
^ permalink raw reply
* Re: [PATCH net 1/3] octeontx2-af: npc: cn20k: Handle npc_mcam_idx_2_key_type() failures
From: Simon Horman @ 2026-04-18 16:17 UTC (permalink / raw)
To: rkannoth
Cc: 'Simon Horman', netdev, linux-kernel, sgoutham, davem,
edumazet, kuba, pabeni, andrew+netdev, dan.carpenter, error27
In-Reply-To: <20260416035352.333808-2-rkannoth@marvell.com>
From: 'Simon Horman' <horms@kernel.org>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
octeontx2-af: npc: cn20k: Handle npc_mcam_idx_2_key_type() failures
This commit adds error checking to npc_mcam_idx_2_key_type() calls.
It returns early from several void functions to prevent programming
or interpreting MCAM state with an invalid key type.
Link: https://lore.kernel.org/netdev/adiQJvuKlEhq2ILx@stanley.mountain/
> diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
> index 7291fdb89b03f..2fcd0ee2b1e10 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c
[ ... ]
> @@ -1052,10 +1054,12 @@ void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index,
> int kw = 0;
> u8 kw_type;
>
> + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type))
> + return;
> +
> /* Disable before mcam entry update */
> npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false);
>
Does returning early from a void function here cause state desynchronization?
For example, in mailbox handlers like
rvu_mbox_handler_npc_cn20k_mcam_write_entry(), the caller assumes success
and returns 0, but the hardware programming would be silently aborted.
[ ... ]
> @@ -1132,8 +1136,13 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest)
>
> dbank = npc_get_bank(mcam, dest);
> sbank = npc_get_bank(mcam, src);
> - npc_mcam_idx_2_key_type(rvu, src, &src_kwtype);
> - npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype);
> +
> + if (npc_mcam_idx_2_key_type(rvu, src, &src_kwtype))
> + return;
> +
If this copy is skipped, could callers like npc_cn20k_defrag() unconditionally
enable the target entry with stale hardware state?
[ ... ]
> @@ -1188,7 +1197,8 @@ void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index,
> int kw = 0, bank;
> u8 kw_type;
>
> - npc_mcam_idx_2_key_type(rvu, index, &kw_type);
> + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type))
> + return;
Could this leave caller variables uninitialized?
If a function like npc_update_dmac_value() passes an uninitialized u8 intf
parameter on the stack, and this function returns early, the caller might
branch on garbage data when later calling is_npc_intf_tx(intf).
Would it make sense to refactor these functions to return an int so that
failures can be propagated and gracefully handled by the callers?
^ permalink raw reply
* Re: [PATCH net v2] vxlan: fix NULL vn6_sock dereference in vxlan_igmp_join() and vxlan_igmp_leave()
From: Weiming Shi @ 2026-04-18 16:08 UTC (permalink / raw)
To: Ido Schimmel
Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Roopa Prabhu, netdev, Xiang Mei
In-Reply-To: <20260418155843.GA808294@shredder>
On 26-04-18 18:58, Ido Schimmel wrote:
> On Sat, Apr 18, 2026 at 04:41:12AM -0700, Weiming Shi wrote:
> > vxlan_sock_add() tolerates IPv6 socket creation failure with
> > -EAFNOSUPPORT (e.g. ipv6.disable=1), leaving vn6_sock as NULL while
> > successfully creating vn4_sock. vxlan_igmp_join() and
> > vxlan_igmp_leave() then crash when they dereference the NULL vn6_sock
> > for VNI filter entries with IPv6 multicast groups:
> >
> > Oops: general protection fault, probably for non-canonical address
> > 0xdffffc0000000002: 0000 [#1] SMP KASAN NOPTI
> > KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
> > RIP: 0010:vxlan_igmp_join (drivers/net/vxlan/vxlan_multicast.c:40)
> > Call Trace:
> > vxlan_multicast_join (drivers/net/vxlan/vxlan_multicast.c:195)
> > vxlan_open (drivers/net/vxlan/vxlan_core.c:2965)
> > __dev_open (net/core/dev.c:1704)
> > __dev_change_flags (net/core/dev.c:9781)
> > do_setlink.isra.0 (net/core/rtnetlink.c:3180)
> > rtnl_newlink (net/core/rtnetlink.c:4238)
> > rtnetlink_rcv_msg (net/core/rtnetlink.c:6921)
> >
> > Skip the IPv6 multicast join/leave when vn6_sock is NULL, consistent
> > with how vxlan_sock_add() tolerates missing IPv6 support.
> >
> > Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device")
> > Reported-by: Xiang Mei <xmei5@asu.edu>
> > Signed-off-by: Weiming Shi <bestswngs@gmail.com>
>
> AFAICT, this is the same patch as:
>
> https://lore.kernel.org/netdev/20260323095544.3311285-4-bestswngs@gmail.com/
>
> If you disagree with the feedback, then please comment there instead of
> reposting the patch.
>
Apologies for the duplicate posting - I should have followed up on the
original.
Thanks,
Weiming Shi
> > ---
> > v2:
> > - drop sock4 NULL checks
> >
> > drivers/net/vxlan/vxlan_multicast.c | 6 ++++++
> > 1 file changed, 6 insertions(+)
> >
> > diff --git a/drivers/net/vxlan/vxlan_multicast.c b/drivers/net/vxlan/vxlan_multicast.c
> > index a7f2d67dc61b..e6aa5ab1c939 100644
> > --- a/drivers/net/vxlan/vxlan_multicast.c
> > +++ b/drivers/net/vxlan/vxlan_multicast.c
> > @@ -37,6 +37,9 @@ int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
> > } else {
> > struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
> >
> > + if (!sock6)
> > + return 0;
> > +
> > sk = sock6->sock->sk;
> > lock_sock(sk);
> > ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
>
> This line changed in commit 29ae61b2fe7e ("drivers: net: drop ipv6_stub
> usage and use direct function calls")
>
> > @@ -71,6 +74,9 @@ int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
> > } else {
> > struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
> >
> > + if (!sock6)
> > + return 0;
> > +
> > sk = sock6->sock->sk;
> > lock_sock(sk);
> > ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
> > --
> > 2.43.0
> >
^ permalink raw reply
* Re: [PATCH net v2] vxlan: fix NULL vn6_sock dereference in vxlan_igmp_join() and vxlan_igmp_leave()
From: Ido Schimmel @ 2026-04-18 15:58 UTC (permalink / raw)
To: Weiming Shi
Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Roopa Prabhu, netdev, Xiang Mei
In-Reply-To: <20260418114110.2602784-3-bestswngs@gmail.com>
On Sat, Apr 18, 2026 at 04:41:12AM -0700, Weiming Shi wrote:
> vxlan_sock_add() tolerates IPv6 socket creation failure with
> -EAFNOSUPPORT (e.g. ipv6.disable=1), leaving vn6_sock as NULL while
> successfully creating vn4_sock. vxlan_igmp_join() and
> vxlan_igmp_leave() then crash when they dereference the NULL vn6_sock
> for VNI filter entries with IPv6 multicast groups:
>
> Oops: general protection fault, probably for non-canonical address
> 0xdffffc0000000002: 0000 [#1] SMP KASAN NOPTI
> KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
> RIP: 0010:vxlan_igmp_join (drivers/net/vxlan/vxlan_multicast.c:40)
> Call Trace:
> vxlan_multicast_join (drivers/net/vxlan/vxlan_multicast.c:195)
> vxlan_open (drivers/net/vxlan/vxlan_core.c:2965)
> __dev_open (net/core/dev.c:1704)
> __dev_change_flags (net/core/dev.c:9781)
> do_setlink.isra.0 (net/core/rtnetlink.c:3180)
> rtnl_newlink (net/core/rtnetlink.c:4238)
> rtnetlink_rcv_msg (net/core/rtnetlink.c:6921)
>
> Skip the IPv6 multicast join/leave when vn6_sock is NULL, consistent
> with how vxlan_sock_add() tolerates missing IPv6 support.
>
> Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device")
> Reported-by: Xiang Mei <xmei5@asu.edu>
> Signed-off-by: Weiming Shi <bestswngs@gmail.com>
AFAICT, this is the same patch as:
https://lore.kernel.org/netdev/20260323095544.3311285-4-bestswngs@gmail.com/
If you disagree with the feedback, then please comment there instead of
reposting the patch.
> ---
> v2:
> - drop sock4 NULL checks
>
> drivers/net/vxlan/vxlan_multicast.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/drivers/net/vxlan/vxlan_multicast.c b/drivers/net/vxlan/vxlan_multicast.c
> index a7f2d67dc61b..e6aa5ab1c939 100644
> --- a/drivers/net/vxlan/vxlan_multicast.c
> +++ b/drivers/net/vxlan/vxlan_multicast.c
> @@ -37,6 +37,9 @@ int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
> } else {
> struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
>
> + if (!sock6)
> + return 0;
> +
> sk = sock6->sock->sk;
> lock_sock(sk);
> ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
This line changed in commit 29ae61b2fe7e ("drivers: net: drop ipv6_stub
usage and use direct function calls")
> @@ -71,6 +74,9 @@ int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
> } else {
> struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
>
> + if (!sock6)
> + return 0;
> +
> sk = sock6->sock->sk;
> lock_sock(sk);
> ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
> --
> 2.43.0
>
^ permalink raw reply
* Re: [PATCH net] ipv6: fix possible UAF in icmpv6_rcv()
From: Ido Schimmel @ 2026-04-18 15:47 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, netdev, eric.dumazet
In-Reply-To: <20260416103505.2380753-1-edumazet@google.com>
On Thu, Apr 16, 2026 at 10:35:05AM +0000, Eric Dumazet wrote:
> Caching saddr and daddr before pskb_pull() is problematic
> since skb->head can change.
>
> Remove these temporary variables:
>
> - We only access &ipv6_hdr(skb)->saddr and &ipv6_hdr(skb)->daddr
> when net_dbg_ratelimited() is called in the slow path.
>
> - Avoid potential future misuse after pskb_pull() call.
>
> Fixes: 4b3418fba0fe ("ipv6: icmp: include addresses in debug messages")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
^ permalink raw reply
* Re: [PATCH] net: hamachi: fix divide by zero in hamachi_init_one
From: Andrew Lunn @ 2026-04-18 15:34 UTC (permalink / raw)
To: Mingyu Wang
Cc: andrew+netdev, davem, edumazet, kuba, pabeni, tglx, mingo, netdev,
linux-kernel
In-Reply-To: <20260418121804.149171-1-25181214217@stu.xidian.edu.cn>
On Sat, Apr 18, 2026 at 08:18:04PM +0800, Mingyu Wang wrote:
> During the hardware initialization phase in hamachi_init_one(), the driver
> reads the PCIClkMeas register to calculate the PCI bus frequency.
>
> The current code attempts to prevent a divide-by-zero error using a ternary
> operator: `i ? 2000/(i&0x7f) : 0`. However, this check is flawed. The highest
> bit of `i` (0x80) acts as a ready flag. If unreliable hardware or a malicious
> virtual device returns a value where the ready bit is set but the lower 7 bits
> are zero (e.g., 0x80), the condition `i` evaluates to true, but `(i & 0x7f)`
> evaluates to 0. This results in a fatal divide-by-zero exception.
>
> This bug was discovered during an automated virtual device fuzzing campaign
> testing the hardware-software trust boundary. When the hardware returns 0x80,
> it bypassed the readiness while-loop but triggered the divide error. In our
> tests, this panic interrupted the module loading process, further triggering
> a KASAN slab-out-of-bounds in the module error path, and ultimately leading
> to a multi-core soft lockup and RCU stall.
Isn't that a good result of somebody trying to use emulated hardware
with bad behaviour? The machine grinds to a halt? So it is not
exploitable.
What happens with your patch in place? How are you reporting the
hardware is attacking the machine, and the hardware should not be
trusted?
Andrew
^ permalink raw reply
* Re: [PATCH] net: ipv4: igmp: add sysctl option to ignore inbound llm_reports
From: Ido Schimmel @ 2026-04-18 15:29 UTC (permalink / raw)
To: Steffen Trumtrar
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Jonathan Corbet, Shuah Khan, David Ahern, netdev,
linux-doc, linux-kernel
In-Reply-To: <20260415-v7-0-topic-igmp-llm-drop-v1-1-1367bfbb898e@pengutronix.de>
On Wed, Apr 15, 2026 at 12:26:13PM +0200, Steffen Trumtrar wrote:
> Add a new sysctl option 'igmp_link_local_mcast_reports_drop' that allows
> dropping inbound IGMP reports for link-local multicast groups in the
> 224.0.0.X range. This can be used to prevent the local system from
> processing IGMP reports for link local multicast groups and therefore
> let the kernel still send the own outbound IGMP reports.
OK, but what is the motivation to keep sending IGMP reports for
link-local multicast groups when the host already received such reports
from other hosts on the network? Why link-local groups are special in
this case?
AFAICT, igmp_heard_report() implements report suppression according to
RFC 2236 and it doesn't mention special behavior for link-local groups:
"If the host receives another host's Report (version 1 or 2) while it
has a timer running, it stops its timer for the specified group and does
not send a Report, in order to suppress duplicate Reports."
Also, I'm not convinced we need a new sysctl (that we will need to keep
forever) for this. It should be possible to drop such packets using tc
(tc-32 / tc-bpf) or netfilter.
[...]
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 6921d8594b849..2da4cd6ac7202 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -2306,6 +2306,18 @@ igmp_link_local_mcast_reports - BOOLEAN
>
> Default TRUE
>
> +igmp_link_local_mcast_reports_drop - BOOLEAN
> + Drop inbound IGMP reports for link local multicast groups in
> + the 224.0.0.X range. When enabled, IGMP membership reports for
> + link local multicast addresses are silently dropped without
> + processing.
> + When the kernel gets inbound IGMP reports it stops sending own
> + IGMP reports. With allowing to drop and process the inbound reports,
> + the kernel will not stop sending the own reports, even when IGMP
> + reports from other hosts are seen on the network.
> +
> + Default FALSE
[...]
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index a674fb44ec25b..3a4932e4108bd 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -931,6 +931,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
> if (ipv4_is_local_multicast(group) &&
> !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
> return false;
> + if (READ_ONCE(net->ipv4.sysctl_igmp_llm_reports_drop))
> + return true;
>
> rcu_read_lock();
> for_each_pmc_rcu(in_dev, im) {
The documentation says that this sysctl is specifically about link-local
groups, but it drops reports from all groups...
^ permalink raw reply
* Re: pre-boot plugged SFP autoneg advertisement
From: Andrew Lunn @ 2026-04-18 15:25 UTC (permalink / raw)
To: markus.stockhausen
Cc: linux, hkallweit1, netdev, 'Jonas Jelonek', jan
In-Reply-To: <007c01dccf15$9b4622c0$d1d26840$@gmx.de>
On Sat, Apr 18, 2026 at 11:27:40AM +0200, markus.stockhausen@gmx.de wrote:
> Hi,
>
> I'm currently analyzing an issue where a pre-boot-plugged SFP module
> comes up with autoneg=no advertisement during boot. After an
> unplug/replug autoneg=yes advertisement is chosen.
>
> The following addition in phylink_start() just before the call to
> phylink_mac_initial_config() mitigiates this.
>
> + /* If an SFP module was already present before phylink_start() was
> + * called, phylink_sfp_set_config() was unable to call
> + * phylink_mac_initial_config() as phylink was not yet started.
> + * Ensure the SFP capabilities are reflected in advertising.
> + */
> + if (pl->sfp_bus && !linkmode_empty(pl->sfp_support))
> + linkmode_copy(pl->link_config.advertising, pl->sfp_support);
Let me see if i have the call chain correct. This is net-next/main
from today.
phylink_sfp_connect_phy() ->
phylink_sfp_config_phy
if (changed && !test_bit(PHYLINK_DISABLE_STOPPED,
&pl->phylink_disable_state))
phylink_mac_initial_config(pl, false);
You are saying PHYLINK_DISABLE_STOPPED is set, so
phylink_mac_initial_config() is not called.
What i don't see is how phylink_mac_initial_config() does the
linkmode_copy() you are adding.
Andrew
^ permalink raw reply
* Re: [PATCH net v2] slip: reject VJ receive packets on instances with no rstate array
From: Simon Horman @ 2026-04-18 15:19 UTC (permalink / raw)
To: Weiming Shi
Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, netdev, Xiang Mei
In-Reply-To: <aeOXJoeq6VkCnqAH@SLSGDTSWING002>
On Sat, Apr 18, 2026 at 10:37:26PM +0800, Weiming Shi wrote:
> On 26-04-18 13:39, Simon Horman wrote:
> > On Thu, Apr 16, 2026 at 04:41:31AM +0800, Weiming Shi wrote:
...
> > I do note that Sashiko flags some other problems in this code.
> > I do not think that needs to delay progress of this patch.
> > But you may wish to look into them as follow-up work.
>
> Thanks for your review.
>
> I've already sent two follow-up patches for the decode()/pull16()
> bounds-checking issues:
>
> [PATCH net] slip: fix slab-out-of-bounds write in slhc_uncompress()
> https://lore.kernel.org/netdev/20260415213359.335657-2-bestswngs@gmail.com/
>
> [PATCH net] slip: bound decode() reads against the compressed packet length
> https://lore.kernel.org/netdev/20260416100147.531855-5-bestswngs@gmail.com/
Great, thanks!
^ permalink raw reply
* Re: [PATCH net-next v4 5/5] selftests: net: bridge: add MRC and QQIC field encoding tests
From: Ido Schimmel @ 2026-04-18 14:49 UTC (permalink / raw)
To: Ujjal Roy
Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Nikolay Aleksandrov, David Ahern, Shuah Khan,
Andy Roulin, Yong Wang, Petr Machata, Ujjal Roy, bridge, netdev,
linux-kernel, linux-kselftest
In-Reply-To: <CAE2MWkmvdAVMBvJ9xKgEzjJZ010=oY_ZoG==FBjHEisHEMrS8Q@mail.gmail.com>
On Fri, Apr 17, 2026 at 11:27:06AM +0530, Ujjal Roy wrote:
> On Mon, Apr 13, 2026 at 2:18 PM Ido Schimmel <idosch@nvidia.com> wrote:
> >
> > See some comments below, but note that net-next is closed:
> >
> > https://lore.kernel.org/netdev/20260412142250.131bf997@kernel.org/
> >
> > So you can either wait with v5 until it is open again or post it as RFC
> > so that we can at least review (but not merge) it while net-next is
> > closed.
>
> Let me clear the changes asked here inline, so that I will be prepared
> with v5 until net-next is open. You can ask me to send it as RFC v5,
> if you have doubts about inline answers.
I checked the proposed changes and they look fine to me.
Thanks
^ permalink raw reply
* Re: [PATCH net v2] slip: reject VJ receive packets on instances with no rstate array
From: Weiming Shi @ 2026-04-18 14:37 UTC (permalink / raw)
To: Simon Horman
Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, netdev, Xiang Mei
In-Reply-To: <20260418123929.GE280379@horms.kernel.org>
On 26-04-18 13:39, Simon Horman wrote:
> On Thu, Apr 16, 2026 at 04:41:31AM +0800, Weiming Shi wrote:
> > slhc_init() accepts rslots == 0 as a valid configuration, with the
> > documented meaning of 'no receive compression'. In that case the
> > allocation loop in slhc_init() is skipped, so comp->rstate stays
> > NULL and comp->rslot_limit stays 0 (from the kzalloc of struct
> > slcompress).
> >
> > The receive helpers do not defend against that configuration.
> > slhc_uncompress() dereferences comp->rstate[x] when the VJ header
> > carries an explicit connection ID, and slhc_remember() later assigns
> > cs = &comp->rstate[...] after only comparing the packet's slot number
> > to comp->rslot_limit. Because rslot_limit is 0, slot 0 passes the
> > range check, and the code dereferences a NULL rstate.
> >
> > The configuration is reachable in-tree through PPP. PPPIOCSMAXCID
> > stores its argument in a signed int, and (val >> 16) uses arithmetic
> > shift. Passing 0xffff0000 therefore sign-extends to -1, so val2 + 1
> > is 0 and ppp_generic.c ends up calling slhc_init(0, 1). Because
> > /dev/ppp open is gated by ns_capable(CAP_NET_ADMIN), the whole path
> > is reachable from an unprivileged user namespace. Once the malformed
> > VJ state is installed, any inbound VJ-compressed or VJ-uncompressed
> > frame that selects slot 0 crashes the kernel in softirq context:
> >
> > Oops: general protection fault, probably for non-canonical
> > address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI
> > KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
> > RIP: 0010:slhc_uncompress (drivers/net/slip/slhc.c:519)
> > Call Trace:
> > <TASK>
> > ppp_receive_nonmp_frame (drivers/net/ppp/ppp_generic.c:2466)
> > ppp_input (drivers/net/ppp/ppp_generic.c:2359)
> > ppp_async_process (drivers/net/ppp/ppp_async.c:492)
> > tasklet_action_common (kernel/softirq.c:926)
> > handle_softirqs (kernel/softirq.c:623)
> > run_ksoftirqd (kernel/softirq.c:1055)
> > smpboot_thread_fn (kernel/smpboot.c:160)
> > kthread (kernel/kthread.c:436)
> > ret_from_fork (arch/x86/kernel/process.c:164)
> > </TASK>
> >
> > Reject the receive side on such instances instead of touching rstate.
> > slhc_uncompress() falls through to its existing 'bad' label, which
> > bumps sls_i_error and enters the toss state. slhc_remember() mirrors
> > that with an explicit sls_i_error increment followed by slhc_toss();
> > the sls_i_runt counter is not used here because a missing rstate is
> > an internal configuration state, not a runt packet.
> >
> > The transmit path is unaffected: the only in-tree caller that picks
> > rslots from userspace (ppp_generic.c) still supplies tslots >= 1, and
> > slip.c always calls slhc_init(16, 16), so comp->tstate remains valid
> > and slhc_compress() continues to work.
> >
> > Fixes: b5451d783ade ("slip: Move the SLIP drivers")
>
> AI review points out that the cited commit moves code but doesn't
> add this bug.
>
> It seems to me that this bug has existed since the beginning of git
> history. If so, the Fixes tag should be:
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>
> > Reported-by: Xiang Mei <xmei5@asu.edu>
> > Signed-off-by: Weiming Shi <bestswngs@gmail.com>
> > ---
> > v2:
> > - slhc_remember(): use sls_i_error instead of sls_i_runt for the
> > missing-rstate case; it is a configuration error, not a runt packet
> > (Simon).
> > - slhc_uncompress(): goto bad instead of returning 0, so the instance
> > also enters SLF_TOSS on the first rejected frame.
>
> Otherwise this looks good to me:
>
> Reviewed-by: Simon Horman <horms@kernel.org>
>
>
> I do note that Sashiko flags some other problems in this code.
> I do not think that needs to delay progress of this patch.
> But you may wish to look into them as follow-up work.
Thanks for your review.
I've already sent two follow-up patches for the decode()/pull16()
bounds-checking issues:
[PATCH net] slip: fix slab-out-of-bounds write in slhc_uncompress()
https://lore.kernel.org/netdev/20260415213359.335657-2-bestswngs@gmail.com/
[PATCH net] slip: bound decode() reads against the compressed packet length
https://lore.kernel.org/netdev/20260416100147.531855-5-bestswngs@gmail.com/
Best regards,
Weiming Shi
^ permalink raw reply
* Re: [PATCH net] ipv6: Implement limits on extension header parsing
From: Justin Iurman @ 2026-04-18 14:15 UTC (permalink / raw)
To: Eric Dumazet
Cc: Daniel Borkmann, kuba, dsahern, tom, willemdebruijn.kernel,
idosch, pabeni, netdev
In-Reply-To: <75d98880-afcd-43f9-8bd5-b874fa5690f5@gmail.com>
On 4/18/26 15:46, Justin Iurman wrote:
> On 4/18/26 15:15, Eric Dumazet wrote:
>> On Sat, Apr 18, 2026 at 5:50 AM Justin Iurman
>> <justin.iurman@gmail.com> wrote:
>>>
>>> On 4/18/26 14:26, Daniel Borkmann wrote:
>>>> Hi Justin,
>>>>
>>>> On 4/18/26 1:45 PM, Justin Iurman wrote:
>>>>> On 4/17/26 19:18, Daniel Borkmann wrote:
>>>> [...]
>>>>>> diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
>>>>>> index d2cd33e2698d..93f865545a7c 100644
>>>>>> --- a/net/ipv6/sysctl_net_ipv6.c
>>>>>> +++ b/net/ipv6/sysctl_net_ipv6.c
>>>>>> @@ -135,6 +135,14 @@ static struct ctl_table ipv6_table_template[]
>>>>>> = {
>>>>>> .extra1 = SYSCTL_ZERO,
>>>>>> .extra2 = &flowlabel_reflect_max,
>>>>>> },
>>>>>> + {
>>>>>> + .procname = "max_ext_hdrs_number",
>>>>>> + .data = &init_net.ipv6.sysctl.max_ext_hdrs_cnt,
>>>>>> + .maxlen = sizeof(int),
>>>>>> + .mode = 0644,
>>>>>> + .proc_handler = proc_dointvec_minmax,
>>>>>> + .extra1 = SYSCTL_ONE,
>>>>>> + },
>>>>>> {
>>>>>> .procname = "max_dst_opts_number",
>>>>>> .data = &init_net.ipv6.sysctl.max_dst_opts_cnt,
>>>>>
>>>>> NACKed-by: Justin Iurman <justin.iurman@gmail.com>
>>>>>
>>>>> +1000 on the need, but NAK on the way it is done. IMO, we don't want
>>>>> yet-another-sysctl for that. Instead, we have (well, not yet, but it's
>>>>> about time) this series [1] to enforce ordering and occurrences of
>>>>> Extension Headers, which is based on an IETF draft [2] (FYI, draft-
>>>>> ietf-6man-eh-limits is dead). I think we should enforce ordering and
>>>>> occurrences in this code path too, instead of relying on a sysctl.
>>>>> Let's keep both code paths consistent.
>>>
>>> Hi Daniel,
>>>
>>>> Hm, that series [1] should probably go to net instead of net-next,
>>>> but atm
>>>
>>> +1, would make sense.
>>>
>>>> hasn't moved since a month. I'd still think max_ext_hdrs_number
>>>> would be
>>>> useful given it has less complexity also for stable, but I guess
>>>> ultimately
>>>> up to maintainers..
>>>
>>> In the short term, I agree. What worries me is that we end up with a
>>> redundant, or even useless, sysctl once the other series is applied,
>>> which will only increase user confusion.
>>
>> Given the amount of bugs in this code, a sysctl is safe and quire
>> reasonable.
>>
>> No one will object when it is eventually removed (or has no action)
>>
>> For the record, I approve Daniel patch.
>
> Fair enough. If there is consensus on this patch, then let me just
> suggest two changes:
>
> - make it clear in the sysctl description that it mainly applies to TX
> (as opposed to the other series [1] discussed earlier that applies to RX)
Sorry, I meant it does not apply to core RX (ip6_rcv()), which is what
series [1] does.
> - set the default to 8 (which should be the max value) instead of 32, as
> per RFC8200, Sec. 4.1
^ permalink raw reply
* [PATCH net v2] net/rds: zero per-item info buffer before handing it to visitors
From: Michael Bommarito @ 2026-04-18 14:10 UTC (permalink / raw)
To: Allison Henderson, David S . Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Sharath Srinivasan, Simon Horman, netdev, linux-rdma, rds-devel,
linux-kernel, stable
In-Reply-To: <20260417141916.494761-1-michael.bommarito@gmail.com>
rds_for_each_conn_info() and rds_walk_conn_path_info() both hand a
caller-allocated on-stack u64 buffer to a per-connection visitor and
then copy the full item_len bytes back to user space via
rds_info_copy() regardless of how much of the buffer the visitor
actually wrote.
rds_ib_conn_info_visitor() and rds6_ib_conn_info_visitor() only
write a subset of their output struct when the underlying
rds_connection is not in state RDS_CONN_UP (src/dst addr, tos, sl
and the two GIDs via explicit memsets). Several u32 fields
(max_send_wr, max_recv_wr, max_send_sge, rdma_mr_max, rdma_mr_size,
cache_allocs) and the 2-byte alignment hole between sl and
cache_allocs remain as whatever stack contents preceded the visitor
call and are then memcpy_to_user()'d out to user space.
struct rds_info_rdma_connection and struct rds6_info_rdma_connection
are the only rds_info_* structs in include/uapi/linux/rds.h that are
not marked __attribute__((packed)), so they have a real alignment
hole. The other info visitors (rds_conn_info_visitor,
rds6_conn_info_visitor, rds_tcp_tc_info, ...) write all fields of
their packed output struct today and are not known to be vulnerable,
but a future visitor that adds a conditional write-path would have
the same bug.
Reproduction on a kernel built without CONFIG_INIT_STACK_ALL_ZERO=y:
a local unprivileged user opens AF_RDS, sets SO_RDS_TRANSPORT=IB,
binds to a local address on an RDMA-capable netdev (rxe soft-RoCE on
any netdev is sufficient), sendto()'s any peer on the same subnet
(fails cleanly but installs an rds_connection in the global hash in
RDS_CONN_CONNECTING), then calls getsockopt(SOL_RDS,
RDS_INFO_IB_CONNECTIONS). The returned 68-byte item contains 26
bytes of stack garbage including kernel text/data pointers:
0..7 0a 63 00 01 0a 63 00 02 src=10.99.0.1 dst=10.99.0.2
8..39 00 ... gids (memset-zeroed)
40..47 e0 92 a3 81 ff ff ff ff kernel pointer (max_send_wr)
48..55 7f 37 b5 81 ff ff ff ff kernel pointer (rdma_mr_max)
56..59 01 00 08 00 rdma_mr_size (garbage)
60..61 00 00 tos, sl
62..63 00 00 alignment padding
64..67 18 00 00 00 cache_allocs (garbage)
Fix by zeroing the per-item buffer in both rds_for_each_conn_info()
and rds_walk_conn_path_info() before invoking the visitor. This
covers the IPv4/IPv6 IB visitors and hardens all current and future
visitors against the same class of bug.
No functional change for visitors that fully populate their output.
Changes in v2:
- retarget at the net tree (subject prefix "[PATCH net v2]",
net/rds: prefix in the title)
- add Cc: stable@vger.kernel.org
- pick up Reviewed-by tags from Sharath Srinivasan and
Allison Henderson
Fixes: ec16227e1414 ("RDS/IB: Infiniband transport")
Cc: stable@vger.kernel.org
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
Reviewed-by: Sharath Srinivasan <sharath.srinivasan@oracle.com>
Reviewed-by: Allison Henderson <achender@kernel.org>
Assisted-by: Claude:claude-opus-4-7
---
net/rds/connection.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 412441aaa298..c10b7ed06c49 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -701,6 +701,13 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ /* Zero the per-item buffer before handing it to the
+ * visitor so any field the visitor does not write -
+ * including implicit alignment padding - cannot leak
+ * stack contents to user space via rds_info_copy().
+ */
+ memset(buffer, 0, item_len);
+
/* XXX no c_lock usage.. */
if (!visitor(conn, buffer))
continue;
@@ -750,6 +757,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
*/
cp = conn->c_path;
+ /* Zero the per-item buffer for the same reason as
+ * rds_for_each_conn_info(): any byte the visitor
+ * does not write (including alignment padding) must
+ * not leak stack contents via rds_info_copy().
+ */
+ memset(buffer, 0, item_len);
+
/* XXX no cp_lock usage.. */
if (!visitor(cp, buffer))
continue;
--
2.53.0
^ permalink raw reply related
* Re: [PATCH net] ipv6: Implement limits on extension header parsing
From: Justin Iurman @ 2026-04-18 13:46 UTC (permalink / raw)
To: Eric Dumazet
Cc: Daniel Borkmann, kuba, dsahern, tom, willemdebruijn.kernel,
idosch, pabeni, netdev
In-Reply-To: <CANn89i+Y0jctj8=tCHFP5jDSJBAWR=RvNfagammc-WqU6EdPRw@mail.gmail.com>
On 4/18/26 15:15, Eric Dumazet wrote:
> On Sat, Apr 18, 2026 at 5:50 AM Justin Iurman <justin.iurman@gmail.com> wrote:
>>
>> On 4/18/26 14:26, Daniel Borkmann wrote:
>>> Hi Justin,
>>>
>>> On 4/18/26 1:45 PM, Justin Iurman wrote:
>>>> On 4/17/26 19:18, Daniel Borkmann wrote:
>>> [...]
>>>>> diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
>>>>> index d2cd33e2698d..93f865545a7c 100644
>>>>> --- a/net/ipv6/sysctl_net_ipv6.c
>>>>> +++ b/net/ipv6/sysctl_net_ipv6.c
>>>>> @@ -135,6 +135,14 @@ static struct ctl_table ipv6_table_template[] = {
>>>>> .extra1 = SYSCTL_ZERO,
>>>>> .extra2 = &flowlabel_reflect_max,
>>>>> },
>>>>> + {
>>>>> + .procname = "max_ext_hdrs_number",
>>>>> + .data = &init_net.ipv6.sysctl.max_ext_hdrs_cnt,
>>>>> + .maxlen = sizeof(int),
>>>>> + .mode = 0644,
>>>>> + .proc_handler = proc_dointvec_minmax,
>>>>> + .extra1 = SYSCTL_ONE,
>>>>> + },
>>>>> {
>>>>> .procname = "max_dst_opts_number",
>>>>> .data = &init_net.ipv6.sysctl.max_dst_opts_cnt,
>>>>
>>>> NACKed-by: Justin Iurman <justin.iurman@gmail.com>
>>>>
>>>> +1000 on the need, but NAK on the way it is done. IMO, we don't want
>>>> yet-another-sysctl for that. Instead, we have (well, not yet, but it's
>>>> about time) this series [1] to enforce ordering and occurrences of
>>>> Extension Headers, which is based on an IETF draft [2] (FYI, draft-
>>>> ietf-6man-eh-limits is dead). I think we should enforce ordering and
>>>> occurrences in this code path too, instead of relying on a sysctl.
>>>> Let's keep both code paths consistent.
>>
>> Hi Daniel,
>>
>>> Hm, that series [1] should probably go to net instead of net-next, but atm
>>
>> +1, would make sense.
>>
>>> hasn't moved since a month. I'd still think max_ext_hdrs_number would be
>>> useful given it has less complexity also for stable, but I guess ultimately
>>> up to maintainers..
>>
>> In the short term, I agree. What worries me is that we end up with a
>> redundant, or even useless, sysctl once the other series is applied,
>> which will only increase user confusion.
>
> Given the amount of bugs in this code, a sysctl is safe and quire reasonable.
>
> No one will object when it is eventually removed (or has no action)
>
> For the record, I approve Daniel patch.
Fair enough. If there is consensus on this patch, then let me just
suggest two changes:
- make it clear in the sysctl description that it mainly applies to TX
(as opposed to the other series [1] discussed earlier that applies to RX)
- set the default to 8 (which should be the max value) instead of 32, as
per RFC8200, Sec. 4.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox