Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 net-next 05/15] ip6mr: Convert ip6mr_rtm_getroute() to RCU.
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

ip6mr_rtm_getroute() calls __ip6mr_get_table(), ip6mr_cache_find(),
and ip6mr_fill_mroute().

Once created, struct mr_table is not freed until netns dismantle,
so it's safe under RCU.

ip6mr_cache_find() iterates mrt->mfc_hash with rhl_for_each_entry_rcu().
struct mr_mfc is freed with call_rcu(), so this is also safe under
RCU.

ip6mr_fill_mroute() calls mr_fill_mroute(), which properly uses
RCU helpers.

Let's call them under RCU and register ip6mr_rtm_getroute() with
RTNL_FLAG_DOIT_UNLOCKED.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5356957bfe94..0054db00fadf 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1388,7 +1388,8 @@ static struct pernet_operations ip6mr_net_ops = {
 static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
 	{.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
 	 .msgtype = RTM_GETROUTE,
-	 .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute},
+	 .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute,
+	 .flags = RTNL_FLAG_DOIT_UNLOCKED},
 };
 
 int __init ip6_mr_init(void)
@@ -2712,6 +2713,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		grp = nla_get_in6_addr(tb[RTA_DST]);
 	tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
 
+	rcu_read_lock();
+
 	mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
 	if (!mrt) {
 		NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
@@ -2719,10 +2722,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto err;
 	}
 
-	/* entries are added/deleted only under RTNL */
-	rcu_read_lock();
 	cache = ip6mr_cache_find(mrt, &src, &grp);
-	rcu_read_unlock();
 	if (!cache) {
 		NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
 		err = -ENOENT;
@@ -2734,9 +2734,12 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto err;
 
+	rcu_read_unlock();
+
 	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 err:
+	rcu_read_unlock();
 	kfree_skb(skb);
 	return err;
 }
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 04/15] ip6mr: Allocate skb earlier in ip6mr_rtm_getroute().
From: Kuniyuki Iwashima @ 2026-04-10 21:17 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

We will convert ip6mr_rtm_getroute() to RCU in the following patch,
where __ip6mr_get_table() will be called under RCU.

nlmsg_new() uses GFP_KERNEL and needs to be called before holding
rcu_read_lock().

As a prep, let's move nlmsg_new() before __ip6mr_get_table().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7ea572db9075..5356957bfe94 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2702,6 +2702,10 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
 	if (tb[RTA_SRC])
 		src = nla_get_in6_addr(tb[RTA_SRC]);
 	if (tb[RTA_DST])
@@ -2711,7 +2715,8 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
 	if (!mrt) {
 		NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
-		return -ENOENT;
+		err = -ENOENT;
+		goto err;
 	}
 
 	/* entries are added/deleted only under RTNL */
@@ -2720,21 +2725,20 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	rcu_read_unlock();
 	if (!cache) {
 		NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
-		return -ENOENT;
+		err = -ENOENT;
+		goto err;
 	}
 
-	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
-	if (!skb)
-		return -ENOBUFS;
-
 	err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
 				nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
-	if (err < 0) {
-		kfree_skb(skb);
-		return err;
-	}
+	if (err < 0)
+		goto err;
 
 	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+err:
+	kfree_skb(skb);
+	return err;
 }
 
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 03/15] ip6mr: Use MAXMIFS in mr6_msgsize().
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

mr6_msgsize() calculates skb size needed for ip6mr_fill_mroute().

The size differs based on mrt->maxvif.

We will drop RTNL for ip6mr_rtm_getroute() and mrt->maxvif may
change under RCU.

To avoid -EMSGSIZE, let's calculate the size with the maximum
value of mrt->maxvif, MAXMIFS.

struct rtnexthop is 8 bytes and MAXMIFS is 32, so the maximum delta
is 256 bytes, which is small enough.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index b263d3c69a5a..7ea572db9075 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2539,7 +2539,7 @@ static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 				 cmd, flags);
 }
 
-static int mr6_msgsize(bool unresolved, int maxvif)
+static int mr6_msgsize(bool unresolved)
 {
 	size_t len =
 		NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -2552,7 +2552,7 @@ static int mr6_msgsize(bool unresolved, int maxvif)
 		len = len
 		      + nla_total_size(4)	/* RTA_IIF */
 		      + nla_total_size(0)	/* RTA_MULTIPATH */
-		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
+		      + MAXMIFS * NLA_ALIGN(sizeof(struct rtnexthop))
 						/* RTA_MFC_STATS */
 		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
 		;
@@ -2567,8 +2567,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif),
-			GFP_ATOMIC);
+	skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
@@ -2724,7 +2723,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		return -ENOENT;
 	}
 
-	skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
+	skb = nlmsg_new(mr6_msgsize(false), GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 02/15] ip6mr: Annotate access to mrt->mroute_do_{pim,assert,wrvifwhole}.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

These fields in struct mr_table are updated in ip6_mroute_setsockopt()
under RTNL:

  * mroute_do_pim
  * mroute_do_assert (MRT6_PIM is under RTNL while MRT6_ASSERT is lockless)
  * mroute_do_wrvifwhole

However, ip6_mroute_getsockopt() does not hold RTNL and read the first
two fields locklessly, and ip6_mr_forward() reads all the three under
RCU.

Let's use WRITE_ONCE() and READ_ONCE() for them.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/ip6mr.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 85010ff21c98..b263d3c69a5a 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1780,7 +1780,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 			return -EINVAL;
 		if (copy_from_sockptr(&v, optval, sizeof(v)))
 			return -EFAULT;
-		mrt->mroute_do_assert = v;
+		WRITE_ONCE(mrt->mroute_do_assert, v);
 		return 0;
 	}
 
@@ -1800,9 +1800,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
 		rtnl_lock();
 		ret = 0;
 		if (v != mrt->mroute_do_pim) {
-			mrt->mroute_do_pim = v;
-			mrt->mroute_do_assert = v;
-			mrt->mroute_do_wrvifwhole = do_wrmifwhole;
+			WRITE_ONCE(mrt->mroute_do_pim, v);
+			WRITE_ONCE(mrt->mroute_do_assert, v);
+			WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrmifwhole);
 		}
 		rtnl_unlock();
 		return ret;
@@ -1870,11 +1870,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
 		break;
 #ifdef CONFIG_IPV6_PIMSM_V2
 	case MRT6_PIM:
-		val = mrt->mroute_do_pim;
+		val = READ_ONCE(mrt->mroute_do_pim);
 		break;
 #endif
 	case MRT6_ASSERT:
-		val = mrt->mroute_do_assert;
+		val = READ_ONCE(mrt->mroute_do_assert);
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -2177,20 +2177,20 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
 		atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
 
-		if (true_vifi >= 0 && mrt->mroute_do_assert &&
+		if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
 		       so that we cannot check that packet arrived on an oif.
 		       It is bad, but otherwise we would need to move pretty
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
-		    (mrt->mroute_do_pim ||
+		    (READ_ONCE(mrt->mroute_do_pim) ||
 		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
 			       c->_c.mfc_un.res.last_assert +
 			       MFC_ASSERT_THRESH)) {
 			c->_c.mfc_un.res.last_assert = jiffies;
 			ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
-			if (mrt->mroute_do_wrvifwhole)
+			if (READ_ONCE(mrt->mroute_do_wrvifwhole))
 				ip6mr_cache_report(mrt, skb, true_vifi,
 						   MRT6MSG_WRMIFWHOLE);
 		}
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 01/15] selftest: net: Extend ipmr.c for IP6MR.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260410211726.1668756-1-kuniyu@google.com>

This commit extends most test cases in ipmr.c for IPV6MR.

Note that IP6MR does not provide rtnetlink interface for MFC,
so such tests will be skipped.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 tools/testing/selftests/net/forwarding/ipmr.c | 163 ++++++++++++------
 1 file changed, 110 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c
index df870aad9ead..cfd00173bcd6 100644
--- a/tools/testing/selftests/net/forwarding/ipmr.c
+++ b/tools/testing/selftests/net/forwarding/ipmr.c
@@ -2,7 +2,9 @@
 /* Copyright 2026 Google LLC */
 
 #include <linux/if.h>
+#include <linux/in6.h>
 #include <linux/mroute.h>
+#include <linux/mroute6.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <linux/socket.h>
@@ -17,6 +19,14 @@ FIXTURE(ipmr)
 	int netlink_sk;
 	int raw_sk;
 	int veth_ifindex;
+	union {
+		struct vifctl vif;
+		struct mif6ctl vif6;
+	};
+	union {
+		struct mfcctl mfc;
+		struct mf6cctl mfc6;
+	};
 };
 
 FIXTURE_VARIANT(ipmr)
@@ -25,6 +35,11 @@ FIXTURE_VARIANT(ipmr)
 	int protocol;
 	int level;
 	int opts[MRT_MAX - MRT_BASE + 1];
+	int vif_size;
+	char vif_check_cmd_pimreg[64];
+	char vif_check_cmd_veth[64];
+	int mfc_size;
+	char mfc_check_cmd[1024];
 };
 
 FIXTURE_VARIANT_ADD(ipmr, ipv4)
@@ -47,6 +62,39 @@ FIXTURE_VARIANT_ADD(ipmr, ipv4)
 		MRT_DEL_MFC_PROXY,
 		MRT_FLUSH,
 	},
+	.vif_size = sizeof(struct vifctl),
+	.vif_check_cmd_pimreg = "cat /proc/net/ip_mr_vif | grep -q pimreg",
+	.vif_check_cmd_veth = "cat /proc/net/ip_mr_vif | grep -q veth",
+	.mfc_size = sizeof(struct mfcctl),
+	.mfc_check_cmd = "cat /proc/net/ip_mr_cache | grep -q '00000000 00000000'",
+};
+
+FIXTURE_VARIANT_ADD(ipmr, ipv6)
+{
+	.family = AF_INET6,
+	.protocol = IPPROTO_ICMPV6,
+	.level = IPPROTO_IPV6,
+	.opts = {
+		MRT6_INIT,
+		MRT6_DONE,
+		MRT6_ADD_MIF,
+		MRT6_DEL_MIF,
+		MRT6_ADD_MFC,
+		MRT6_DEL_MFC,
+		MRT6_VERSION,
+		MRT6_ASSERT,
+		MRT6_PIM,
+		MRT6_TABLE,
+		MRT6_ADD_MFC_PROXY,
+		MRT6_DEL_MFC_PROXY,
+		MRT_FLUSH,
+	},
+	.vif_size = sizeof(struct mif6ctl),
+	.vif_check_cmd_pimreg = "cat /proc/net/ip6_mr_vif | grep -q pim6reg",
+	.vif_check_cmd_veth = "cat /proc/net/ip6_mr_vif | grep -q veth",
+	.mfc_size = sizeof(struct mf6cctl),
+	.mfc_check_cmd = "cat /proc/net/ip6_mr_cache | "
+		"grep -q '0000:0000:0000:0000:0000:0000:0000:0000 0000:0000:0000:0000:0000:0000:0000:0000'",
 };
 
 struct mfc_attr {
@@ -144,6 +192,18 @@ FIXTURE_SETUP(ipmr)
 	ASSERT_EQ(0, err);
 
 	self->veth_ifindex = ifr.ifr_ifindex;
+
+	if (variant->family == AF_INET) {
+		self->vif = (struct vifctl){
+			.vifc_flags = VIFF_USE_IFINDEX,
+			.vifc_lcl_ifindex = self->veth_ifindex,
+		};
+	} else {
+		self->vif6 = (struct mif6ctl){
+			.mif6c_flags = 0,
+			.mif6c_pifi = self->veth_ifindex,
+		};
+	}
 }
 
 FIXTURE_TEARDOWN(ipmr)
@@ -169,41 +229,39 @@ TEST_F(ipmr, mrt_init)
 
 TEST_F(ipmr, mrt_add_vif_register)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_REGISTER,
-	};
 	int err;
 
+	memset(&self->vif, 0, variant->vif_size);
+
+	if (variant->family == AF_INET)
+		self->vif.vifc_flags = VIFF_REGISTER;
+	else
+		self->vif6.mif6c_flags = MIFF_REGISTER;
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_vif | grep -q pimreg");
+	err = system(variant->vif_check_cmd_pimreg);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 }
 
 TEST_F(ipmr, mrt_del_vif_unreg)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_vif | grep -q veth0");
+	err = system(variant->vif_check_cmd_veth);
 	ASSERT_EQ(0, err);
 
 	/* VIF is removed along with its device. */
@@ -213,23 +271,18 @@ TEST_F(ipmr, mrt_del_vif_unreg)
 	/* mrt->vif_table[veth_ifindex]->dev is NULL. */
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(-1, err);
 	ASSERT_EQ(EADDRNOTAVAIL, errno);
 }
 
 TEST_F(ipmr, mrt_del_vif_netns_dismantle)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	/* Let cleanup_net() remove veth0 and VIF. */
@@ -237,49 +290,49 @@ TEST_F(ipmr, mrt_del_vif_netns_dismantle)
 
 TEST_F(ipmr, mrt_add_mfc)
 {
-	struct mfcctl mfc = {};
 	int err;
 
 	/* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 	ASSERT_EQ(0, err);
 
 	/* (0.0.0.0 -> 0.0.0.0) */
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 }
 
 TEST_F(ipmr, mrt_add_mfc_proxy)
 {
-	struct mfcctl mfc = {};
 	int err;
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE],
-			 &mfc,  sizeof(mfc));
+			 &self->mfc, variant->mfc_size);
 }
 
+#define SKIP_IPV6()						\
+	do {							\
+		if (variant->family == AF_INET6)		\
+			SKIP(return,				\
+			     "no netlink MFC interface");	\
+	} while (0)
+
 TEST_F(ipmr, mrt_add_mfc_netlink)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.table = RT_TABLE_DEFAULT,
 		.origin = 0,
@@ -289,15 +342,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif, variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr);
@@ -306,11 +361,6 @@ TEST_F(ipmr, mrt_add_mfc_netlink)
 
 TEST_F(ipmr, mrt_add_mfc_netlink_proxy)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.table = RT_TABLE_DEFAULT,
 		.origin = 0,
@@ -320,15 +370,17 @@ TEST_F(ipmr, mrt_add_mfc_netlink_proxy)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif, variant->vif_size);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr);
@@ -345,6 +397,8 @@ TEST_F(ipmr, mrt_add_mfc_netlink_no_vif)
 	};
 	int err;
 
+	SKIP_IPV6();
+
 	/* netlink always requires RTA_IIF of an existing vif. */
 	mfc_attr.ifindex = 0;
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
@@ -378,6 +432,8 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	};
 	int i, err;
 
+	SKIP_IPV6();
+
 	for (i = 0; i < 2; i++) {
 		/* Create 2 VIFs just to avoid -ENFILE later. */
 		err = setsockopt(self->raw_sk,
@@ -390,7 +446,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
 	ASSERT_EQ(0, err);
 
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	/* Remove mrt->vif_table[0]. */
@@ -398,7 +454,7 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 	ASSERT_EQ(0, err);
 
 	/* MFC entry is NOT removed even if the tied VIF is removed... */
-	err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' ");
+	err = system(variant->mfc_check_cmd);
 	ASSERT_EQ(0, err);
 
 	/* ... and netlink is not capable of removing such an entry
@@ -412,11 +468,6 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle)
 
 TEST_F(ipmr, mrt_table_flush)
 {
-	struct vifctl vif = {
-		.vifc_vifi = 0,
-		.vifc_flags = VIFF_USE_IFINDEX,
-		.vifc_lcl_ifindex = self->veth_ifindex,
-	};
 	struct mfc_attr mfc_attr = {
 		.origin = 0,
 		.group = 0,
@@ -436,11 +487,17 @@ TEST_F(ipmr, mrt_table_flush)
 
 	err = setsockopt(self->raw_sk,
 			 variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE],
-			 &vif,  sizeof(vif));
+			 &self->vif,  variant->vif_size);
 	ASSERT_EQ(0, err);
 
-	mfc_attr.table = table_id;
-	err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
+	if (variant->family == AF_INET) {
+		mfc_attr.table = table_id;
+		err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr);
+	} else {
+		err = setsockopt(self->raw_sk,
+				 variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE],
+				 &self->mfc, variant->mfc_size);
+	}
 	ASSERT_EQ(0, err);
 
 	/* Flush mrt->vif_table[] and all caches. */
-- 
2.53.0.1213.gd9a14994de-goog


^ permalink raw reply related

* [PATCH v2 net-next 00/15] ip6mr: No RTNL for RTNL_FAMILY_IP6MR rtnetlink.
From: Kuniyuki Iwashima @ 2026-04-10 21:16 UTC (permalink / raw)
  To: David S . Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

This series is the IPv6 version of

  https://lore.kernel.org/netdev/20260228221800.1082070-1-kuniyu@google.com/

and removes RTNL from ip6mr rtnetlink handlers.

After this series, there are a few RTNL left in net/ipv6/ipmr.c
and such users will be converted to per-netns RTNL in another
series.

Patch 1 extends the ipmr selftest to exercise most of the RTNL
 paths in net/ipv6/ipmr.c

Patch 2 - 6 converts RTM_GETROUTE handlers to RCU.

Patch 7 removes struct fib_dump_filter.rtnl_held.

Patch 8 - 9 use RCU for mr_table for CONFIG_IP_MROUTE_MULTIPLE_TABLES=n
 and CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=n for ->exit_rtnl().

Patch 10 - 12 converts ->exit_batch() to ->exit_rtnl() to
 save one RTNL in cleanup_net().

Patch 13 - 14 removes unnecessary RTNL during setup_net()
 failure.

Patch 15 drops RTNL for MRT6_(ADD|DEL)_MFC(_PROXY)?.

Changes:
  v2:
    Add patch 8 & 9 for ->exit_rtnl() conversion and
     drop 2 trivial patches (patch 2 & 14 in v1)

  v1: https://lore.kernel.org/netdev/20260407212001.2368593-1-kuniyu@google.com/

Kuniyuki Iwashima (15):
  selftest: net: Extend ipmr.c for IP6MR.
  ip6mr: Annotate access to mrt->mroute_do_{pim,assert,wrvifwhole}.
  ip6mr: Use MAXMIFS in mr6_msgsize().
  ip6mr: Allocate skb earlier in ip6mr_rtm_getroute().
  ip6mr: Convert ip6mr_rtm_getroute() to RCU.
  ip6mr: Convert ip6mr_rtm_dumproute() to RCU.
  net: Remove rtnl_held of struct fib_dump_filter.
  ipmr: Free mr_table after RCU grace period.
  ip6mr: Free mr_table after RCU grace period.
  ip6mr: Move unregister_netdevice_many() out of mroute_clean_tables().
  ip6mr: Move unregister_netdevice_many() out of ip6mr_free_table().
  ip6mr: Convert ip6mr_net_exit_batch() to ->exit_rtnl().
  ip6mr: Remove RTNL in ip6mr_rules_init() and ip6mr_net_init().
  ip6mr: Call fib_rules_unregister() without RTNL.
  ip6mr: Replace RTNL with a dedicated mutex for MFC.

 include/linux/mroute_base.h                   |   2 +
 include/net/ip_fib.h                          |   1 -
 include/net/netns/ipv6.h                      |   1 +
 net/ipv4/fib_frontend.c                       |  19 +-
 net/ipv4/ipmr.c                               |  55 +++--
 net/ipv6/ip6_fib.c                            |   1 -
 net/ipv6/ip6mr.c                              | 233 +++++++++++-------
 net/mpls/af_mpls.c                            |   6 +-
 tools/testing/selftests/net/forwarding/ipmr.c | 163 ++++++++----
 9 files changed, 289 insertions(+), 192 deletions(-)

-- 
2.53.0.1213.gd9a14994de-goog

^ permalink raw reply

* Re: [PATCH RFC net-next 02/10] net: stmmac: rename dev_id to userver
From: Jitendra Vegiraju @ 2026-04-10 21:04 UTC (permalink / raw)
  To: Russell King (Oracle)
  Cc: Andrew Lunn, Alexandre Torgue, Andrew Lunn, Chen-Yu Tsai,
	David S. Miller, Eric Dumazet, Jakub Kicinski, linux-arm-kernel,
	linux-stm32, linux-sunxi, netdev, Paolo Abeni, Samuel Holland
In-Reply-To: <adi3Vks-N0a83ylE@shell.armlinux.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2139 bytes --]

On Fri, Apr 10, 2026 at 1:39 AM Russell King (Oracle)
<linux@armlinux.org.uk> wrote:
>
> On Thu, Apr 09, 2026 at 04:07:42PM -0700, Jitendra Vegiraju wrote:
> > Hi Russell,
> >
> > On Wed, Apr 8, 2026 at 2:27 AM Russell King (Oracle)
> > <rmk+kernel@armlinux.org.uk> wrote:
> > >
> > > The Synopsys Databook and several implementation TRMs identify bits
> > > 15:8 of the version register in dwmac v3.xx and v4.xx as "userver".
> > > We even print its value with "User ID". Rather than using "dev_id",
> > > use "userver" instead.
> > >
> > > Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> > > ---
> > >  drivers/net/ethernet/stmicro/stmmac/hwif.c | 18 +++++++++---------
> > >  1 file changed, 9 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > index 3774af66db48..830ff816ab4f 100644
> > > --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
> > > @@ -15,7 +15,7 @@
> > >
> > >  struct stmmac_version {
> > >         u8 snpsver;
> > > -       u8 dev_id;
> > > +       u8 userver;
> > >  };
> > From the XGMAC databook that I have access to bits(15:8) identify the
> > DEVID field of MAC_version register.
> > The userver field is from bits(23:16) of the same register. This is a
> > customer defined field (configured with coreConsultant).
> > Currently stmmac doesn't care about bits(23:16).
>
> Thanks for the additional information.
>
> I don't have any XGMAC documentation, but this indicates that it differs
> between XGMAC and previous cores - GMAC and GMAC4 cores, 15:8 are
> documented as userver, and 31:16 are marked as reserved.
>
> Note that the dev_info() also prints 15:8 as "User ID" not "Device ID".
>
> To confirm, is the XGMAC version register at offset 0x20 ? Later GMAC
> cores moved it to 0x110.
The XGMAC version register is at offset 0x110.

>
> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5435 bytes --]

^ permalink raw reply

* Re: [PATCH net-next] tcp: add indirect call wrapper in tcp_conn_request()
From: Kuniyuki Iwashima @ 2026-04-10 20:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Neal Cardwell, netdev, eric.dumazet
In-Reply-To: <20260410174950.745670-1-edumazet@google.com>

On Fri, Apr 10, 2026 at 10:49 AM Eric Dumazet <edumazet@google.com> wrote:
>
> Small improvement in SYN processing, to directly call
> tcp_v6_init_seq_and_ts_off() or tcp_v4_init_seq_and_ts_off().
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH net-next 1/3] psp: add crypt-offset and spi-threshold get/set attributes
From: Jakub Kicinski @ 2026-04-10 20:57 UTC (permalink / raw)
  To: Akhilesh Samineni
  Cc: Willem de Bruijn, davem, edumazet, pabeni, andrew+netdev, horms,
	willemb, daniel.zahka, netdev, linux-kernel,
	jayakrishnan.udayavarma, ajit.khaparde, kiran.kella, sachin.suman
In-Reply-To: <CANQF7iDj1E6=3pod078-++5wbweyMg0H7gpeq0kbGFZHNytDdA@mail.gmail.com>

On Sat, 11 Apr 2026 01:06:06 +0530 Akhilesh Samineni wrote:
> On Wed, Apr 8, 2026 at 6:34 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > On Tue, 07 Apr 2026 17:37:41 -0400 Willem de Bruijn wrote:  
> > > PSP defines a 6-bit field in 4 octet units. Does this need bounds checking?  
> >
> > More fundamentally, were we to support this -- is it a device property
> > or an assoc property?  
> 
> It's a device property. All associations under the device will share
> the same crypt-offset.

I don't think there's anything in the spec that says the crypto
offset is device level.
At the very least every L4 proto may want to have a different offset.
We should probably hold off adding this until a real user appears.

^ permalink raw reply

* [patch V1.1 11/38] misc: sgi-gru: Remove get_cycles() [ab]use
From: Thomas Gleixner @ 2026-04-10 20:56 UTC (permalink / raw)
  To: LKML
  Cc: Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik, netdev,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120318.320727701@kernel.org>

Calculating a timeout from get_cycles() is a historical leftover without
any functional requirement.

Use ktime_get() instead.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
V2: Fix typo
---
 drivers/misc/sgi-gru/gruhandles.c   |   20 ++++++++------------
 drivers/misc/sgi-gru/grukservices.c |    3 ++-
 drivers/misc/sgi-gru/grutlbpurge.c  |    5 ++---
 3 files changed, 12 insertions(+), 16 deletions(-)

--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -6,26 +6,22 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/timekeeping.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
 
-/* 10 sec */
 #include <linux/sync_core.h>
-#include <asm/tsc.h>
-#define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
-#define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
+
+#define GRU_OPERATION_TIMEOUT_NSEC	(((ktime_t)10 * NSEC_PER_SEC))
 
 /* Extract the status field from a kernel handle */
 #define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
 
 struct mcs_op_statistic mcs_op_statistics[mcsop_last];
 
-static void update_mcs_stats(enum mcs_op op, unsigned long clks)
+static void update_mcs_stats(enum mcs_op op, unsigned long nsec)
 {
-	unsigned long nsec;
-
-	nsec = CLKS2NSEC(clks);
 	atomic_long_inc(&mcs_op_statistics[op].count);
 	atomic_long_add(nsec, &mcs_op_statistics[op].total);
 	if (mcs_op_statistics[op].max < nsec)
@@ -58,21 +54,21 @@ static void report_instruction_timeout(v
 
 static int wait_instruction_complete(void *h, enum mcs_op opc)
 {
+	ktime_t start_time = ktime_get();
 	int status;
-	unsigned long start_time = get_cycles();
 
 	while (1) {
 		cpu_relax();
 		status = GET_MSEG_HANDLE_STATUS(h);
 		if (status != CCHSTATUS_ACTIVE)
 			break;
-		if (GRU_OPERATION_TIMEOUT < (get_cycles() - start_time)) {
+		if (GRU_OPERATION_TIMEOUT_NSEC < (ktime_get() - start_time)) {
 			report_instruction_timeout(h);
-			start_time = get_cycles();
+			start_time = ktime_get();
 		}
 	}
 	if (gru_options & OPT_STATS)
-		update_mcs_stats(opc, get_cycles() - start_time);
+		update_mcs_stats(opc, (unsigned long)(ktime_get() - start_time));
 	return status;
 }
 
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -20,6 +20,7 @@
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/random.h>
 #include <asm/io_apic.h>
 #include "gru.h"
 #include "grulib.h"
@@ -1106,7 +1107,7 @@ static int quicktest3(unsigned long arg)
 	int ret = 0;
 
 	memset(buf2, 0, sizeof(buf2));
-	memset(buf1, get_cycles() & 255, sizeof(buf1));
+	memset(buf1, get_random_u32() & 255, sizeof(buf1));
 	gru_copy_gpa(uv_gpa(buf2), uv_gpa(buf1), BUFSIZE);
 	if (memcmp(buf1, buf2, BUFSIZE)) {
 		printk(KERN_DEBUG "GRU:%d quicktest3 error\n", smp_processor_id());
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -22,13 +22,12 @@
 #include <linux/delay.h>
 #include <linux/timex.h>
 #include <linux/srcu.h>
+#include <linux/random.h>
 #include <asm/processor.h>
 #include "gru.h"
 #include "grutables.h"
 #include <asm/uv/uv_hub.h>
 
-#define gru_random()	get_cycles()
-
 /* ---------------------------------- TLB Invalidation functions --------
  * get_tgh_handle
  *
@@ -49,7 +48,7 @@ static inline int get_off_blade_tgh(stru
 	int n;
 
 	n = GRU_NUM_TGH - gru->gs_tgh_first_remote;
-	n = gru_random() % n;
+	n = get_random_u32() % n;
 	n += gru->gs_tgh_first_remote;
 	return n;
 }

^ permalink raw reply

* [patch V1.1 02/38] x86: Cleanup include recursion hell
From: Thomas Gleixner @ 2026-04-10 20:55 UTC (permalink / raw)
  To: LKML
  Cc: Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik, netdev,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120317.709923681@kernel.org>

Including a random architecture specific header which requires global
headers just to avoid including that header at the two usage sites is
really beyond lazy and tasteless. Including global headers just to get the
__percpu macro from linux/compiler_types.h falls into the same category.

Remove the linux/percpu.h and asm/cpumask.h includes from msr.h and smp.h
and fix the resulting fallout by a simple forward struct declaration and by
including the x86 specific asm/cpumask.h header where it is actually
required.

Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
V1.1: Fix PARAVIRT_XXL fallout....
---
 arch/x86/include/asm/cache.h             |    1 +
 arch/x86/include/asm/msr.h               |    5 +++--
 arch/x86/include/asm/paravirt.h          |    3 ++-
 arch/x86/include/asm/pvclock.h           |    1 +
 arch/x86/include/asm/smp.h               |    2 --
 arch/x86/include/asm/vdso/gettimeofday.h |    5 ++---
 arch/x86/kernel/cpu/mce/core.c           |    1 +
 arch/x86/kernel/nmi.c                    |    1 +
 arch/x86/kernel/smpboot.c                |    1 +
 9 files changed, 12 insertions(+), 8 deletions(-)

--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_CACHE_H
 #define _ASM_X86_CACHE_H
 
+#include <vdso/page.h>
 #include <linux/linkage.h>
 
 /* L1 cache line size */
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -8,12 +8,11 @@
 
 #include <asm/asm.h>
 #include <asm/errno.h>
-#include <asm/cpumask.h>
 #include <uapi/asm/msr.h>
 #include <asm/shared/msr.h>
 
+#include <linux/compiler_types.h>
 #include <linux/types.h>
-#include <linux/percpu.h>
 
 struct msr_info {
 	u32			msr_no;
@@ -256,6 +255,8 @@ int msr_set_bit(u32 msr, u8 bit);
 int msr_clear_bit(u32 msr, u8 bit);
 
 #ifdef CONFIG_SMP
+struct cpumask;
+
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
 int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -16,9 +16,10 @@
 
 #ifndef __ASSEMBLER__
 #include <linux/types.h>
-#include <linux/cpumask.h>
 #include <asm/frame.h>
 
+struct cpumask;
+
 /* The paravirtualized I/O functions */
 static inline void slow_down_io(void)
 {
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_PVCLOCK_H
 #define _ASM_X86_PVCLOCK_H
 
+#include <asm/barrier.h>
 #include <asm/clocksource.h>
 #include <asm/pvclock-abi.h>
 
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -5,8 +5,6 @@
 #include <linux/cpumask.h>
 #include <linux/thread_info.h>
 
-#include <asm/cpumask.h>
-
 DECLARE_PER_CPU_CACHE_HOT(int, cpu_number);
 
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -11,13 +11,12 @@
 #define __ASM_VDSO_GETTIMEOFDAY_H
 
 #ifndef __ASSEMBLER__
-
+#include <clocksource/hyperv_timer.h>
 #include <uapi/linux/time.h>
+
 #include <asm/vgtod.h>
 #include <asm/unistd.h>
-#include <asm/msr.h>
 #include <asm/pvclock.h>
-#include <clocksource/hyperv_timer.h>
 #include <asm/vdso/sys_call.h>
 
 #define VDSO_HAS_TIME 1
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -48,6 +48,7 @@
 #include <linux/vmcore_info.h>
 
 #include <asm/fred.h>
+#include <asm/cpumask.h>
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,6 +26,7 @@
 #include <linux/sched/clock.h>
 #include <linux/kvm_types.h>
 
+#include <asm/cpumask.h>
 #include <asm/cpu_entry_area.h>
 #include <asm/traps.h>
 #include <asm/mach_traps.h>
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -70,6 +70,7 @@
 #include <asm/irq.h>
 #include <asm/realmode.h>
 #include <asm/cpu.h>
+#include <asm/cpumask.h>
 #include <asm/numa.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>

^ permalink raw reply

* Re: [PATCH net-next v3 00/12] net: airoha: Support multiple net_devices connected to the same GDM port
From: Jakub Kicinski @ 2026-04-10 20:49 UTC (permalink / raw)
  To: Lorenzo Bianconi
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Christian Marangi,
	Benjamin Larsson, linux-arm-kernel, linux-mediatek, netdev,
	devicetree, Xuegang Lu
In-Reply-To: <adiop-9Mo4ADfvfw@lore-desk>

On Fri, 10 Apr 2026 09:37:11 +0200 Lorenzo Bianconi wrote:
> > On Mon, 06 Apr 2026 12:34:05 +0200 Lorenzo Bianconi wrote:  
> > > EN7581 or AN7583 SoCs support connecting multiple external SerDes (e.g.
> > > Ethernet or USB SerDes) to GDM3 or GDM4 ports via a hw arbiter that
> > > manages the traffic in a TDM manner. As a result multiple net_devices can
> > > connect to the same GDM{3,4} port and there is a theoretical "1:n"
> > > relation between GDM ports and net_devices.  
> > 
> > Looks like this driver uses page pool.
> > If you're sharing the same page pool across multiple netdevs
> > it must not be linked to a netdev.  
> 
> are you referring to slow.netdev pointer? If so, this is not set in airoha_eth
> driver.

Yes. Alright, thanks for checking. Pretty sure I saw it set somewhere 
in a file called airoha* but must be another component :)

^ permalink raw reply

* Re: [PATCH v3] selftests: vsock: avoid races creating Unix socket paths
From: Jakub Kicinski @ 2026-04-10 20:47 UTC (permalink / raw)
  To: Cao Ruichuang
  Cc: Stefano Garzarella, Shuah Khan, Stefano Garzarella, Simon Horman,
	Bobby Eshleman, virtualization, netdev, linux-kselftest,
	linux-kernel
In-Reply-To: <177581562073.13887.468247298173578281@163.com>

On Fri, 10 Apr 2026 18:07:00 +0800 Cao Ruichuang wrote:
> vmtest.sh currently uses mktemp -u to precompute Unix socket paths for the
> namespace bridge helpers. That only returns an unused pathname and leaves a
> time-of-check/time-of-use window before socat binds or connects to it.
> 
> Create a private temporary directory with mktemp -d and place the
> socket path inside it instead. This removes the pathname race while
> keeping cleanup straightforward.

And you actually run into this as a real problem?
How do you repro the failure?

Basic netdev rules:
 - don't post new version of patches in reply to the old ones
 - no more than 1 posting in a 24h period

^ permalink raw reply

* Re: [PATCH 2/4] net: ionic: Add PHC state page for user space access
From: Jakub Kicinski @ 2026-04-10 20:43 UTC (permalink / raw)
  To: Allen Hubbe
  Cc: Abhijit Gangurde, jgg, leon, brett.creeley, andrew+netdev, davem,
	edumazet, pabeni, nikhil.agarwal, linux-rdma, netdev,
	linux-kernel
In-Reply-To: <52cee89f-50e2-4569-a622-b03e711ab26b@amd.com>

On Fri, 10 Apr 2026 09:10:09 -0400 Allen Hubbe wrote:
> >> +struct ionic_phc_state {
> >> +     __u32 seq;
> >> +     __u32 rsvd;
> >> +     __aligned_u64 mask;
> >> +     __aligned_u64 tick;
> >> +     __aligned_u64 nsec;
> >> +     __aligned_u64 frac;
> >> +     __u32 mult;
> >> +     __u32 shift;
> >> +};  
> > 
> > You're just exposing kernel timecounter internals.
> > Why is this ionic uAPI and not something reusable by other drivers?  
> 
> The simple answer is just following the same approach as an existing 
> implementation.  See struct mlx5_ib_clock_info and 
> mlx5_update_clock_info_page().
> 
> Making this common might risk presuming that other implementations will 
> be a similar design.  Compare these to the sfc driver.  The clock is 
> quite different from ionic and mlx5, not using timecounter, because 
> instead of a free-running cycle counter the hardware itself provides an 
> adjustable clock for timestamping.

So your augment is basically that drivers which don't use sw timecounter
exist so we shouldn't bother creating common definitions for drivers
that do? Why do we have common implementation of timecounter in the
kernel at all then?

These are rhetorical questions.

^ permalink raw reply

* Re: [PATCH v5 net-next 0/8] dpll/ice: Add TXC DPLL type and full TX reference clock control for E825
From: Jakub Kicinski @ 2026-04-10 20:38 UTC (permalink / raw)
  To: Nitka, Grzegorz
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	intel-wired-lan@lists.osuosl.org, Oros, Petr,
	richardcochran@gmail.com, andrew+netdev@lunn.ch,
	Kitszel, Przemyslaw, Nguyen, Anthony L,
	Prathosh.Satish@microchip.com, Vecera, Ivan, jiri@resnulli.us,
	Kubalewski, Arkadiusz, vadim.fedorenko@linux.dev,
	donald.hunter@gmail.com, horms@kernel.org, pabeni@redhat.com,
	davem@davemloft.net, edumazet@google.com
In-Reply-To: <IA1PR11MB62194BF52262FCEB7FD5E76D92592@IA1PR11MB6219.namprd11.prod.outlook.com>

On Fri, 10 Apr 2026 14:23:58 +0000 Nitka, Grzegorz wrote:
> Here is the high-level connection diagram for E825 device. I hope you find it helpful:
> [..]

It does thanks a lot.

> Before this series, we tried different approaches.
> One of them was to create MUX pin associated with netdev interface.
> EXT_REF and SYNCE pins were registered with this MUX pin.
> However I recall there were at least two issues with this solution:
> - when using DPLL subsystem not all the connections/relations were visible
>   from DPLL pin-get perspective. RT netlink was required
> - due to mixing pins from different modules (like fwnode based pin from zl driver
>   and the pins from ice), we were not able to safely clean the references between
>   pins and dpll (basicaly .. we observed crashes)
> 
> Proposed solution just seems to be clean and fully reflects current
> connection topology.

Do you have the link to the old proposal that was adding stuff to
rtnetlink? I remember some discussion long-ish ago, maybe I was wrong.

> What's actually your biggest concern?
> The fact we introduce a new DPLL type? Or multiply DPLL instances? Or both?
> Do you prefer to see "one big" DPLL with 16 pins in our case (8 ports x 2 tx-clk pins)?
> Each pin with the name like, for example, PF0-SyncE/PF0-eRef etc.?

My concern is that I think this is a pretty run of the mill SyncE
design. If we need to pretend we have two DPLLs here if we really
only have one and a mux - then our APIs are mis-designed :(

^ permalink raw reply

* Re: [PATCH net 1/1] af_unix: Hold receive queue lock in ioctl(SIOCATMARK)
From: Kuniyuki Iwashima @ 2026-04-10 20:06 UTC (permalink / raw)
  To: n05ec
  Cc: bird, davem, edumazet, enjou1224z, horms, kuba, kuniyu, netdev,
	pabeni, rao.shoaib, tomapufckgml, wangjiexun2025, yifanwucs,
	yuantan098
In-Reply-To: <f6cbbc8da90e95584847b5ceb60aae830d1631c2.1775731983.git.wangjiexun2025@gmail.com>

From: Ren Wei <n05ec@lzu.edu.cn>
Date: Fri, 10 Apr 2026 14:31:57 +0800
> From: Jiexun Wang <wangjiexun2025@gmail.com>
> 
> unix_ioctl() peeks at the receive queue and may check both the head skb
> and its successor while deciding whether SIOCATMARK should report the
> mark. However, u->iolock does not stabilize receive-queue element
> lifetime. Queue teardown paths can purge or splice the queue under

Please be more specific here.


> sk->sk_receive_queue.lock and free the skb while unix_ioctl() still
> uses it.
> 
> Take sk->sk_receive_queue.lock while inspecting the queue so the skb
> and next_skb stay alive for the whole decision.
> 
> Fixes: 314001f0bf92 ("af_unix: Add OOB support")
> Reported-by: Yifan Wu <yifanwucs@gmail.com>
> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> Co-developed-by: Yuan Tan <yuantan098@gmail.com>
> Signed-off-by: Yuan Tan <yuantan098@gmail.com>
> Suggested-by: Xin Liu <bird@lzu.edu.cn>
> Tested-by: Ren Wei <enjou1224z@gmail.com>
> Signed-off-by: Jiexun Wang <wangjiexun2025@gmail.com>
> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> ---
>  net/unix/af_unix.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index b23c33df8b46..54f12d5cda37 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -3301,6 +3301,8 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
>  			int answ = 0;
>  
>  			mutex_lock(&u->iolock);
> +			/* The receive queue lock keeps skb and next_skb alive. */
> +			spin_lock(&sk->sk_receive_queue.lock);

I think this is not the correct fix.

SIOCATMARK is apparently for MSG_OOB,

  $ man 3 sockatmark

and non SOCK_STREAM sockets do not support MSG_OOB
and should not abuse SIOCATMARK.

---8<---
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index eebabf0bd850..868b26c963ab 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3299,6 +3299,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			struct sk_buff *skb;
 			int answ = 0;
 
+			if (sk->sk_type != SOCK_STREAM)
+				return -EOPNOTSUPP;
+
 			mutex_lock(&u->iolock);
 
 			skb = skb_peek(&sk->sk_receive_queue);
---8<---

^ permalink raw reply related

* Re: [PATCH net-next v04 3/6] hinic3: Add ethtool coalesce ops
From: Simon Horman @ 2026-04-10 20:12 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, edumazet, zhengjiezhen, guoxin09, netdev,
	andrew+netdev, maxime.chevallier, zhoushuai28, mohsin.bashr,
	ioana.ciornei, zhuyikai1, luosifu, linux-kernel, kuba, davem,
	linux-doc, pabeni, shijing34, wulike1
In-Reply-To: <f9f5c1239d17675283a2da8b307f5aca8f46864c.1775618798.git.zhuyikai1@h-partners.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
hinic3: Add ethtool coalesce ops

This patch implements ethtool .get_coalesce and .set_coalesce callback
functions to allow users to configure and monitor RX coalesce settings.

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> index be26698fc658..a4b2d5ba81f8 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c

[ ... ]

> @@ -985,9 +990,230 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
>  	kfree(ps);
>  }
>
> +static int hinic3_set_queue_coalesce(struct net_device *netdev, u16 q_id,
> +				     struct hinic3_intr_coal_info *coal)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +	struct hinic3_intr_coal_info *intr_coal;
> +	struct hinic3_interrupt_info info = {};
> +	int err;
> +
> +	intr_coal = &nic_dev->intr_coalesce[q_id];
> +
> +	intr_coal->coalesce_timer_cfg = coal->coalesce_timer_cfg;
> +	intr_coal->pending_limit = coal->pending_limit;
> +	intr_coal->rx_pending_limit_low = coal->rx_pending_limit_low;
> +	intr_coal->rx_pending_limit_high = coal->rx_pending_limit_high;
> +
> +	if (!test_bit(HINIC3_INTF_UP, &nic_dev->flags) ||
> +	    q_id >= nic_dev->q_params.num_qps || nic_dev->adaptive_rx_coal)
> +		return 0;
> +
> +	info.msix_index = nic_dev->q_params.irq_cfg[q_id].msix_entry_idx;
> +	info.interrupt_coalesc_set = 1;
> +	info.coalesc_timer_cfg = intr_coal->coalesce_timer_cfg;
> +	info.pending_limit = intr_coal->pending_limit;
> +	info.resend_timer_cfg = intr_coal->resend_timer_cfg;
> +	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
> +	if (err) {
> +		netdev_warn(netdev, "Failed to set queue%u coalesce\n", q_id);
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +static int is_coalesce_exceed_limit(struct net_device *netdev,
> +				    const struct ethtool_coalesce *coal)
> +{
> +	const struct {
> +		const char *name;
> +		u32 value;
> +		u32 limit;
> +	} coalesce_limits[] = {
> +		{"rx_coalesce_usecs",
> +		 coal->rx_coalesce_usecs,
> +		 COALESCE_MAX_TIMER_CFG},
> +		{"rx_max_coalesced_frames",
> +		 coal->rx_max_coalesced_frames,
> +		 COALESCE_MAX_PENDING_LIMIT},
> +		{"rx_max_coalesced_frames_low",
> +		 coal->rx_max_coalesced_frames_low,
> +		 COALESCE_MAX_PENDING_LIMIT},
> +		{"rx_max_coalesced_frames_high",
> +		 coal->rx_max_coalesced_frames_high,
> +		 COALESCE_MAX_PENDING_LIMIT},
> +	};
> +
> +	for (int i = 0; i < ARRAY_SIZE(coalesce_limits); i++) {
> +		if (coalesce_limits[i].value > coalesce_limits[i].limit) {
> +			netdev_err(netdev, "%s out of range %d-%d\n",
> +				   coalesce_limits[i].name, 0,
> +				   coalesce_limits[i].limit);
> +			return -ERANGE;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int is_coalesce_legal(struct net_device *netdev,
> +			     const struct ethtool_coalesce *coal)
> +{
> +	int err;
> +
> +	err = is_coalesce_exceed_limit(netdev, coal);
> +	if (err)
> +		return err;
> +
> +	if (coal->rx_max_coalesced_frames_low >
> +	    coal->rx_max_coalesced_frames_high) {
> +		netdev_err(netdev, "invalid coalesce frame high %u, low %u, unit %d\n",
> +			   coal->rx_max_coalesced_frames_high,
> +			   coal->rx_max_coalesced_frames_low,
> +			   COALESCE_PENDING_LIMIT_UNIT);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}

Does this validation allow non-zero low with zero high? For example, if
rx_max_coalesced_frames_low=10 and rx_max_coalesced_frames_high=0, the
condition 'low > high' would be true and return -EINVAL, which is correct.

However, this validation still accepts the case where low equals high, including
when both are zero. In v03, a reviewer asked: "So this would allow non-zero low
and zero high. For example, low = 10, high = 0. Is this expected?" The original
v03 logic was 'low >= high && high > 0' which rejected the case where low
equals high. In v04, this was changed to 'low > high' only, but the reviewer's
concern about whether allowing low=high (including low=0, high=0) is expected
behavior was not directly addressed.

Reference: https://lore.kernel.org/netdev/18d5a9b7-9c8d-4fc4-9332-fab27449dc58@gmail.com/

> +
> +static void check_coalesce_align(struct net_device *netdev,
> +				 u32 item, u32 unit, const char *str)
> +{
> +	if (item % unit)
> +		netdev_warn(netdev, "%s in %d units, change to %u\n",
> +			    str, unit, item - item % unit);
> +}

[ ... ]

> @@ -1003,6 +1229,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
>  	.get_eth_ctrl_stats             = hinic3_get_eth_ctrl_stats,
>  	.get_rmon_stats                 = hinic3_get_rmon_stats,
>  	.get_pause_stats                = hinic3_get_pause_stats,
> +	.get_coalesce                   = hinic3_get_coalesce,
> +	.set_coalesce                   = hinic3_set_coalesce,
>  };
>
>  void hinic3_set_ethtool_ops(struct net_device *netdev)

[ ... ]

> @@ -985,9 +990,230 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
>
>  static const struct ethtool_ops hinic3_ethtool_ops = {
> -	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
> -					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
> +	.supported_coalesce_params      = ETHTOOL_COALESCE_RX_USECS |
> +					  ETHTOOL_COALESCE_RX_MAX_FRAMES |
> +					  ETHTOOL_COALESCE_USE_ADAPTIVE_RX |
> +					  ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |
> +					  ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH,

Was the removal of ETHTOOL_COALESCE_TX_USECS intentional? In v02, the code
included 'ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_PKT_RATE_RX_USECS', but
v03 and v04 changed this to only RX-specific parameters. A reviewer asked if
this was intentional, but there was no response from the author and the v04
changelog does not mention this concern.

Reference: https://lore.kernel.org/netdev/18d5a9b7-9c8d-4fc4-9332-fab27449dc58@gmail.com/

^ permalink raw reply

* Re: [PATCH net-next v04 1/6] hinic3: Add ethtool queue ops
From: Simon Horman @ 2026-04-10 20:10 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, linux-kernel, maxime.chevallier, zhoushuai28, kuba,
	shijing34, pabeni, wulike1, mohsin.bashr, zhuyikai1, edumazet,
	guoxin09, netdev, zhengjiezhen, linux-doc, ioana.ciornei, luosifu,
	andrew+netdev, davem
In-Reply-To: <7deda08d48b7f894c5aa0537f70897871e11d012.1775618798.git.zhuyikai1@h-partners.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
hinic3: Add ethtool queue ops

This commit implements .get_ringparam and .set_ringparam ethtool callbacks,
allowing users to configure and monitor queue depths.

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> index 90fc16288de9..e47c3f43e7b9 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c

[ ... ]

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> index e7d6c2033b45..d3b3927b5408 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c

[ ... ]

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> index 0a888fe4c975..3b470978714a 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c

[ ... ]

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> index da73811641a9..cec501a9dd43 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> @@ -428,6 +428,84 @@ static void hinic3_vport_down(struct net_device *netdev)
>  	}
>  }
>
> +int
> +hinic3_change_channel_settings(struct net_device *netdev,
> +			       struct hinic3_dyna_txrxq_params *trxq_params)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +	struct hinic3_dyna_txrxq_params old_qp_params = {};
> +	struct hinic3_dyna_qp_params new_qp_params = {};
> +	struct hinic3_dyna_qp_params cur_qp_params = {};
> +	bool need_teardown = false;
> +	unsigned long flags;
> +	int err;
> +
> +	mutex_lock(&nic_dev->channel_cfg_lock);
> +
> +	hinic3_config_num_qps(netdev, trxq_params);
> +
> +	err = hinic3_alloc_channel_resources(netdev, &new_qp_params,
> +					     trxq_params);
> +	if (err) {
> +		netdev_err(netdev, "Failed to alloc channel resources\n");
> +		mutex_unlock(&nic_dev->channel_cfg_lock);
> +		return err;
> +	}
> +
> +	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
> +	if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
> +		need_teardown = true;
> +	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
> +
> +	if (need_teardown) {
> +		hinic3_vport_down(netdev);
> +		hinic3_close_channel(netdev);
> +		hinic3_uninit_qps(nic_dev, &cur_qp_params);
> +		hinic3_free_channel_resources(netdev, &cur_qp_params,
> +					      &nic_dev->q_params);
> +	}
> +
> +	if (nic_dev->num_qp_irq > trxq_params->num_qps)
> +		hinic3_qp_irq_change(netdev, trxq_params->num_qps);
> +
> +	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
> +	old_qp_params = nic_dev->q_params;
> +	nic_dev->q_params = *trxq_params;
> +	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
> +
> +	hinic3_init_qps(nic_dev, &new_qp_params);
> +
> +	err = hinic3_open_channel(netdev);
> +	if (err)
> +		goto err_uninit_qps;
> +
> +	err = hinic3_vport_up(netdev);
> +	if (err)
> +		goto err_close_channel;
> +
> +	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
> +	clear_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags);
> +	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
> +
> +	mutex_unlock(&nic_dev->channel_cfg_lock);
> +
> +	return 0;
> +
> +err_close_channel:
> +	hinic3_close_channel(netdev);
> +err_uninit_qps:
> +	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
> +	nic_dev->q_params = old_qp_params;
> +	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
> +
> +	hinic3_uninit_qps(nic_dev, &new_qp_params);
> +	hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
> +
> +	mutex_unlock(&nic_dev->channel_cfg_lock);
> +
> +	return err;
> +}

Does the error path properly clear the HINIC3_CHANGE_RES_INVALID flag?

On the success path, clear_bit(HINIC3_CHANGE_RES_INVALID) is called under
channel_res_lock. However, both error paths (err_close_channel and
err_uninit_qps) skip this, leaving the flag permanently set.

After a failed ring parameter change, the device enters an unrecoverable
state:

1. HINIC3_INTF_UP remains set, HINIC3_CHANGE_RES_INVALID remains set.

2. User does ifdown: hinic3_close() clears HINIC3_INTF_UP but skips
   teardown since HINIC3_CHANGE_RES_INVALID is already set:

   hinic3_close() {
       ...
       spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
       if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
           need_teardown = true;   // Already set, need_teardown = false
       spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);

       if (need_teardown) {
           // Skipped
       }
   }

3. User does ifup: hinic3_open() allocates fresh resources and sets
   HINIC3_INTF_UP, but never clears HINIC3_CHANGE_RES_INVALID.

4. HINIC3_CHANNEL_RES_VALID() now returns false permanently:

   #define HINIC3_CHANNEL_RES_VALID(nic_dev) \
       (test_bit(HINIC3_INTF_UP, &(nic_dev)->flags) && \
        !test_bit(HINIC3_CHANGE_RES_INVALID, &(nic_dev)->flags))

5. hinic3_link_status_change() in hinic3_main.c silently returns, so the
   link never comes up:

   static void hinic3_link_status_change(struct net_device *netdev,
                                         bool link_status_up)
   {
       ...
       spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
       valid = HINIC3_CHANNEL_RES_VALID(nic_dev);
       spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);

       if (!valid)
           return;   // Always returns here
       ...
   }

6. hinic3_set_interrupt_moder() in hinic3_irq.c returns 0 without
   configuring interrupts:

   static int hinic3_set_interrupt_moder(struct net_device *netdev, ...)
   {
       ...
       spin_lock_irqsave(&nic_dev->channel_res_lock, flags);

       if (!HINIC3_CHANNEL_RES_VALID(nic_dev) ||
           q_id >= nic_dev->q_params.num_qps) {
           spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
           return 0;   // Always returns here
       }
       ...
   }

The device appears up but is permanently non-functional until module reload.

Should the error paths clear HINIC3_CHANGE_RES_INVALID under
channel_res_lock before returning?

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
> index 9502293ff710..55b280888ad8 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h

[ ... ]

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
> index 12eefabcf1db..3791b9bc865b 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h

[ ... ]

^ permalink raw reply

* Re: [PATCH net-next v6 13/14] selftests: net: add team_bridge_macvlan rx_mode test
From: Stanislav Fomichev @ 2026-04-10 20:09 UTC (permalink / raw)
  To: Breno Leitao; +Cc: Stanislav Fomichev, netdev, davem, edumazet, kuba, pabeni
In-Reply-To: <adkmmM6cCKWqogIx@gmail.com>

On 04/10, Breno Leitao wrote:
> On Tue, Apr 07, 2026 at 08:31:00AM -0700, Stanislav Fomichev wrote:
> > Add a test that exercises the ndo_change_rx_flags path through a
> > macvlan -> bridge -> team -> dummy stack. This triggers dev_uc_add
> > under addr_list_lock which flips promiscuity on the lower device.
> > With the new work queue approach, this must not deadlock.
> > 
> > Link: https://lore.kernel.org/netdev/20260214033859.43857-1-jiayuan.chen@linux.dev/
> > Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
> 
> Reviewed-by: Breno Leitao <leitao@debian.org>
> >
> > ---
> >  tools/testing/selftests/net/config       |  3 ++
> >  tools/testing/selftests/net/rtnetlink.sh | 44 ++++++++++++++++++++++++
> >  2 files changed, 47 insertions(+)
> > 
> > diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
> > index 2a390cae41bf..38611ea11c6b 100644
> > --- a/tools/testing/selftests/net/config
> > +++ b/tools/testing/selftests/net/config
> > @@ -101,6 +101,9 @@ CONFIG_NET_SCH_HTB=m
> >  CONFIG_NET_SCH_INGRESS=m
> >  CONFIG_NET_SCH_NETEM=y
> >  CONFIG_NET_SCH_PRIO=m
> > +CONFIG_NET_TEAM=y
> > +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=y
> > +CONFIG_NET_TEAM_MODE_LOADBALANCE=y
> 
> Why do you need LOADBALANCE enabled for the test?

Not sure, I copy-pasted from team/config :-[ let me double check if it's
really needed.

^ permalink raw reply

* Re: [PATCH net-next 3/3] selftests: net: psp: add crypt-offset and spi-threshold test cases
From: Akhilesh Samineni @ 2026-04-10 20:07 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: davem, edumazet, kuba, pabeni, andrew+netdev, horms, willemb,
	daniel.zahka, netdev, linux-kernel, jayakrishnan.udayavarma,
	ajit.khaparde, kiran.kella, sachin.suman
In-Reply-To: <willemdebruijn.kernel.9cf8a81ff43@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3419 bytes --]

On Wed, Apr 8, 2026 at 3:22 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Akhilesh Samineni wrote:
> > Add test cases to set and get crypt-offset and spi-threshold attributes,
> > verifying both the applied value and the restored prior value.
> >
> > Signed-off-by: Akhilesh Samineni <akhilesh.samineni@broadcom.com>
> > Reviewed-by: Kiran Kella <kiran.kella@broadcom.com>
> > Reviewed-by: Ajit Kumar Khaparde <ajit.khaparde@broadcom.com>
> > ---
> >  tools/testing/selftests/drivers/net/psp.py | 32 ++++++++++++++++++++++
> >  1 file changed, 32 insertions(+)
> >
> > diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py
> > index 864d9fce1094..9253aab29ded 100755
> > --- a/tools/testing/selftests/drivers/net/psp.py
> > +++ b/tools/testing/selftests/drivers/net/psp.py
> > @@ -171,6 +171,38 @@ def dev_get_device_bad(cfg):
> >      ksft_true(raised)
> >
> >
> > +def dev_set_crypt_offset(cfg):
> > +    """ Set and get the crypt-offset """
> > +    _init_psp_dev(cfg)
> > +
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    orig = dev['crypt-offset']
> > +    cfg.pspnl.dev_set({"id": cfg.psp_dev_id,
> > +                       "crypt-offset": 5})
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    ksft_eq(dev['crypt-offset'], 5)
> > +    cfg.pspnl.dev_set({"id": cfg.psp_dev_id,
> > +                       "crypt-offset": orig})
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    ksft_eq(dev['crypt-offset'], orig)
> > +
> > +
> > +def dev_set_spi_threshold(cfg):
> > +    """ Set and get the spi-threshold """
> > +    _init_psp_dev(cfg)
> > +
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    orig = dev['spi-threshold']
> > +    cfg.pspnl.dev_set({"id": cfg.psp_dev_id,
> > +                       "spi-threshold": 10})
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    ksft_eq(dev['spi-threshold'], 10)
> > +    cfg.pspnl.dev_set({"id": cfg.psp_dev_id,
> > +                       "spi-threshold": orig})
> > +    dev = cfg.pspnl.dev_get({'id': cfg.psp_dev_id})
> > +    ksft_eq(dev['spi-threshold'], orig)
>
> These tests mainly verify that netlink works as intended. Not sure how
> much value that brings.
>
> Once crypt-offset requires bounds checking (say), such control ops
> functional tests may become more valuable.
Agree. I will include bounds checking for the crypt-offset in the v2
submission. This will ensure the control path is properly validated as
suggested.

>
> More interesting would be to see the effect on the datapath. E.g.,
> a crypt-offset that actually leaves plaintext. Not sure how easy or
> hard this is, so don't take this as a requirement. But maybe something
> that achievable with PSP packetdrill (eventually)?

Regarding datapath verification, because PSP encryption and decryption
occur within the hardware offload path, the host cannot directly
observe the encrypted/plaintext transition for self-received packets.

We verified the crypt-offset effect by capturing and inspecting
traffic on an external switch between the hosts. While integrating
this into packetdrill is a valuable suggestion for long-term testing,
the current hardware-based offload makes host-side snooping difficult.
We will investigate packetdrill support for a future update.

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4211 bytes --]

^ permalink raw reply

* [PATCH net-next v18 15/15] net: homa: create Makefile and Kconfig
From: John Ousterhout @ 2026-04-10 20:03 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, edumazet, horms, kuba, John Ousterhout
In-Reply-To: <20260410200310.1915-1-ouster@cs.stanford.edu>

Before this commit the Homa code is "inert": it won't be compiled
in kernel builds. This commit adds Homa's Makefile and Kconfig, and
also links Homa into net/Makefile and net/Kconfig, so that Homa
will be built during kernel builds if enabled (it is disabled by
default).

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>
---
 net/Kconfig       |  1 +
 net/Makefile      |  1 +
 net/homa/Kconfig  | 21 +++++++++++++++++++++
 net/homa/Makefile | 11 +++++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 net/homa/Kconfig
 create mode 100644 net/homa/Makefile

diff --git a/net/Kconfig b/net/Kconfig
index 62266eaf0e95..41bcc80ee9ec 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -252,6 +252,7 @@ source "net/bridge/netfilter/Kconfig"
 endif # if NETFILTER
 
 source "net/sctp/Kconfig"
+source "net/homa/Kconfig"
 source "net/rds/Kconfig"
 source "net/tipc/Kconfig"
 source "net/atm/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 90e3d72bf58b..17e0ba84bd15 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -44,6 +44,7 @@ ifneq ($(CONFIG_VLAN_8021Q),)
 obj-y				+= 8021q/
 endif
 obj-$(CONFIG_IP_SCTP)		+= sctp/
+obj-$(CONFIG_HOMA)		+= homa/
 obj-$(CONFIG_RDS)		+= rds/
 obj-$(CONFIG_WIRELESS)		+= wireless/
 obj-$(CONFIG_MAC80211)		+= mac80211/
diff --git a/net/homa/Kconfig b/net/homa/Kconfig
new file mode 100644
index 000000000000..16fec3fd52ba
--- /dev/null
+++ b/net/homa/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+
+#
+# Homa transport protocol
+#
+
+menuconfig HOMA
+	tristate "The Homa transport protocol"
+	depends on INET
+	depends on IPV6
+
+	help
+	  Homa is a network transport protocol for communication within
+	  a datacenter. It provides significantly lower latency than TCP,
+	  particularly for workloads containing a mixture of large and small
+	  messages operating at high network utilization. At present, Homa
+	  has been only partially upstreamed; this version provides bare-bones
+	  functionality but is not performant. For more information see the
+	  homa(7) man page or checkout the Homa Wiki at
+	  https://homa-transport.atlassian.net/wiki/spaces/HOMA/overview.
+
+	  If unsure, say N.
diff --git a/net/homa/Makefile b/net/homa/Makefile
new file mode 100644
index 000000000000..57f051d44c6b
--- /dev/null
+++ b/net/homa/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_HOMA) := homa.o
+homa-y:=        homa_incoming.o \
+                homa_interest.o \
+                homa_outgoing.o \
+                homa_peer.o \
+		homa_plumbing.o \
+                homa_pool.o \
+		homa_rpc.o \
+		homa_sock.o \
+		homa_timer.o \
+		homa_utils.o
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v18 14/15] net: homa: create homa_plumbing.c
From: John Ousterhout @ 2026-04-10 20:03 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, edumazet, horms, kuba, John Ousterhout
In-Reply-To: <20260410200310.1915-1-ouster@cs.stanford.edu>

homa_plumbing.c contains functions that connect Homa to the rest of
the Linux kernel, such as dispatch tables used by Linux and the
top-level functions that Linux invokes from those dispatch tables.

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>

---
Changes for v18:
* Add missing RCU locks.
* Cleanup homa_socket (eliminate redundant error handling)

Changes for v16:
* Implement HOMAIOCINFO ioctl
* Set hsk->error_msg (for HOMAIOCINFO)
* Use consume_skb and kfree_skb_reason instead of kfree_skb
* Use set_bit, clear_bit, etc. for flag bits
* Remove global_homa variable

Changes for v13:
* Fix bug in is_homa_pkt: didn't properly handle packets where the
  network header hadn't yet been set.

Changes for v12:
* Fix deadlock in homa_recvmsg (homa_rpc_reap was invoked while holding
  an RPC lock).

Changes for v11:
* Move link_mbps variable from struct homa_pacer back to struct homa.
* Clean up error handing in homa_load.
* Cleanup and simplify use of RPC reference counts.
* Add explicit padding to struct homa_recvmsg_args to fix problems compiling
  on 32-bit machines.

Changes for v10:
* Use the destroy function from struct proto properly (fixes bugs in
  socket cleanup)
* Fix issues from sparse, xmastree, etc.
* Replace __u16 with u16, __u8 with u8, etc.

Changes for v9:
* Add support for homa_net objects
* Various name improvements (e.g. use "alloc" instead of "new" for functions
  that allocate memory)
* Add BUILD_BUG_ON statements to replace _Static_asserts removed from
  header files
* Remove unnecessary/unused functions such as homa_get_port, homa_disconnect,
  and homa_backlog_rcv.

Changes for v8:
* Accommodate homa_pacer and homa_pool refactorings

Changes for v7:
* Remove extraneous code
* Make Homa a pernet subsystem
* Block Homa senders if insufficient tx buffer memory
* Check for missing buffer pool in homa_recvmsg
* Refactor waiting mechanism for incoming packets: simplify wait
  criteria and use standard Linux mechanisms for waiting
* Implement SO_HOMA_SERVER option for setsockopt
* Rename UNKNOWN packet type to RPC_UNKNOWN
* Remove locker argument from locking functions
* Use u64 and __u64 properly
* Use new homa_make_header_avl function
---
 net/homa/homa_impl.h     |   28 +
 net/homa/homa_plumbing.c | 1254 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 1282 insertions(+)
 create mode 100644 net/homa/homa_plumbing.c

diff --git a/net/homa/homa_impl.h b/net/homa/homa_impl.h
index 4a8df43003de..9f11ec49e2fe 100644
--- a/net/homa/homa_impl.h
+++ b/net/homa/homa_impl.h
@@ -362,28 +362,54 @@ extern unsigned int homa_net_id;
 void     homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
 		      struct homa_rpc *rpc);
 void     homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb);
+int      homa_bind(struct socket *sk, struct sockaddr_unsized *addr,
+		   int addr_len);
+void     homa_close(struct sock *sock, long timeout);
 void     homa_consume_rx_skb(struct sk_buff *skb);
 int      homa_copy_to_user(struct homa_rpc *rpc);
 void     homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
 void     homa_destroy(struct homa *homa);
 void     homa_dispatch_pkts(struct sk_buff *skb);
+int      homa_err_handler_v4(struct sk_buff *skb, u32 info);
+int      homa_err_handler_v6(struct sk_buff *skb,
+			     struct inet6_skb_parm *opt, u8 type,  u8 code,
+			     int offset, __be32 info);
 int      homa_fill_data_interleaved(struct homa_rpc *rpc,
 				    struct sk_buff *skb, struct iov_iter *iter);
 struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end);
+int      homa_getsockopt(struct sock *sk, int level, int optname,
+			 char __user *optval, int __user *optlen);
+int      homa_hash(struct sock *sk);
+enum hrtimer_restart homa_hrtimer(struct hrtimer *timer);
 int      homa_init(struct homa *homa);
+int      homa_ioc_info(struct socket *sock, unsigned long arg);
+int      homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int      homa_load(void);
 int      homa_message_out_fill(struct homa_rpc *rpc,
 			       struct iov_iter *iter, int xmit);
 void     homa_message_out_init(struct homa_rpc *rpc, int length);
 void     homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
 			   struct homa_rpc *rpc);
 void     homa_net_destroy(struct homa_net *hnet);
+void     homa_net_exit(struct net *net);
 int      homa_net_init(struct homa_net *hnet, struct net *net,
 		       struct homa *homa);
+int      homa_net_start(struct net *net);
+__poll_t homa_poll(struct file *file, struct socket *sock,
+		   struct poll_table_struct *wait);
+int      homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		      int flags);
 void     homa_request_retrans(struct homa_rpc *rpc);
 void     homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc,
 			 struct homa_sock *hsk);
 void     homa_rpc_handoff(struct homa_rpc *rpc);
 int      homa_rpc_tx_end(struct homa_rpc *rpc);
+int      homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+int      homa_setsockopt(struct sock *sk, int level, int optname,
+			 sockptr_t optval, unsigned int optlen);
+int      homa_shutdown(struct socket *sock, int how);
+int      homa_socket(struct sock *sk);
+int      homa_softirq(struct sk_buff *skb);
 void     homa_spin(int ns);
 void     homa_timer(struct homa *homa);
 void     homa_timer_check_rpc(struct homa_rpc *rpc);
@@ -391,7 +417,9 @@ int      homa_timer_main(void *transport);
 struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc,
 				       struct iov_iter *iter, int offset,
 				       int length, int max_seg_data);
+void     homa_unhash(struct sock *sk);
 void     homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
+void     homa_unload(void);
 int      homa_wait_private(struct homa_rpc *rpc, int nonblocking);
 struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking);
 int      homa_xmit_control(enum homa_packet_type type, void *contents,
diff --git a/net/homa/homa_plumbing.c b/net/homa/homa_plumbing.c
new file mode 100644
index 000000000000..67cc5bd347b0
--- /dev/null
+++ b/net/homa/homa_plumbing.c
@@ -0,0 +1,1254 @@
+// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+
+
+/* This file consists mostly of "glue" that hooks Homa into the rest of
+ * the Linux kernel. The guts of the protocol are in other files.
+ */
+
+#include "homa_impl.h"
+#include "homa_peer.h"
+#include "homa_pool.h"
+
+/* Identifier for retrieving Homa-specific data for a struct net. */
+unsigned int homa_net_id;
+
+/* This structure defines functions that allow Homa to be used as a
+ * pernet subsystem.
+ */
+static struct pernet_operations homa_net_ops = {
+	.init = homa_net_start,
+	.exit = homa_net_exit,
+	.id = &homa_net_id,
+	.size = sizeof(struct homa_net)
+};
+
+/* Global data for Homa. Avoid referencing directly except when there is
+ * no alternative (instead, use a homa pointer stored in a struct or
+ * passed via a parameter). This allows overriding during unit tests.
+ */
+static struct homa homa_data;
+
+/* This structure defines functions that handle various operations on
+ * Homa sockets. These functions are relatively generic: they are called
+ * to implement top-level system calls. Many of these operations can
+ * be implemented by PF_INET6 functions that are independent of the
+ * Homa protocol.
+ */
+static const struct proto_ops homa_proto_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = homa_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet_getname,
+	.poll		   = homa_poll,
+	.ioctl		   = homa_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = homa_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.set_peek_off	   = sk_set_peek_off,
+};
+
+static const struct proto_ops homav6_proto_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = homa_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet6_getname,
+	.poll		   = homa_poll,
+	.ioctl		   = homa_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = homa_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.set_peek_off	   = sk_set_peek_off,
+};
+
+/* This structure also defines functions that handle various operations
+ * on Homa sockets. However, these functions are lower-level than those
+ * in homa_proto_ops: they are specific to the PF_INET or PF_INET6
+ * protocol family, and in many cases they are invoked by functions in
+ * homa_proto_ops. Most of these functions have Homa-specific implementations.
+ */
+static struct proto homa_prot = {
+	.name		   = "HOMA",
+	.owner		   = THIS_MODULE,
+	.close		   = homa_close,
+	.connect	   = ip4_datagram_connect,
+	.init		   = homa_socket,
+	.destroy	   = homa_sock_destroy,
+	.setsockopt	   = homa_setsockopt,
+	.getsockopt	   = homa_getsockopt,
+	.sendmsg	   = homa_sendmsg,
+	.recvmsg	   = homa_recvmsg,
+	.hash		   = homa_hash,
+	.unhash		   = homa_unhash,
+	.obj_size	   = sizeof(struct homa_sock),
+	.no_autobind       = 1,
+};
+
+static struct proto homav6_prot = {
+	.name		   = "HOMAv6",
+	.owner		   = THIS_MODULE,
+	.close		   = homa_close,
+	.connect	   = ip6_datagram_connect,
+	.init		   = homa_socket,
+	.destroy	   = homa_sock_destroy,
+	.setsockopt	   = homa_setsockopt,
+	.getsockopt	   = homa_getsockopt,
+	.sendmsg	   = homa_sendmsg,
+	.recvmsg	   = homa_recvmsg,
+	.hash		   = homa_hash,
+	.unhash		   = homa_unhash,
+	.obj_size	   = sizeof(struct homa_v6_sock),
+	.ipv6_pinfo_offset = offsetof(struct homa_v6_sock, inet6),
+	.no_autobind       = 1,
+};
+
+/* Top-level structure describing the Homa protocol. */
+static struct inet_protosw homa_protosw = {
+	.type              = SOCK_DGRAM,
+	.protocol          = IPPROTO_HOMA,
+	.prot              = &homa_prot,
+	.ops               = &homa_proto_ops,
+	.flags             = INET_PROTOSW_REUSE,
+};
+
+static struct inet_protosw homav6_protosw = {
+	.type              = SOCK_DGRAM,
+	.protocol          = IPPROTO_HOMA,
+	.prot              = &homav6_prot,
+	.ops               = &homav6_proto_ops,
+	.flags             = INET_PROTOSW_REUSE,
+};
+
+/* This structure is used by IP to deliver incoming Homa packets to us. */
+static struct net_protocol homa_protocol = {
+	.handler =	homa_softirq,
+	.err_handler =	homa_err_handler_v4,
+	.no_policy =     1,
+};
+
+static struct inet6_protocol homav6_protocol = {
+	.handler =	homa_softirq,
+	.err_handler =	homa_err_handler_v6,
+	.flags =        INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+/* Sizes of the headers for each Homa packet type, in bytes. */
+static u16 header_lengths[] = {
+	sizeof(struct homa_data_hdr),
+	0,
+	sizeof(struct homa_resend_hdr),
+	sizeof(struct homa_rpc_unknown_hdr),
+	sizeof(struct homa_busy_hdr),
+	0,
+	0,
+	sizeof(struct homa_need_ack_hdr),
+	sizeof(struct homa_ack_hdr)
+};
+
+/* Thread that runs timer code to detect lost packets and crashed peers. */
+static struct task_struct *timer_kthread;
+static DECLARE_COMPLETION(timer_thread_done);
+
+/* Used to wakeup timer_kthread at regular intervals. */
+static struct hrtimer hrtimer;
+
+/* Nonzero is an indication to the timer thread that it should exit. */
+static int timer_thread_exit;
+
+/**
+ * homa_load() - invoked when this module is loaded into the Linux kernel
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int __init homa_load(void)
+{
+	struct homa *homa = &homa_data;
+	bool init_protocol6 = false;
+	bool init_protosw6 = false;
+	bool init_protocol = false;
+	bool init_protosw = false;
+	bool init_net_ops = false;
+	bool init_proto6 = false;
+	bool init_proto = false;
+	bool init_homa = false;
+	int status;
+
+	/* Compile-time validations that no packet header is longer
+	 * than HOMA_MAX_HEADER.
+	 */
+	BUILD_BUG_ON(sizeof(struct homa_data_hdr) > HOMA_MAX_HEADER);
+	BUILD_BUG_ON(sizeof(struct homa_resend_hdr) > HOMA_MAX_HEADER);
+	BUILD_BUG_ON(sizeof(struct homa_rpc_unknown_hdr) > HOMA_MAX_HEADER);
+	BUILD_BUG_ON(sizeof(struct homa_busy_hdr) > HOMA_MAX_HEADER);
+	BUILD_BUG_ON(sizeof(struct homa_need_ack_hdr) > HOMA_MAX_HEADER);
+	BUILD_BUG_ON(sizeof(struct homa_ack_hdr) > HOMA_MAX_HEADER);
+
+	/* Extra constraints on data packets:
+	 * - Ensure minimum header length so Homa doesn't have to worry about
+	 *   padding data packets.
+	 * - Make sure data packet headers are a multiple of 4 bytes (needed
+	 *   for TCP/TSO compatibility).
+	 */
+	BUILD_BUG_ON(sizeof(struct homa_data_hdr) < HOMA_MIN_PKT_LENGTH);
+	BUILD_BUG_ON((sizeof(struct homa_data_hdr) -
+		      sizeof(struct homa_seg_hdr)) & 0x3);
+
+	/* Detect size changes in uAPI structs. */
+	BUILD_BUG_ON(sizeof(struct homa_sendmsg_args) != 24);
+	BUILD_BUG_ON(sizeof(struct homa_recvmsg_args) != 88);
+
+	status = homa_init(homa);
+	if (status)
+		goto error;
+	init_homa = true;
+
+	status = proto_register(&homa_prot, 1);
+	if (status != 0) {
+		pr_err("proto_register failed for homa_prot: %d\n", status);
+		goto error;
+	}
+	init_proto = true;
+
+	status = proto_register(&homav6_prot, 1);
+	if (status != 0) {
+		pr_err("proto_register failed for homav6_prot: %d\n", status);
+		goto error;
+	}
+	init_proto6 = true;
+
+	inet_register_protosw(&homa_protosw);
+	init_protosw = true;
+
+	status = inet6_register_protosw(&homav6_protosw);
+	if (status != 0) {
+		pr_err("inet6_register_protosw failed in %s: %d\n", __func__,
+		       status);
+		goto error;
+	}
+	init_protosw6 = true;
+
+	status = inet_add_protocol(&homa_protocol, IPPROTO_HOMA);
+	if (status != 0) {
+		pr_err("inet_add_protocol failed in %s: %d\n", __func__,
+		       status);
+		goto error;
+	}
+	init_protocol = true;
+
+	status = inet6_add_protocol(&homav6_protocol, IPPROTO_HOMA);
+	if (status != 0) {
+		pr_err("inet6_add_protocol failed in %s: %d\n",  __func__,
+		       status);
+		goto error;
+	}
+	init_protocol6 = true;
+
+	status = register_pernet_subsys(&homa_net_ops);
+	if (status != 0) {
+		pr_err("Homa got error from register_pernet_subsys: %d\n",
+		       status);
+		goto error;
+	}
+	init_net_ops = true;
+
+	timer_kthread = kthread_run(homa_timer_main, homa, "homa_timer");
+	if (IS_ERR(timer_kthread)) {
+		status = PTR_ERR(timer_kthread);
+		pr_err("couldn't create Homa timer thread: error %d\n",
+		       status);
+		timer_kthread = NULL;
+		goto error;
+	}
+
+	return 0;
+
+error:
+	if (timer_kthread) {
+		timer_thread_exit = 1;
+		wake_up_process(timer_kthread);
+		wait_for_completion(&timer_thread_done);
+	}
+	if (init_net_ops)
+		unregister_pernet_subsys(&homa_net_ops);
+	if (init_homa)
+		homa_destroy(homa);
+	if (init_protocol)
+		inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
+	if (init_protocol6)
+		inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
+	if (init_protosw)
+		inet_unregister_protosw(&homa_protosw);
+	if (init_protosw6)
+		inet6_unregister_protosw(&homav6_protosw);
+	if (init_proto)
+		proto_unregister(&homa_prot);
+	if (init_proto6)
+		proto_unregister(&homav6_prot);
+	return status;
+}
+
+/**
+ * homa_unload() - invoked when this module is unloaded from the Linux kernel.
+ */
+void __exit homa_unload(void)
+{
+	struct homa *homa = &homa_data;
+
+	pr_notice("Homa module unloading\n");
+
+	if (timer_kthread) {
+		timer_thread_exit = 1;
+		wake_up_process(timer_kthread);
+		wait_for_completion(&timer_thread_done);
+	}
+	unregister_pernet_subsys(&homa_net_ops);
+	inet_del_protocol(&homa_protocol, IPPROTO_HOMA);
+	inet_unregister_protosw(&homa_protosw);
+	inet6_del_protocol(&homav6_protocol, IPPROTO_HOMA);
+	inet6_unregister_protosw(&homav6_protosw);
+	proto_unregister(&homa_prot);
+	proto_unregister(&homav6_prot);
+	homa_destroy(homa);
+}
+
+module_init(homa_load);
+module_exit(homa_unload);
+
+/**
+ * homa_net_start() - Initialize Homa for a new network namespace.
+ * @net:    The net that Homa will be associated with.
+ * Return:  0 on success, otherwise a negative errno.
+ */
+int homa_net_start(struct net *net)
+{
+	pr_notice("Homa attaching to net namespace\n");
+	return homa_net_init(homa_net(net), net, &homa_data);
+}
+
+/**
+ * homa_net_exit() - Perform Homa cleanup needed when a network namespace
+ * is destroyed.
+ * @net:    The net from which Homa should be removed.
+ */
+void homa_net_exit(struct net *net)
+{
+	pr_notice("Homa detaching from net namespace\n");
+	homa_net_destroy(homa_net(net));
+}
+
+/**
+ * homa_bind() - Implements the bind system call for Homa sockets: associates
+ * a well-known service port with a socket. Unlike other AF_INET6 protocols,
+ * there is no need to invoke this system call for sockets that are only
+ * used as clients.
+ * @sock:     Socket on which the system call was invoked.
+ * @addr:     Contains the desired port number.
+ * @addr_len: Number of bytes in uaddr.
+ * Return:    0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *            on errors.
+ */
+int homa_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
+{
+	union sockaddr_in_union *addr_in = (union sockaddr_in_union *)addr;
+	struct homa_sock *hsk = homa_sk(sock->sk);
+	int port = 0;
+
+	if (unlikely(addr->sa_family != sock->sk->sk_family)) {
+		hsk->error_msg = "address family in bind address didn't match socket";
+		return -EAFNOSUPPORT;
+	}
+	if (addr_in->in6.sin6_family == AF_INET6) {
+		if (addr_len < sizeof(struct sockaddr_in6)) {
+			hsk->error_msg = "ipv6 address too short";
+			return -EINVAL;
+		}
+		port = ntohs(addr_in->in4.sin_port);
+	} else if (addr_in->in4.sin_family == AF_INET) {
+		if (addr_len < sizeof(struct sockaddr_in)) {
+			hsk->error_msg = "ipv4 address too short";
+			return -EINVAL;
+		}
+		port = ntohs(addr_in->in6.sin6_port);
+	}
+	return homa_sock_bind(hsk->hnet, hsk, port);
+}
+
+/**
+ * homa_close() - Invoked when close system call is invoked on a Homa socket.
+ * @sk:      Socket being closed
+ * @timeout: ??
+ */
+void homa_close(struct sock *sk, long timeout)
+{
+	struct homa_sock *hsk = homa_sk(sk);
+
+	homa_sock_shutdown(hsk);
+	sk_common_release(sk);
+}
+
+/**
+ * homa_shutdown() - Implements the shutdown system call for Homa sockets.
+ * @sock:    Socket to shut down.
+ * @how:     Ignored: for other sockets, can independently shut down
+ *           sending and receiving, but for Homa any shutdown will
+ *           shut down everything.
+ *
+ * Return: 0 on success, otherwise a negative errno.
+ */
+int homa_shutdown(struct socket *sock, int how)
+{
+	homa_sock_shutdown(homa_sk(sock->sk));
+	return 0;
+}
+
+/**
+ * homa_ioc_info() - The top-level function that implements the
+ * HOMAIOCINFO ioctl for Homa sockets.
+ * @sock:  Socket for this request
+ * @arg:   The address in user space of the argument to ioctl, which
+ *          is a homa_info struct.
+ *
+ * Return:  0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *          on errors.
+ */
+int homa_ioc_info(struct socket *sock, unsigned long arg)
+{
+	struct homa_sock *hsk = homa_sk(sock->sk);
+	struct homa_rpc_info rinfo;
+	struct homa_info hinfo;
+	struct homa_rpc *rpc;
+	int bytes_avl;
+	char *dst;
+
+	if (unlikely(copy_from_user(&hinfo, (void __user *)arg,
+				    sizeof(hinfo)))) {
+		hsk->error_msg = "invalid address for homa_info";
+		return -EFAULT;
+	}
+
+	if (!homa_protect_rpcs(hsk)) {
+		hsk->error_msg = "socket has been shut down";
+		return -ESHUTDOWN;
+	}
+	rcu_read_lock();
+	hinfo.bpool_avail_bytes = homa_pool_avail_bytes(hsk->buffer_pool);
+	hinfo.port = hsk->port;
+	dst = (char *)hinfo.rpc_info;
+	bytes_avl = hinfo.rpc_info_length;
+	hinfo.num_rpcs = 0;
+	list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) {
+		homa_rpc_lock(rpc);
+		if (rpc->state == RPC_DEAD) {
+			homa_rpc_unlock(rpc);
+			continue;
+		}
+		homa_rpc_get_info(rpc, &rinfo);
+		homa_rpc_unlock(rpc);
+		if (dst && bytes_avl >= sizeof(rinfo)) {
+			if (copy_to_user((void __user *)dst, &rinfo,
+					 sizeof(rinfo))) {
+				rcu_read_unlock();
+				homa_unprotect_rpcs(hsk);
+				hsk->error_msg = "couldn't copy homa_rpc_info to user space: invalid or read-only address?";
+				return -EFAULT;
+			}
+			dst += sizeof(rinfo);
+			bytes_avl -= sizeof(rinfo);
+		}
+		hinfo.num_rpcs++;
+	}
+	rcu_read_unlock();
+	homa_unprotect_rpcs(hsk);
+
+	if (hsk->error_msg)
+		snprintf(hinfo.error_msg, HOMA_ERROR_MSG_SIZE, "%s",
+			 hsk->error_msg);
+	else
+		hinfo.error_msg[0] = 0;
+
+	if (copy_to_user((void __user *)arg, &hinfo, sizeof(hinfo))) {
+		hsk->error_msg = "couldn't copy homa_info to user space: read-only address?";
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/**
+ * homa_ioctl() - Implements the ioctl system call for Homa sockets.
+ * @sock:  Socket on which the system call was invoked.
+ * @cmd:   Identifier for a particular ioctl operation.
+ * @arg:   Operation-specific argument; typically the address of a block
+ *         of data in user address space.
+ *
+ * Return: 0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *         on errors.
+ */
+int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == HOMAIOCINFO)
+		return homa_ioc_info(sock, arg);
+	homa_sk(sock->sk)->error_msg = "ioctl opcode isn't supported by Homa";
+	return -EINVAL;
+}
+
+/**
+ * homa_socket() - Implements the socket(2) system call for sockets.
+ * @sk:    Socket on which the system call was invoked. The non-Homa
+ *         parts have already been initialized.
+ *
+ * Return: always 0 (success).
+ */
+int homa_socket(struct sock *sk)
+{
+	return homa_sock_init(homa_sk(sk));
+}
+
+/**
+ * homa_setsockopt() - Implements the getsockopt system call for Homa sockets.
+ * @sk:      Socket on which the system call was invoked.
+ * @level:   Level at which the operation should be handled; will always
+ *           be IPPROTO_HOMA.
+ * @optname: Identifies a particular setsockopt operation.
+ * @optval:  Address in user space of information about the option.
+ * @optlen:  Number of bytes of data at @optval.
+ * Return:   0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *           on errors.
+ */
+int homa_setsockopt(struct sock *sk, int level, int optname,
+		    sockptr_t optval, unsigned int optlen)
+{
+	struct homa_sock *hsk = homa_sk(sk);
+	int ret;
+
+	if (level != IPPROTO_HOMA) {
+		hsk->error_msg = "homa_setsockopt invoked with level not IPPROTO_HOMA";
+		return -ENOPROTOOPT;
+	}
+
+	if (optname == SO_HOMA_RCVBUF) {
+		struct homa_rcvbuf_args args;
+
+		if (optlen != sizeof(struct homa_rcvbuf_args)) {
+			hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)";
+			return -EINVAL;
+		}
+
+		if (copy_from_sockptr(&args, optval, optlen)) {
+			hsk->error_msg = "invalid address for homa_rcvbuf_args";
+			return -EFAULT;
+		}
+
+		/* Do a trivial test to make sure we can at least write the
+		 * first page of the region.
+		 */
+		if (copy_to_user(u64_to_user_ptr(args.start), &args,
+				 sizeof(args))) {
+			hsk->error_msg = "receive buffer region is not writable";
+			return -EFAULT;
+		}
+
+		ret = homa_pool_set_region(hsk, u64_to_user_ptr(args.start),
+					   args.length);
+	} else if (optname == SO_HOMA_SERVER) {
+		int arg;
+
+		if (optlen != sizeof(arg)) {
+			hsk->error_msg = "invalid optlen argument: must be sizeof(int)";
+			return -EINVAL;
+		}
+
+		if (copy_from_sockptr(&arg, optval, optlen)) {
+			hsk->error_msg = "invalid address for SO_HOMA_SERVER value";
+			return -EFAULT;
+		}
+
+		if (arg)
+			hsk->is_server = true;
+		else
+			hsk->is_server = false;
+		ret = 0;
+	} else {
+		hsk->error_msg = "setsockopt option not supported by Homa";
+		ret = -ENOPROTOOPT;
+	}
+	return ret;
+}
+
+/**
+ * homa_getsockopt() - Implements the getsockopt system call for Homa sockets.
+ * @sk:      Socket on which the system call was invoked.
+ * @level:   Selects level in the network stack to handle the request;
+ *           must be IPPROTO_HOMA.
+ * @optname: Identifies a particular setsockopt operation.
+ * @optval:  Address in user space where the option's value should be stored.
+ * @optlen:  Number of bytes available at optval; will be overwritten with
+ *           actual number of bytes stored.
+ * Return:   0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *           on errors.
+ */
+int homa_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	struct homa_sock *hsk = homa_sk(sk);
+	struct homa_rcvbuf_args rcvbuf_args;
+	int is_server;
+	void *result;
+	int len;
+
+	if (copy_from_sockptr(&len, USER_SOCKPTR(optlen), sizeof(int))) {
+		hsk->error_msg = "invalid address for optlen argument to getsockopt";
+		return -EFAULT;
+	}
+
+	if (level != IPPROTO_HOMA) {
+		hsk->error_msg = "homa_getsockopt invoked with level not IPPROTO_HOMA";
+		return -ENOPROTOOPT;
+	}
+	if (optname == SO_HOMA_RCVBUF) {
+		if (len < sizeof(rcvbuf_args)) {
+			hsk->error_msg = "invalid optlen argument: must be sizeof(struct homa_rcvbuf_args)";
+			return -EINVAL;
+		}
+
+		homa_sock_lock(hsk);
+		homa_pool_get_rcvbuf(hsk->buffer_pool, &rcvbuf_args);
+		homa_sock_unlock(hsk);
+		len = sizeof(rcvbuf_args);
+		result = &rcvbuf_args;
+	} else if (optname == SO_HOMA_SERVER) {
+		if (len < sizeof(is_server)) {
+			hsk->error_msg = "invalid optlen argument: must be sizeof(int)";
+			return -EINVAL;
+		}
+
+		is_server = hsk->is_server;
+		len = sizeof(is_server);
+		result = &is_server;
+	} else {
+		hsk->error_msg = "getsockopt option not supported by Homa";
+		return -ENOPROTOOPT;
+	}
+
+	if (copy_to_sockptr(USER_SOCKPTR(optlen), &len, sizeof(int))) {
+		hsk->error_msg = "couldn't update optlen argument to getsockopt: read-only?";
+		return -EFAULT;
+	}
+
+	if (copy_to_sockptr(USER_SOCKPTR(optval), result, len)) {
+		hsk->error_msg = "couldn't update optval argument to getsockopt: read-only?";
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * homa_sendmsg() - Send a request or response message on a Homa socket.
+ * @sk:     Socket on which the system call was invoked.
+ * @msg:    Structure describing the message to send; the msg_control
+ *          field points to additional information.
+ * @length: Number of bytes of the message.
+ * Return:  0 on success, otherwise a negative errno. Sets hsk->error_msg
+ *          on errors.
+ */
+int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length)
+{
+	DECLARE_SOCKADDR(union sockaddr_in_union *, addr, msg->msg_name);
+	struct homa_sock *hsk = homa_sk(sk);
+	struct homa_sendmsg_args args;
+	struct homa_rpc *rpc = NULL;
+	int result = 0;
+
+	if (!addr) {
+		hsk->error_msg = "no msg_name passed to sendmsg";
+		result = -EINVAL;
+		goto error;
+	}
+
+	if (unlikely(!msg->msg_control_is_user)) {
+		hsk->error_msg = "msg_control argument for sendmsg isn't in user space";
+		result = -EINVAL;
+		goto error;
+	}
+	if (unlikely(copy_from_user(&args, (void __user *)msg->msg_control,
+				    sizeof(args)))) {
+		hsk->error_msg = "invalid address for msg_control argument to sendmsg";
+		result = -EFAULT;
+		goto error;
+	}
+	if (args.flags & ~HOMA_SENDMSG_VALID_FLAGS ||
+	    args.reserved != 0) {
+		hsk->error_msg = "reserved fields in homa_sendmsg_args must be zero";
+		result = -EINVAL;
+		goto error;
+	}
+
+	if (!homa_sock_wmem_avl(hsk)) {
+		result = homa_sock_wait_wmem(hsk,
+					     msg->msg_flags & MSG_DONTWAIT);
+		if (result != 0)
+			goto error;
+	}
+
+	if (addr->sa.sa_family != sk->sk_family) {
+		hsk->error_msg = "address family in sendmsg address must match the socket";
+		result = -EAFNOSUPPORT;
+		goto error;
+	}
+	if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
+	    (msg->msg_namelen < sizeof(struct sockaddr_in6) &&
+	     addr->in6.sin6_family == AF_INET6)) {
+		hsk->error_msg = "msg_namelen too short";
+		result = -EINVAL;
+		goto error;
+	}
+
+	if (!args.id) {
+		/* This is a request message. */
+		rpc = homa_rpc_alloc_client(hsk, addr);
+		if (IS_ERR(rpc)) {
+			result = PTR_ERR(rpc);
+			rpc = NULL;
+			goto error;
+		}
+		homa_rpc_hold(rpc);
+		if (args.flags & HOMA_SENDMSG_PRIVATE)
+			set_bit(RPC_PRIVATE, &rpc->flags);
+		rpc->completion_cookie = args.completion_cookie;
+		result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
+		if (result)
+			goto error;
+		args.id = rpc->id;
+		homa_rpc_unlock(rpc); /* Locked by homa_rpc_alloc_client. */
+
+		if (unlikely(copy_to_user((void __user *)msg->msg_control,
+					  &args, sizeof(args)))) {
+			homa_rpc_lock(rpc);
+			hsk->error_msg = "couldn't update homa_sendmsg_args argument to sendmsg: read-only?";
+			result = -EFAULT;
+			goto error;
+		}
+		homa_rpc_put(rpc);
+	} else {
+		/* This is a response message. */
+		struct in6_addr canonical_dest;
+
+		if (args.completion_cookie != 0) {
+			hsk->error_msg = "completion_cookie must be zero when sending responses";
+			result = -EINVAL;
+			goto error;
+		}
+		canonical_dest = canonical_ipv6_addr(addr);
+
+		rpc = homa_rpc_find_server(hsk, &canonical_dest, args.id);
+		if (!rpc) {
+			/* Return without an error if the RPC doesn't exist;
+			 * this could be totally valid (e.g. client is
+			 * no longer interested in it).
+			 */
+			return 0;
+		}
+		homa_rpc_hold(rpc);
+		if (rpc->error) {
+			hsk->error_msg = "RPC has failed, so can't send response";
+			result = rpc->error;
+			goto error;
+		}
+		if (rpc->state != RPC_IN_SERVICE) {
+			hsk->error_msg = "RPC is not in a state where a response can be sent";
+			result = -EINVAL;
+			goto error_dont_end_rpc;
+		}
+		rpc->state = RPC_OUTGOING;
+
+		result = homa_message_out_fill(rpc, &msg->msg_iter, 1);
+		if (result && rpc->state != RPC_DEAD)
+			goto error;
+		homa_rpc_put(rpc);
+		homa_rpc_unlock(rpc); /* Locked by homa_rpc_find_server. */
+	}
+	return 0;
+
+error:
+	if (rpc)
+		homa_rpc_end(rpc);
+
+error_dont_end_rpc:
+	if (rpc) {
+		homa_rpc_put(rpc);
+
+		/* Locked by homa_rpc_find_server or homa_rpc_alloc_client. */
+		homa_rpc_unlock(rpc);
+	}
+	return result;
+}
+
+/**
+ * homa_recvmsg() - Receive a message from a Homa socket.
+ * @sk:          Socket on which the system call was invoked.
+ * @msg:         Controlling information for the receive.
+ * @len:         Total bytes of space available in msg->msg_iov; not used.
+ * @flags:       Flags from system call; only MSG_DONTWAIT is used.
+ * Return:       The length of the message on success, otherwise a negative
+ *               errno. Sets hsk->error_msg on errors.
+ */
+int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
+{
+	struct homa_sock *hsk = homa_sk(sk);
+	struct homa_recvmsg_args control;
+	struct homa_rpc *rpc = NULL;
+	int nonblocking;
+	int result;
+
+	if (unlikely(!msg->msg_control)) {
+		/* This test isn't strictly necessary, but it provides a
+		 * hook for testing kernel call times.
+		 */
+		hsk->error_msg = "no msg_control passed to recvmsg";
+		return -EINVAL;
+	}
+	if (msg->msg_controllen != sizeof(control)) {
+		hsk->error_msg = "invalid msg_controllen in recvmsg";
+		return -EINVAL;
+	}
+	if (unlikely(copy_from_user(&control, (void __user *)msg->msg_control,
+				    sizeof(control)))) {
+		hsk->error_msg = "invalid address for msg_control argument to recvmsg";
+		return -EFAULT;
+	}
+	control.completion_cookie = 0;
+
+	if (control.num_bpages > HOMA_MAX_BPAGES) {
+		hsk->error_msg = "num_pages exceeds HOMA_MAX_BPAGES";
+		result = -EINVAL;
+		goto done;
+	}
+	if (control.reserved != 0) {
+		hsk->error_msg = "reserved fields in homa_recvmsg_args must be zero";
+		result = -EINVAL;
+		goto done;
+	}
+	if (!hsk->buffer_pool) {
+		hsk->error_msg = "SO_HOMA_RECVBUF socket option has not been set";
+		result = -EINVAL;
+		goto done;
+	}
+	result = homa_pool_free_bufs(hsk->buffer_pool, control.num_bpages,
+				     control.bpage_offsets);
+	control.num_bpages = 0;
+	if (result != 0) {
+		hsk->error_msg = "error while releasing buffer pages";
+		goto done;
+	}
+
+	nonblocking = flags & MSG_DONTWAIT;
+	if (control.id != 0) {
+		rpc = homa_rpc_find_client(hsk, control.id); /* Locks RPC. */
+		if (!rpc) {
+			hsk->error_msg = "invalid RPC id passed to recvmsg";
+			result = -EINVAL;
+			goto done;
+		}
+		homa_rpc_hold(rpc);
+		result = homa_wait_private(rpc, nonblocking);
+		if (result != 0) {
+			hsk->error_msg = "error while waiting for private RPC to complete";
+			control.id = 0;
+			goto done;
+		}
+	} else {
+		rpc = homa_wait_shared(hsk, nonblocking);
+		if (IS_ERR(rpc)) {
+			/* If we get here, it means there was an error that
+			 * prevented us from finding an RPC to return. Errors
+			 * in the RPC itself are handled below.
+			 */
+			hsk->error_msg = "error while waiting for shared RPC to complete";
+			result = PTR_ERR(rpc);
+			rpc = NULL;
+			goto done;
+		}
+	}
+	if (rpc->error) {
+		hsk->error_msg = "RPC failed";
+		result = rpc->error;
+	} else {
+		result = rpc->msgin.length;
+	}
+
+	/* Collect result information. */
+	control.id = rpc->id;
+	control.completion_cookie = rpc->completion_cookie;
+	if (likely(rpc->msgin.length >= 0)) {
+		control.num_bpages = rpc->msgin.num_bpages;
+		memcpy(control.bpage_offsets, rpc->msgin.bpage_offsets,
+		       sizeof(rpc->msgin.bpage_offsets));
+	}
+	if (msg->msg_name) {
+		if (sk->sk_family == AF_INET6) {
+			DECLARE_SOCKADDR(struct sockaddr_in6 *, in6,
+					 msg->msg_name);
+
+			in6->sin6_family = AF_INET6;
+			in6->sin6_port = htons(rpc->dport);
+			in6->sin6_addr = rpc->peer->addr;
+			msg->msg_namelen = sizeof(*in6);
+		} else {
+			DECLARE_SOCKADDR(struct sockaddr_in *, in4,
+					 msg->msg_name);
+
+			in4->sin_family = AF_INET;
+			in4->sin_port = htons(rpc->dport);
+			in4->sin_addr.s_addr = ipv6_to_ipv4(rpc->peer->addr);
+			msg->msg_namelen = sizeof(*in4);
+		}
+	}
+
+	/* This indicates that the application now owns the buffers, so
+	 * we won't free them in homa_rpc_end.
+	 */
+	rpc->msgin.num_bpages = 0;
+
+	if (homa_is_client(rpc->id)) {
+		homa_peer_add_ack(rpc);
+		homa_rpc_end(rpc);
+	} else {
+		if (result < 0)
+			homa_rpc_end(rpc);
+		else
+			rpc->state = RPC_IN_SERVICE;
+	}
+
+done:
+	/* Note: must release the RPC lock before calling homa_rpc_reap
+	 * or copying results to user space.
+	 */
+	if (rpc) {
+		homa_rpc_put(rpc);
+
+		/* Locked by homa_rpc_find_client or homa_wait_shared. */
+		homa_rpc_unlock(rpc);
+	}
+
+	if (test_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags)) {
+		/* There are tasks waiting for tx memory, so reap
+		 * immediately.
+		 */
+		homa_rpc_reap(hsk, true);
+	}
+
+	if (unlikely(copy_to_user((__force void __user *)msg->msg_control,
+				  &control, sizeof(control)))) {
+		hsk->error_msg = "couldn't update homa_recvmsg_args argument to recvmsg: read-only?";
+		result = -EFAULT;
+	}
+
+	return result;
+}
+
+/**
+ * homa_hash() - Not needed for Homa.
+ * @sk:    Socket for the operation
+ * Return: ??
+ */
+int homa_hash(struct sock *sk)
+{
+	return 0;
+}
+
+/**
+ * homa_unhash() - Not needed for Homa.
+ * @sk:    Socket for the operation
+ */
+void homa_unhash(struct sock *sk)
+{
+}
+
+/**
+ * homa_softirq() - This function is invoked at SoftIRQ level to handle
+ * incoming packets.
+ * @skb:   The incoming packet.
+ * Return: Always 0
+ */
+int homa_softirq(struct sk_buff *skb)
+{
+	struct sk_buff *packets, *other_pkts, *next;
+	struct sk_buff **prev_link, **other_link;
+	enum skb_drop_reason reason;
+	struct homa_common_hdr *h;
+	int header_offset;
+
+	/* skb may actually contain many distinct packets, linked through
+	 * skb_shinfo(skb)->frag_list by the Homa GRO mechanism. Make a
+	 * pass through the list to process all of the short packets,
+	 * leaving the longer packets in the list. Also, perform various
+	 * prep/cleanup/error checking functions.
+	 */
+	skb->next = skb_shinfo(skb)->frag_list;
+	skb_shinfo(skb)->frag_list = NULL;
+	packets = skb;
+	prev_link = &packets;
+	for (skb = packets; skb; skb = next) {
+		next = skb->next;
+
+		/* Make the header available at skb->data, even if the packet
+		 * is fragmented. One complication: it's possible that the IP
+		 * header hasn't yet been removed (this happens for GRO packets
+		 * on the frag_list, since they aren't handled explicitly by IP.
+		 */
+		if (!homa_make_header_avl(skb)) {
+			reason = SKB_DROP_REASON_HDR_TRUNC;
+			goto discard;
+		}
+		header_offset = skb_transport_header(skb) - skb->data;
+		if (header_offset)
+			__skb_pull(skb, header_offset);
+
+		/* Reject packets that are too short or have bogus types. */
+		h = (struct homa_common_hdr *)skb->data;
+		if (unlikely(skb->len < sizeof(struct homa_common_hdr) ||
+			     h->type < DATA || h->type > MAX_OP ||
+			     skb->len < header_lengths[h->type - DATA])) {
+			reason = SKB_DROP_REASON_PKT_TOO_SMALL;
+			goto discard;
+		}
+
+		/* Process the packet now if it is a control packet or
+		 * if it contains an entire short message.
+		 */
+		if (h->type != DATA || ntohl(((struct homa_data_hdr *)h)
+				->message_length) < 1400) {
+			*prev_link = skb->next;
+			skb->next = NULL;
+			homa_dispatch_pkts(skb);
+		} else {
+			prev_link = &skb->next;
+		}
+		continue;
+
+discard:
+		*prev_link = skb->next;
+		kfree_skb_reason(skb, reason);
+	}
+
+	/* Now process the longer packets. Each iteration of this loop
+	 * collects all of the packets for a particular RPC and dispatches
+	 * them (batching the packets for an RPC allows more efficient
+	 * generation of grants).
+	 */
+	while (packets) {
+		struct in6_addr saddr, saddr2;
+		struct homa_common_hdr *h2;
+		struct sk_buff *skb2;
+
+		skb = packets;
+		prev_link = &skb->next;
+		saddr = skb_canonical_ipv6_saddr(skb);
+		other_pkts = NULL;
+		other_link = &other_pkts;
+		h = (struct homa_common_hdr *)skb->data;
+		for (skb2 = skb->next; skb2; skb2 = next) {
+			next = skb2->next;
+			h2 = (struct homa_common_hdr *)skb2->data;
+			if (h2->sender_id == h->sender_id) {
+				saddr2 = skb_canonical_ipv6_saddr(skb2);
+				if (ipv6_addr_equal(&saddr, &saddr2)) {
+					*prev_link = skb2;
+					prev_link = &skb2->next;
+					continue;
+				}
+			}
+			*other_link = skb2;
+			other_link = &skb2->next;
+		}
+		*prev_link = NULL;
+		*other_link = NULL;
+		homa_dispatch_pkts(packets);
+		packets = other_pkts;
+	}
+
+	return 0;
+}
+
+/**
+ * homa_err_handler_v4() - Invoked by IP to handle an incoming error
+ * packet, such as ICMP UNREACHABLE.
+ * @skb:    The incoming packet; skb->data points to the byte just after
+ *          the ICMP header (the first byte of the embedded packet IP header).
+ * @skb:   The incoming packet.
+ * @info:  Information about the error that occurred?
+ *
+ * Return: zero, or a negative errno if the error couldn't be handled here.
+ */
+int homa_err_handler_v4(struct sk_buff *skb, u32 info)
+{
+	struct homa *homa = homa_net(dev_net(skb->dev))->homa;
+	const struct icmphdr *icmp = icmp_hdr(skb);
+	struct in6_addr daddr;
+	int type = icmp->type;
+	int code = icmp->code;
+	struct iphdr *iph;
+	int error = 0;
+	int port = 0;
+
+	iph = (struct iphdr *)(skb->data);
+	ipv6_addr_set_v4mapped(iph->daddr, &daddr);
+	if (type == ICMP_DEST_UNREACH && code == ICMP_PORT_UNREACH) {
+		struct homa_common_hdr *h = (struct homa_common_hdr *)(skb->data
+				+ iph->ihl * 4);
+
+		port = ntohs(h->dport);
+		error = -ENOTCONN;
+	} else if (type == ICMP_DEST_UNREACH) {
+		if (code == ICMP_PROT_UNREACH)
+			error = -EPROTONOSUPPORT;
+		else
+			error = -EHOSTUNREACH;
+	} else {
+		pr_notice("%s invoked with info %x, ICMP type %d, ICMP code %d\n",
+			  __func__, info, type, code);
+	}
+	if (error != 0)
+		homa_abort_rpcs(homa, &daddr, port, error);
+	return 0;
+}
+
+/**
+ * homa_err_handler_v6() - Invoked by IP to handle an incoming error
+ * packet, such as ICMP UNREACHABLE.
+ * @skb:    The incoming packet; skb->data points to the byte just after
+ *          the ICMP header (the first byte of the embedded packet IP header).
+ * @opt:    Not used.
+ * @type:   Type of ICMP packet.
+ * @code:   Additional information about the error.
+ * @offset: Not used.
+ * @info:   Information about the error that occurred?
+ *
+ * Return: zero, or a negative errno if the error couldn't be handled here.
+ */
+int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			u8 type,  u8 code,  int offset,  __be32 info)
+{
+	const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+	struct homa *homa = homa_net(dev_net(skb->dev))->homa;
+	int error = 0;
+	int port = 0;
+
+	if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_PORT_UNREACH) {
+		const struct homa_common_hdr *h;
+
+		h = (struct homa_common_hdr *)(skb->data + sizeof(*iph));
+		port = ntohs(h->dport);
+		error = -ENOTCONN;
+	} else if (type == ICMPV6_DEST_UNREACH && code == ICMPV6_ADDR_UNREACH) {
+		error = -EHOSTUNREACH;
+	} else if (type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_NEXTHDR) {
+		error = -EPROTONOSUPPORT;
+	}
+	if (error != 0)
+		homa_abort_rpcs(homa, &iph->daddr, port, error);
+	return 0;
+}
+
+/**
+ * homa_poll() - Invoked by Linux as part of implementing select, poll,
+ * epoll, etc.
+ * @file:  Open file that is participating in a poll, select, etc.
+ * @sock:  A Homa socket, associated with @file.
+ * @wait:  This table will be registered with the socket, so that it
+ *         is notified when the socket's ready state changes.
+ *
+ * Return: A mask of bits such as EPOLLIN, which indicate the current
+ *         state of the socket.
+ */
+__poll_t homa_poll(struct file *file, struct socket *sock,
+		   struct poll_table_struct *wait)
+{
+	struct homa_sock *hsk = homa_sk(sock->sk);
+	__poll_t mask;
+
+	mask = 0;
+	sock_poll_wait(file, sock, wait);
+	if (homa_sock_wmem_avl(hsk))
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	else
+		set_bit(SOCK_NOSPACE, &hsk->sock.sk_socket->flags);
+
+	if (hsk->shutdown)
+		mask |= EPOLLIN;
+
+	if (!list_empty(&hsk->ready_rpcs))
+		mask |= EPOLLIN | EPOLLRDNORM;
+	return mask;
+}
+
+/**
+ * homa_hrtimer() - This function is invoked by the hrtimer mechanism to
+ * wake up the timer thread. Runs at IRQ level.
+ * @timer:   The timer that triggered; not used.
+ *
+ * Return:   Always HRTIMER_NORESTART.
+ */
+enum hrtimer_restart homa_hrtimer(struct hrtimer *timer)
+{
+	wake_up_process(timer_kthread);
+	return HRTIMER_NORESTART;
+}
+
+/**
+ * homa_timer_main() - Top-level function for the timer thread.
+ * @transport:  Pointer to struct homa.
+ *
+ * Return:         Always 0.
+ */
+int homa_timer_main(void *transport)
+{
+	struct homa *homa = (struct homa *)transport;
+	ktime_t tick_interval;
+	u64 nsec;
+
+	hrtimer_setup(&hrtimer, homa_hrtimer, CLOCK_MONOTONIC,
+		      HRTIMER_MODE_REL);
+	nsec = 1000000;                   /* 1 ms */
+	tick_interval = ns_to_ktime(nsec);
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!timer_thread_exit) {
+			hrtimer_start(&hrtimer, tick_interval,
+				      HRTIMER_MODE_REL);
+			schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+		if (timer_thread_exit)
+			break;
+		homa_timer(homa);
+	}
+	hrtimer_cancel(&hrtimer);
+	kthread_complete_and_exit(&timer_thread_done, 0);
+	return 0;
+}
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("John Ousterhout <ouster@cs.stanford.edu>");
+MODULE_DESCRIPTION("Homa transport protocol");
+MODULE_VERSION("1.0");
+
+/* Arrange for this module to be loaded automatically when a Homa socket is
+ * opened. Apparently symbols don't work in the macros below, so must use
+ * numeric values for IPPROTO_HOMA (146) and SOCK_DGRAM(2).
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 146, 2);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 146, 2);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v18 13/15] net: homa: create homa_timer.c
From: John Ousterhout @ 2026-04-10 20:03 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, edumazet, horms, kuba, John Ousterhout
In-Reply-To: <20260410200310.1915-1-ouster@cs.stanford.edu>

This file contains code that wakes up periodically to check for
missing data, initiate retransmissions, and declare peer nodes
"dead".

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>

---
Changes for v14:
* Use new homa_rpc_tx_end function

Changes for v11:
* Cleanup sparse annotations.

Changes for v10:
* Refactor resend mechanism

Changes for v9:
* Reflect changes in socket and peer management
* Minor name changes for clarity

Changes for v7:
* Interface changes to homa_sock_start_scan etc.
* Remove locker argument from locking functions
* Use u64 and __u64 properly
---
 net/homa/homa_impl.h  |   3 +
 net/homa/homa_timer.c | 136 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 net/homa/homa_timer.c

diff --git a/net/homa/homa_impl.h b/net/homa/homa_impl.h
index b1ca8b25fb9a..4a8df43003de 100644
--- a/net/homa/homa_impl.h
+++ b/net/homa/homa_impl.h
@@ -385,6 +385,9 @@ void     homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc,
 void     homa_rpc_handoff(struct homa_rpc *rpc);
 int      homa_rpc_tx_end(struct homa_rpc *rpc);
 void     homa_spin(int ns);
+void     homa_timer(struct homa *homa);
+void     homa_timer_check_rpc(struct homa_rpc *rpc);
+int      homa_timer_main(void *transport);
 struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc,
 				       struct iov_iter *iter, int offset,
 				       int length, int max_seg_data);
diff --git a/net/homa/homa_timer.c b/net/homa/homa_timer.c
new file mode 100644
index 000000000000..dcfdcc06c8ab
--- /dev/null
+++ b/net/homa/homa_timer.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+
+
+/* This file handles timing-related functions for Homa, such as retries
+ * and timeouts.
+ */
+
+#include "homa_impl.h"
+#include "homa_peer.h"
+#include "homa_rpc.h"
+#include "homa_stub.h"
+
+/**
+ * homa_timer_check_rpc() -  Invoked for each RPC during each timer pass; does
+ * most of the work of checking for time-related actions such as sending
+ * resends, aborting RPCs for which there is no response, and sending
+ * requests for acks. It is separate from homa_timer because homa_timer
+ * got too long and deeply indented.
+ * @rpc:     RPC to check; must be locked by the caller.
+ */
+void homa_timer_check_rpc(struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa *homa = rpc->hsk->homa;
+	int tx_end = homa_rpc_tx_end(rpc);
+
+	/* See if we need to request an ack for this RPC. */
+	if (!homa_is_client(rpc->id) && rpc->state == RPC_OUTGOING &&
+	    tx_end == rpc->msgout.length) {
+		if (rpc->done_timer_ticks == 0) {
+			rpc->done_timer_ticks = homa->timer_ticks;
+		} else {
+			/* >= comparison that handles tick wrap-around. */
+			if ((rpc->done_timer_ticks + homa->request_ack_ticks
+					- 1 - homa->timer_ticks) & 1 << 31) {
+				struct homa_need_ack_hdr h;
+
+				homa_xmit_control(NEED_ACK, &h, sizeof(h), rpc);
+			}
+		}
+	}
+
+	if (rpc->state == RPC_INCOMING) {
+		if (rpc->msgin.num_bpages == 0) {
+			/* Waiting for buffer space, so no problem. */
+			rpc->silent_ticks = 0;
+			return;
+		}
+	} else if (!homa_is_client(rpc->id)) {
+		/* We're the server and we've received the input message;
+		 * no need to worry about retries.
+		 */
+		rpc->silent_ticks = 0;
+		return;
+	}
+
+	if (rpc->state == RPC_OUTGOING) {
+		if (tx_end < rpc->msgout.length) {
+			/* There are granted bytes that we haven't transmitted,
+			 * so no need to be concerned; the ball is in our court.
+			 */
+			rpc->silent_ticks = 0;
+			return;
+		}
+	}
+
+	if (rpc->silent_ticks < homa->resend_ticks)
+		return;
+	if (rpc->silent_ticks >= homa->timeout_ticks) {
+		homa_rpc_abort(rpc, -ETIMEDOUT);
+		return;
+	}
+	if (((rpc->silent_ticks - homa->resend_ticks) % homa->resend_interval)
+			== 0)
+		homa_request_retrans(rpc);
+}
+
+/**
+ * homa_timer() - This function is invoked at regular intervals ("ticks")
+ * to implement retries and aborts for Homa.
+ * @homa:    Overall data about the Homa protocol implementation.
+ */
+void homa_timer(struct homa *homa)
+{
+	struct homa_socktab_scan scan;
+	struct homa_sock *hsk;
+	struct homa_rpc *rpc;
+	int rpc_count = 0;
+
+	homa->timer_ticks++;
+
+	/* Scan all existing RPCs in all sockets. */
+	for (hsk = homa_socktab_start_scan(homa->socktab, &scan);
+			hsk; hsk = homa_socktab_next(&scan)) {
+		while (hsk->dead_skbs >= homa->dead_buffs_limit)
+			/* If we get here, it means that Homa isn't keeping
+			 * up with RPC reaping, so we'll help out.  See
+			 * "RPC Reaping Strategy" in homa_rpc_reap code for
+			 * details.
+			 */
+			if (homa_rpc_reap(hsk, false) == 0)
+				break;
+
+		if (list_empty(&hsk->active_rpcs) || hsk->shutdown)
+			continue;
+
+		if (!homa_protect_rpcs(hsk))
+			continue;
+		rcu_read_lock();
+		list_for_each_entry_rcu(rpc, &hsk->active_rpcs, active_links) {
+			homa_rpc_lock(rpc);
+			if (rpc->state == RPC_IN_SERVICE) {
+				rpc->silent_ticks = 0;
+				homa_rpc_unlock(rpc);
+				continue;
+			}
+			rpc->silent_ticks++;
+			homa_timer_check_rpc(rpc);
+			homa_rpc_unlock(rpc);
+			rpc_count++;
+			if (rpc_count >= 10) {
+				/* Give other kernel threads a chance to run
+				 * on this core.
+				 */
+				rcu_read_unlock();
+				schedule();
+				rcu_read_lock();
+				rpc_count = 0;
+			}
+		}
+		rcu_read_unlock();
+		homa_unprotect_rpcs(hsk);
+	}
+	homa_socktab_end_scan(&scan);
+	homa_skb_release_pages(homa);
+	homa_peer_gc(homa->peertab);
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v18 12/15] net: homa: create homa_incoming.c
From: John Ousterhout @ 2026-04-10 20:03 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, edumazet, horms, kuba, John Ousterhout
In-Reply-To: <20260410200310.1915-1-ouster@cs.stanford.edu>

This file contains most of the code for handling incoming packets,
including top-level dispatching code plus specific handlers for each
pack type. It also contains code for dispatching fully-received
messages to waiting application threads.

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>

Changes for v18:
* Create new function homa_consume_rx_skb (which uses skb_attempt_defer_free)
  and use that function instead of consume_skb.
* Make sure unused fields of outgoing packets are zeroes.
* Improve error handling in homa_ack_pkt.

Changes for v16:
* Use kfree_skb_reason and consume_skb instead of kfree_skb
* Use set_bit/clear_bit/test_bit
* Simplify handling of acks; eliminate akcs and num_acks variables
  in homa_dispatch_pkts
* Check for wmem in homa_dispatch_pkts (fixes deadlock over wmem)
* Use new homa_lock_preempt() function

Changes for v14:
* Use new homa_rpc_tx_end function
* Fix race in homa_wait_shared (an RPC could get lost if it became
  ready at the same time that homa_interest_wait returned with an error)
* Handle nonblocking behavior here, rather than in homa_interest.c
* Change API for homa_wait_private to distinguish errors in an RPC from
  errors that prevented the wait operation from completing.

Changes for v11:
* Cleanup and simplify use of RPC reference counts.
* Cleanup sparse annotations.
* Rework the mechanism for waking up RPCs that stalled waiting for
  buffer pool space.

Changes for v10:
* Revise sparse annotations to eliminate __context__ definition
* Refactor resend mechanism (new function homa_request_retrans replaces
  homa_gap_retry)
* Remove log messages after alloc errors
* Fix socket cleanup race

Changes for v9:
* Add support for homa_net objects
* Use new homa_clock abstraction layer
* Various name improvements (e.g. use "alloc" instead of "new" for functions
  that allocate memory)

Changes for v7:
* API change for homa_rpc_handoff
* Refactor waiting mechanism for incoming packets: simplify wait
  criteria and use standard Linux mechanisms for waiting, use
  new homa_interest struct
* Reject unauthorized incoming request messages
* Improve documentation for code that spins (and reduce spin length)
* Use RPC reference counts, eliminate RPC_HANDING_OFF flag
* Replace erroneous use of "safe" list iteration with "rcu" version
* Remove locker argument from locking functions
* Check incoming messages against HOMA_MAX_MESSAGE_LENGTH
* Use u64 and __u64 properly
---
 net/homa/homa_impl.h     |  16 +
 net/homa/homa_incoming.c | 906 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 922 insertions(+)
 create mode 100644 net/homa/homa_incoming.c

diff --git a/net/homa/homa_impl.h b/net/homa/homa_impl.h
index e997f1f6cb3b..b1ca8b25fb9a 100644
--- a/net/homa/homa_impl.h
+++ b/net/homa/homa_impl.h
@@ -359,22 +359,38 @@ static inline bool homa_make_header_avl(struct sk_buff *skb)
 
 extern unsigned int homa_net_id;
 
+void     homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
+		      struct homa_rpc *rpc);
+void     homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb);
+void     homa_consume_rx_skb(struct sk_buff *skb);
+int      homa_copy_to_user(struct homa_rpc *rpc);
+void     homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
 void     homa_destroy(struct homa *homa);
+void     homa_dispatch_pkts(struct sk_buff *skb);
 int      homa_fill_data_interleaved(struct homa_rpc *rpc,
 				    struct sk_buff *skb, struct iov_iter *iter);
+struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end);
 int      homa_init(struct homa *homa);
 int      homa_message_out_fill(struct homa_rpc *rpc,
 			       struct iov_iter *iter, int xmit);
 void     homa_message_out_init(struct homa_rpc *rpc, int length);
+void     homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
+			   struct homa_rpc *rpc);
 void     homa_net_destroy(struct homa_net *hnet);
 int      homa_net_init(struct homa_net *hnet, struct net *net,
 		       struct homa *homa);
+void     homa_request_retrans(struct homa_rpc *rpc);
+void     homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc,
+			 struct homa_sock *hsk);
 void     homa_rpc_handoff(struct homa_rpc *rpc);
 int      homa_rpc_tx_end(struct homa_rpc *rpc);
 void     homa_spin(int ns);
 struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc,
 				       struct iov_iter *iter, int offset,
 				       int length, int max_seg_data);
+void     homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
+int      homa_wait_private(struct homa_rpc *rpc, int nonblocking);
+struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking);
 int      homa_xmit_control(enum homa_packet_type type, void *contents,
 			   size_t length, struct homa_rpc *rpc);
 int      __homa_xmit_control(void *contents, size_t length,
diff --git a/net/homa/homa_incoming.c b/net/homa/homa_incoming.c
new file mode 100644
index 000000000000..7cb9a096cbec
--- /dev/null
+++ b/net/homa/homa_incoming.c
@@ -0,0 +1,906 @@
+// SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+
+
+/* This file contains functions that handle incoming Homa messages. */
+
+#include "homa_impl.h"
+#include "homa_interest.h"
+#include "homa_peer.h"
+#include "homa_pool.h"
+
+/**
+ * homa_message_in_init() - Constructor for homa_message_in.
+ * @rpc:          RPC whose msgin structure should be initialized. The
+ *                msgin struct is assumed to be zeroes.
+ * @length:       Total number of bytes in message.
+ * Return:        Zero for successful initialization, or a negative errno
+ *                if rpc->msgin could not be initialized.
+ */
+int homa_message_in_init(struct homa_rpc *rpc, int length)
+	__must_hold(rpc->bucket->lock)
+{
+	int err;
+
+	if (length > HOMA_MAX_MESSAGE_LENGTH)
+		return -EINVAL;
+
+	rpc->msgin.length = length;
+	__skb_queue_head_init(&rpc->msgin.packets);
+	INIT_LIST_HEAD(&rpc->msgin.gaps);
+	rpc->msgin.bytes_remaining = length;
+	err = homa_pool_alloc_msg(rpc);
+	if (err != 0) {
+		rpc->msgin.length = -1;
+		return err;
+	}
+	return 0;
+}
+
+/**
+ * homa_gap_alloc() - Allocate a new gap and add it to a gap list.
+ * @next:   Add the new gap just before this list element.
+ * @start:  Offset of first byte covered by the gap.
+ * @end:    Offset of byte just after the last one covered by the gap.
+ * Return:  Pointer to the new gap, or NULL if memory couldn't be allocated
+ *          for the gap object.
+ */
+struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end)
+{
+	struct homa_gap *gap;
+
+	gap = kmalloc_obj(*gap, GFP_ATOMIC);
+	if (!gap)
+		return NULL;
+	gap->start = start;
+	gap->end = end;
+	gap->time = homa_clock();
+	list_add_tail(&gap->links, next);
+	return gap;
+}
+
+/**
+ * homa_request_retrans() - The function is invoked when it appears that
+ * data packets for a message have been lost. It issues RESEND requests
+ * as appropriate and may modify the state of the RPC.
+ * @rpc:     RPC for which incoming data is delinquent; must be locked by
+ *           caller.
+ */
+void homa_request_retrans(struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_resend_hdr resend;
+	struct homa_gap *gap;
+	int offset, length;
+
+	if (rpc->msgin.length >= 0) {
+		/* Issue RESENDS for any gaps in incoming data. */
+		list_for_each_entry(gap, &rpc->msgin.gaps, links) {
+			resend.offset = htonl(gap->start);
+			resend.length = htonl(gap->end - gap->start);
+			homa_xmit_control(RESEND, &resend, sizeof(resend), rpc);
+		}
+
+		/* Issue a RESEND for any granted data after the last gap. */
+		offset = rpc->msgin.recv_end;
+		length = rpc->msgin.length - rpc->msgin.recv_end;
+		if (length <= 0)
+			return;
+	} else {
+		/* No data has been received for the RPC. Ask the sender to
+		 * resend everything it has sent so far.
+		 */
+		offset = 0;
+		length = -1;
+	}
+
+	resend.offset = htonl(offset);
+	resend.length = htonl(length);
+	homa_xmit_control(RESEND, &resend, sizeof(resend), rpc);
+}
+
+/**
+ * homa_add_packet() - Add an incoming packet to the contents of a
+ * partially received message.
+ * @rpc:   Add the packet to the msgin for this RPC.
+ * @skb:   The new packet. This function takes ownership of the packet
+ *         (the packet will either be freed or added to rpc->msgin.packets).
+ */
+void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data;
+	struct homa_gap *gap, *dummy, *gap2;
+	int start = ntohl(h->seg.offset);
+	int length = homa_data_len(skb);
+	enum skb_drop_reason reason;
+	int end = start + length;
+
+	if ((start + length) > rpc->msgin.length) {
+		reason = SKB_DROP_REASON_PKT_TOO_BIG;
+		goto discard;
+	}
+
+	if (start == rpc->msgin.recv_end) {
+		/* Common case: packet is sequential. */
+		rpc->msgin.recv_end += length;
+		goto keep;
+	}
+
+	if (start > rpc->msgin.recv_end) {
+		/* Packet creates a new gap. */
+		if (!homa_gap_alloc(&rpc->msgin.gaps,
+				    rpc->msgin.recv_end, start)) {
+			reason = SKB_DROP_REASON_NOMEM;
+			goto discard;
+		}
+		rpc->msgin.recv_end = end;
+		goto keep;
+	}
+
+	/* Must now check to see if the packet fills in part or all of
+	 * an existing gap.
+	 */
+	list_for_each_entry_safe(gap, dummy, &rpc->msgin.gaps, links) {
+		/* Is packet at the start of this gap? */
+		if (start <= gap->start) {
+			if (end <= gap->start)
+				continue;
+			if (start < gap->start) {
+				reason = SKB_DROP_REASON_DUP_FRAG;
+				goto discard;
+			}
+			if (end > gap->end) {
+				reason = SKB_DROP_REASON_DUP_FRAG;
+				goto discard;
+			}
+			gap->start = end;
+			if (gap->start >= gap->end) {
+				list_del(&gap->links);
+				kfree(gap);
+			}
+			goto keep;
+		}
+
+		/* Is packet at the end of this gap? BTW, at this point we know
+		 * the packet can't cover the entire gap.
+		 */
+		if (end >= gap->end) {
+			if (start >= gap->end)
+				continue;
+			if (end > gap->end) {
+				reason = SKB_DROP_REASON_DUP_FRAG;
+				goto discard;
+			}
+			gap->end = start;
+			goto keep;
+		}
+
+		/* Packet is in the middle of the gap; must split the gap. */
+		gap2 = homa_gap_alloc(&gap->links, gap->start, start);
+		if (!gap2) {
+			reason = SKB_DROP_REASON_NOMEM;
+			goto discard;
+		}
+		gap2->time = gap->time;
+		gap->start = end;
+		goto keep;
+	}
+	/* Packet doesn't overlap any gap, so it is a duplicate. */
+	reason = SKB_DROP_REASON_DUP_FRAG;
+
+discard:
+	kfree_skb_reason(skb, reason);
+	return;
+
+keep:
+	__skb_queue_tail(&rpc->msgin.packets, skb);
+	rpc->msgin.bytes_remaining -= length;
+}
+
+/**
+ * homa_consume_rx_skb() - Invoked to free an incoming skb that has been
+ * processed normally. Contains optimizations to minimize overhead during
+ * the execution of this function.
+ * @skb:    Buffer to free. Should be for an incoming skb, which was
+ *          processed normally.
+ */
+void homa_consume_rx_skb(struct sk_buff *skb)
+{
+	skb_orphan(skb);
+	skb_attempt_defer_free(skb);
+}
+
+/**
+ * homa_copy_to_user() - Copy as much data as possible from incoming
+ * packet buffers to buffers in user space.
+ * @rpc:     RPC for which data should be copied. Must be locked by caller.
+ * Return:   Zero for success or a negative errno if there is an error.
+ *           It is possible for the RPC to be freed while this function
+ *           executes (it releases and reacquires the RPC lock). If that
+ *           happens, -EINVAL will be returned and the state of @rpc
+ *           will be RPC_DEAD. Clears the RPC_PKTS_READY bit in @rpc->flags
+ *           if all available packets have been copied out.
+ */
+int homa_copy_to_user(struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+#define MAX_SKBS 20
+	struct sk_buff *skbs[MAX_SKBS];
+	int error = 0;
+	int n = 0;             /* Number of filled entries in skbs. */
+	int i;
+
+	/* Tricky note: we can't hold the RPC lock while we're actually
+	 * copying to user space, because (a) it's illegal to hold a spinlock
+	 * while copying to user space and (b) we'd like for homa_softirq
+	 * to add more packets to the RPC while we're copying these out.
+	 * So, collect a bunch of packets to copy, then release the lock,
+	 * copy them, and reacquire the lock.
+	 */
+	while (true) {
+		struct sk_buff *skb;
+
+		if (rpc->state == RPC_DEAD) {
+			error = -EINVAL;
+			break;
+		}
+
+		skb = __skb_dequeue(&rpc->msgin.packets);
+		if (skb) {
+			skbs[n] = skb;
+			n++;
+			if (n < MAX_SKBS)
+				continue;
+		}
+		if (n == 0) {
+			clear_bit(RPC_PKTS_READY, &rpc->flags);
+			break;
+		}
+
+		/* At this point we've collected a batch of packets (or
+		 * run out of packets); copy any available packets out to
+		 * user space.
+		 */
+		homa_rpc_unlock(rpc);
+
+		/* Each iteration of this loop copies out one skb. */
+		for (i = 0; i < n; i++) {
+			struct homa_data_hdr *h = (struct homa_data_hdr *)
+					skbs[i]->data;
+			int pkt_length = homa_data_len(skbs[i]);
+			int offset = ntohl(h->seg.offset);
+			int buf_bytes, chunk_size;
+			struct iov_iter iter;
+			int copied = 0;
+			char __user *dst;
+
+			/* Each iteration of this loop copies to one
+			 * user buffer.
+			 */
+			while (copied < pkt_length) {
+				chunk_size = pkt_length - copied;
+				dst = homa_pool_get_buffer(rpc, offset + copied,
+							   &buf_bytes);
+				if (buf_bytes < chunk_size) {
+					if (buf_bytes == 0) {
+						/* skb has data beyond message
+						 * end?
+						 */
+						break;
+					}
+					chunk_size = buf_bytes;
+				}
+				error = import_ubuf(READ, dst, chunk_size,
+						    &iter);
+				if (error)
+					goto free_skbs;
+				error = skb_copy_datagram_iter(skbs[i],
+							       sizeof(*h) +
+							       copied,  &iter,
+							       chunk_size);
+				if (error)
+					goto free_skbs;
+				copied += chunk_size;
+			}
+		}
+
+free_skbs:
+		for (i = 0; i < n; i++)
+			homa_consume_rx_skb(skbs[i]);
+		n = 0;
+		homa_rpc_lock_preempt(rpc);
+		if (error)
+			break;
+	}
+	return error;
+}
+
+/**
+ * homa_dispatch_pkts() - Top-level function that processes a batch of packets,
+ * all related to the same RPC.
+ * @skb:       First packet in the batch, linked through skb->next.
+ */
+void homa_dispatch_pkts(struct sk_buff *skb)
+{
+#define MAX_ACKS 10
+	const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
+	struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data;
+	u64 id = homa_local_id(h->common.sender_id);
+	int dport = ntohs(h->common.dport);
+	struct homa_rpc *rpc = NULL;
+	struct homa_sock *hsk;
+	struct homa_net *hnet;
+	struct sk_buff *next;
+
+	/* Find the appropriate socket.*/
+	hnet = homa_net(dev_net(skb->dev));
+	hsk = homa_sock_find(hnet, dport);
+	if (!hsk || (!homa_is_client(id) && !hsk->is_server)) {
+		if (skb_is_ipv6(skb))
+			icmp6_send(skb, ICMPV6_DEST_UNREACH,
+				   ICMPV6_PORT_UNREACH, 0, NULL, IP6CB(skb));
+		else
+			icmp_send(skb, ICMP_DEST_UNREACH,
+				  ICMP_PORT_UNREACH, 0);
+		while (skb) {
+			next = skb->next;
+			kfree_skb(skb);
+			skb = next;
+		}
+		if (hsk)
+			sock_put(&hsk->sock);
+		return;
+	}
+
+	/* Each iteration through the following loop processes one packet. */
+	for (; skb; skb = next) {
+		h = (struct homa_data_hdr *)skb->data;
+		next = skb->next;
+
+		/* Relinquish the RPC lock temporarily if it's needed
+		 * elsewhere.
+		 */
+		if (rpc) {
+			if (test_bit(APP_NEEDS_LOCK, &rpc->flags)) {
+				homa_rpc_unlock(rpc);
+
+				/* This short spin is needed to ensure that the
+				 * other thread gets the lock before this thread
+				 * grabs it again below (the need for this
+				 * was confirmed experimentally in 2/2025;
+				 * without it, the handoff fails 20-25% of the
+				 * time). Furthermore, the call to homa_spin
+				 * seems to allow the other thread to acquire
+				 * the lock more quickly.
+				 */
+				homa_spin(100);
+				homa_rpc_lock(rpc);
+			}
+		}
+
+		/* If we don't already have an RPC, find it, lock it,
+		 * and create a reference on it.
+		 */
+		if (!rpc) {
+			if (!homa_is_client(id)) {
+				/* We are the server for this RPC. */
+				if (h->common.type == DATA) {
+					int created;
+
+					/* Create a new RPC if one doesn't
+					 * already exist.
+					 */
+					rpc = homa_rpc_alloc_server(hsk, &saddr,
+								    h,
+								    &created);
+					if (IS_ERR(rpc)) {
+						rpc = NULL;
+						goto discard;
+					}
+				} else {
+					rpc = homa_rpc_find_server(hsk, &saddr,
+								   id);
+				}
+			} else {
+				rpc = homa_rpc_find_client(hsk, id);
+			}
+			if (rpc)
+				homa_rpc_hold(rpc);
+		}
+		if (unlikely(!rpc)) {
+			if (h->common.type != NEED_ACK &&
+			    h->common.type != ACK &&
+			    h->common.type != RESEND)
+				goto discard;
+		} else {
+			if (h->common.type == DATA ||
+			    h->common.type == BUSY)
+				rpc->silent_ticks = 0;
+			rpc->peer->outstanding_resends = 0;
+		}
+
+		switch (h->common.type) {
+		case DATA:
+			homa_data_pkt(skb, rpc);
+			break;
+		case RESEND:
+			homa_resend_pkt(skb, rpc, hsk);
+			break;
+		case RPC_UNKNOWN:
+			homa_rpc_unknown_pkt(skb, rpc);
+			break;
+		case BUSY:
+			/* Nothing to do for these packets except reset
+			 * silent_ticks, which happened above.
+			 */
+			goto discard;
+		case NEED_ACK:
+			homa_need_ack_pkt(skb, hsk, rpc);
+			break;
+		case ACK:
+			homa_ack_pkt(skb, hsk, rpc);
+			break;
+		default:
+			goto discard;
+		}
+		continue;
+
+discard:
+		kfree_skb(skb);
+	}
+	if (rpc) {
+		homa_rpc_put(rpc);
+		homa_rpc_unlock(rpc);
+	}
+
+	/* We need to reap dead RPCs here under two conditions:
+	 * 1. The socket has hit its limit on tx buffer space and threads are
+	 *    blocked waiting for skbs to be released.
+	 * 2. A large number of dead RPCs have accumulated, and it seems
+	 *    that the reaper isn't keeping up when invoked only at
+	 *    "convenient" times (see "RPC Reaping Strategy" in homa_rpc_reap
+	 *    code for details).
+	 */
+	if (hsk->dead_skbs > 0) {
+		int waiting_for_wmem = test_bit(SOCK_NOSPACE,
+						&hsk->sock.sk_socket->flags);
+		if (waiting_for_wmem ||
+		    hsk->dead_skbs >= 2 * hsk->homa->dead_buffs_limit)
+			homa_rpc_reap(hsk, waiting_for_wmem);
+	}
+	sock_put(&hsk->sock);
+}
+
+/**
+ * homa_data_pkt() - Handler for incoming DATA packets
+ * @skb:     Incoming packet; size known to be large enough for the header.
+ *           This function now owns the packet.
+ * @rpc:     Information about the RPC corresponding to this packet.
+ *           Must be locked by the caller.
+ */
+void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_data_hdr *h = (struct homa_data_hdr *)skb->data;
+
+	if (h->ack.client_id) {
+		const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
+
+		homa_rpc_unlock(rpc);
+		homa_rpc_acked(rpc->hsk, &saddr, &h->ack);
+		homa_rpc_lock(rpc);
+		if (rpc->state == RPC_DEAD)
+			goto discard;
+	}
+
+	if (rpc->state != RPC_INCOMING && homa_is_client(rpc->id)) {
+		if (unlikely(rpc->state != RPC_OUTGOING))
+			goto discard;
+		rpc->state = RPC_INCOMING;
+		if (homa_message_in_init(rpc, ntohl(h->message_length)) != 0)
+			goto discard;
+	} else if (rpc->state != RPC_INCOMING) {
+		/* Must be server; note that homa_rpc_alloc_server already
+		 * initialized msgin and allocated buffers.
+		 */
+		if (unlikely(rpc->msgin.length >= 0))
+			goto discard;
+	}
+
+	if (rpc->msgin.num_bpages == 0)
+		/* Drop packets that arrive when we can't allocate buffer
+		 * space. If we keep them around, packet buffer usage can
+		 * exceed available cache space, resulting in poor
+		 * performance.
+		 */
+		goto discard;
+
+	homa_add_packet(rpc, skb);
+
+	if (skb_queue_len(&rpc->msgin.packets) != 0 &&
+	    !test_bit(RPC_PKTS_READY, &rpc->flags)) {
+		set_bit(RPC_PKTS_READY, &rpc->flags);
+		homa_rpc_handoff(rpc);
+	}
+
+	return;
+
+discard:
+	kfree_skb(skb);
+}
+
+/**
+ * homa_resend_pkt() - Handler for incoming RESEND packets
+ * @skb:     Incoming packet; size already verified large enough for header.
+ *           This function now owns the packet.
+ * @rpc:     Information about the RPC corresponding to this packet; must
+ *           be locked by caller, but may be NULL if there is no RPC matching
+ *           this packet
+ * @hsk:     Socket on which the packet was received.
+ */
+void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc,
+		     struct homa_sock *hsk)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_resend_hdr *h = (struct homa_resend_hdr *)skb->data;
+	int offset = ntohl(h->offset);
+	int length = ntohl(h->length);
+	int end = offset + length;
+	struct homa_busy_hdr busy;
+	int tx_end;
+
+	if (!rpc) {
+		homa_xmit_unknown(skb, hsk);
+		goto done;
+	}
+
+	tx_end = homa_rpc_tx_end(rpc);
+	if (!homa_is_client(rpc->id) && rpc->state != RPC_OUTGOING) {
+		/* We are the server for this RPC and don't yet have a
+		 * response message, so send BUSY to keep the client
+		 * waiting.
+		 */
+		homa_xmit_control(BUSY, &busy, sizeof(busy), rpc);
+		goto done;
+	}
+
+	if (length == -1)
+		end = tx_end;
+
+	homa_resend_data(rpc, offset, (end > tx_end) ? tx_end : end);
+
+	if (offset >= tx_end) {
+		/* We have chosen not to transmit any of the requested data;
+		 * send BUSY so the receiver knows we are alive.
+		 */
+		homa_xmit_control(BUSY, &busy, sizeof(busy), rpc);
+		goto done;
+	}
+
+done:
+	homa_consume_rx_skb(skb);
+}
+
+/**
+ * homa_rpc_unknown_pkt() - Handler for incoming RPC_UNKNOWN packets.
+ * @skb:     Incoming packet; size known to be large enough for the header.
+ *           This function now owns the packet.
+ * @rpc:     Information about the RPC corresponding to this packet. Must
+ *           be locked by caller.
+ */
+void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	if (homa_is_client(rpc->id)) {
+		if (rpc->state == RPC_OUTGOING) {
+			int tx_end = homa_rpc_tx_end(rpc);
+
+			/* It appears that everything we've already transmitted
+			 * has been lost; retransmit it.
+			 */
+			homa_resend_data(rpc, 0, tx_end);
+			goto done;
+		}
+	} else {
+		homa_rpc_end(rpc);
+	}
+done:
+	homa_consume_rx_skb(skb);
+}
+
+/**
+ * homa_need_ack_pkt() - Handler for incoming NEED_ACK packets
+ * @skb:     Incoming packet; size already verified large enough for header.
+ *           This function now owns the packet.
+ * @hsk:     Socket on which the packet was received.
+ * @rpc:     The RPC named in the packet header, or NULL if no such
+ *           RPC exists. The RPC has been locked by the caller.
+ */
+void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
+		       struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_common_hdr *h = (struct homa_common_hdr *)skb->data;
+	const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
+	u64 id = homa_local_id(h->sender_id);
+	struct homa_ack_hdr ack;
+	struct homa_peer *peer;
+
+	/* Don't ack if it's not safe for the peer to purge its state
+	 * for this RPC (the RPC still exists and we haven't received
+	 * the entire response), or if we can't find peer info.
+	 */
+	if (rpc && (rpc->state != RPC_INCOMING ||
+		    rpc->msgin.bytes_remaining)) {
+		homa_request_retrans(rpc);
+		goto done;
+	} else {
+		peer = homa_peer_get(hsk, &saddr);
+		if (IS_ERR(peer))
+			goto done;
+	}
+
+	/* Send an ACK for this RPC. At the same time, include all of the
+	 * other acks available for the peer. Note: can't use rpc below,
+	 * since it may be NULL.
+	 */
+	memset(&ack, 0, sizeof(ack));
+	ack.common.type = ACK;
+	ack.common.sport = h->dport;
+	ack.common.dport = h->sport;
+	ack.common.sender_id = cpu_to_be64(id);
+	ack.num_acks = htons(homa_peer_get_acks(peer,
+						HOMA_MAX_ACKS_PER_PKT,
+						ack.acks));
+	__homa_xmit_control(&ack, sizeof(ack), peer, hsk);
+	homa_peer_release(peer);
+
+done:
+	homa_consume_rx_skb(skb);
+}
+
+/**
+ * homa_ack_pkt() - Handler for incoming ACK packets
+ * @skb:     Incoming packet; size already verified large enough for header.
+ *           This function now owns the packet.
+ * @hsk:     Socket on which the packet was received.
+ * @rpc:     The RPC named in the packet header, or NULL if no such
+ *           RPC exists. The RPC lock will be dead on return.
+ */
+void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
+		  struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	const struct in6_addr saddr = skb_canonical_ipv6_saddr(skb);
+	struct homa_ack_hdr *h = (struct homa_ack_hdr *)skb->data;
+	int i, count;
+
+	if (rpc)
+		homa_rpc_end(rpc);
+
+	count = ntohs(h->num_acks);
+	if (count > HOMA_MAX_ACKS_PER_PKT)
+		count = HOMA_MAX_ACKS_PER_PKT;
+	if (count > 0) {
+		if (rpc) {
+			/* Must temporarily release rpc's lock because
+			 * homa_rpc_acked needs to acquire RPC locks.
+			 */
+			homa_rpc_unlock(rpc);
+			for (i = 0; i < count; i++)
+				homa_rpc_acked(hsk, &saddr, &h->acks[i]);
+			homa_rpc_lock(rpc);
+		} else {
+			for (i = 0; i < count; i++)
+				homa_rpc_acked(hsk, &saddr, &h->acks[i]);
+		}
+	}
+	homa_consume_rx_skb(skb);
+}
+
+/**
+ * homa_wait_private() - Waits until the response has been received for
+ * a specific RPC or the RPC has failed with an error.
+ * @rpc:          RPC to wait for; an error will be returned if the RPC is
+ *                not a client RPC or not private. Must be locked by caller.
+ * @nonblocking:  Nonzero means return immediately if @rpc not ready.
+ * Return:        0 means that @rpc is ready for attention: either its response
+ *                has been received or it has an unrecoverable error such as
+ *                ETIMEDOUT (in rpc->error). Nonzero means some other error
+ *                (such as EINTR or EINVAL) occurred before @rpc became ready
+ *                for attention; in this case the return value is a negative
+ *                errno.
+ */
+int homa_wait_private(struct homa_rpc *rpc, int nonblocking)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_interest interest;
+	int result;
+
+	if (!test_bit(RPC_PRIVATE, &rpc->flags))
+		return -EINVAL;
+
+	/* Each iteration through this loop waits until rpc needs attention
+	 * in some way (e.g. packets have arrived), then deals with that need
+	 * (e.g. copy to user space). It may take many iterations until the
+	 * RPC is ready for the application.
+	 */
+	while (1) {
+		result = 0;
+		if (!rpc->error)
+			rpc->error = homa_copy_to_user(rpc);
+		if (rpc->error)
+			break;
+		if (rpc->msgin.length >= 0 &&
+		    rpc->msgin.bytes_remaining == 0 &&
+		    skb_queue_len(&rpc->msgin.packets) == 0)
+			break;
+
+		if (nonblocking) {
+			result = -EAGAIN;
+			break;
+		}
+
+		result = homa_interest_init_private(&interest, rpc);
+		if (result != 0)
+			break;
+
+		homa_rpc_unlock(rpc);
+		result = homa_interest_wait(&interest);
+
+		homa_rpc_lock_preempt(rpc);
+		homa_interest_unlink_private(&interest);
+
+		/* Abort on error, but if the interest actually got ready
+		 * in the meantime the ignore the error (loop back around
+		 * to process the RPC).
+		 */
+		if (result != 0 && atomic_read(&interest.ready) == 0)
+			break;
+	}
+
+	return result;
+}
+
+/**
+ * homa_wait_shared() - Wait for the completion of any non-private
+ * incoming message on a socket.
+ * @hsk:          Socket on which to wait. Must not be locked.
+ * @nonblocking:  Nonzero means return immediately if no RPC is ready.
+ *
+ * Return:    Pointer to an RPC with a complete incoming message or nonzero
+ *            error field, or a negative errno (usually -EINTR). If an RPC
+ *            is returned it will be locked and referenced; the caller
+ *            must release the lock and the reference.
+ */
+struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking)
+	__cond_acquires(nonnull, rpc->bucket->lock)
+{
+	struct homa_interest interest;
+	struct homa_rpc *rpc;
+	int result;
+
+	INIT_LIST_HEAD(&interest.links);
+	init_waitqueue_head(&interest.wait_queue);
+	/* Each iteration through this loop waits until an RPC needs attention
+	 * in some way (e.g. packets have arrived), then deals with that need
+	 * (e.g. copy to user space). It may take many iterations until an
+	 * RPC is ready for the application.
+	 */
+	while (1) {
+		homa_sock_lock(hsk);
+		if (hsk->shutdown) {
+			rpc = ERR_PTR(-ESHUTDOWN);
+			homa_sock_unlock(hsk);
+			goto done;
+		}
+		if (!list_empty(&hsk->ready_rpcs)) {
+			rpc = list_first_entry(&hsk->ready_rpcs,
+					       struct homa_rpc,
+					       ready_links);
+			homa_rpc_hold(rpc);
+			list_del_init(&rpc->ready_links);
+			if (!list_empty(&hsk->ready_rpcs)) {
+				/* There are still more RPCs available, so
+				 * let Linux know.
+				 */
+				hsk->sock.sk_data_ready(&hsk->sock);
+			}
+			homa_sock_unlock(hsk);
+		} else if (nonblocking) {
+			rpc = ERR_PTR(-EAGAIN);
+			homa_sock_unlock(hsk);
+
+			/* This is a good time to cleanup dead RPCS. */
+			homa_rpc_reap(hsk, false);
+			goto done;
+		} else {
+			homa_interest_init_shared(&interest, hsk);
+			homa_sock_unlock(hsk);
+			result = homa_interest_wait(&interest);
+
+			if (result != 0) {
+				int ready;
+
+				/* homa_interest_wait returned an error, so we
+				 * have to do two things. First, unlink the
+				 * interest from the socket. Second, check to
+				 * see if in the meantime the interest received
+				 * a handoff. If so, ignore the error. Very
+				 * important to hold the socket lock while
+				 * checking, in order to eliminate races with
+				 * homa_rpc_handoff.
+				 */
+				homa_sock_lock(hsk);
+				homa_interest_unlink_shared(&interest);
+				ready = atomic_read(&interest.ready);
+				homa_sock_unlock(hsk);
+				if (ready == 0) {
+					rpc = ERR_PTR(result);
+					goto done;
+				}
+			}
+
+			rpc = interest.rpc;
+			if (!rpc) {
+				rpc = ERR_PTR(-ESHUTDOWN);
+				goto done;
+			}
+		}
+
+		homa_rpc_lock_preempt(rpc);
+		if (!rpc->error)
+			rpc->error = homa_copy_to_user(rpc);
+		if (rpc->error) {
+			if (rpc->state != RPC_DEAD)
+				break;
+		} else if (rpc->msgin.bytes_remaining == 0 &&
+		    skb_queue_len(&rpc->msgin.packets) == 0)
+			break;
+		homa_rpc_put(rpc);
+		homa_rpc_unlock(rpc);
+	}
+
+done:
+	return rpc;
+}
+
+/**
+ * homa_rpc_handoff() - This function is called when the input message for
+ * an RPC is ready for attention from a user thread. It notifies a waiting
+ * reader and/or queues the RPC, as appropriate.
+ * @rpc:                RPC to handoff; must be locked.
+ */
+void homa_rpc_handoff(struct homa_rpc *rpc)
+	__must_hold(rpc->bucket->lock)
+{
+	struct homa_sock *hsk = rpc->hsk;
+	struct homa_interest *interest;
+
+	if (test_bit(RPC_PRIVATE, &rpc->flags)) {
+		homa_interest_notify_private(rpc);
+		return;
+	}
+
+	/* Shared RPC; if there is a waiting thread, hand off the RPC;
+	 * otherwise enqueue it.
+	 */
+	homa_sock_lock(hsk);
+	if (hsk->shutdown) {
+		homa_sock_unlock(hsk);
+		return;
+	}
+	if (!list_empty(&hsk->interests)) {
+		interest = list_first_entry(&hsk->interests,
+					    struct homa_interest, links);
+		list_del_init(&interest->links);
+		interest->rpc = rpc;
+		homa_rpc_hold(rpc);
+		atomic_set_release(&interest->ready, 1);
+		wake_up(&interest->wait_queue);
+	} else if (list_empty(&rpc->ready_links)) {
+		list_add_tail(&rpc->ready_links, &hsk->ready_rpcs);
+		hsk->sock.sk_data_ready(&hsk->sock);
+	}
+	homa_sock_unlock(hsk);
+}
+
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v18 11/15] net: homa: export skb_attempt_defer_free
From: John Ousterhout @ 2026-04-10 20:03 UTC (permalink / raw)
  To: netdev; +Cc: pabeni, edumazet, horms, kuba, John Ousterhout
In-Reply-To: <20260410200310.1915-1-ouster@cs.stanford.edu>

This function is now used by Homa.

Signed-off-by: John Ousterhout <ouster@cs.stanford.edu>
---
 net/core/skbuff.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4657d0245a84..a66209647732 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -7314,6 +7314,7 @@ nodefer:	kfree_skb_napi_cache(skb);
 	if (unlikely(kick))
 		kick_defer_list_purge(cpu);
 }
+EXPORT_SYMBOL(skb_attempt_defer_free);
 
 static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
 				 size_t offset, size_t len)
-- 
2.43.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox