netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-11-28 12:23 [PATCH net " Pablo Neira Ayuso
@ 2024-11-28 12:23 ` Pablo Neira Ayuso
  0 siblings, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:23 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

Softirq can interrupt packet from process context which walks over the
percpu area.

Add routines to disable bh while restoring and saving the tunnel parser
context from percpu area to stack. Add a skbuff owner for this percpu
area to catch softirq interference to exercise the packet tunnel parser
again in such case.

Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
 2 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index ff27cb2e1662..dae0e7592934 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -161,6 +161,7 @@ enum {
 };
 
 struct nft_inner_tun_ctx {
+	struct sk_buff *skb;	/* percpu area owner */
 	u16	type;
 	u16	inner_tunoff;
 	u16	inner_lloff;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 928312d01eb1..fcaa126ac8da 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
 			   struct nft_pktinfo *pkt,
 			   struct nft_inner_tun_ctx *tun_ctx)
 {
-	struct nft_inner_tun_ctx ctx = {};
 	u32 off = pkt->inneroff;
 
 	if (priv->flags & NFT_INNER_HDRSIZE &&
-	    nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+	    nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
 		return -1;
 
 	if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
-		if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+		if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
 			return -1;
 	} else if (priv->flags & NFT_INNER_TH) {
-		ctx.inner_thoff = off;
-		ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+		tun_ctx->inner_thoff = off;
+		tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
 	}
 
-	*tun_ctx = ctx;
 	tun_ctx->type = priv->type;
+	tun_ctx->skb = pkt->skb;
 	pkt->flags |= NFT_PKTINFO_INNER_FULL;
 
 	return 0;
 }
 
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+				      struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	if (this_cpu_tun_ctx->skb != pkt->skb) {
+		local_bh_enable();
+		return false;
+	}
+	*tun_ctx = *this_cpu_tun_ctx;
+	local_bh_enable();
+
+	return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+				   const struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	*this_cpu_tun_ctx = *tun_ctx;
+	local_bh_enable();
+}
+
 static bool nft_inner_parse_needed(const struct nft_inner *priv,
 				   const struct nft_pktinfo *pkt,
-				   const struct nft_inner_tun_ctx *tun_ctx)
+				   struct nft_inner_tun_ctx *tun_ctx)
 {
 	if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
 		return true;
 
+	if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+		return true;
+
 	if (priv->type != tun_ctx->type)
 		return true;
 
@@ -248,27 +278,29 @@ static bool nft_inner_parse_needed(const struct nft_inner *priv,
 static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
 {
-	struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
 	const struct nft_inner *priv = nft_expr_priv(expr);
+	struct nft_inner_tun_ctx tun_ctx = {};
 
 	if (nft_payload_inner_offset(pkt) < 0)
 		goto err;
 
-	if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
-	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+	if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
 		goto err;
 
 	switch (priv->expr_type) {
 	case NFT_INNER_EXPR_PAYLOAD:
-		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	case NFT_INNER_EXPR_META:
-		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		goto err;
 	}
+	nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
 	return;
 err:
 	regs->verdict.code = NFT_BREAK;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net,v2 0/4] Netfilter fixes for net
@ 2024-11-28 12:38 Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() Pablo Neira Ayuso
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

v2: Amended missing Fixes: tag in patch #4.

-o-

Hi,

The following patchset contains Netfilter fixes for net:

1) Fix esoteric UB due to uninitialized stack access in ip_vs_protocol_init(),
   from Jinghao Jia.

2) Fix iptables xt_LED slab-out-of-bounds, reported by syzbot,
   patch from Dmitry Antipov.

3) Remove WARN_ON_ONCE reachable from userspace to cap maximum cgroup
   levels to 255, reported by syzbot.

4) Fix nft_inner incorrect use of percpu area to store tunnel parser
   context with softirqs, reported by syzbot.

Please, pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-24-11-28

Thanks.

----------------------------------------------------------------

The following changes since commit 04f5cb48995d51deed0af71aaba1b8699511313f:

  Documentation: tls_offload: fix typos and grammar (2024-11-28 12:09:06 +0100)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-24-11-28

for you to fetch changes up to e4e12f81c14c8c0c5a2920587ad2619abf1b8e30:

  netfilter: nft_inner: incorrect percpu area handling under softirq (2024-11-28 13:32:17 +0100)

----------------------------------------------------------------
netfilter pull request 24-11-28

----------------------------------------------------------------
Dmitry Antipov (1):
      netfilter: x_tables: fix LED ID check in led_tg_check()

Jinghao Jia (1):
      ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init()

Pablo Neira Ayuso (2):
      netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level
      netfilter: nft_inner: incorrect percpu area handling under softirq

 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/ipvs/ip_vs_proto.c       |  4 +--
 net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++++++++--------
 net/netfilter/nft_socket.c             |  2 +-
 net/netfilter/xt_LED.c                 |  4 ++-
 5 files changed, 50 insertions(+), 17 deletions(-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net 1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init()
  2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
@ 2024-11-28 12:38 ` Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 2/4] netfilter: x_tables: fix LED ID check in led_tg_check() Pablo Neira Ayuso
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

From: Jinghao Jia <jinghao7@illinois.edu>

Under certain kernel configurations when building with Clang/LLVM, the
compiler does not generate a return or jump as the terminator
instruction for ip_vs_protocol_init(), triggering the following objtool
warning during build time:

  vmlinux.o: warning: objtool: ip_vs_protocol_init() falls through to next function __initstub__kmod_ip_vs_rr__935_123_ip_vs_rr_init6()

At runtime, this either causes an oops when trying to load the ipvs
module or a boot-time panic if ipvs is built-in. This same issue has
been reported by the Intel kernel test robot previously.

Digging deeper into both LLVM and the kernel code reveals this to be a
undefined behavior problem. ip_vs_protocol_init() uses a on-stack buffer
of 64 chars to store the registered protocol names and leaves it
uninitialized after definition. The function calls strnlen() when
concatenating protocol names into the buffer. With CONFIG_FORTIFY_SOURCE
strnlen() performs an extra step to check whether the last byte of the
input char buffer is a null character (commit 3009f891bb9f ("fortify:
Allow strlen() and strnlen() to pass compile-time known lengths")).
This, together with possibly other configurations, cause the following
IR to be generated:

  define hidden i32 @ip_vs_protocol_init() local_unnamed_addr #5 section ".init.text" align 16 !kcfi_type !29 {
    %1 = alloca [64 x i8], align 16
    ...

  14:                                               ; preds = %11
    %15 = getelementptr inbounds i8, ptr %1, i64 63
    %16 = load i8, ptr %15, align 1
    %17 = tail call i1 @llvm.is.constant.i8(i8 %16)
    %18 = icmp eq i8 %16, 0
    %19 = select i1 %17, i1 %18, i1 false
    br i1 %19, label %20, label %23

  20:                                               ; preds = %14
    %21 = call i64 @strlen(ptr noundef nonnull dereferenceable(1) %1) #23
    ...

  23:                                               ; preds = %14, %11, %20
    %24 = call i64 @strnlen(ptr noundef nonnull dereferenceable(1) %1, i64 noundef 64) #24
    ...
  }

The above code calculates the address of the last char in the buffer
(value %15) and then loads from it (value %16). Because the buffer is
never initialized, the LLVM GVN pass marks value %16 as undefined:

  %13 = getelementptr inbounds i8, ptr %1, i64 63
  br i1 undef, label %14, label %17

This gives later passes (SCCP, in particular) more DCE opportunities by
propagating the undef value further, and eventually removes everything
after the load on the uninitialized stack location:

  define hidden i32 @ip_vs_protocol_init() local_unnamed_addr #0 section ".init.text" align 16 !kcfi_type !11 {
    %1 = alloca [64 x i8], align 16
    ...

  12:                                               ; preds = %11
    %13 = getelementptr inbounds i8, ptr %1, i64 63
    unreachable
  }

In this way, the generated native code will just fall through to the
next function, as LLVM does not generate any code for the unreachable IR
instruction and leaves the function without a terminator.

Zero the on-stack buffer to avoid this possible UB.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202402100205.PWXIz1ZK-lkp@intel.com/
Co-developed-by: Ruowen Qin <ruqin@redhat.com>
Signed-off-by: Ruowen Qin <ruqin@redhat.com>
Signed-off-by: Jinghao Jia <jinghao7@illinois.edu>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index f100da4ba3bc..a9fd1d3fc2cb 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -340,7 +340,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
 
 int __init ip_vs_protocol_init(void)
 {
-	char protocols[64];
+	char protocols[64] = { 0 };
 #define REGISTER_PROTOCOL(p)			\
 	do {					\
 		register_ip_vs_protocol(p);	\
@@ -348,8 +348,6 @@ int __init ip_vs_protocol_init(void)
 		strcat(protocols, (p)->name);	\
 	} while (0)
 
-	protocols[0] = '\0';
-	protocols[2] = '\0';
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
 #endif
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net 2/4] netfilter: x_tables: fix LED ID check in led_tg_check()
  2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() Pablo Neira Ayuso
@ 2024-11-28 12:38 ` Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 3/4] netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level Pablo Neira Ayuso
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

From: Dmitry Antipov <dmantipov@yandex.ru>

Syzbot has reported the following BUG detected by KASAN:

BUG: KASAN: slab-out-of-bounds in strlen+0x58/0x70
Read of size 1 at addr ffff8881022da0c8 by task repro/5879
...
Call Trace:
 <TASK>
 dump_stack_lvl+0x241/0x360
 ? __pfx_dump_stack_lvl+0x10/0x10
 ? __pfx__printk+0x10/0x10
 ? _printk+0xd5/0x120
 ? __virt_addr_valid+0x183/0x530
 ? __virt_addr_valid+0x183/0x530
 print_report+0x169/0x550
 ? __virt_addr_valid+0x183/0x530
 ? __virt_addr_valid+0x183/0x530
 ? __virt_addr_valid+0x45f/0x530
 ? __phys_addr+0xba/0x170
 ? strlen+0x58/0x70
 kasan_report+0x143/0x180
 ? strlen+0x58/0x70
 strlen+0x58/0x70
 kstrdup+0x20/0x80
 led_tg_check+0x18b/0x3c0
 xt_check_target+0x3bb/0xa40
 ? __pfx_xt_check_target+0x10/0x10
 ? stack_depot_save_flags+0x6e4/0x830
 ? nft_target_init+0x174/0xc30
 nft_target_init+0x82d/0xc30
 ? __pfx_nft_target_init+0x10/0x10
 ? nf_tables_newrule+0x1609/0x2980
 ? nf_tables_newrule+0x1609/0x2980
 ? rcu_is_watching+0x15/0xb0
 ? nf_tables_newrule+0x1609/0x2980
 ? nf_tables_newrule+0x1609/0x2980
 ? __kmalloc_noprof+0x21a/0x400
 nf_tables_newrule+0x1860/0x2980
 ? __pfx_nf_tables_newrule+0x10/0x10
 ? __nla_parse+0x40/0x60
 nfnetlink_rcv+0x14e5/0x2ab0
 ? __pfx_validate_chain+0x10/0x10
 ? __pfx_nfnetlink_rcv+0x10/0x10
 ? __lock_acquire+0x1384/0x2050
 ? netlink_deliver_tap+0x2e/0x1b0
 ? __pfx_lock_release+0x10/0x10
 ? netlink_deliver_tap+0x2e/0x1b0
 netlink_unicast+0x7f8/0x990
 ? __pfx_netlink_unicast+0x10/0x10
 ? __virt_addr_valid+0x183/0x530
 ? __check_object_size+0x48e/0x900
 netlink_sendmsg+0x8e4/0xcb0
 ? __pfx_netlink_sendmsg+0x10/0x10
 ? aa_sock_msg_perm+0x91/0x160
 ? __pfx_netlink_sendmsg+0x10/0x10
 __sock_sendmsg+0x223/0x270
 ____sys_sendmsg+0x52a/0x7e0
 ? __pfx_____sys_sendmsg+0x10/0x10
 __sys_sendmsg+0x292/0x380
 ? __pfx___sys_sendmsg+0x10/0x10
 ? lockdep_hardirqs_on_prepare+0x43d/0x780
 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10
 ? exc_page_fault+0x590/0x8c0
 ? do_syscall_64+0xb6/0x230
 do_syscall_64+0xf3/0x230
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
...
 </TASK>

Since an invalid (without '\0' byte at all) byte sequence may be passed
from userspace, add an extra check to ensure that such a sequence is
rejected as possible ID and so never passed to 'kstrdup()' and further.

Reported-by: syzbot+6c8215822f35fdb35667@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=6c8215822f35fdb35667
Fixes: 268cb38e1802 ("netfilter: x_tables: add LED trigger target")
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_LED.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index f7b0286d106a..8a80fd76fe45 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -96,7 +96,9 @@ static int led_tg_check(const struct xt_tgchk_param *par)
 	struct xt_led_info_internal *ledinternal;
 	int err;
 
-	if (ledinfo->id[0] == '\0')
+	/* Bail out if empty string or not a string at all. */
+	if (ledinfo->id[0] == '\0' ||
+	    !memchr(ledinfo->id, '\0', sizeof(ledinfo->id)))
 		return -EINVAL;
 
 	mutex_lock(&xt_led_mutex);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net 3/4] netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level
  2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 2/4] netfilter: x_tables: fix LED ID check in led_tg_check() Pablo Neira Ayuso
@ 2024-11-28 12:38 ` Pablo Neira Ayuso
  2024-11-28 12:38 ` [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq Pablo Neira Ayuso
  2024-11-28 14:33 ` [PATCH net,v2 0/4] Netfilter fixes for net Paolo Abeni
  4 siblings, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

cgroup maximum depth is INT_MAX by default, there is a cgroup toggle to
restrict this maximum depth to a more reasonable value not to harm
performance. Remove unnecessary WARN_ON_ONCE which is reachable from
userspace.

Fixes: 7f3287db6543 ("netfilter: nft_socket: make cgroupsv2 matching work with namespaces")
Reported-by: syzbot+57bac0866ddd99fe47c0@syzkaller.appspotmail.com
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index f5da0c1775f2..35d0409b0095 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -68,7 +68,7 @@ static noinline int nft_socket_cgroup_subtree_level(void)
 
 	cgroup_put(cgrp);
 
-	if (WARN_ON_ONCE(level > 255))
+	if (level > 255)
 		return -ERANGE;
 
 	if (WARN_ON_ONCE(level < 0))
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
                   ` (2 preceding siblings ...)
  2024-11-28 12:38 ` [PATCH net 3/4] netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level Pablo Neira Ayuso
@ 2024-11-28 12:38 ` Pablo Neira Ayuso
  2024-11-29  9:14   ` Eric Dumazet
  2024-11-28 14:33 ` [PATCH net,v2 0/4] Netfilter fixes for net Paolo Abeni
  4 siblings, 1 reply; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 12:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw

Softirq can interrupt packet from process context which walks over the
percpu area.

Add routines to disable bh while restoring and saving the tunnel parser
context from percpu area to stack. Add a skbuff owner for this percpu
area to catch softirq interference to exercise the packet tunnel parser
again in such case.

Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
 2 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index ff27cb2e1662..dae0e7592934 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -161,6 +161,7 @@ enum {
 };
 
 struct nft_inner_tun_ctx {
+	struct sk_buff *skb;	/* percpu area owner */
 	u16	type;
 	u16	inner_tunoff;
 	u16	inner_lloff;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 928312d01eb1..fcaa126ac8da 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
 			   struct nft_pktinfo *pkt,
 			   struct nft_inner_tun_ctx *tun_ctx)
 {
-	struct nft_inner_tun_ctx ctx = {};
 	u32 off = pkt->inneroff;
 
 	if (priv->flags & NFT_INNER_HDRSIZE &&
-	    nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+	    nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
 		return -1;
 
 	if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
-		if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+		if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
 			return -1;
 	} else if (priv->flags & NFT_INNER_TH) {
-		ctx.inner_thoff = off;
-		ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+		tun_ctx->inner_thoff = off;
+		tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
 	}
 
-	*tun_ctx = ctx;
 	tun_ctx->type = priv->type;
+	tun_ctx->skb = pkt->skb;
 	pkt->flags |= NFT_PKTINFO_INNER_FULL;
 
 	return 0;
 }
 
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+				      struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	if (this_cpu_tun_ctx->skb != pkt->skb) {
+		local_bh_enable();
+		return false;
+	}
+	*tun_ctx = *this_cpu_tun_ctx;
+	local_bh_enable();
+
+	return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+				   const struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+	local_bh_disable();
+	this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+	*this_cpu_tun_ctx = *tun_ctx;
+	local_bh_enable();
+}
+
 static bool nft_inner_parse_needed(const struct nft_inner *priv,
 				   const struct nft_pktinfo *pkt,
-				   const struct nft_inner_tun_ctx *tun_ctx)
+				   struct nft_inner_tun_ctx *tun_ctx)
 {
 	if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
 		return true;
 
+	if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+		return true;
+
 	if (priv->type != tun_ctx->type)
 		return true;
 
@@ -248,27 +278,29 @@ static bool nft_inner_parse_needed(const struct nft_inner *priv,
 static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
 {
-	struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
 	const struct nft_inner *priv = nft_expr_priv(expr);
+	struct nft_inner_tun_ctx tun_ctx = {};
 
 	if (nft_payload_inner_offset(pkt) < 0)
 		goto err;
 
-	if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
-	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
+	if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
 		goto err;
 
 	switch (priv->expr_type) {
 	case NFT_INNER_EXPR_PAYLOAD:
-		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	case NFT_INNER_EXPR_META:
-		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		goto err;
 	}
+	nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
 	return;
 err:
 	regs->verdict.code = NFT_BREAK;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net,v2 0/4] Netfilter fixes for net
  2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
                   ` (3 preceding siblings ...)
  2024-11-28 12:38 ` [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq Pablo Neira Ayuso
@ 2024-11-28 14:33 ` Paolo Abeni
  2024-11-28 14:41   ` Pablo Neira Ayuso
  4 siblings, 1 reply; 13+ messages in thread
From: Paolo Abeni @ 2024-11-28 14:33 UTC (permalink / raw)
  To: Pablo Neira Ayuso, netfilter-devel; +Cc: davem, netdev, kuba, edumazet, fw

On 11/28/24 13:38, Pablo Neira Ayuso wrote:
> v2: Amended missing Fixes: tag in patch #4.
> 
> -o-
> 
> Hi,
> 
> The following patchset contains Netfilter fixes for net:
> 
> 1) Fix esoteric UB due to uninitialized stack access in ip_vs_protocol_init(),
>    from Jinghao Jia.
> 
> 2) Fix iptables xt_LED slab-out-of-bounds, reported by syzbot,
>    patch from Dmitry Antipov.
> 
> 3) Remove WARN_ON_ONCE reachable from userspace to cap maximum cgroup
>    levels to 255, reported by syzbot.
> 
> 4) Fix nft_inner incorrect use of percpu area to store tunnel parser
>    context with softirqs, reported by syzbot.
> 
> Please, pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-24-11-28
> 
> Thanks.

Oops... I completed the net PR a little earlier than this message, I was
testing it up 2 now, and I just sent it to Linus. Is there anything
above that can't wait until next week?

Thanks,

Paolo


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net,v2 0/4] Netfilter fixes for net
  2024-11-28 14:33 ` [PATCH net,v2 0/4] Netfilter fixes for net Paolo Abeni
@ 2024-11-28 14:41   ` Pablo Neira Ayuso
  0 siblings, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-11-28 14:41 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: netfilter-devel, davem, netdev, kuba, edumazet, fw

On Thu, Nov 28, 2024 at 03:33:59PM +0100, Paolo Abeni wrote:
> On 11/28/24 13:38, Pablo Neira Ayuso wrote:
> > v2: Amended missing Fixes: tag in patch #4.
> > 
> > -o-
> > 
> > Hi,
> > 
> > The following patchset contains Netfilter fixes for net:
> > 
> > 1) Fix esoteric UB due to uninitialized stack access in ip_vs_protocol_init(),
> >    from Jinghao Jia.
> > 
> > 2) Fix iptables xt_LED slab-out-of-bounds, reported by syzbot,
> >    patch from Dmitry Antipov.
> > 
> > 3) Remove WARN_ON_ONCE reachable from userspace to cap maximum cgroup
> >    levels to 255, reported by syzbot.
> > 
> > 4) Fix nft_inner incorrect use of percpu area to store tunnel parser
> >    context with softirqs, reported by syzbot.
> > 
> > Please, pull these changes from:
> > 
> >   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-24-11-28
> > 
> > Thanks.
> 
> Oops... I completed the net PR a little earlier than this message, I was
> testing it up 2 now, and I just sent it to Linus. Is there anything
> above that can't wait until next week?

This can wait. I will try to post PR late wednesday moving forward.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-11-28 12:38 ` [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq Pablo Neira Ayuso
@ 2024-11-29  9:14   ` Eric Dumazet
  2024-12-02  1:24     ` Pablo Neira Ayuso
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2024-11-29  9:14 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw

On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> Softirq can interrupt packet from process context which walks over the
> percpu area.
>
> Add routines to disable bh while restoring and saving the tunnel parser
> context from percpu area to stack. Add a skbuff owner for this percpu
> area to catch softirq interference to exercise the packet tunnel parser
> again in such case.
>
> Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>  include/net/netfilter/nf_tables_core.h |  1 +
>  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
>  2 files changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> index ff27cb2e1662..dae0e7592934 100644
> --- a/include/net/netfilter/nf_tables_core.h
> +++ b/include/net/netfilter/nf_tables_core.h
> @@ -161,6 +161,7 @@ enum {
>  };
>
>  struct nft_inner_tun_ctx {
> +       struct sk_buff *skb;    /* percpu area owner */
>         u16     type;
>         u16     inner_tunoff;
>         u16     inner_lloff;
> diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> index 928312d01eb1..fcaa126ac8da 100644
> --- a/net/netfilter/nft_inner.c
> +++ b/net/netfilter/nft_inner.c
> @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
>                            struct nft_pktinfo *pkt,
>                            struct nft_inner_tun_ctx *tun_ctx)
>  {
> -       struct nft_inner_tun_ctx ctx = {};
>         u32 off = pkt->inneroff;
>
>         if (priv->flags & NFT_INNER_HDRSIZE &&
> -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
>                 return -1;
>
>         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
>                         return -1;
>         } else if (priv->flags & NFT_INNER_TH) {
> -               ctx.inner_thoff = off;
> -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> +               tun_ctx->inner_thoff = off;
> +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
>         }
>
> -       *tun_ctx = ctx;
>         tun_ctx->type = priv->type;
> +       tun_ctx->skb = pkt->skb;
>         pkt->flags |= NFT_PKTINFO_INNER_FULL;
>
>         return 0;
>  }
>
> +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> +                                     struct nft_inner_tun_ctx *tun_ctx)
> +{
> +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> +
> +       local_bh_disable();
> +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> +       if (this_cpu_tun_ctx->skb != pkt->skb) {

I must say I do not understand this patch.

If a context is used by a save/restore more than one time per packet
traversal, then this means we can not use per-cpu storage,
or risk flakes.

Also, skb could be freed and re-allocated ?

Perhaps describe a bit more what is going on in the changelog.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-11-29  9:14   ` Eric Dumazet
@ 2024-12-02  1:24     ` Pablo Neira Ayuso
  2024-12-02  9:17       ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-12-02  1:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw

Hi Eric,

On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Softirq can interrupt packet from process context which walks over the
> > percpu area.
> >
> > Add routines to disable bh while restoring and saving the tunnel parser
> > context from percpu area to stack. Add a skbuff owner for this percpu
> > area to catch softirq interference to exercise the packet tunnel parser
> > again in such case.
> >
> > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > ---
> >  include/net/netfilter/nf_tables_core.h |  1 +
> >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> >  2 files changed, 45 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > index ff27cb2e1662..dae0e7592934 100644
> > --- a/include/net/netfilter/nf_tables_core.h
> > +++ b/include/net/netfilter/nf_tables_core.h
> > @@ -161,6 +161,7 @@ enum {
> >  };
> >
> >  struct nft_inner_tun_ctx {
> > +       struct sk_buff *skb;    /* percpu area owner */
> >         u16     type;
> >         u16     inner_tunoff;
> >         u16     inner_lloff;
> > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > index 928312d01eb1..fcaa126ac8da 100644
> > --- a/net/netfilter/nft_inner.c
> > +++ b/net/netfilter/nft_inner.c
> > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> >                            struct nft_pktinfo *pkt,
> >                            struct nft_inner_tun_ctx *tun_ctx)
> >  {
> > -       struct nft_inner_tun_ctx ctx = {};
> >         u32 off = pkt->inneroff;
> >
> >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> >                 return -1;
> >
> >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> >                         return -1;
> >         } else if (priv->flags & NFT_INNER_TH) {
> > -               ctx.inner_thoff = off;
> > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > +               tun_ctx->inner_thoff = off;
> > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> >         }
> >
> > -       *tun_ctx = ctx;
> >         tun_ctx->type = priv->type;
> > +       tun_ctx->skb = pkt->skb;
> >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> >
> >         return 0;
> >  }
> >
> > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > +{
> > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > +
> > +       local_bh_disable();
> > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> 
> I must say I do not understand this patch.
> 
> If a context is used by a save/restore more than one time per packet
> traversal, then this means we can not use per-cpu storage,
> or risk flakes.
> 
> Also, skb could be freed and re-allocated ?
> 
> Perhaps describe a bit more what is going on in the changelog.

There is an on-stack nft_pktinfo structure with a flags field. This
nft_pktinfo is a wrapper for the sk_buff, containing header offsets
and metainformation. This is initialize when entering this chain.

If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
could contain information on the inner header offsets (ie. packet was
already parsed in this chain).

There is a check to validate that the percpu area refers to this
skbuff. If there is a mismatch, then this needs to parse the inner
headers because the data in the percpu area is stale. Otherwise, if
there is a match, the percpu area is copied on-stack.

After processing (payload/meta fetching), the on-stack copy is stored
back to the percpu area. I can make an improvement on this patch to
check if this skbuff still owns the percpu area in the store/exit
section of this inner evaluation routine, to avoid a unnecessary update.

So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
skbuff reallocation scenario. This is not blindly trusting this percpu
area per-se.

One comestic change I can apply to this: I can also turn the struct
sk_buff into unsigned long so it clear to the reader this pointer to
skbuff is not meant to be used for being dereferenced.

If the explaination above is sufficient, I can revamp to extend the
changelog as you suggest and post a new version of this patch.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-12-02  1:24     ` Pablo Neira Ayuso
@ 2024-12-02  9:17       ` Eric Dumazet
  2024-12-02  9:28         ` Pablo Neira Ayuso
  2024-12-03 20:22         ` Pablo Neira Ayuso
  0 siblings, 2 replies; 13+ messages in thread
From: Eric Dumazet @ 2024-12-02  9:17 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw

On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> Hi Eric,
>
> On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > >
> > > Softirq can interrupt packet from process context which walks over the
> > > percpu area.
> > >
> > > Add routines to disable bh while restoring and saving the tunnel parser
> > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > area to catch softirq interference to exercise the packet tunnel parser
> > > again in such case.
> > >
> > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > ---
> > >  include/net/netfilter/nf_tables_core.h |  1 +
> > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > index ff27cb2e1662..dae0e7592934 100644
> > > --- a/include/net/netfilter/nf_tables_core.h
> > > +++ b/include/net/netfilter/nf_tables_core.h
> > > @@ -161,6 +161,7 @@ enum {
> > >  };
> > >
> > >  struct nft_inner_tun_ctx {
> > > +       struct sk_buff *skb;    /* percpu area owner */
> > >         u16     type;
> > >         u16     inner_tunoff;
> > >         u16     inner_lloff;
> > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > index 928312d01eb1..fcaa126ac8da 100644
> > > --- a/net/netfilter/nft_inner.c
> > > +++ b/net/netfilter/nft_inner.c
> > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > >                            struct nft_pktinfo *pkt,
> > >                            struct nft_inner_tun_ctx *tun_ctx)
> > >  {
> > > -       struct nft_inner_tun_ctx ctx = {};
> > >         u32 off = pkt->inneroff;
> > >
> > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > >                 return -1;
> > >
> > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > >                         return -1;
> > >         } else if (priv->flags & NFT_INNER_TH) {
> > > -               ctx.inner_thoff = off;
> > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > +               tun_ctx->inner_thoff = off;
> > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > >         }
> > >
> > > -       *tun_ctx = ctx;
> > >         tun_ctx->type = priv->type;
> > > +       tun_ctx->skb = pkt->skb;
> > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > >
> > >         return 0;
> > >  }
> > >
> > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > +{
> > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > +
> > > +       local_bh_disable();
> > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> >
> > I must say I do not understand this patch.
> >
> > If a context is used by a save/restore more than one time per packet
> > traversal, then this means we can not use per-cpu storage,
> > or risk flakes.
> >
> > Also, skb could be freed and re-allocated ?
> >
> > Perhaps describe a bit more what is going on in the changelog.
>
> There is an on-stack nft_pktinfo structure with a flags field. This
> nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> and metainformation. This is initialize when entering this chain.
>
> If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> could contain information on the inner header offsets (ie. packet was
> already parsed in this chain).
>
> There is a check to validate that the percpu area refers to this
> skbuff. If there is a mismatch, then this needs to parse the inner
> headers because the data in the percpu area is stale. Otherwise, if
> there is a match, the percpu area is copied on-stack.
>
> After processing (payload/meta fetching), the on-stack copy is stored
> back to the percpu area. I can make an improvement on this patch to
> check if this skbuff still owns the percpu area in the store/exit
> section of this inner evaluation routine, to avoid a unnecessary update.
>
> So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> skbuff reallocation scenario. This is not blindly trusting this percpu
> area per-se.
>
> One comestic change I can apply to this: I can also turn the struct
> sk_buff into unsigned long so it clear to the reader this pointer to
> skbuff is not meant to be used for being dereferenced.
>
> If the explaination above is sufficient, I can revamp to extend the
> changelog as you suggest and post a new version of this patch.
>
> Thanks.

The part I do not understand is that tun_ctx->skb is not cleared at
the end of processing (or at some point)

That would make clear that a future (tun_ctx->skb == skb) test is not
confused by skb reuse (free/alloc).

If you use it as a cookie, then we need something else than a pointer.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-12-02  9:17       ` Eric Dumazet
@ 2024-12-02  9:28         ` Pablo Neira Ayuso
  2024-12-03 20:22         ` Pablo Neira Ayuso
  1 sibling, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-12-02  9:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw

On Mon, Dec 02, 2024 at 10:17:10AM +0100, Eric Dumazet wrote:
> On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Hi Eric,
> >
> > On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > >
> > > > Softirq can interrupt packet from process context which walks over the
> > > > percpu area.
> > > >
> > > > Add routines to disable bh while restoring and saving the tunnel parser
> > > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > > area to catch softirq interference to exercise the packet tunnel parser
> > > > again in such case.
> > > >
> > > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > ---
> > > >  include/net/netfilter/nf_tables_core.h |  1 +
> > > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > > index ff27cb2e1662..dae0e7592934 100644
> > > > --- a/include/net/netfilter/nf_tables_core.h
> > > > +++ b/include/net/netfilter/nf_tables_core.h
> > > > @@ -161,6 +161,7 @@ enum {
> > > >  };
> > > >
> > > >  struct nft_inner_tun_ctx {
> > > > +       struct sk_buff *skb;    /* percpu area owner */
> > > >         u16     type;
> > > >         u16     inner_tunoff;
> > > >         u16     inner_lloff;
> > > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > > index 928312d01eb1..fcaa126ac8da 100644
> > > > --- a/net/netfilter/nft_inner.c
> > > > +++ b/net/netfilter/nft_inner.c
> > > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > > >                            struct nft_pktinfo *pkt,
> > > >                            struct nft_inner_tun_ctx *tun_ctx)
> > > >  {
> > > > -       struct nft_inner_tun_ctx ctx = {};
> > > >         u32 off = pkt->inneroff;
> > > >
> > > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > > >                 return -1;
> > > >
> > > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > > >                         return -1;
> > > >         } else if (priv->flags & NFT_INNER_TH) {
> > > > -               ctx.inner_thoff = off;
> > > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > > +               tun_ctx->inner_thoff = off;
> > > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > >         }
> > > >
> > > > -       *tun_ctx = ctx;
> > > >         tun_ctx->type = priv->type;
> > > > +       tun_ctx->skb = pkt->skb;
> > > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > > >
> > > >         return 0;
> > > >  }
> > > >
> > > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > > +{
> > > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > > +
> > > > +       local_bh_disable();
> > > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> > >
> > > I must say I do not understand this patch.
> > >
> > > If a context is used by a save/restore more than one time per packet
> > > traversal, then this means we can not use per-cpu storage,
> > > or risk flakes.
> > >
> > > Also, skb could be freed and re-allocated ?
> > >
> > > Perhaps describe a bit more what is going on in the changelog.
> >
> > There is an on-stack nft_pktinfo structure with a flags field. This
> > nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> > and metainformation. This is initialize when entering this chain.
> >
> > If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> > could contain information on the inner header offsets (ie. packet was
> > already parsed in this chain).
> >
> > There is a check to validate that the percpu area refers to this
> > skbuff. If there is a mismatch, then this needs to parse the inner
> > headers because the data in the percpu area is stale. Otherwise, if
> > there is a match, the percpu area is copied on-stack.
> >
> > After processing (payload/meta fetching), the on-stack copy is stored
> > back to the percpu area. I can make an improvement on this patch to
> > check if this skbuff still owns the percpu area in the store/exit
> > section of this inner evaluation routine, to avoid a unnecessary update.
> >
> > So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> > skbuff reallocation scenario. This is not blindly trusting this percpu
> > area per-se.
> >
> > One comestic change I can apply to this: I can also turn the struct
> > sk_buff into unsigned long so it clear to the reader this pointer to
> > skbuff is not meant to be used for being dereferenced.
> >
> > If the explaination above is sufficient, I can revamp to extend the
> > changelog as you suggest and post a new version of this patch.
> >
> > Thanks.
> 
> The part I do not understand is that tun_ctx->skb is not cleared at
> the end of processing (or at some point)

I believe on-stack NFT_PKTINFO_INNER_FULL flag is sufficient, but
I will clear it as you suggest to make this more robust.

> That would make clear that a future (tun_ctx->skb == skb) test is not
> confused by skb reuse (free/alloc).
> 
> If you use it as a cookie, then we need something else than a pointer.

Yes, this is a cookie, I can turn this field into unsigned long
instead.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq
  2024-12-02  9:17       ` Eric Dumazet
  2024-12-02  9:28         ` Pablo Neira Ayuso
@ 2024-12-03 20:22         ` Pablo Neira Ayuso
  1 sibling, 0 replies; 13+ messages in thread
From: Pablo Neira Ayuso @ 2024-12-03 20:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw

Hi Eric,

On Mon, Dec 02, 2024 at 10:17:10AM +0100, Eric Dumazet wrote:
> On Mon, Dec 2, 2024 at 2:24 AM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > Hi Eric,
> >
> > On Fri, Nov 29, 2024 at 10:14:34AM +0100, Eric Dumazet wrote:
> > > On Thu, Nov 28, 2024 at 1:38 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > >
> > > > Softirq can interrupt packet from process context which walks over the
> > > > percpu area.
> > > >
> > > > Add routines to disable bh while restoring and saving the tunnel parser
> > > > context from percpu area to stack. Add a skbuff owner for this percpu
> > > > area to catch softirq interference to exercise the packet tunnel parser
> > > > again in such case.
> > > >
> > > > Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com
> > > > Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching")
> > > > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > > > ---
> > > >  include/net/netfilter/nf_tables_core.h |  1 +
> > > >  net/netfilter/nft_inner.c              | 56 ++++++++++++++++++++------
> > > >  2 files changed, 45 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
> > > > index ff27cb2e1662..dae0e7592934 100644
> > > > --- a/include/net/netfilter/nf_tables_core.h
> > > > +++ b/include/net/netfilter/nf_tables_core.h
> > > > @@ -161,6 +161,7 @@ enum {
> > > >  };
> > > >
> > > >  struct nft_inner_tun_ctx {
> > > > +       struct sk_buff *skb;    /* percpu area owner */
> > > >         u16     type;
> > > >         u16     inner_tunoff;
> > > >         u16     inner_lloff;
> > > > diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
> > > > index 928312d01eb1..fcaa126ac8da 100644
> > > > --- a/net/netfilter/nft_inner.c
> > > > +++ b/net/netfilter/nft_inner.c
> > > > @@ -210,35 +210,65 @@ static int nft_inner_parse(const struct nft_inner *priv,
> > > >                            struct nft_pktinfo *pkt,
> > > >                            struct nft_inner_tun_ctx *tun_ctx)
> > > >  {
> > > > -       struct nft_inner_tun_ctx ctx = {};
> > > >         u32 off = pkt->inneroff;
> > > >
> > > >         if (priv->flags & NFT_INNER_HDRSIZE &&
> > > > -           nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
> > > > +           nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
> > > >                 return -1;
> > > >
> > > >         if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
> > > > -               if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
> > > > +               if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
> > > >                         return -1;
> > > >         } else if (priv->flags & NFT_INNER_TH) {
> > > > -               ctx.inner_thoff = off;
> > > > -               ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > > +               tun_ctx->inner_thoff = off;
> > > > +               tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
> > > >         }
> > > >
> > > > -       *tun_ctx = ctx;
> > > >         tun_ctx->type = priv->type;
> > > > +       tun_ctx->skb = pkt->skb;
> > > >         pkt->flags |= NFT_PKTINFO_INNER_FULL;
> > > >
> > > >         return 0;
> > > >  }
> > > >
> > > > +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
> > > > +                                     struct nft_inner_tun_ctx *tun_ctx)
> > > > +{
> > > > +       struct nft_inner_tun_ctx *this_cpu_tun_ctx;
> > > > +
> > > > +       local_bh_disable();
> > > > +       this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
> > > > +       if (this_cpu_tun_ctx->skb != pkt->skb) {
> > >
> > > I must say I do not understand this patch.
> > >
> > > If a context is used by a save/restore more than one time per packet
> > > traversal, then this means we can not use per-cpu storage,
> > > or risk flakes.
> > >
> > > Also, skb could be freed and re-allocated ?
> > >
> > > Perhaps describe a bit more what is going on in the changelog.
> >
> > There is an on-stack nft_pktinfo structure with a flags field. This
> > nft_pktinfo is a wrapper for the sk_buff, containing header offsets
> > and metainformation. This is initialize when entering this chain.
> >
> > If the NFT_PKTINFO_INNER_FULL flag is set on, then the percpu area
> > could contain information on the inner header offsets (ie. packet was
> > already parsed in this chain).
> >
> > There is a check to validate that the percpu area refers to this
> > skbuff. If there is a mismatch, then this needs to parse the inner
> > headers because the data in the percpu area is stale. Otherwise, if
> > there is a match, the percpu area is copied on-stack.
> >
> > After processing (payload/meta fetching), the on-stack copy is stored
> > back to the percpu area. I can make an improvement on this patch to
> > check if this skbuff still owns the percpu area in the store/exit
> > section of this inner evaluation routine, to avoid a unnecessary update.
> >
> > So, it is basically the NFT_PKTINFO_INNER_FULL flag that handles the
> > skbuff reallocation scenario. This is not blindly trusting this percpu
> > area per-se.
> >
> > One comestic change I can apply to this: I can also turn the struct
> > sk_buff into unsigned long so it clear to the reader this pointer to
> > skbuff is not meant to be used for being dereferenced.
> >
> > If the explaination above is sufficient, I can revamp to extend the
> > changelog as you suggest and post a new version of this patch.
> >
> > Thanks.
> 
> The part I do not understand is that tun_ctx->skb is not cleared at
> the end of processing (or at some point)
> 
> That would make clear that a future (tun_ctx->skb == skb) test is not
> confused by skb reuse (free/alloc).
>
> If you use it as a cookie, then we need something else than a pointer.

Revisiting this...

This is performing _three checks_ to validate that the percpu area is
valid for this skbuff:

static bool nft_inner_parse_needed(const struct nft_inner *priv,
                                   const struct nft_pktinfo *pkt,
                                   struct nft_inner_tun_ctx *tun_ctx)
{
        if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
                return true;

1) pkt->flags & NFT_PKTINFO_INNER_FULL tells us this percpu area could
   contain information for this skbuff already _in this chain_.

        if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
                return true;

2) this above checks if (tun_ctx->skb == skb)

        if (priv->type != tun_ctx->type)
                return true;

3) this also checks if inner header type in percpu area is the same
   as the type of this match.

But NFT_PKTINFO_INNER_FULL is only set for this packet in the context
of the chain.

I don't have a way to clear the percpu area, because I don't know what
would be the last rule that will try to match on inner headers.

As far as I understand, the problematic scenario is this:

   (process context)
        skb A Rule 1, no NFT_PKTINFO_INNER_FULL flag, initialize percpu area
        skb A Rule 2, NFT_PKTINFO_INNER_FULL is set, use percpu area ... but softirq comes while evaluating
        (softirq)
             skb B Rule 1, no NFT_PKTINFO_INNER_FULL flag, initialize percpu area
             (this is overriding the percpu and updating the cookie).
             skb B Rule 2, NFT_PKTINFO_INNER_FULL flag is set, use percpu area
             skb B Rule 3, NFT_PKTINFO_INNER_FULL flag is set, use percpu area
             skb B end of processing
   (resume to process context)
        skb A Rule 2, override percpu area
        skb A Rule 3, NFT_PKTINFO_INNER_FULL is set, use percpu area

I think this approach in this patch is sufficient to deal with this.

I can expand commit description and rename variable to _cookie_ to
make it more obvious to the reader.

Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2024-12-03 20:22 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-28 12:38 [PATCH net,v2 0/4] Netfilter fixes for net Pablo Neira Ayuso
2024-11-28 12:38 ` [PATCH net 1/4] ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() Pablo Neira Ayuso
2024-11-28 12:38 ` [PATCH net 2/4] netfilter: x_tables: fix LED ID check in led_tg_check() Pablo Neira Ayuso
2024-11-28 12:38 ` [PATCH net 3/4] netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level Pablo Neira Ayuso
2024-11-28 12:38 ` [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq Pablo Neira Ayuso
2024-11-29  9:14   ` Eric Dumazet
2024-12-02  1:24     ` Pablo Neira Ayuso
2024-12-02  9:17       ` Eric Dumazet
2024-12-02  9:28         ` Pablo Neira Ayuso
2024-12-03 20:22         ` Pablo Neira Ayuso
2024-11-28 14:33 ` [PATCH net,v2 0/4] Netfilter fixes for net Paolo Abeni
2024-11-28 14:41   ` Pablo Neira Ayuso
  -- strict thread matches above, loose matches on Subject: below --
2024-11-28 12:23 [PATCH net " Pablo Neira Ayuso
2024-11-28 12:23 ` [PATCH net 4/4] netfilter: nft_inner: incorrect percpu area handling under softirq Pablo Neira Ayuso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).