[RFC nf-next v3 0/2] netfilter: bpf: support prog update

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC nf-next v3 0/2] netfilter: bpf: support prog update
@ 2023-12-20 14:09 D. Wythe
  2023-12-20 14:09 ` [RFC nf-next v3 1/2] " D. Wythe
  2023-12-20 14:09 ` [RFC nf-next v3 2/2] selftests/bpf: Add netfilter link prog update test D. Wythe
  0 siblings, 2 replies; 9+ messages in thread
From: D. Wythe @ 2023-12-20 14:09 UTC (permalink / raw)
  To: pablo, kadlec, fw
  Cc: bpf, linux-kernel, netdev, coreteam, netfilter-devel, davem,
	edumazet, kuba, pabeni, ast

From: "D. Wythe" <alibuda@linux.alibaba.com>

This patches attempt to implements updating of progs within
bpf netfilter link, allowing user update their ebpf netfilter
prog in hot update manner.

Besides, a corresponding test case has been added to verify
whether the update works.

--
v1:
1. remove unnecessary context, access the prog directly via rcu.
2. remove synchronize_rcu(), dealloc the nf_link via kfree_rcu.
3. check the dead flag during the update.
--
v1->v2:
1. remove unnecessary nf_prog, accessing nf_link->link.prog in direct.
--
v2->v3:
1. access nf_link->link.prog via rcu_dereference_raw to avoid warning.


D. Wythe (2):
  netfilter: bpf: support prog update
  selftests/bpf: Add netfilter link prog update test

 net/netfilter/nf_bpf_link.c                        | 63 ++++++++++++----
 .../bpf/prog_tests/netfilter_link_update_prog.c    | 83 ++++++++++++++++++++++
 .../bpf/progs/test_netfilter_link_update_prog.c    | 24 +++++++
 3 files changed, 155 insertions(+), 15 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/netfilter_link_update_prog.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_netfilter_link_update_prog.c

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-20 14:09 [RFC nf-next v3 0/2] netfilter: bpf: support prog update D. Wythe
@ 2023-12-20 14:09 ` D. Wythe
  2023-12-20 21:11   ` Alexei Starovoitov
  2023-12-20 14:09 ` [RFC nf-next v3 2/2] selftests/bpf: Add netfilter link prog update test D. Wythe
  1 sibling, 1 reply; 9+ messages in thread
From: D. Wythe @ 2023-12-20 14:09 UTC (permalink / raw)
  To: pablo, kadlec, fw
  Cc: bpf, linux-kernel, netdev, coreteam, netfilter-devel, davem,
	edumazet, kuba, pabeni, ast

From: "D. Wythe" <alibuda@linux.alibaba.com>

To support the prog update, we need to ensure that the prog seen
within the hook is always valid. Considering that hooks are always
protected by rcu_read_lock(), which provide us the ability to
access the prog under rcu.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index e502ec0..9bc91d1 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -8,17 +8,8 @@
 #include <net/netfilter/nf_bpf_link.h>
 #include <uapi/linux/netfilter_ipv4.h>
 
-static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
-				    const struct nf_hook_state *s)
-{
-	const struct bpf_prog *prog = bpf_prog;
-	struct bpf_nf_ctx ctx = {
-		.state = s,
-		.skb = skb,
-	};
-
-	return bpf_prog_run(prog, &ctx);
-}
+/* protect link update in parallel */
+static DEFINE_MUTEX(bpf_nf_mutex);
 
 struct bpf_nf_link {
 	struct bpf_link link;
@@ -26,8 +17,20 @@ struct bpf_nf_link {
 	struct net *net;
 	u32 dead;
 	const struct nf_defrag_hook *defrag_hook;
+	struct rcu_head head;
 };
 
+static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
+				    const struct nf_hook_state *s)
+{
+	const struct bpf_nf_link *nf_link = bpf_link;
+	struct bpf_nf_ctx ctx = {
+		.state = s,
+		.skb = skb,
+	};
+	return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
+}
+
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 static const struct nf_defrag_hook *
 get_proto_defrag_hook(struct bpf_nf_link *link,
@@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
 static void bpf_nf_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
-
-	kfree(nf_link);
+	kfree_rcu(nf_link, head);
 }
 
 static int bpf_nf_link_detach(struct bpf_link *link)
@@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
 static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 			      struct bpf_prog *old_prog)
 {
-	return -EOPNOTSUPP;
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+	int err = 0;
+
+	mutex_lock(&bpf_nf_mutex);
+
+	if (nf_link->dead) {
+		err = -EPERM;
+		goto out;
+	}
+
+	/* target old_prog mismatch */
+	if (old_prog && link->prog != old_prog) {
+		err = -EPERM;
+		goto out;
+	}
+
+	old_prog = link->prog;
+	if (old_prog == new_prog) {
+		/* don't need update */
+		bpf_prog_put(new_prog);
+		goto out;
+	}
+
+	old_prog = xchg(&link->prog, new_prog);
+	bpf_prog_put(old_prog);
+out:
+	mutex_unlock(&bpf_nf_mutex);
+	return err;
 }
 
 static const struct bpf_link_ops bpf_nf_link_lops = {
@@ -226,7 +255,11 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	link->hook_ops.hook = nf_hook_run_bpf;
 	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
-	link->hook_ops.priv = prog;
+
+	/* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
+	 * valid at all times within nf_hook_run_bpf().
+	 */
+	link->hook_ops.priv = link;
 
 	link->hook_ops.pf = attr->link_create.netfilter.pf;
 	link->hook_ops.priority = attr->link_create.netfilter.priority;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC nf-next v3 2/2] selftests/bpf: Add netfilter link prog update test
  2023-12-20 14:09 [RFC nf-next v3 0/2] netfilter: bpf: support prog update D. Wythe
  2023-12-20 14:09 ` [RFC nf-next v3 1/2] " D. Wythe
@ 2023-12-20 14:09 ` D. Wythe
  1 sibling, 0 replies; 9+ messages in thread
From: D. Wythe @ 2023-12-20 14:09 UTC (permalink / raw)
  To: pablo, kadlec, fw
  Cc: bpf, linux-kernel, netdev, coreteam, netfilter-devel, davem,
	edumazet, kuba, pabeni, ast

From: "D. Wythe" <alibuda@linux.alibaba.com>

Update prog for active links and verify whether
the prog has been successfully replaced.

Expected output:

./test_progs -t netfilter_link_update_prog
Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 .../bpf/prog_tests/netfilter_link_update_prog.c    | 83 ++++++++++++++++++++++
 .../bpf/progs/test_netfilter_link_update_prog.c    | 24 +++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/netfilter_link_update_prog.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_netfilter_link_update_prog.c

diff --git a/tools/testing/selftests/bpf/prog_tests/netfilter_link_update_prog.c b/tools/testing/selftests/bpf/prog_tests/netfilter_link_update_prog.c
new file mode 100644
index 00000000..d23b544
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/netfilter_link_update_prog.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <test_progs.h>
+#include <linux/netfilter.h>
+#include <network_helpers.h>
+#include "test_netfilter_link_update_prog.skel.h"
+
+#define SERVER_ADDR "127.0.0.1"
+#define SERVER_PORT 12345
+
+static const char dummy_message[] = "A dummy message";
+
+static int send_dummy(int client_fd)
+{
+	struct sockaddr_storage saddr;
+	struct sockaddr *saddr_p;
+	socklen_t saddr_len;
+	int err;
+
+	saddr_p = (struct sockaddr *)&saddr;
+	err = make_sockaddr(AF_INET, SERVER_ADDR, SERVER_PORT, &saddr, &saddr_len);
+	if (!ASSERT_OK(err, "make_sockaddr"))
+		return -1;
+
+	err = sendto(client_fd, dummy_message, sizeof(dummy_message) - 1, 0, saddr_p, saddr_len);
+	if (!ASSERT_GE(err, 0, "sendto"))
+		return -1;
+
+	return 0;
+}
+
+void test_netfilter_link_update_prog(void)
+{
+	LIBBPF_OPTS(bpf_netfilter_opts, opts,
+		.pf = NFPROTO_IPV4,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.priority = 100);
+	struct test_netfilter_link_update_prog *skel;
+	struct bpf_program *prog;
+	int server_fd, client_fd;
+	int err;
+
+	skel = test_netfilter_link_update_prog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "test_netfilter_link_update_prog__open_and_load"))
+		goto out;
+
+	prog = skel->progs.nf_link_prog;
+
+	if (!ASSERT_OK_PTR(prog, "load program"))
+		goto out;
+
+	skel->links.nf_link_prog = bpf_program__attach_netfilter(prog, &opts);
+	if (!ASSERT_OK_PTR(skel->links.nf_link_prog, "attach netfilter program"))
+		goto out;
+
+	server_fd = start_server(AF_INET, SOCK_DGRAM, SERVER_ADDR, SERVER_PORT, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		goto out;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+		goto out;
+
+	send_dummy(client_fd);
+
+	ASSERT_EQ(skel->bss->counter, 0, "counter should be zero");
+
+	err = bpf_link__update_program(skel->links.nf_link_prog, skel->progs.nf_link_prog_new);
+	if (!ASSERT_OK(err, "bpf_link__update_program"))
+		goto out;
+
+	send_dummy(client_fd);
+	ASSERT_GE(skel->bss->counter, 0, "counter should be greater than zero");
+out:
+	if (client_fd > 0)
+		close(client_fd);
+	if (server_fd > 0)
+		close(server_fd);
+
+	test_netfilter_link_update_prog__destroy(skel);
+}
+
+
diff --git a/tools/testing/selftests/bpf/progs/test_netfilter_link_update_prog.c b/tools/testing/selftests/bpf/progs/test_netfilter_link_update_prog.c
new file mode 100644
index 00000000..42ae332
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_netfilter_link_update_prog.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+#define NF_ACCEPT 1
+
+SEC("netfilter")
+int nf_link_prog(struct bpf_nf_ctx *ctx)
+{
+	return NF_ACCEPT;
+}
+
+u64 counter = 0;
+
+SEC("netfilter")
+int nf_link_prog_new(struct bpf_nf_ctx *ctx)
+{
+	counter++;
+	return NF_ACCEPT;
+}
+
+char _license[] SEC("license") = "GPL";
+
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-20 14:09 ` [RFC nf-next v3 1/2] " D. Wythe
@ 2023-12-20 21:11   ` Alexei Starovoitov
  2023-12-22  7:06     ` D. Wythe
  0 siblings, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2023-12-20 21:11 UTC (permalink / raw)
  To: D. Wythe
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov

On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
> From: "D. Wythe" <alibuda@linux.alibaba.com>
>
> To support the prog update, we need to ensure that the prog seen
> within the hook is always valid. Considering that hooks are always
> protected by rcu_read_lock(), which provide us the ability to
> access the prog under rcu.
>
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> ---
>  net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 48 insertions(+), 15 deletions(-)
>
> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
> index e502ec0..9bc91d1 100644
> --- a/net/netfilter/nf_bpf_link.c
> +++ b/net/netfilter/nf_bpf_link.c
> @@ -8,17 +8,8 @@
>  #include <net/netfilter/nf_bpf_link.h>
>  #include <uapi/linux/netfilter_ipv4.h>
>
> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
> -                                   const struct nf_hook_state *s)
> -{
> -       const struct bpf_prog *prog = bpf_prog;
> -       struct bpf_nf_ctx ctx = {
> -               .state = s,
> -               .skb = skb,
> -       };
> -
> -       return bpf_prog_run(prog, &ctx);
> -}
> +/* protect link update in parallel */
> +static DEFINE_MUTEX(bpf_nf_mutex);
>
>  struct bpf_nf_link {
>         struct bpf_link link;
> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>         struct net *net;
>         u32 dead;
>         const struct nf_defrag_hook *defrag_hook;
> +       struct rcu_head head;

I have to point out the same issues as before, but
will ask them differently...

Why do you think above rcu_head is necessary?

>  };
>
> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
> +                                   const struct nf_hook_state *s)
> +{
> +       const struct bpf_nf_link *nf_link = bpf_link;
> +       struct bpf_nf_ctx ctx = {
> +               .state = s,
> +               .skb = skb,
> +       };
> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
> +}
> +
>  #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>  static const struct nf_defrag_hook *
>  get_proto_defrag_hook(struct bpf_nf_link *link,
> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>  static void bpf_nf_link_dealloc(struct bpf_link *link)
>  {
>         struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> -
> -       kfree(nf_link);
> +       kfree_rcu(nf_link, head);

Why is this needed ?
Have you looked at tcx_link_lops ?

>  }
>
>  static int bpf_nf_link_detach(struct bpf_link *link)
> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>  static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>                               struct bpf_prog *old_prog)
>  {
> -       return -EOPNOTSUPP;
> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> +       int err = 0;
> +
> +       mutex_lock(&bpf_nf_mutex);

Why do you need this mutex?
What race does it solve?

> +
> +       if (nf_link->dead) {
> +               err = -EPERM;
> +               goto out;
> +       }
> +
> +       /* target old_prog mismatch */
> +       if (old_prog && link->prog != old_prog) {
> +               err = -EPERM;
> +               goto out;
> +       }
> +
> +       old_prog = link->prog;
> +       if (old_prog == new_prog) {
> +               /* don't need update */
> +               bpf_prog_put(new_prog);
> +               goto out;
> +       }
> +
> +       old_prog = xchg(&link->prog, new_prog);
> +       bpf_prog_put(old_prog);
> +out:
> +       mutex_unlock(&bpf_nf_mutex);
> +       return err;
>  }
>
>  static const struct bpf_link_ops bpf_nf_link_lops = {
> @@ -226,7 +255,11 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>
>         link->hook_ops.hook = nf_hook_run_bpf;
>         link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
> -       link->hook_ops.priv = prog;
> +
> +       /* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
> +        * valid at all times within nf_hook_run_bpf().
> +        */
> +       link->hook_ops.priv = link;
>
>         link->hook_ops.pf = attr->link_create.netfilter.pf;
>         link->hook_ops.priority = attr->link_create.netfilter.priority;
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-20 21:11   ` Alexei Starovoitov
@ 2023-12-22  7:06     ` D. Wythe
  2023-12-22 22:23       ` Alexei Starovoitov
  0 siblings, 1 reply; 9+ messages in thread
From: D. Wythe @ 2023-12-22  7:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov



On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
> On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>
>> To support the prog update, we need to ensure that the prog seen
>> within the hook is always valid. Considering that hooks are always
>> protected by rcu_read_lock(), which provide us the ability to
>> access the prog under rcu.
>>
>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>> ---
>>   net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 48 insertions(+), 15 deletions(-)
>>
>> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
>> index e502ec0..9bc91d1 100644
>> --- a/net/netfilter/nf_bpf_link.c
>> +++ b/net/netfilter/nf_bpf_link.c
>> @@ -8,17 +8,8 @@
>>   #include <net/netfilter/nf_bpf_link.h>
>>   #include <uapi/linux/netfilter_ipv4.h>
>>
>> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
>> -                                   const struct nf_hook_state *s)
>> -{
>> -       const struct bpf_prog *prog = bpf_prog;
>> -       struct bpf_nf_ctx ctx = {
>> -               .state = s,
>> -               .skb = skb,
>> -       };
>> -
>> -       return bpf_prog_run(prog, &ctx);
>> -}
>> +/* protect link update in parallel */
>> +static DEFINE_MUTEX(bpf_nf_mutex);
>>
>>   struct bpf_nf_link {
>>          struct bpf_link link;
>> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>>          struct net *net;
>>          u32 dead;
>>          const struct nf_defrag_hook *defrag_hook;
>> +       struct rcu_head head;
> I have to point out the same issues as before, but
> will ask them differently...
>
> Why do you think above rcu_head is necessary?
>
>>   };
>>
>> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
>> +                                   const struct nf_hook_state *s)
>> +{
>> +       const struct bpf_nf_link *nf_link = bpf_link;
>> +       struct bpf_nf_ctx ctx = {
>> +               .state = s,
>> +               .skb = skb,
>> +       };
>> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
>> +}
>> +
>>   #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>>   static const struct nf_defrag_hook *
>>   get_proto_defrag_hook(struct bpf_nf_link *link,
>> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>>   static void bpf_nf_link_dealloc(struct bpf_link *link)
>>   {
>>          struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>> -
>> -       kfree(nf_link);
>> +       kfree_rcu(nf_link, head);
> Why is this needed ?
> Have you looked at tcx_link_lops ?

Introducing rcu_head/kfree_rcu is to address the situation where the 
netfilter hooks might
still access the link after bpf_nf_link_dealloc.

                                                      nf_hook_run_bpf
                                                      const struct 
bpf_nf_link *nf_link = bpf_link;

bpf_nf_link_release
     nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);

bpf_nf_link_dealloc
     free(link)
bpf_prog_run(link->prog);


I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu() 
to solve the
same problem, which is also the way we used in the first version.

https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/

However, we have received some opposing views, believing that this is a 
bit overkill,
so we decided to use kfree_rcu.

https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/

>>   }
>>
>>   static int bpf_nf_link_detach(struct bpf_link *link)
>> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>>   static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>>                                struct bpf_prog *old_prog)
>>   {
>> -       return -EOPNOTSUPP;
>> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>> +       int err = 0;
>> +
>> +       mutex_lock(&bpf_nf_mutex);
> Why do you need this mutex?
> What race does it solve?

To avoid user update a link with differ prog at the same time. I noticed 
that sys_bpf()
doesn't seem to prevent being invoked by user at the same time. Have I 
missed something?

Best wishes,
D. Wythe
>> +
>> +       if (nf_link->dead) {
>> +               err = -EPERM;
>> +               goto out;
>> +       }
>> +
>> +       /* target old_prog mismatch */
>> +       if (old_prog && link->prog != old_prog) {
>> +               err = -EPERM;
>> +               goto out;
>> +       }
>> +
>> +       old_prog = link->prog;
>> +       if (old_prog == new_prog) {
>> +               /* don't need update */
>> +               bpf_prog_put(new_prog);
>> +               goto out;
>> +       }
>> +
>> +       old_prog = xchg(&link->prog, new_prog);
>> +       bpf_prog_put(old_prog);
>> +out:
>> +       mutex_unlock(&bpf_nf_mutex);
>> +       return err;
>>   }
>>
>>   static const struct bpf_link_ops bpf_nf_link_lops = {
>> @@ -226,7 +255,11 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
>>
>>          link->hook_ops.hook = nf_hook_run_bpf;
>>          link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
>> -       link->hook_ops.priv = prog;
>> +
>> +       /* bpf_nf_link_release & bpf_nf_link_dealloc() can ensures that link remains
>> +        * valid at all times within nf_hook_run_bpf().
>> +        */
>> +       link->hook_ops.priv = link;
>>
>>          link->hook_ops.pf = attr->link_create.netfilter.pf;
>>          link->hook_ops.priority = attr->link_create.netfilter.priority;
>> --
>> 1.8.3.1
>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-22  7:06     ` D. Wythe
@ 2023-12-22 22:23       ` Alexei Starovoitov
  2023-12-27  8:20         ` D. Wythe
  0 siblings, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2023-12-22 22:23 UTC (permalink / raw)
  To: D. Wythe
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov

On Thu, Dec 21, 2023 at 11:06 PM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
>
>
> On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
> > On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
> >> From: "D. Wythe" <alibuda@linux.alibaba.com>
> >>
> >> To support the prog update, we need to ensure that the prog seen
> >> within the hook is always valid. Considering that hooks are always
> >> protected by rcu_read_lock(), which provide us the ability to
> >> access the prog under rcu.
> >>
> >> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> >> ---
> >>   net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
> >>   1 file changed, 48 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
> >> index e502ec0..9bc91d1 100644
> >> --- a/net/netfilter/nf_bpf_link.c
> >> +++ b/net/netfilter/nf_bpf_link.c
> >> @@ -8,17 +8,8 @@
> >>   #include <net/netfilter/nf_bpf_link.h>
> >>   #include <uapi/linux/netfilter_ipv4.h>
> >>
> >> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
> >> -                                   const struct nf_hook_state *s)
> >> -{
> >> -       const struct bpf_prog *prog = bpf_prog;
> >> -       struct bpf_nf_ctx ctx = {
> >> -               .state = s,
> >> -               .skb = skb,
> >> -       };
> >> -
> >> -       return bpf_prog_run(prog, &ctx);
> >> -}
> >> +/* protect link update in parallel */
> >> +static DEFINE_MUTEX(bpf_nf_mutex);
> >>
> >>   struct bpf_nf_link {
> >>          struct bpf_link link;
> >> @@ -26,8 +17,20 @@ struct bpf_nf_link {
> >>          struct net *net;
> >>          u32 dead;
> >>          const struct nf_defrag_hook *defrag_hook;
> >> +       struct rcu_head head;
> > I have to point out the same issues as before, but
> > will ask them differently...
> >
> > Why do you think above rcu_head is necessary?
> >
> >>   };
> >>
> >> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
> >> +                                   const struct nf_hook_state *s)
> >> +{
> >> +       const struct bpf_nf_link *nf_link = bpf_link;
> >> +       struct bpf_nf_ctx ctx = {
> >> +               .state = s,
> >> +               .skb = skb,
> >> +       };
> >> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
> >> +}
> >> +
> >>   #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
> >>   static const struct nf_defrag_hook *
> >>   get_proto_defrag_hook(struct bpf_nf_link *link,
> >> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
> >>   static void bpf_nf_link_dealloc(struct bpf_link *link)
> >>   {
> >>          struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> >> -
> >> -       kfree(nf_link);
> >> +       kfree_rcu(nf_link, head);
> > Why is this needed ?
> > Have you looked at tcx_link_lops ?
>
> Introducing rcu_head/kfree_rcu is to address the situation where the
> netfilter hooks might
> still access the link after bpf_nf_link_dealloc.

Why do you think so?

>
>                                                       nf_hook_run_bpf
>                                                       const struct
> bpf_nf_link *nf_link = bpf_link;
>
> bpf_nf_link_release
>      nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>
> bpf_nf_link_dealloc
>      free(link)
> bpf_prog_run(link->prog);
>
>
> I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu()
> to solve the

Where do you see such code in tcx_link_lops ?

> same problem, which is also the way we used in the first version.
>
> https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/
>
> However, we have received some opposing views, believing that this is a
> bit overkill,
> so we decided to use kfree_rcu.
>
> https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/
>
> >>   }
> >>
> >>   static int bpf_nf_link_detach(struct bpf_link *link)
> >> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
> >>   static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
> >>                                struct bpf_prog *old_prog)
> >>   {
> >> -       return -EOPNOTSUPP;
> >> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
> >> +       int err = 0;
> >> +
> >> +       mutex_lock(&bpf_nf_mutex);
> > Why do you need this mutex?
> > What race does it solve?
>
> To avoid user update a link with differ prog at the same time. I noticed
> that sys_bpf()
> doesn't seem to prevent being invoked by user at the same time. Have I
> missed something?

You're correct that sys_bpf() doesn't lock anything.
But what are you serializing in this bpf_nf_link_update() ?
What will happen if multiple bpf_nf_link_update()
without mutex run on different CPUs in parallel ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-22 22:23       ` Alexei Starovoitov
@ 2023-12-27  8:20         ` D. Wythe
  2023-12-27 19:00           ` Alexei Starovoitov
  0 siblings, 1 reply; 9+ messages in thread
From: D. Wythe @ 2023-12-27  8:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov



On 12/23/23 6:23 AM, Alexei Starovoitov wrote:
> On Thu, Dec 21, 2023 at 11:06 PM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>
>>
>> On 12/21/23 5:11 AM, Alexei Starovoitov wrote:
>>> On Wed, Dec 20, 2023 at 6:09 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>>>
>>>> To support the prog update, we need to ensure that the prog seen
>>>> within the hook is always valid. Considering that hooks are always
>>>> protected by rcu_read_lock(), which provide us the ability to
>>>> access the prog under rcu.
>>>>
>>>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>>>> ---
>>>>    net/netfilter/nf_bpf_link.c | 63 ++++++++++++++++++++++++++++++++++-----------
>>>>    1 file changed, 48 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
>>>> index e502ec0..9bc91d1 100644
>>>> --- a/net/netfilter/nf_bpf_link.c
>>>> +++ b/net/netfilter/nf_bpf_link.c
>>>> @@ -8,17 +8,8 @@
>>>>    #include <net/netfilter/nf_bpf_link.h>
>>>>    #include <uapi/linux/netfilter_ipv4.h>
>>>>
>>>> -static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
>>>> -                                   const struct nf_hook_state *s)
>>>> -{
>>>> -       const struct bpf_prog *prog = bpf_prog;
>>>> -       struct bpf_nf_ctx ctx = {
>>>> -               .state = s,
>>>> -               .skb = skb,
>>>> -       };
>>>> -
>>>> -       return bpf_prog_run(prog, &ctx);
>>>> -}
>>>> +/* protect link update in parallel */
>>>> +static DEFINE_MUTEX(bpf_nf_mutex);
>>>>
>>>>    struct bpf_nf_link {
>>>>           struct bpf_link link;
>>>> @@ -26,8 +17,20 @@ struct bpf_nf_link {
>>>>           struct net *net;
>>>>           u32 dead;
>>>>           const struct nf_defrag_hook *defrag_hook;
>>>> +       struct rcu_head head;
>>> I have to point out the same issues as before, but
>>> will ask them differently...
>>>
>>> Why do you think above rcu_head is necessary?
>>>
>>>>    };
>>>>
>>>> +static unsigned int nf_hook_run_bpf(void *bpf_link, struct sk_buff *skb,
>>>> +                                   const struct nf_hook_state *s)
>>>> +{
>>>> +       const struct bpf_nf_link *nf_link = bpf_link;
>>>> +       struct bpf_nf_ctx ctx = {
>>>> +               .state = s,
>>>> +               .skb = skb,
>>>> +       };
>>>> +       return bpf_prog_run(rcu_dereference_raw(nf_link->link.prog), &ctx);
>>>> +}
>>>> +
>>>>    #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
>>>>    static const struct nf_defrag_hook *
>>>>    get_proto_defrag_hook(struct bpf_nf_link *link,
>>>> @@ -126,8 +129,7 @@ static void bpf_nf_link_release(struct bpf_link *link)
>>>>    static void bpf_nf_link_dealloc(struct bpf_link *link)
>>>>    {
>>>>           struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>>>> -
>>>> -       kfree(nf_link);
>>>> +       kfree_rcu(nf_link, head);
>>> Why is this needed ?
>>> Have you looked at tcx_link_lops ?
>> Introducing rcu_head/kfree_rcu is to address the situation where the
>> netfilter hooks might
>> still access the link after bpf_nf_link_dealloc.
> Why do you think so?
>

Hi Alexei,


IMMO, nf_unregister_net_hook does not wait for the completion of the 
execution of the hook that is being removed,
instead, it allocates a new array without the very hook to replace the 
old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
then it use call_rcu() to release the old one.

You can find more details in commit 
8c873e2199700c2de7dbd5eedb9d90d5f109462b.

In other words, when nf_unregister_net_hook returns, there may still be 
contexts executing hooks on the
old array, which means that the `link` may still be accessed after 
nf_unregister_net_hook returns.

And that's the reason why we use kfree_rcu() to release the `link`.
>>                                                        nf_hook_run_bpf
>>                                                        const struct
>> bpf_nf_link *nf_link = bpf_link;
>>
>> bpf_nf_link_release
>>       nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>>
>> bpf_nf_link_dealloc
>>       free(link)
>> bpf_prog_run(link->prog);
>>
>>
>> I had checked the tcx_link_lops ,it's seems it use the synchronize_rcu()
>> to solve the
> Where do you see such code in tcx_link_lops ?

I'm not certain if the reason that it choose to use synchronize_rcu()is 
the same as mine,
but I did see it here:


tcx_link_release() -> tcx_entry_sync()


static inline void tcx_entry_sync(void)
{
     /* bpf_mprog_entry got a/b swapped, therefore ensure that
      * there are no inflight users on the old one anymore.
      */
     synchronize_rcu();
}

>> same problem, which is also the way we used in the first version.
>>
>> https://lore.kernel.org/bpf/1702467945-38866-1-git-send-email-alibuda@linux.alibaba.com/
>>
>> However, we have received some opposing views, believing that this is a
>> bit overkill,
>> so we decided to use kfree_rcu.
>>
>> https://lore.kernel.org/bpf/20231213222415.GA13818@breakpoint.cc/
>>
>>>>    }
>>>>
>>>>    static int bpf_nf_link_detach(struct bpf_link *link)
>>>> @@ -162,7 +164,34 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
>>>>    static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
>>>>                                 struct bpf_prog *old_prog)
>>>>    {
>>>> -       return -EOPNOTSUPP;
>>>> +       struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
>>>> +       int err = 0;
>>>> +
>>>> +       mutex_lock(&bpf_nf_mutex);
>>> Why do you need this mutex?
>>> What race does it solve?
>> To avoid user update a link with differ prog at the same time. I noticed
>> that sys_bpf()
>> doesn't seem to prevent being invoked by user at the same time. Have I
>> missed something?
> You're correct that sys_bpf() doesn't lock anything.
> But what are you serializing in this bpf_nf_link_update() ?
> What will happen if multiple bpf_nf_link_update()
> without mutex run on different CPUs in parallel ?

I must admit that it is indeed feasible if we eliminate the mutex and 
use cmpxchg to swap the prog (we need to ensure that there is only one 
bpf_prog_put() on the old prog).
However, when cmpxchg fails, it means that this context has not 
outcompeted the other one, and we have to return a failure. Maybe 
something like this:

if (!cmpxchg(&link->prog, old_prog, new_prog)) {
     /* already replaced by another link_update */
     return -xxx;
}

As a comparison, The version with the mutex wouldn't encounter this 
error, every update would succeed. I think that it's too harsh for the 
user to receive a failure
in that case since they haven't done anything wrong.

Best wishes,
D. Wythe



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-27  8:20         ` D. Wythe
@ 2023-12-27 19:00           ` Alexei Starovoitov
  2023-12-28 11:06             ` D. Wythe
  0 siblings, 1 reply; 9+ messages in thread
From: Alexei Starovoitov @ 2023-12-27 19:00 UTC (permalink / raw)
  To: D. Wythe
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov

On Wed, Dec 27, 2023 at 12:20 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>
>
> Hi Alexei,
>
>
> IMMO, nf_unregister_net_hook does not wait for the completion of the
> execution of the hook that is being removed,
> instead, it allocates a new array without the very hook to replace the
> old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
> then it use call_rcu() to release the old one.
>
> You can find more details in commit
> 8c873e2199700c2de7dbd5eedb9d90d5f109462b.
>
> In other words, when nf_unregister_net_hook returns, there may still be
> contexts executing hooks on the
> old array, which means that the `link` may still be accessed after
> nf_unregister_net_hook returns.
>
> And that's the reason why we use kfree_rcu() to release the `link`.
> >>                                                        nf_hook_run_bpf
> >>                                                        const struct
> >> bpf_nf_link *nf_link = bpf_link;
> >>
> >> bpf_nf_link_release
> >>       nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
> >>
> >> bpf_nf_link_dealloc
> >>       free(link)
> >> bpf_prog_run(link->prog);

Got it.
Sounds like it's an existing bug. If so it should be an independent
patch with Fixes tag.

Also please craft a test case to demonstrate UAF.

>
> I must admit that it is indeed feasible if we eliminate the mutex and
> use cmpxchg to swap the prog (we need to ensure that there is only one
> bpf_prog_put() on the old prog).
> However, when cmpxchg fails, it means that this context has not
> outcompeted the other one, and we have to return a failure. Maybe
> something like this:
>
> if (!cmpxchg(&link->prog, old_prog, new_prog)) {
>      /* already replaced by another link_update */
>      return -xxx;
> }
>
> As a comparison, The version with the mutex wouldn't encounter this
> error, every update would succeed. I think that it's too harsh for the
> user to receive a failure
> in that case since they haven't done anything wrong.

Disagree. The mutex doesn't prevent this issue.
There is always a race.
It happens when link_update.old_prog_fd and BPF_F_REPLACE
were specified.
One user space passes an FD of the old prog and
another user space doing the same. They both race and one of them
gets
if (old_prog && link->prog != old_prog) {
               err = -EPERM;

it's no different with dropping the mutex and doing:
if (old_prog) {
    if (!cmpxchg(&link->prog, old_prog, new_prog))
      -EPERM
} else {
   old_prog = xchg(&link->prog, new_prog);
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC nf-next v3 1/2] netfilter: bpf: support prog update
  2023-12-27 19:00           ` Alexei Starovoitov
@ 2023-12-28 11:06             ` D. Wythe
  0 siblings, 0 replies; 9+ messages in thread
From: D. Wythe @ 2023-12-28 11:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal, bpf, LKML,
	Network Development, coreteam, netfilter-devel, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov



On 12/28/23 3:00 AM, Alexei Starovoitov wrote:
> On Wed, Dec 27, 2023 at 12:20 AM D. Wythe <alibuda@linux.alibaba.com> wrote:
>>
>> Hi Alexei,
>>
>>
>> IMMO, nf_unregister_net_hook does not wait for the completion of the
>> execution of the hook that is being removed,
>> instead, it allocates a new array without the very hook to replace the
>> old arrayvia rcu_assign_pointer() (in __nf_hook_entries_try_shrink),
>> then it use call_rcu() to release the old one.
>>
>> You can find more details in commit
>> 8c873e2199700c2de7dbd5eedb9d90d5f109462b.
>>
>> In other words, when nf_unregister_net_hook returns, there may still be
>> contexts executing hooks on the
>> old array, which means that the `link` may still be accessed after
>> nf_unregister_net_hook returns.
>>
>> And that's the reason why we use kfree_rcu() to release the `link`.
>>>>                                                         nf_hook_run_bpf
>>>>                                                         const struct
>>>> bpf_nf_link *nf_link = bpf_link;
>>>>
>>>> bpf_nf_link_release
>>>>        nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
>>>>
>>>> bpf_nf_link_dealloc
>>>>        free(link)
>>>> bpf_prog_run(link->prog);
> Got it.
> Sounds like it's an existing bug. If so it should be an independent
> patch with Fixes tag.
>
> Also please craft a test case to demonstrate UAF.
>

It is not an existing bug... Accessing the link within the hook was 
something I introduced here
to support updates😉, as previously there was no access to the link 
within the hook.
>> I must admit that it is indeed feasible if we eliminate the mutex and
>> use cmpxchg to swap the prog (we need to ensure that there is only one
>> bpf_prog_put() on the old prog).
>> However, when cmpxchg fails, it means that this context has not
>> outcompeted the other one, and we have to return a failure. Maybe
>> something like this:
>>
>> if (!cmpxchg(&link->prog, old_prog, new_prog)) {
>>       /* already replaced by another link_update */
>>       return -xxx;
>> }
>>
>> As a comparison, The version with the mutex wouldn't encounter this
>> error, every update would succeed. I think that it's too harsh for the
>> user to receive a failure
>> in that case since they haven't done anything wrong.
> Disagree. The mutex doesn't prevent this issue.
> There is always a race.
> It happens when link_update.old_prog_fd and BPF_F_REPLACE
> were specified.
> One user space passes an FD of the old prog and
> another user space doing the same. They both race and one of them
> gets
> if (old_prog && link->prog != old_prog) {
>                 err = -EPERM;
>
> it's no different with dropping the mutex and doing:
> if (old_prog) {
>      if (!cmpxchg(&link->prog, old_prog, new_prog))
>        -EPERM
> } else {
>     old_prog = xchg(&link->prog, new_prog);
> }

Got it!  It's very helpful, Thanks very much! I will modify my patch 
accordingly.


Best wishes,
D. Wythe






^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-12-28 11:06 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-20 14:09 [RFC nf-next v3 0/2] netfilter: bpf: support prog update D. Wythe
2023-12-20 14:09 ` [RFC nf-next v3 1/2] " D. Wythe
2023-12-20 21:11   ` Alexei Starovoitov
2023-12-22  7:06     ` D. Wythe
2023-12-22 22:23       ` Alexei Starovoitov
2023-12-27  8:20         ` D. Wythe
2023-12-27 19:00           ` Alexei Starovoitov
2023-12-28 11:06             ` D. Wythe
2023-12-20 14:09 ` [RFC nf-next v3 2/2] selftests/bpf: Add netfilter link prog update test D. Wythe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).