Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf 3/4] bpf_trace: handle compat in kprobe_multi_resolve_syms
From: Eugene Syromiatnikov @ 2022-05-16 18:27 UTC (permalink / raw)
  To: Jiri Olsa, Masami Hiramatsu, Steven Rostedt, Ingo Molnar,
	Alexei Starovoitov, Daniel Borkmann
  Cc: Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, linux-kernel, Shuah Khan,
	linux-kselftest

For compat processes, userspace pointer size is different.  Since the
copied array is iterated anyway, the simplest fix seems to be copy the
user-supplied array as-is and the iterate as an array of native or
compat pointers, depending on the in_compat_syscall() value.

Fixes: 0dcac272540613d4 ("bpf: Add multi kprobe link")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 kernel/trace/bpf_trace.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d228440..5b0cf54 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2352,16 +2352,21 @@ static int
 kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
 			  unsigned long *addrs)
 {
-	unsigned long addr, size;
+	unsigned long addr;
+	size_t sym_size;
+	u32 size, elem_size;
 	const char __user **syms;
+	compat_uptr_t __user *compat_syms;
 	int err = -ENOMEM;
 	unsigned int i;
 	char *func;
 
-	if (check_mul_overflow(cnt, sizeof(*syms), &size))
+	elem_size = in_compat_syscall() ? sizeof(*compat_syms) : sizeof(*syms);
+	if (check_mul_overflow(cnt, elem_size, &size))
 		return -EOVERFLOW;
-	size = cnt * sizeof(*syms);
+	size = cnt * elem_size;
 	syms = kvzalloc(size, GFP_KERNEL);
+	compat_syms = (void *)syms;
 	if (!syms)
 		return -ENOMEM;
 
@@ -2375,7 +2380,10 @@ kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
 	}
 
 	for (i = 0; i < cnt; i++) {
-		err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+		const char __user *ufunc = in_compat_syscall()
+					? (char __user *)(uintptr_t)compat_syms[i]
+					: syms[i];
+		err = strncpy_from_user(func, ufunc, KSYM_NAME_LEN);
 		if (err == KSYM_NAME_LEN)
 			err = -E2BIG;
 		if (err < 0)
@@ -2384,9 +2392,9 @@ kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
 		addr = kallsyms_lookup_name(func);
 		if (!addr)
 			goto error;
-		if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+		if (!kallsyms_lookup_size_offset(addr, &sym_size, NULL))
 			goto error;
-		addr = ftrace_location_range(addr, addr + size - 1);
+		addr = ftrace_location_range(addr, addr + sym_size - 1);
 		if (!addr)
 			goto error;
 		addrs[i] = addr;
-- 
2.1.4


^ permalink raw reply related

* [PATCH bpf 2/4] bpf_trace: support 32-bit kernels in bpf_kprobe_multi_link_attach
From: Eugene Syromiatnikov @ 2022-05-16 18:27 UTC (permalink / raw)
  To: Jiri Olsa, Masami Hiramatsu, Steven Rostedt, Ingo Molnar,
	Alexei Starovoitov, Daniel Borkmann
  Cc: Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, linux-kernel, Shuah Khan,
	linux-kselftest

It seems that there is no reason not to support 32-bit architectures;
doing so requires a bit of rework with respect to cookies handling,
however, as the current code implicitly assumes
that sizeof(long) == sizeof(u64).

Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 kernel/trace/bpf_trace.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e90c4ce7..d228440 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2405,16 +2405,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	struct bpf_link_primer link_primer;
 	void __user *ucookies;
 	unsigned long *addrs;
-	u32 flags, cnt, size;
+	u32 flags, cnt, size, cookies_size;
 	void __user *uaddrs;
 	u64 *cookies = NULL;
 	void __user *usyms;
 	int err;
 
-	/* no support for 32bit archs yet */
-	if (sizeof(u64) != sizeof(void *))
-		return -EOPNOTSUPP;
-
 	if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
 		return -EINVAL;
 
@@ -2424,6 +2420,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 
 	uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
 	usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
 	if (!!uaddrs == !!usyms)
 		return -EINVAL;
 
@@ -2431,8 +2428,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (!cnt)
 		return -EINVAL;
 
-	if (check_mul_overflow(cnt, (u32)sizeof(*addrs), &size))
+	if (check_mul_overflow(cnt, (u32)sizeof(*addrs), &size) ||
+	    (ucookies &&
+	     check_mul_overflow(cnt, (u32)sizeof(*cookies), &cookies_size))) {
 		return -EOVERFLOW;
+	}
 	size = cnt * sizeof(*addrs);
 	addrs = kvmalloc(size, GFP_KERNEL);
 	if (!addrs)
@@ -2449,14 +2449,14 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 			goto error;
 	}
 
-	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
 	if (ucookies) {
-		cookies = kvmalloc(size, GFP_KERNEL);
+		cookies_size = cnt * sizeof(*cookies);
+		cookies = kvmalloc(cookies_size, GFP_KERNEL);
 		if (!cookies) {
 			err = -ENOMEM;
 			goto error;
 		}
-		if (copy_from_user(cookies, ucookies, size)) {
+		if (copy_from_user(cookies, ucookies, cookies_size)) {
 			err = -EFAULT;
 			goto error;
 		}
-- 
2.1.4


^ permalink raw reply related

* [PATCH bpf 1/4] bpf_trace: check size for overflow in bpf_kprobe_multi_link_attach
From: Eugene Syromiatnikov @ 2022-05-16 18:27 UTC (permalink / raw)
  To: Jiri Olsa, Masami Hiramatsu, Steven Rostedt, Ingo Molnar,
	Alexei Starovoitov, Daniel Borkmann
  Cc: Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, linux-kernel, Shuah Khan,
	linux-kselftest

Check that size would not overflow before calculation (and return
-EOVERFLOW if it will), to prevent potential out-of-bounds write
with the following copy_from_user.  Add the same check
to kprobe_multi_resolve_syms in case it will be called from elsewhere
in the future.

Fixes: 0dcac272540613d4 ("bpf: Add multi kprobe link")
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
---
 kernel/trace/bpf_trace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d8553f4..e90c4ce7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2358,6 +2358,8 @@ kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
 	unsigned int i;
 	char *func;
 
+	if (check_mul_overflow(cnt, sizeof(*syms), &size))
+		return -EOVERFLOW;
 	size = cnt * sizeof(*syms);
 	syms = kvzalloc(size, GFP_KERNEL);
 	if (!syms)
@@ -2429,6 +2431,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (!cnt)
 		return -EINVAL;
 
+	if (check_mul_overflow(cnt, (u32)sizeof(*addrs), &size))
+		return -EOVERFLOW;
 	size = cnt * sizeof(*addrs);
 	addrs = kvmalloc(size, GFP_KERNEL);
 	if (!addrs)
-- 
2.1.4


^ permalink raw reply related

* [PATCH bpf 0/4] Fix 32-bit arch and compat support for the kprobe_multi attach type
From: Eugene Syromiatnikov @ 2022-05-16 18:26 UTC (permalink / raw)
  To: Jiri Olsa, Masami Hiramatsu, Steven Rostedt, Ingo Molnar,
	Alexei Starovoitov, Daniel Borkmann
  Cc: Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, linux-kernel, Shuah Khan,
	linux-kselftest

As suggested in [1], the kprobe_multi interface is to be fixed for 32-bit
architectures and compat, rather then disabled.  As it turned out,
there are a couple of additional problems that are to be addressed:
 - the absence of size overflow checks, leading to possible
   out-of-bounds writes (addressed by the first patch);
 - the assumption that long has the same size as u64, which would make
   cookies arrays size calculation incorrect on 32-bit architectures
   (addressed by the second patch);
 - the addrs array passing API, that is incompatible with compat and has
   to be changed (addressed in the fourth patch): those are kernel
   addresses and not user ones (as was incorrectly stated in [2]);
   this change is only semantical for 64-bit user/kernelspace,
   so it shouldn't impact ABI there, at least.

[1] https://lore.kernel.org/lkml/CAADnVQ+2gwhcMht4PuDnDOFKY68Wsq8QFz4Y69NBX_TLaSexQQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/20220510184155.GA8295@asgard.redhat.com/

Eugene Syromiatnikov (4):
  bpf_trace: check size for overflow in bpf_kprobe_multi_link_attach
  bpf_trace: support 32-bit kernels in bpf_kprobe_multi_link_attach
  bpf_trace: handle compat in kprobe_multi_resolve_syms
  bpf_trace: pass array of u64 values in kprobe_multi.addrs

 kernel/trace/bpf_trace.c                           | 63 ++++++++++++++++------
 tools/lib/bpf/bpf.h                                |  2 +-
 tools/lib/bpf/libbpf.c                             |  8 +--
 tools/lib/bpf/libbpf.h                             |  2 +-
 .../testing/selftests/bpf/prog_tests/bpf_cookie.c  |  2 +-
 .../selftests/bpf/prog_tests/kprobe_multi_test.c   |  2 +-
 6 files changed, 54 insertions(+), 25 deletions(-)

-- 
2.1.4


^ permalink raw reply

* Re: [PATCH net-next 4/4] net: call skb_defer_free_flush() before each napi_poll()
From: Eric Dumazet @ 2022-05-16 18:26 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Eric Dumazet, David S . Miller, Paolo Abeni, netdev
In-Reply-To: <20220516112140.0f088427@kernel.org>

On Mon, May 16, 2022 at 11:21 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sun, 15 May 2022 21:24:56 -0700 Eric Dumazet wrote:
> > -end:
> > -     skb_defer_free_flush(sd);
> > +end:;
>
> Sorry for the nit pick but can I remove this and just return like we
> did before f3412b3879b4? Is there a reason such "label:;}" is good?

I thought that having a return in the middle of this function would
hurt us at some point.

^ permalink raw reply

* Re: [PATCH net-next 1/4] net: fix possible race in skb_attempt_defer_free()
From: Eric Dumazet @ 2022-05-16 18:24 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Eric Dumazet, David S . Miller, Paolo Abeni, netdev
In-Reply-To: <20220516111554.5585a6b5@kernel.org>

On Mon, May 16, 2022 at 11:16 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Sun, 15 May 2022 21:24:53 -0700 Eric Dumazet wrote:
> > A cpu can observe sd->defer_count reaching 128,
> > and call smp_call_function_single_async()
> >
> > Problem is that the remote CPU can clear sd->defer_count
> > before the IPI is run/acknowledged.
> >
> > Other cpus can queue more packets and also decide
> > to call smp_call_function_single_async() while the pending
> > IPI was not yet delivered.
> >
> > This is a common issue with smp_call_function_single_async().
> > Callers must ensure correct synchronization and serialization.
> >
> > I triggered this issue while experimenting smaller threshold.
> > Performing the call to smp_call_function_single_async()
> > under sd->defer_lock protection did not solve the problem.
> >
> > Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async()
> > to insert locked csd") replaced an informative WARN_ON_ONCE()
> > with a return of -EBUSY, which is often ignored.
> > Test of CSD_FLAG_LOCK presence is racy anyway.
>
> If I'm reading this right this is useful for backports but in net-next
> it really is a noop? The -EBUSY would be perfectly safe to ignore?
> Just checking.

Not sure I understand the question.

trigger_rx_softirq() and friends were only in net-next, so there is no
backport needed.

Are you talking of calls from net_rps_send_ipi() ?
These are fine, because we own an atomic bit (NAPI_STATE_SCHED).

^ permalink raw reply

* [PATCH bpf 0/4] Fix 32-bit arch and compat support for the kprobe_multi attach type
From: Eugene Syromiatnikov @ 2022-05-16 18:24 UTC (permalink / raw)
  To: Jiri Olsa, Masami Hiramatsu, Steven Rostedt, Ingo Molnar,
	Alexei Starovoitov, Daniel Borkmann
  Cc: Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, linux-kernel,
	linux-kselftest

As suggested in [1], the kprobe_multi interface is to be fixed for 32-bit
architectures and compat, rather then disabled.  As it turned out,
there are a couple of additional problems that are to be addressed:
 - the absence of size overflow checks, leading to possible
   out-of-bounds writes (addressed by the first patch);
 - the assumption that long has the same size as u64, which would make
   cookies arrays size calculation incorrect on 32-bit architectures
   (addressed by the second patch);
 - the addrs array passing API, that is incompatible with compat and has
   to be changed (addressed in the fourth patch): those are kernel
   addresses and not user ones (as was incorrectly stated in [2]);
   this change is only semantical for 64-bit user/kernelspace,
   so it shouldn't impact ABI there, at least.

[1] https://lore.kernel.org/lkml/CAADnVQ+2gwhcMht4PuDnDOFKY68Wsq8QFz4Y69NBX_TLaSexQQ@mail.gmail.com/
[2] https://lore.kernel.org/lkml/20220510184155.GA8295@asgard.redhat.com/

Eugene Syromiatnikov (4):
  bpf_trace: check size for overflow in bpf_kprobe_multi_link_attach
  bpf_trace: support 32-bit kernels in bpf_kprobe_multi_link_attach
  bpf_trace: handle compat in kprobe_multi_resolve_syms
  bpf_trace: pass array of u64 values in kprobe_multi.addrs

 kernel/trace/bpf_trace.c                           | 63 ++++++++++++++++------
 tools/lib/bpf/bpf.h                                |  2 +-
 tools/lib/bpf/libbpf.c                             |  8 +--
 tools/lib/bpf/libbpf.h                             |  2 +-
 .../testing/selftests/bpf/prog_tests/bpf_cookie.c  |  2 +-
 .../selftests/bpf/prog_tests/kprobe_multi_test.c   |  2 +-
 6 files changed, 54 insertions(+), 25 deletions(-)

-- 
2.1.4


^ permalink raw reply

* Re: [PATCH net v2] netfilter: nf_flow_table: fix teardown flow timeout
From: Sven Auhagen @ 2022-05-16 18:23 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Oz Shlomo, Felix Fietkau, netdev, netfilter-devel,
	Florian Westphal, Paul Blakey
In-Reply-To: <YoKO0dPJs+VjbTXP@salvia>

On Mon, May 16, 2022 at 07:50:09PM +0200, Pablo Neira Ayuso wrote:
> On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> > On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> [...]
> > > > > > [...]
> > > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > > >  
> > > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > > 
> > > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > > 
> > > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > > 
> > > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > > the closing packets.
> > > > > > 
> > > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > > in a TCP state that represent closure?
> > > > > > 
> > > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > >   			goto out;
> > > > > > 
> > > > > > this is already the intention in the existing code.
> > > > > 
> > > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > > extra IPS_ bit.
> > > > 
> > > > You might create a race with ct gc that will remove the ct
> > > > if it is in close or end of close and before flow offload teardown is running
> > > > so flow offload teardown might access memory that was freed.
> > > 
> > > flow object holds a reference to the ct object until it is released,
> > > no use-after-free can happen.
> > > 
> > 
> > Also if nf_ct_delete is called before flowtable delete?
> > Can you let me know why?
> 
> nf_ct_delete() removes the conntrack object from lists and it
> decrements the reference counter by one.
> 
> flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
> the reference count on the conntrack object before creating the flow.
> 
> > > > It is not a very likely scenario but never the less it might happen now
> > > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > > 
> > > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > > 
> > > > So Oz and myself were debatting about three possible cases/problems:
> > > > 
> > > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > > IPS_OFFLOAD is still set but the flow is in teardown
> > > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > > the CLOSE timeout is reached before the flow offload del
> > > 
> > > OK.
> > > 
> > > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > > in flow offload teardown/delete even though the state is already
> > > > CLOSED.
> > > >
> > > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > > flow table teardown as the packet might get bumped to the slow path in
> > > > nftables.
> > > 
> > > I assume this remark is related to 3.?
> > 
> > Yes, exactly.
> > 
> > > if IPS_OFFLOAD is unset, then conntrack would update the state
> > > according to this FIN or RST.
> > 
> > It will move to a different TCP state anyways only the ct state
> > will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> > The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> > even though TCP might already be CLOSED.

I see what you are trying to do here, I have some remarks:

> 
> If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
> unset, then the packet is passed up in a consistent state.
> 
> I made a patch, it is based on yours, it's attached:
> 
> - If flow timeout expires or rst/fin is seen, ct state and timeout is
>   fixed up (to established state) and IPS_OFFLOAD is unset.
> 
> - If rst/fin packet is seen, ct state and timeout is fixed up (to
>   established state) and IPS_OFFLOAD is unset. The packet continues
>   its travel up to the classic path, so conntrack triggers the
>   transition from established to one of the close states.
> 
> For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
> cannot race to reset the ct timeout anymore.
> 
> For the case 2., if gc conntrack ever removes the ct entry, then the
> IPS_DYING bit is set, which implicitly triggers the teardown state
> from the flowtable gc. The flowtable still holds a reference to the
> ct object, so no UAF can happen.
> 
> For the case 3. the conntrack is set to ESTABLISHED with a long
> timeout, yes. This is to deal with the two possible cases:
> 
> a) flowtable timeout expired, so conntrack recovers control on the
>    flow.
> b) tcp rst/fin will take back the packet to slow path. The ct has been
>    fixed up to established state so it will trasition to one of the
>    close states.
> 
> Am I missing anything?

You should not fixup the tcp state back to established.
If flow_offload_teardown is not called because a packet got bumped up to the slow path
and you call flow_offload_teardown from nf_flow_offload_gc_step, the tcp state might already
be in CLOSE state and you just moved it back to established.
The entire function flow_offload_fixup_tcp can go away if we only allow established tcp states
in the flowtable.

Same goes for the timeout. The timeout should really be set to the current tcp state
ct->proto.tcp->state which might not be established anymore.

For me the question remains, why can the ct gc not remove the ct when nf_ct_delete
is called before flow_offload_del is called?

Also you probably want to move the IPS_OFFLOAD_BIT to the beginning of
flow_offload_teardown just to make sure that the ct gc is not bumping up the ct timeout
while it is changed in flow_offload_fixup_ct.



^ permalink raw reply

* Re: [net-next PATCH V2] octeontx2-pf: Add support for adaptive interrupt coalescing
From: Jakub Kicinski @ 2022-05-16 18:22 UTC (permalink / raw)
  To: Suman Ghosh
  Cc: davem, edumazet, pabeni, sgoutham, sbhatta, gakula, Sunil.Goutham,
	hkelam, colin.king, netdev
In-Reply-To: <20220516044614.731395-1-sumang@marvell.com>

On Mon, 16 May 2022 10:16:14 +0530 Suman Ghosh wrote:
> Added support for adaptive IRQ coalescing. It uses net_dim
> algorithm to find the suitable delay/IRQ count based on the
> current packet rate.
> 
> Signed-off-by: Suman Ghosh <sumang@marvell.com>
> Reviewed-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>

Can you share uname -r of the kernel you're testing this on?

^ permalink raw reply

* Re: [PATCH net-next 4/4] net: call skb_defer_free_flush() before each napi_poll()
From: Jakub Kicinski @ 2022-05-16 18:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Paolo Abeni, netdev, Eric Dumazet
In-Reply-To: <20220516042456.3014395-5-eric.dumazet@gmail.com>

On Sun, 15 May 2022 21:24:56 -0700 Eric Dumazet wrote:
> -end:
> -	skb_defer_free_flush(sd);
> +end:;

Sorry for the nit pick but can I remove this and just return like we
did before f3412b3879b4? Is there a reason such "label:;}" is good?

^ permalink raw reply

* Re: [PATCH iproute2] iplink: remove GSO_MAX_SIZE definition
From: patchwork-bot+netdevbpf @ 2022-05-16 18:20 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: dsahern, stephen, netdev, edumazet
In-Reply-To: <20220516153457.3086137-1-eric.dumazet@gmail.com>

Hello:

This patch was applied to iproute2/iproute2.git (main)
by Stephen Hemminger <stephen@networkplumber.org>:

On Mon, 16 May 2022 08:34:57 -0700 you wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> David removed the check using GSO_MAX_SIZE
> in commit f1d18e2e6ec5 ("Update kernel headers").
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> 
> [...]

Here is the summary with links:
  - [iproute2] iplink: remove GSO_MAX_SIZE definition
    https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/commit/?id=6b6979b9d443

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH v2] dt-bindings: net: orion-mdio: Convert to JSON schema
From: Rob Herring @ 2022-05-16 18:16 UTC (permalink / raw)
  To: Chris Packham
  Cc: davem, edumazet, kuba, pabeni, krzysztof.kozlowski+dt, andrew,
	netdev, devicetree, linux-kernel
In-Reply-To: <20220505210621.3637268-1-chris.packham@alliedtelesis.co.nz>

On Fri, May 06, 2022 at 09:06:20AM +1200, Chris Packham wrote:
> Convert the marvell,orion-mdio binding to JSON schema.
> 
> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> ---
> 
> Notes:
>     This does throw up the following dtbs_check warnings for turris-mox:
>     
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch0@10:reg: [[16], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch0@2:reg: [[2], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch1@11:reg: [[17], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch1@2:reg: [[2], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch2@12:reg: [[18], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dtb: mdio@32004: switch2@2:reg: [[2], [0]] is too long
>             From schema: Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>     
>     I think they're all genuine but I'm hesitant to leap in and fix them
>     without being able to test them.
>     
>     I also need to set unevaluatedProperties: true to cater for the L2
>     switch on turris-mox (and probably others). That might be better tackled
>     in the core mdio.yaml schema but I wasn't planning on touching that.
>     
>     Changes in v2:
>     - Add Andrew as maintainer (thanks for volunteering)
> 
>  .../bindings/net/marvell,orion-mdio.yaml      | 60 +++++++++++++++++++
>  .../bindings/net/marvell-orion-mdio.txt       | 54 -----------------
>  2 files changed, 60 insertions(+), 54 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
>  delete mode 100644 Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml b/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
> new file mode 100644
> index 000000000000..fe3a3412f093
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml
> @@ -0,0 +1,60 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/marvell,orion-mdio.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Marvell MDIO Ethernet Controller interface
> +
> +maintainers:
> +  - Andrew Lunn <andrew@lunn.ch>
> +
> +description: |
> +  The Ethernet controllers of the Marvel Kirkwood, Dove, Orion5x, MV78xx0,
> +  Armada 370, Armada XP, Armada 7k and Armada 8k have an identical unit that
> +  provides an interface with the MDIO bus. Additionally, Armada 7k and Armada
> +  8k has a second unit which provides an interface with the xMDIO bus. This
> +  driver handles these interfaces.
> +
> +allOf:
> +  - $ref: "mdio.yaml#"
> +
> +properties:
> +  compatible:
> +    enum:
> +      - marvell,orion-mdio
> +      - marvell,xmdio
> +
> +  reg:
> +    maxItems: 1
> +
> +  interrupts:
> +    maxItems: 1
> +
> +  clocks:
> +    minItems: 1
> +    maxItems: 4

Really this should be better defined, but the original was not.

> +
> +required:
> +  - compatible
> +  - reg
> +
> +unevaluatedProperties: true

This must be false.

> +
> +examples:
> +  - |
> +    mdio@d0072004 {
> +      compatible = "marvell,orion-mdio";
> +      reg = <0xd0072004 0x4>;
> +      #address-cells = <1>;
> +      #size-cells = <0>;
> +      interrupts = <30>;
> +
> +      phy0: ethernet-phy@0 {
> +        reg = <0>;
> +      };
> +
> +      phy1: ethernet-phy@1 {
> +        reg = <1>;
> +      };
> +    };
> diff --git a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
> deleted file mode 100644
> index 3f3cfc1d8d4d..000000000000
> --- a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
> +++ /dev/null
> @@ -1,54 +0,0 @@
> -* Marvell MDIO Ethernet Controller interface
> -
> -The Ethernet controllers of the Marvel Kirkwood, Dove, Orion5x,
> -MV78xx0, Armada 370, Armada XP, Armada 7k and Armada 8k have an
> -identical unit that provides an interface with the MDIO bus.
> -Additionally, Armada 7k and Armada 8k has a second unit which
> -provides an interface with the xMDIO bus. This driver handles
> -these interfaces.
> -
> -Required properties:
> -- compatible: "marvell,orion-mdio" or "marvell,xmdio"
> -- reg: address and length of the MDIO registers.  When an interrupt is
> -  not present, the length is the size of the SMI register (4 bytes)
> -  otherwise it must be 0x84 bytes to cover the interrupt control
> -  registers.
> -
> -Optional properties:
> -- interrupts: interrupt line number for the SMI error/done interrupt
> -- clocks: phandle for up to four required clocks for the MDIO instance
> -
> -The child nodes of the MDIO driver are the individual PHY devices
> -connected to this MDIO bus. They must have a "reg" property given the
> -PHY address on the MDIO bus.
> -
> -Example at the SoC level without an interrupt property:
> -
> -mdio {
> -	#address-cells = <1>;
> -	#size-cells = <0>;
> -	compatible = "marvell,orion-mdio";
> -	reg = <0xd0072004 0x4>;
> -};
> -
> -Example with an interrupt property:
> -
> -mdio {
> -	#address-cells = <1>;
> -	#size-cells = <0>;
> -	compatible = "marvell,orion-mdio";
> -	reg = <0xd0072004 0x84>;
> -	interrupts = <30>;
> -};
> -
> -And at the board level:
> -
> -mdio {
> -	phy0: ethernet-phy@0 {
> -		reg = <0>;
> -	};
> -
> -	phy1: ethernet-phy@1 {
> -		reg = <1>;
> -	};
> -}
> -- 
> 2.36.0
> 
> 

^ permalink raw reply

* Re: [PATCH net-next 1/4] net: fix possible race in skb_attempt_defer_free()
From: Jakub Kicinski @ 2022-05-16 18:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Paolo Abeni, netdev, Eric Dumazet
In-Reply-To: <20220516042456.3014395-2-eric.dumazet@gmail.com>

On Sun, 15 May 2022 21:24:53 -0700 Eric Dumazet wrote:
> A cpu can observe sd->defer_count reaching 128,
> and call smp_call_function_single_async()
> 
> Problem is that the remote CPU can clear sd->defer_count
> before the IPI is run/acknowledged.
> 
> Other cpus can queue more packets and also decide
> to call smp_call_function_single_async() while the pending
> IPI was not yet delivered.
> 
> This is a common issue with smp_call_function_single_async().
> Callers must ensure correct synchronization and serialization.
> 
> I triggered this issue while experimenting smaller threshold.
> Performing the call to smp_call_function_single_async()
> under sd->defer_lock protection did not solve the problem.
> 
> Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async()
> to insert locked csd") replaced an informative WARN_ON_ONCE()
> with a return of -EBUSY, which is often ignored.
> Test of CSD_FLAG_LOCK presence is racy anyway.

If I'm reading this right this is useful for backports but in net-next
it really is a noop? The -EBUSY would be perfectly safe to ignore?
Just checking.

^ permalink raw reply

* Re: [PATCH v2] dt-bindings: net: orion-mdio: Convert to JSON schema
From: Rob Herring @ 2022-05-16 18:14 UTC (permalink / raw)
  To: Andrew Lunn, Jakub Kicinski, Chris Packham
  Cc: davem, edumazet, pabeni, krzysztof.kozlowski+dt, netdev,
	devicetree, linux-kernel
In-Reply-To: <Ynm0tm7/05ye9z6v@lunn.ch>

On Tue, May 10, 2022 at 02:41:26AM +0200, Andrew Lunn wrote:
> On Mon, May 09, 2022 at 05:28:14PM -0700, Jakub Kicinski wrote:
> > On Fri,  6 May 2022 09:06:20 +1200 Chris Packham wrote:
> > > Subject: [PATCH v2] dt-bindings: net: orion-mdio: Convert to JSON schema
> > 
> > JSON or YAML?

I actually prefer 'DT schema format' which implicitly means json-schema 
vocabulary in YAML formatted files (among other constraints we place on 
them).

> 
> JAML is a superset of JSON. So it is not completely wrong. I've no
> idea if this particular binding sticks to the subset which is JSON.

I think you just invented a new term for 'JSON compatible YAML subset'.

Rob

^ permalink raw reply

* Re: [PATCH net v2] netfilter: nf_flow_table: fix teardown flow timeout
From: Pablo Neira Ayuso @ 2022-05-16 17:54 UTC (permalink / raw)
  To: Sven Auhagen
  Cc: Oz Shlomo, Felix Fietkau, netdev, netfilter-devel,
	Florian Westphal, Paul Blakey
In-Reply-To: <20220516121700.ug2fwqyqgsd3rmsz@SvensMacbookPro.hq.voleatech.com>

On Mon, May 16, 2022 at 02:17:00PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 02:06:28PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 01:37:40PM +0200, Pablo Neira Ayuso wrote:
> > > On Mon, May 16, 2022 at 01:18:17PM +0200, Sven Auhagen wrote:
> > > > On Mon, May 16, 2022 at 12:56:38PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
> > > > > > Connections leaving the established state (due to RST / FIN TCP packets)
> > > > > > set the flow table teardown flag. The packet path continues to set lower
> > > > > > timeout value as per the new TCP state but the offload flag remains set.
> > > > > >
> > > > > > Hence, the conntrack garbage collector may race to undo the timeout
> > > > > > adjustment of the packet path, leaving the conntrack entry in place with
> > > > > > the internal offload timeout (one day).
> > > > > >
> > > > > > Avoid ct gc timeout overwrite by flagging teared down flowtable
> > > > > > connections.
> > > > > >
> > > > > > On the nftables side we only need to allow established TCP connections to
> > > > > > create a flow offload entry. Since we can not guaruantee that
> > > > > > flow_offload_teardown is called by a TCP FIN packet we also need to make
> > > > > > sure that flow_offload_fixup_ct is also called in flow_offload_del
> > > > > > and only fixes up established TCP connections.
> > > > > [...]
> > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > >  
> > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > 
> > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > 
> > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > 
> > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > the closing packets.
> > > > > 
> > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > in a TCP state that represent closure?
> > > >
> > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > >   			goto out;
> > > > > 
> > > > > this is already the intention in the existing code.
> > > > > 
> > > > > If this does work, could you keep IPS_OFFLOAD_TEARDOWN_BIT internal,
> > > > > ie. no in uapi? Define it at include/net/netfilter/nf_conntrack.h and
> > > > > add a comment regarding this to avoid an overlap in the future.
> > > > > 
> > > > > > +				if (!test_bit(IPS_OFFLOAD_TEARDOWN_BIT, &tmp->status))
> > > > > > +					nf_ct_offload_timeout(tmp);
> > > > > >  				continue;
> > > > > >  			}
> > > > > >  
> > > > > > diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
> > > > > > index 3db256da919b..aaed1a244013 100644
> > > > > > --- a/net/netfilter/nf_flow_table_core.c
> > > > > > +++ b/net/netfilter/nf_flow_table_core.c
> > > > > > @@ -177,14 +177,8 @@ int flow_offload_route_init(struct flow_offload *flow,
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(flow_offload_route_init);
> > > > > >  
> > > > > > -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
> > > > > > -{
> > > > > > -	tcp->state = TCP_CONNTRACK_ESTABLISHED;
> > > > > > -	tcp->seen[0].td_maxwin = 0;
> > > > > > -	tcp->seen[1].td_maxwin = 0;
> > > > > > -}
> > > > > >  
> > > > > > -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > > +static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > > >  {
> > > > > >  	struct net *net = nf_ct_net(ct);
> > > > > >  	int l4num = nf_ct_protonum(ct);
> > > > > > @@ -192,8 +186,12 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > >  
> > > > > >  	if (l4num == IPPROTO_TCP) {
> > > > > >  		struct nf_tcp_net *tn = nf_tcp_pernet(net);
> > > > > > +		struct ip_ct_tcp *tcp = &ct->proto.tcp;
> > > > > > +
> > > > > > +		tcp->seen[0].td_maxwin = 0;
> > > > > > +		tcp->seen[1].td_maxwin = 0;
> > > > > >  
> > > > > > -		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
> > > > > > +		timeout = tn->timeouts[ct->proto.tcp.state];
> > > > > >  		timeout -= tn->offload_timeout;
> > > > > >  	} else if (l4num == IPPROTO_UDP) {
> > > > > >  		struct nf_udp_net *tn = nf_udp_pernet(net);
> > > > > > @@ -211,18 +209,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
> > > > > >  		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
> > > > > >  }
> > > > > >  
> > > > > > -static void flow_offload_fixup_ct_state(struct nf_conn *ct)
> > > > > > -{
> > > > > > -	if (nf_ct_protonum(ct) == IPPROTO_TCP)
> > > > > > -		flow_offload_fixup_tcp(&ct->proto.tcp);
> > > > > > -}
> > > > > > -
> > > > > > -static void flow_offload_fixup_ct(struct nf_conn *ct)
> > > > > > -{
> > > > > > -	flow_offload_fixup_ct_state(ct);
> > > > > > -	flow_offload_fixup_ct_timeout(ct);
> > > > > > -}
> > > > > > -
> > > > > >  static void flow_offload_route_release(struct flow_offload *flow)
> > > > > >  {
> > > > > >  	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
> > > > > > @@ -353,6 +339,10 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
> > > > > >  static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  			     struct flow_offload *flow)
> > > > > >  {
> > > > > > +	struct nf_conn *ct = flow->ct;
> > > > > > +
> > > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > > > +
> > > > > >  	rhashtable_remove_fast(&flow_table->rhashtable,
> > > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
> > > > > >  			       nf_flow_offload_rhash_params);
> > > > > > @@ -360,12 +350,11 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
> > > > > >  			       nf_flow_offload_rhash_params);
> > > > > >  
> > > > > > -	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
> > > > > > -
> > > > > >  	if (nf_flow_has_expired(flow))
> > > > > > -		flow_offload_fixup_ct(flow->ct);
> > > > > > -	else
> > > > > > -		flow_offload_fixup_ct_timeout(flow->ct);
> > > > > > +		flow_offload_fixup_ct(ct);
> > > > > 
> > > > > Very unlikely, but race might still happen between fixup and
> > > > > clear IPS_OFFLOAD_BIT with gc below?
> > > > > 
> > > > > Without checking from the packet path, the conntrack gc might race to
> > > > > refresh the timeout, I don't see a 100% race free solution.
> > > > > 
> > > > > Probably update the nf_ct_offload_timeout to a shorter value than a
> > > > > day would mitigate this issue too.
> > > > 
> > > > This section of the code is now protected by IPS_OFFLOAD_TEARDOWN_BIT
> > > > which will prevent the update via nf_ct_offload_timeout.
> > > > We set it at the beginning of flow_offload_del and flow_offload_teardown.
> > > > 
> > > > Since flow_offload_teardown is only called on TCP packets
> > > > we also need to set it at flow_offload_del to prevent the race.
> > > > 
> > > > This should prevent the race at this point.
> > > 
> > > OK.
> > > 
> > > > > > +	clear_bit(IPS_OFFLOAD_BIT, &ct->status);
> > > > > > +	clear_bit(IPS_OFFLOAD_TEARDOWN_BIT, &ct->status);
> > > > > >  
> > > > > >  	flow_offload_free(flow);
> > > > > >  }
> > > > > > @@ -373,8 +362,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
> > > > > >  void flow_offload_teardown(struct flow_offload *flow)
> > > > > >  {
> > > > > >  	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
> > > > > > +	set_bit(IPS_OFFLOAD_TEARDOWN_BIT, &flow->ct->status);
> > > > > >  
> > > > > > -	flow_offload_fixup_ct_state(flow->ct);
> > > > > > +	flow_offload_fixup_ct(flow->ct);
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_GPL(flow_offload_teardown);
> > > > > >  
> > > > > > diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
> > > > > > index 900d48c810a1..9cc3ea08eb3a 100644
> > > > > > --- a/net/netfilter/nft_flow_offload.c
> > > > > > +++ b/net/netfilter/nft_flow_offload.c
> > > > > > @@ -295,6 +295,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
> > > > > >  					  sizeof(_tcph), &_tcph);
> > > > > >  		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > > >  			goto out;
> > > > > > +		if (unlikely(!nf_conntrack_tcp_established(ct)))
> > > > > > +			goto out;
> > > > > 
> > > > > This chunk is not required, from ruleset users can do
> > > > > 
> > > > >         ... ct status assured ...
> > > > > 
> > > > > instead.
> > > > 
> > > > Maybe this should be mentioned in the manual or wiki if
> > > > it is not necessary in the flow offload code.
> > > 
> > > Yes, documentation and wiki can be updated.
> > > 
> > > Users might want to offload the flow at a later stage in the TCP
> > > connection.
> > 
> > Well, actually there is not later stage than established, anything
> > after established are TCP teardown states.
> > 
> > What's the issue with allowing to offload from SYN_RECV state?
> 
> There were multiple problem in general with the code.
> flow_offload_fixup_tcp always moves a TCP connection
> to established even if it is in FIN or CLOSE.
> 
> The flowoffload_del function was always setting the TCP timeout
> to ESTABLISHED timeout even when the state was in CLOSE and therefore
> creating a very long lasting dead state.
> 
> Since we might miss or bump packets to slow path, we do not know
> what will happen there when we are still in SYN_RECV.

Right.

> We will have a better knowledge of the TCP state when we are in 
> established first and we know that we are either still in it or
> we have moved past it to a closing state.

It makes sense to restrict this to TCP established only.

Oz and Paul already do this for tc-ct.

Thanks for explaining.

^ permalink raw reply

* Re: [PATCH net v2] netfilter: nf_flow_table: fix teardown flow timeout
From: Pablo Neira Ayuso @ 2022-05-16 17:50 UTC (permalink / raw)
  To: Sven Auhagen
  Cc: Oz Shlomo, Felix Fietkau, netdev, netfilter-devel,
	Florian Westphal, Paul Blakey
In-Reply-To: <20220516130213.bedrzjmvgvdzuzdc@SvensMacbookPro.hq.voleatech.com>

[-- Attachment #1: Type: text/plain, Size: 5071 bytes --]

On Mon, May 16, 2022 at 03:02:13PM +0200, Sven Auhagen wrote:
> On Mon, May 16, 2022 at 02:43:06PM +0200, Pablo Neira Ayuso wrote:
> > On Mon, May 16, 2022 at 02:23:00PM +0200, Sven Auhagen wrote:
> > > On Mon, May 16, 2022 at 02:13:03PM +0200, Pablo Neira Ayuso wrote:
> > > > On Mon, May 16, 2022 at 12:56:41PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Thu, May 12, 2022 at 09:28:03PM +0300, Oz Shlomo wrote:
[...]
> > > > > [...]
> > > > > > diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> > > > > > index 0164e5f522e8..324fdb62c08b 100644
> > > > > > --- a/net/netfilter/nf_conntrack_core.c
> > > > > > +++ b/net/netfilter/nf_conntrack_core.c
> > > > > > @@ -1477,7 +1477,8 @@ static void gc_worker(struct work_struct *work)
> > > > > >  			tmp = nf_ct_tuplehash_to_ctrack(h);
> > > > > >  
> > > > > >  			if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
> > > > > > -				nf_ct_offload_timeout(tmp);
> > > > > 
> > > > > Hm, it is the trick to avoid checking for IPS_OFFLOAD from the packet
> > > > > path that triggers the race, ie. nf_ct_is_expired()
> > > > > 
> > > > > The flowtable ct fixup races with conntrack gc collector.
> > > > > 
> > > > > Clearing IPS_OFFLOAD might result in offloading the entry again for
> > > > > the closing packets.
> > > > > 
> > > > > Probably clear IPS_OFFLOAD from teardown, and skip offload if flow is
> > > > > in a TCP state that represent closure?
> > > > > 
> > > > >   		if (unlikely(!tcph || tcph->fin || tcph->rst))
> > > > >   			goto out;
> > > > > 
> > > > > this is already the intention in the existing code.
> > > > 
> > > > I'm attaching an incomplete sketch patch. My goal is to avoid the
> > > > extra IPS_ bit.
> > > 
> > > You might create a race with ct gc that will remove the ct
> > > if it is in close or end of close and before flow offload teardown is running
> > > so flow offload teardown might access memory that was freed.
> > 
> > flow object holds a reference to the ct object until it is released,
> > no use-after-free can happen.
> > 
> 
> Also if nf_ct_delete is called before flowtable delete?
> Can you let me know why?

nf_ct_delete() removes the conntrack object from lists and it
decrements the reference counter by one.

flow_offload_free() also calls nf_ct_put(). flow_offload_alloc() bumps
the reference count on the conntrack object before creating the flow.

> > > It is not a very likely scenario but never the less it might happen now
> > > since the IPS_OFFLOAD_BIT is not set and the state might just time out.
> > > 
> > > If someone sets a very small TCP CLOSE timeout it gets more likely.
> > > 
> > > So Oz and myself were debatting about three possible cases/problems:
> > > 
> > > 1. ct gc sets timeout even though the state is in CLOSE/FIN because the
> > > IPS_OFFLOAD is still set but the flow is in teardown
> > > 2. ct gc removes the ct because the IPS_OFFLOAD is not set and
> > > the CLOSE timeout is reached before the flow offload del
> > 
> > OK.
> > 
> > > 3. tcp ct is always set to ESTABLISHED with a very long timeout
> > > in flow offload teardown/delete even though the state is already
> > > CLOSED.
> > >
> > > Also as a remark we can not assume that the FIN or RST packet is hitting
> > > flow table teardown as the packet might get bumped to the slow path in
> > > nftables.
> > 
> > I assume this remark is related to 3.?
> 
> Yes, exactly.
> 
> > if IPS_OFFLOAD is unset, then conntrack would update the state
> > according to this FIN or RST.
> 
> It will move to a different TCP state anyways only the ct state
> will be at IPS_OFFLOAD_BIT and prevent it from beeing garbage collected.
> The timeout will be bumped back up as long as IPS_OFFLOAD_BIT is set
> even though TCP might already be CLOSED.

If teardown fixes the ct state and timeout to established, and IPS_OFFLOAD is
unset, then the packet is passed up in a consistent state.

I made a patch, it is based on yours, it's attached:

- If flow timeout expires or rst/fin is seen, ct state and timeout is
  fixed up (to established state) and IPS_OFFLOAD is unset.

- If rst/fin packet is seen, ct state and timeout is fixed up (to
  established state) and IPS_OFFLOAD is unset. The packet continues
  its travel up to the classic path, so conntrack triggers the
  transition from established to one of the close states.

For the case 1., IPS_OFFLOAD is not set anymore, so conntrack gc
cannot race to reset the ct timeout anymore.

For the case 2., if gc conntrack ever removes the ct entry, then the
IPS_DYING bit is set, which implicitly triggers the teardown state
from the flowtable gc. The flowtable still holds a reference to the
ct object, so no UAF can happen.

For the case 3. the conntrack is set to ESTABLISHED with a long
timeout, yes. This is to deal with the two possible cases:

a) flowtable timeout expired, so conntrack recovers control on the
   flow.
b) tcp rst/fin will take back the packet to slow path. The ct has been
   fixed up to established state so it will trasition to one of the
   close states.

Am I missing anything?

[-- Attachment #2: fix-pickup.patch --]
[-- Type: text/x-diff, Size: 3655 bytes --]

commit f11b613b1f63a70e4dacae7ff73ce1895c96c6d1
Author: Pablo Neira Ayuso <pablo@netfilter.org>
Date:   Mon May 16 18:44:19 2022 +0200

    x

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 20b4a14e5d4e..85796082a4ac 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -184,7 +184,7 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
 	tcp->seen[1].td_maxwin = 0;
 }
 
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct nf_conn *ct)
 {
 	struct net *net = nf_ct_net(ct);
 	int l4num = nf_ct_protonum(ct);
@@ -193,6 +193,8 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 	if (l4num == IPPROTO_TCP) {
 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
 
+		flow_offload_fixup_tcp(&ct->proto.tcp);
+
 		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
 		timeout -= tn->offload_timeout;
 	} else if (l4num == IPPROTO_UDP) {
@@ -211,18 +213,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 		WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
 }
 
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
-	if (nf_ct_protonum(ct) == IPPROTO_TCP)
-		flow_offload_fixup_tcp(&ct->proto.tcp);
-}
-
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
-	flow_offload_fixup_ct_state(ct);
-	flow_offload_fixup_ct_timeout(ct);
-}
-
 static void flow_offload_route_release(struct flow_offload *flow)
 {
 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -361,14 +351,6 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       nf_flow_offload_rhash_params);
-
-	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
-
-	if (nf_flow_has_expired(flow))
-		flow_offload_fixup_ct(flow->ct);
-	else
-		flow_offload_fixup_ct_timeout(flow->ct);
-
 	flow_offload_free(flow);
 }
 
@@ -376,7 +358,9 @@ void flow_offload_teardown(struct flow_offload *flow)
 {
 	set_bit(NF_FLOW_TEARDOWN, &flow->flags);
 
-	flow_offload_fixup_ct_state(flow->ct);
+	flow_offload_fixup_ct(flow->ct);
+
+	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
@@ -466,7 +450,7 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
 	if (nf_flow_has_expired(flow) ||
 	    nf_ct_is_dying(flow->ct) ||
 	    nf_flow_has_stale_dst(flow))
-		set_bit(NF_FLOW_TEARDOWN, &flow->flags);
+		flow_offload_teardown(flow);
 
 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 187b8cb9a510..a22bb238926d 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -273,6 +273,12 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
 	return false;
 }
 
+static bool flow_offload_teardown_state(const struct ip_ct_tcp *state)
+{
+	return state->state > TCP_CONNTRACK_ESTABLISHED &&
+	       state->state <= TCP_CONNTRACK_CLOSE;
+}
+
 static void nft_flow_offload_eval(const struct nft_expr *expr,
 				  struct nft_regs *regs,
 				  const struct nft_pktinfo *pkt)
@@ -298,7 +304,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	case IPPROTO_TCP:
 		tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
 					  sizeof(_tcph), &_tcph);
-		if (unlikely(!tcph || tcph->fin || tcph->rst))
+		if (unlikely(!tcph || tcph->fin || tcph->rst ||
+			     flow_offload_teardown_state(&ct->proto.tcp)))
 			goto out;
 		break;
 	case IPPROTO_UDP:

^ permalink raw reply related

* Re: [PATCH v5 04/15] landlock: helper functions refactoring
From: Konstantin Meskhidze @ 2022-05-16 17:43 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: willemdebruijn.kernel, linux-security-module, netdev,
	netfilter-devel, yusongping, anton.sirazetdinov
In-Reply-To: <ce1201e9-8493-8387-9df4-f0f8b75011c9@digikod.net>



5/16/2022 8:14 PM, Mickaël Salaün пишет:
> 
> On 16/05/2022 17:20, Konstantin Meskhidze wrote:
>> Unmask_layers(), init_layer_masks() and
>> get_handled_accesses() helper functions move to
>> ruleset.c and rule_type argument is added.
>> This modification supports implementing new rule
>> types into next landlock versions.
>>
>> Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
>> ---
>>
>> Changes since v3:
>> * Splits commit.
>> * Refactoring landlock_unmask_layers functions.
>>
> 
> Please sort changes in antichronological order. It is easier to look at 
> the first lines to get the last changes.

  Ok. I will. Thanks
> 
>> Changes since v4:
>> * Refactoring init_layer_masks(), get_handled_accesses()
>> and unmask_layers() functions to support multiple rule types.
>> * Refactoring landlock_get_fs_access_mask() function with
>> LANDLOCK_MASK_ACCESS_FS mask. >
>> ---
>>   security/landlock/fs.c      | 158 ++++++++----------------------------
>>   security/landlock/ruleset.c | 152 +++++++++++++++++++++++++++++++---
>>   security/landlock/ruleset.h |  17 +++-
>>   3 files changed, 192 insertions(+), 135 deletions(-)
>>
>> diff --git a/security/landlock/fs.c b/security/landlock/fs.c
>> index 5de24d4dd74c..3506e182b23e 100644
>> --- a/security/landlock/fs.c
>> +++ b/security/landlock/fs.c
>> @@ -211,60 +211,6 @@ find_rule(const struct landlock_ruleset *const 
>> domain,
>>       return rule;
>>   }
>>
>> -/*
>> - * @layer_masks is read and may be updated according to the access 
>> request and
>> - * the matching rule.
>> - *
>> - * Returns true if the request is allowed (i.e. relevant layer masks 
>> for the
>> - * request are empty).
>> - */
>> -static inline bool
>> -unmask_layers(const struct landlock_rule *const rule,
>> -          const access_mask_t access_request,
>> -          layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
> 
> Moving these entire blocks of code make the review/diff impossible. Why 
> moving these helpers?

   Cause these helpers are going to be used both for filesystem and 
network. I moved them into ruleset.c/h
> 
>> -{
>> -    size_t layer_level;
>> -
>> -    if (!access_request || !layer_masks)
>> -        return true;
>> -    if (!rule)
>> -        return false;
>> -
>> -    /*
>> -     * An access is granted if, for each policy layer, at least one rule
>> -     * encountered on the pathwalk grants the requested access,
>> -     * regardless of its position in the layer stack.  We must then 
>> check
>> -     * the remaining layers for each inode, from the first added 
>> layer to
>> -     * the last one.  When there is multiple requested accesses, for 
>> each
>> -     * policy layer, the full set of requested accesses may not be 
>> granted
>> -     * by only one rule, but by the union (binary OR) of multiple rules.
>> -     * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
>> -     */
>> -    for (layer_level = 0; layer_level < rule->num_layers; 
>> layer_level++) {
>> -        const struct landlock_layer *const layer =
>> -            &rule->layers[layer_level];
>> -        const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
>> -        const unsigned long access_req = access_request;
>> -        unsigned long access_bit;
>> -        bool is_empty;
>> -
>> -        /*
>> -         * Records in @layer_masks which layer grants access to each
>> -         * requested access.
>> -         */
>> -        is_empty = true;
>> -        for_each_set_bit(access_bit, &access_req,
>> -                 ARRAY_SIZE(*layer_masks)) {
>> -            if (layer->access & BIT_ULL(access_bit))
>> -                (*layer_masks)[access_bit] &= ~layer_bit;
>> -            is_empty = is_empty && !(*layer_masks)[access_bit];
>> -        }
>> -        if (is_empty)
>> -            return true;
>> -    }
>> -    return false;
>> -}
>> -
>>   /*
>>    * Allows access to pseudo filesystems that will never be mountable 
>> (e.g.
>>    * sockfs, pipefs), but can still be reachable through
>> @@ -277,59 +223,6 @@ static inline bool is_nouser_or_private(const 
>> struct dentry *dentry)
>>           unlikely(IS_PRIVATE(d_backing_inode(dentry))));
>>   }
>>
>> -static inline access_mask_t
>> -get_handled_accesses(const struct landlock_ruleset *const domain)
>> -{
>> -    access_mask_t access_dom = 0;
>> -    unsigned long access_bit;
>> -
>> -    for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
>> -         access_bit++) {
>> -        size_t layer_level;
>> -
>> -        for (layer_level = 0; layer_level < domain->num_layers;
>> -                layer_level++) {
>> -            if (landlock_get_fs_access_mask(domain, layer_level) &
>> -                    BIT_ULL(access_bit)) {
>> -                access_dom |= BIT_ULL(access_bit);
>> -                break;
>> -            }
>> -        }
>> -    }
>> -    return access_dom;
>> -}
>> -
>> -static inline access_mask_t
>> -init_layer_masks(const struct landlock_ruleset *const domain,
>> -         const access_mask_t access_request,
>> -         layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
>> -{
>> -    access_mask_t handled_accesses = 0;
>> -    size_t layer_level;
>> -
>> -    memset(layer_masks, 0, sizeof(*layer_masks));
>> -    /* An empty access request can happen because of O_WRONLY | 
>> O_RDWR. */
>> -    if (!access_request)
>> -        return 0;
>> -
>> -    /* Saves all handled accesses per layer. */
>> -    for (layer_level = 0; layer_level < domain->num_layers; 
>> layer_level++) {
>> -        const unsigned long access_req = access_request;
>> -        unsigned long access_bit;
>> -
>> -        for_each_set_bit(access_bit, &access_req,
>> -                ARRAY_SIZE(*layer_masks)) {
>> -            if (landlock_get_fs_access_mask(domain, layer_level) &
>> -                    BIT_ULL(access_bit)) {
>> -                (*layer_masks)[access_bit] |=
>> -                    BIT_ULL(layer_level);
>> -                handled_accesses |= BIT_ULL(access_bit);
>> -            }
>> -        }
>> -    }
>> -    return handled_accesses;
>> -}
>> -
>>   /*
>>    * Check that a destination file hierarchy has more restrictions 
>> than a source
>>    * file hierarchy.  This is only used for link and rename actions.
>> @@ -506,7 +399,8 @@ static int check_access_path_dual(
>>            * a superset of the meaningful requested accesses).
>>            */
>>           access_masked_parent1 = access_masked_parent2 =
>> -            get_handled_accesses(domain);
>> +            get_handled_accesses(domain, LANDLOCK_RULE_PATH_BENEATH,
>> +                         LANDLOCK_NUM_ACCESS_FS);
>>           is_dom_check = true;
>>       } else {
>>           if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
>> @@ -519,17 +413,25 @@ static int check_access_path_dual(
>>
>>       if (unlikely(dentry_child1)) {
>>           unmask_layers(find_rule(domain, dentry_child1),
>> -                  init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
>> -                           &_layer_masks_child1),
>> -                  &_layer_masks_child1);
>> +                init_layer_masks(domain,
>> +                    LANDLOCK_MASK_ACCESS_FS,
>> +                    &_layer_masks_child1,
>> +                    sizeof(_layer_masks_child1),
>> +                    LANDLOCK_RULE_PATH_BENEATH),
>> +                &_layer_masks_child1,
>> +                ARRAY_SIZE(_layer_masks_child1));
> 
> There is a lot of formatting diff and that makes the review difficult. 
> Please format everything with clang-format-14.

   Ok. Do you have some tool that helps you with editing code with clang 
format?
> 
>>           layer_masks_child1 = &_layer_masks_child1;
>>           child1_is_directory = d_is_dir(dentry_child1);
>>       }
>>       if (unlikely(dentry_child2)) {
>>           unmask_layers(find_rule(domain, dentry_child2),
>> -                  init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
>> -                           &_layer_masks_child2),
>> -                  &_layer_masks_child2);
>> +                init_layer_masks(domain,
>> +                    LANDLOCK_MASK_ACCESS_FS,
>> +                    &_layer_masks_child2,
>> +                    sizeof(_layer_masks_child2),
>> +                    LANDLOCK_RULE_PATH_BENEATH),
>> +                &_layer_masks_child2,
>> +                ARRAY_SIZE(_layer_masks_child2));
>>           layer_masks_child2 = &_layer_masks_child2;
>>           child2_is_directory = d_is_dir(dentry_child2);
>>       }
>> @@ -582,14 +484,15 @@ static int check_access_path_dual(
>>
>>           rule = find_rule(domain, walker_path.dentry);
>>           allowed_parent1 = unmask_layers(rule, access_masked_parent1,
>> -                        layer_masks_parent1);
>> +                layer_masks_parent1,
>> +                ARRAY_SIZE(*layer_masks_parent1));
>>           allowed_parent2 = unmask_layers(rule, access_masked_parent2,
>> -                        layer_masks_parent2);
>> +                layer_masks_parent2,
>> +                ARRAY_SIZE(*layer_masks_parent2));
>>
>>           /* Stops when a rule from each layer grants access. */
>>           if (allowed_parent1 && allowed_parent2)
>>               break;
>> -
> 
> There is no place for such formatting/whitespace patches.
> 
   I missed that. scripts/checkpatch.pl did not show any problem here.
   I will fix it. Thanks.
> 
>>   jump_up:
>>           if (walker_path.dentry == walker_path.mnt->mnt_root) {
>>               if (follow_up(&walker_path)) {
>> @@ -645,7 +548,9 @@ static inline int check_access_path(const struct 
>> landlock_ruleset *const domain,
>>   {
>>       layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
>>
>> -    access_request = init_layer_masks(domain, access_request, 
>> &layer_masks);
>> +    access_request = init_layer_masks(domain, access_request,
>> +            &layer_masks, sizeof(layer_masks),
>> +            LANDLOCK_RULE_PATH_BENEATH);
>>       return check_access_path_dual(domain, path, access_request,
>>                         &layer_masks, NULL, 0, NULL, NULL);
>>   }
>> @@ -729,7 +634,8 @@ static bool collect_domain_accesses(
>>           return true;
>>
>>       access_dom = init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
>> -                      layer_masks_dom);
>> +            layer_masks_dom, sizeof(*layer_masks_dom),
>> +            LANDLOCK_RULE_PATH_BENEATH);
>>
>>       dget(dir);
>>       while (true) {
>> @@ -737,7 +643,8 @@ static bool collect_domain_accesses(
>>
>>           /* Gets all layers allowing all domain accesses. */
>>           if (unmask_layers(find_rule(domain, dir), access_dom,
>> -                  layer_masks_dom)) {
>> +                    layer_masks_dom,
>> +                    ARRAY_SIZE(*layer_masks_dom))) {
>>               /*
>>                * Stops when all handled accesses are allowed by at
>>                * least one rule in each layer.
>> @@ -851,9 +758,10 @@ static int current_check_refer_path(struct dentry 
>> *const old_dentry,
>>            * The LANDLOCK_ACCESS_FS_REFER access right is not required
>>            * for same-directory referer (i.e. no reparenting).
>>            */
>> -        access_request_parent1 = init_layer_masks(
>> -            dom, access_request_parent1 | access_request_parent2,
>> -            &layer_masks_parent1);
>> +        access_request_parent1 = init_layer_masks(dom,
>> +                access_request_parent1 | access_request_parent2,
>> +                &layer_masks_parent1, sizeof(layer_masks_parent1),
>> +                LANDLOCK_RULE_PATH_BENEATH);
>>           return check_access_path_dual(dom, new_dir,
>>                             access_request_parent1,
>>                             &layer_masks_parent1, NULL, 0,
>> @@ -861,7 +769,9 @@ static int current_check_refer_path(struct dentry 
>> *const old_dentry,
>>       }
>>
>>       /* Backward compatibility: no reparenting support. */
>> -    if (!(get_handled_accesses(dom) & LANDLOCK_ACCESS_FS_REFER))
>> +    if (!(get_handled_accesses(dom, LANDLOCK_RULE_PATH_BENEATH,
>> +                   LANDLOCK_NUM_ACCESS_FS) &
>> +                        LANDLOCK_ACCESS_FS_REFER))
>>           return -EXDEV;
>>
>>       access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
>> diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c
>> index 4b4c9953bb32..c4ed783d655b 100644
>> --- a/security/landlock/ruleset.c
>> +++ b/security/landlock/ruleset.c
>> @@ -233,7 +233,8 @@ static int insert_rule(struct landlock_ruleset 
>> *const ruleset,
>>                              &(*layers)[0]);
>>               if (IS_ERR(new_rule))
>>                   return PTR_ERR(new_rule);
>> -            rb_replace_node(&this->node, &new_rule->node, 
>> &ruleset->root_inode);
>> +            rb_replace_node(&this->node, &new_rule->node,
>> +                    &ruleset->root_inode);
> 
> This is a pure formatting hunk. :/
> 
   Thats strange, cause in my editor I have normal aligment of arguments.
   Could please share clang-format-14 tab size and other format 	parameters?
> 
>>               free_rule(this, rule_type);
>>               break;
>>           }
>> @@ -246,7 +247,8 @@ static int insert_rule(struct landlock_ruleset 
>> *const ruleset,
>>           return -E2BIG;
>>       switch (rule_type) {
>>       case LANDLOCK_RULE_PATH_BENEATH:
>> -        new_rule = create_rule(object_ptr, 0, layers, num_layers, NULL);
>> +        new_rule = create_rule(object_ptr, 0, layers,
>> +                       num_layers, NULL);
>>           if (IS_ERR(new_rule))
>>               return PTR_ERR(new_rule);
>>           rb_link_node(&new_rule->node, parent_node, walker_node);
>> @@ -281,8 +283,8 @@ int landlock_insert_rule(struct landlock_ruleset 
>> *const ruleset,
>>       } };
>>
>>       build_check_layer();
>> -    return insert_rule(ruleset, object_ptr, object_data, rule_type, 
>> &layers,
>> -               ARRAY_SIZE(layers));
>> +    return insert_rule(ruleset, object_ptr, object_data, rule_type,
>> +               &layers, ARRAY_SIZE(layers));
>>   }
>>
>>   static inline void get_hierarchy(struct landlock_hierarchy *const 
>> hierarchy)
>> @@ -335,8 +337,9 @@ static int tree_merge(struct landlock_ruleset 
>> *const src,
>>
>>           switch (rule_type) {
>>           case LANDLOCK_RULE_PATH_BENEATH:
>> -            err = insert_rule(dst, walker_rule->object.ptr, 0, 
>> rule_type,
>> -                      &layers, ARRAY_SIZE(layers));
>> +            err = insert_rule(dst, walker_rule->object.ptr, 0,
>> +                      rule_type, &layers,
>> +                      ARRAY_SIZE(layers));
>>               break;
>>           }
>>           if (err)
>> @@ -433,9 +436,13 @@ static int inherit_ruleset(struct 
>> landlock_ruleset *const parent,
>>           err = -EINVAL;
>>           goto out_unlock;
>>       }
>> -    /* Copies the parent layer stack and leaves a space for the new 
>> layer. */
>> +    /*
>> +     * Copies the parent layer stack and leaves a space
>> +     * for the new layer.
>> +     */
>>       memcpy(child->access_masks, parent->access_masks,
>> -            flex_array_size(parent, access_masks, parent->num_layers));
>> +            flex_array_size(parent, access_masks,
>> +                    parent->num_layers));
>>
>>       if (WARN_ON_ONCE(!parent->hierarchy)) {
>>           err = -EINVAL;
>> @@ -455,8 +462,9 @@ static void free_ruleset(struct landlock_ruleset 
>> *const ruleset)
>>       struct landlock_rule *freeme, *next;
>>
>>       might_sleep();
>> -    rbtree_postorder_for_each_entry_safe(freeme, next, 
>> &ruleset->root_inode,
>> -            node)
>> +    rbtree_postorder_for_each_entry_safe(freeme, next,
>> +                         &ruleset->root_inode,
>> +                         node)
>>           free_rule(freeme, LANDLOCK_RULE_PATH_BENEATH);
>>       put_hierarchy(ruleset->hierarchy);
>>       kfree(ruleset);
>> @@ -577,3 +585,127 @@ const struct landlock_rule *landlock_find_rule(
>>       }
>>       return NULL;
>>   }
>> +
>> +access_mask_t get_handled_accesses(
>> +        const struct landlock_ruleset *const domain,
>> +        u16 rule_type, u16 num_access)
>> +{
>> +    access_mask_t access_dom = 0;
>> +    unsigned long access_bit;
>> +
>> +    switch (rule_type) {
>> +    case LANDLOCK_RULE_PATH_BENEATH:
>> +        for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
>> +            access_bit++) {
>> +            size_t layer_level;
>> +
>> +            for (layer_level = 0; layer_level < domain->num_layers;
>> +                    layer_level++) {
>> +                if (landlock_get_fs_access_mask(domain,
>> +                                layer_level) &
>> +                        BIT_ULL(access_bit)) {
>> +                    access_dom |= BIT_ULL(access_bit);
>> +                    break;
>> +                }
>> +            }
>> +        }
>> +        break;
>> +    default:
>> +        break;
>> +    }
>> +    return access_dom;
>> +}
>> +
>> +/*
>> + * @layer_masks is read and may be updated according to the access 
>> request and
>> + * the matching rule.
>> + *
>> + * Returns true if the request is allowed (i.e. relevant layer masks 
>> for the
>> + * request are empty).
>> + */
>> +bool unmask_layers(const struct landlock_rule *const rule,
>> +        const access_mask_t access_request,
>> +        layer_mask_t (*const layer_masks)[], size_t masks_array_size)
>> +{
>> +    size_t layer_level;
>> +
>> +    if (!access_request || !layer_masks)
>> +        return true;
>> +    if (!rule)
>> +        return false;
>> +
>> +    /*
>> +     * An access is granted if, for each policy layer, at least one rule
>> +     * encountered on the pathwalk grants the requested access,
>> +     * regardless of its position in the layer stack.  We must then 
>> check
>> +     * the remaining layers for each inode, from the first added 
>> layer to
>> +     * the last one.  When there is multiple requested accesses, for 
>> each
>> +     * policy layer, the full set of requested accesses may not be 
>> granted
>> +     * by only one rule, but by the union (binary OR) of multiple rules.
>> +     * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
>> +     */
>> +    for (layer_level = 0; layer_level < rule->num_layers; 
>> layer_level++) {
>> +        const struct landlock_layer *const layer =
>> +            &rule->layers[layer_level];
>> +        const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
>> +        const unsigned long access_req = access_request;
>> +        unsigned long access_bit;
>> +        bool is_empty;
>> +
>> +        /*
>> +         * Records in @layer_masks which layer grants access to each
>> +         * requested access.
>> +         */
>> +        is_empty = true;
>> +        for_each_set_bit(access_bit, &access_req, masks_array_size) {
>> +            if (layer->access & BIT_ULL(access_bit))
>> +                (*layer_masks)[access_bit] &= ~layer_bit;
>> +            is_empty = is_empty && !(*layer_masks)[access_bit];
>> +        }
>> +        if (is_empty)
>> +            return true;
>> +    }
>> +    return false;
>> +}
>> +
>> +access_mask_t init_layer_masks(const struct landlock_ruleset *const 
>> domain,
>> +                   const access_mask_t access_request,
>> +                   layer_mask_t (*const layer_masks)[],
>> +                   size_t masks_size,
>> +                   u16 rule_type)
>> +{
>> +    access_mask_t handled_accesses = 0;
>> +    size_t layer_level;
>> +
>> +    memset(layer_masks, 0, masks_size);
>> +
>> +    /* An empty access request can happen because of O_WRONLY | 
>> O_RDWR. */
>> +    if (!access_request)
>> +        return 0;
>> +
>> +    /* Saves all handled accesses per layer. */
>> +    for (layer_level = 0; layer_level < domain->num_layers;
>> +            layer_level++) {
>> +        const unsigned long access_req = access_request;
>> +        unsigned long access_bit;
>> +
>> +        switch (rule_type) {
>> +        case LANDLOCK_RULE_PATH_BENEATH:
>> +            for_each_set_bit(access_bit, &access_req,
>> +                    LANDLOCK_NUM_ACCESS_FS) {
>> +                if (landlock_get_fs_access_mask(domain,
>> +                                layer_level) &
>> +                        BIT_ULL(access_bit)) {
>> +                    (*layer_masks)[access_bit] |=
>> +                        BIT_ULL(layer_level);
>> +                    handled_accesses |=
>> +                               BIT_ULL(access_bit);
>> +                }
>> +            }
>> +            break;
>> +        default:
>> +            return 0;
>> +        }
>> +    }
>> +    return handled_accesses;
>> +}
>> diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
>> index 3066e5d7180c..f3cd890d0348 100644
>> --- a/security/landlock/ruleset.h
>> +++ b/security/landlock/ruleset.h
>> @@ -195,7 +195,22 @@ static inline u32 landlock_get_fs_access_mask(
>>                       const struct landlock_ruleset *ruleset,
>>                       u16 mask_level)
>>   {
>> -    return ruleset->access_masks[mask_level];
>> +    return (ruleset->access_masks[mask_level] & 
>> LANDLOCK_MASK_ACCESS_FS);
>>   }
>>
>> +access_mask_t get_handled_accesses(
>> +        const struct landlock_ruleset *const domain,
>> +        u16 rule_type, u16 num_access);
>> +
>> +bool unmask_layers(const struct landlock_rule *const rule,
>> +           const access_mask_t access_request,
>> +           layer_mask_t (*const layer_masks)[],
>> +           size_t masks_array_size);
>> +
>> +access_mask_t init_layer_masks(const struct landlock_ruleset *const 
>> domain,
>> +                   const access_mask_t access_request,
>> +                   layer_mask_t (*const layer_masks)[],
>> +                   size_t masks_size,
>> +                   u16 rule_type);
> 
> These declarations are useless.

   I moved helpers in rulset.c/h to use them both for filesystem
and network.
> 
>> +
>>   #endif /* _SECURITY_LANDLOCK_RULESET_H */
>> -- 
>> 2.25.1
>>
> .

^ permalink raw reply

* Re: [PATCH v4 1/3] dt-bindings: net: adin: document phy clock
From: Jakub Kicinski @ 2022-05-16 17:43 UTC (permalink / raw)
  To: Josua Mayer
  Cc: Michael Walle, alexandru.ardelean, alvaro.karsz, davem, edumazet,
	krzysztof.kozlowski+dt, michael.hennerich, netdev, pabeni,
	robh+dt
In-Reply-To: <072a773c-2e42-1b82-9fe7-63c9a3dc9c7d@solid-run.com>

On Sun, 15 May 2022 10:16:47 +0300 Josua Mayer wrote:
> Am 13.05.22 um 01:44 schrieb Jakub Kicinski:
> > On Thu, 12 May 2022 23:20:18 +0200 Michael Walle wrote:  
> >>> It's pure speculation on my side. I don't even know if PHYs use
> >>> the recovered clock to clock its output towards the MAC or that's
> >>> a different clock domain.
> >>>
> >>> My concern is that people will start to use DT to configure SyncE which
> >>> is entirely a runtime-controllable thing, and doesn't belong.  
> Okay.
> However phy drivers do not seem to implement runtime control of those 
> clock output pins currently, so they are configured once in DT.

To me that means nobody needs the recovered clock.

> >>> Hence
> >>> my preference to hide the recovered vs free-running detail if we can
> >>> pick one that makes most sense for now.  
> I am not a fan of hiding information. The clock configuration register 
> clearly supports this distinction.

Unless you expose all registers as a direct API to the user you'll be
"hiding information". I don't think we are exposing all possible
registers for this PHY, the two bits in question are no different.

> Is this a political stance to say users may not "accidentally" enable 
> SyncE by patching DT?
> If so we should print a warning message when someone selects it?

Why would we add a feature and then print a warning? We can always add 
the support later, once we have a use case for it.

> >> I see. That makes sense, but then wouldn't it make more sense to pick
> >> the (simple) free-running one? As for SyncE you'd need the recovered
> >> clock.  
> > 
> > Sounds good.  
> 
> Yep, it seems recovered clock is only for SyncE - and only if there is a 
> master clock on the network. Sadly however documentation is sparse and I 
> do not know if the adi phys would fall back to using their internal 
> clock, or just refuse to operate at all.

^ permalink raw reply

* Re: [PATCH v5 12/15] seltests/landlock: rules overlapping test
From: Mickaël Salaün @ 2022-05-16 17:41 UTC (permalink / raw)
  To: Konstantin Meskhidze
  Cc: willemdebruijn.kernel, linux-security-module, netdev,
	netfilter-devel, yusongping, anton.sirazetdinov
In-Reply-To: <20220516152038.39594-13-konstantin.meskhidze@huawei.com>

Please fix these kind of subjects (selftests). I'd also like the subject 
description to (quickly) describe what is done (with a verb), to start 
with a capital (like a title), and to contain "network", something like 
this:
selftests/landlock: Add test for overlapping network rules

This is a good test though.


On 16/05/2022 17:20, Konstantin Meskhidze wrote:
> This patch adds overlapping rules for one port.
> First rule adds just bind() access right for a port.
> The second one adds both bind() and connect()
> access rights for the same port.
> 
> Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
> ---
> 
> Changes since v3:
> * Add ruleset_overlap test.
> 
> Changes since v4:
> * Refactoring code with self->port, self->addr4 variables.
> 
> ---
>   tools/testing/selftests/landlock/net_test.c | 51 +++++++++++++++++++++
>   1 file changed, 51 insertions(+)
> 
> diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
> index bf8e49466d1d..1d8c9dfdbd48 100644
> --- a/tools/testing/selftests/landlock/net_test.c
> +++ b/tools/testing/selftests/landlock/net_test.c
> @@ -677,4 +677,55 @@ TEST_F_FORK(socket_test, connect_afunspec_with_restictions) {
>   	ASSERT_EQ(1, WIFEXITED(status));
>   	ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
>   }
> +
> +TEST_F_FORK(socket_test, ruleset_overlap) {

Please run clang-format-14 on all files (and all commits).

> +
> +	int sockfd;
> +
> +	struct landlock_ruleset_attr ruleset_attr = {
> +		.handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP |
> +				      LANDLOCK_ACCESS_NET_CONNECT_TCP,
> +	};
> +	struct landlock_net_service_attr net_service_1 = {
> +		.allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP,
> +
> +		.port = self->port[0],
> +	};
> +
> +		struct landlock_net_service_attr net_service_2 = {
> +		.allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP |
> +				  LANDLOCK_ACCESS_NET_CONNECT_TCP,
> +
> +		.port = self->port[0],
> +	};
> +
> +	const int ruleset_fd = landlock_create_ruleset(&ruleset_attr,
> +					sizeof(ruleset_attr), 0);
> +	ASSERT_LE(0, ruleset_fd);
> +
> +	/* Allows bind operations to the port[0] socket */

Please ends this kind of comments with a final dot (all files/commits).

> +	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_SERVICE,
> +				       &net_service_1, 0));
> +	/* Allows connect and bind operations to the port[0] socket */
> +	ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_SERVICE,
> +				       &net_service_2, 0));
> +
> +	/* Enforces the ruleset. */
> +	enforce_ruleset(_metadata, ruleset_fd);
> +
> +	/* Creates a server socket */
> +	sockfd = create_socket(_metadata, false, false);
> +	ASSERT_LE(0, sockfd);
> +
> +	/* Binds the socket to address with port[0] */
> +	ASSERT_EQ(0, bind(sockfd, (struct sockaddr *)&self->addr4[0], sizeof(self->addr4[0])));
> +
> +	/* Makes connection to socket with port[0] */
> +	ASSERT_EQ(0, connect(sockfd, (struct sockaddr *)&self->addr4[0],

Can you please get rid of this (struct sockaddr *) type casting please 
(without compiler warning)?

> +						   sizeof(self->addr4[0])));

Here, you can enforce a new ruleset with net_service_1 and check that 
bind() is still allowed but not connect().

> +
> +	/* Closes socket */
> +	ASSERT_EQ(0, close(sockfd));
> +}
> +
>   TEST_HARNESS_MAIN
> --
> 2.25.1
> 

^ permalink raw reply

* Re: [PATCH net-next v3] bond: add mac filter option for balance-xor
From: Nikolay Aleksandrov @ 2022-05-16 17:24 UTC (permalink / raw)
  To: Jonathan Toppins, netdev
  Cc: toke, Long Xin, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jonathan Corbet, Jay Vosburgh, Veaceslav Falico,
	Andy Gospodarek, linux-doc, linux-kernel
In-Reply-To: <53357187-aedf-20dd-a331-bc501aa0484e@blackwall.org>

On 16/05/2022 20:22, Nikolay Aleksandrov wrote:
> On 16/05/2022 17:06, Jonathan Toppins wrote:
>> On 5/15/22 02:32, Nikolay Aleksandrov wrote:
>>> On 15/05/2022 00:41, Nikolay Aleksandrov wrote:
>>>> On 13/05/2022 20:43, Jonathan Toppins wrote:
>>>>> Implement a MAC filter that prevents duplicate frame delivery when
>>>>> handling BUM traffic. This attempts to partially replicate OvS SLB
>>>>> Bonding[1] like functionality without requiring significant change
>>>>> in the Linux bridging code.
>>>>>
>>>>> A typical network setup for this feature would be:
>>>>>
>>>>>              .--------------------------------------------.
>>>>>              |         .--------------------.             |
>>>>>              |         |                    |             |
>>>>>         .-------------------.               |             |
>>>>>         |    | Bond 0  |    |               |             |
>>>>>         | .--'---. .---'--. |               |             |
>>>>>    .----|-| eth0 |-| eth1 |-|----.    .-----+----.   .----+------.
>>>>>    |    | '------' '------' |    |    | Switch 1 |   | Switch 2  |
>>>>>    |    '---,---------------'    |    |          +---+           |
>>>>>    |       /                     |    '----+-----'   '----+------'
>>>>>    |  .---'---.    .------.      |         |              |
>>>>>    |  |  br0  |----| VM 1 |      |      ~~~~~~~~~~~~~~~~~~~~~
>>>>>    |  '-------'    '------'      |     (                     )
>>>>>    |      |        .------.      |     ( Rest of Network     )
>>>>>    |      '--------| VM # |      |     (_____________________)
>>>>>    |               '------'      |
>>>>>    |  Host 1                     |
>>>>>    '-----------------------------'
>>>>>
>>>>> Where 'VM1' and 'VM#' are hosts connected to a Linux bridge, br0, with
>>>>> bond0 and its associated links, eth0 & eth1, provide ingress/egress. One
>>>>> can assume bond0, br1, and hosts VM1 to VM# are all contained in a
>>>>> single box, as depicted. Interfaces eth0 and eth1 provide redundant
>>>>> connections to the data center with the requirement to use all bandwidth
>>>>> when the system is functioning normally. Switch 1 and Switch 2 are
>>>>> physical switches that do not implement any advanced L2 management
>>>>> features such as MLAG, Cisco's VPC, or LACP.
>>>>>
>>>>> Combining this feature with vlan+srcmac hash policy allows a user to
>>>>> create an access network without the need to use expensive switches that
>>>>> support features like Cisco's VCP.
>>>>>
>>>>> [1] https://docs.openvswitch.org/en/latest/topics/bonding/#slb-bonding
>>>>>
>>>>> Co-developed-by: Long Xin <lxin@redhat.com>
>>>>> Signed-off-by: Long Xin <lxin@redhat.com>
>>>>> Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
>>>>> ---
>>>>>
>>>>> Notes:
>>>>>      v2:
>>>>>       * dropped needless abstraction functions and put code in module init
>>>>>       * renamed variable "rc" to "ret" to stay consistent with most of the
>>>>>         code
>>>>>       * fixed parameter setting management, when arp-monitor is turned on
>>>>>         this feature will be turned off similar to how miimon and arp-monitor
>>>>>         interact
>>>>>       * renamed bond_xor_recv to bond_mac_filter_recv for a little more
>>>>>         clarity
>>>>>       * it appears the implied default return code for any bonding recv probe
>>>>>         must be `RX_HANDLER_ANOTHER`. Changed the default return code of
>>>>>         bond_mac_filter_recv to use this return value to not break skb
>>>>>         processing when the skb dev is switched to the bond dev:
>>>>>           `skb->dev = bond->dev`
>>>>>           v3: Nik's comments
>>>>>       * clarified documentation
>>>>>       * fixed inline and basic reverse Christmas tree formatting
>>>>>       * zero'ed entry in mac_create
>>>>>       * removed read_lock taking in bond_mac_filter_recv
>>>>>       * made has_expired() atomic and removed critical sections
>>>>>         surrounding calls to has_expired(), this also removed the
>>>>>         use-after-free that would have occurred:
>>>>>             spin_lock_irqsave(&entry->lock, flags);
>>>>>                 if (has_expired(bond, entry))
>>>>>                     mac_delete(bond, entry);
>>>>>             spin_unlock_irqrestore(&entry->lock, flags); <---
>>>>>       * moved init/destroy of mac_filter_tbl to bond_open/bond_close
>>>>>         this removed the complex option dependencies, the only behavioural
>>>>>         change the user will see is if the bond is up and mac_filter is
>>>>>         enabled if they try and set arp_interval they will receive -EBUSY
>>>>>       * in bond_changelink moved processing of mac_filter option just below
>>>>>         mode processing
>>>>>
>>>>>   Documentation/networking/bonding.rst  |  20 +++
>>>>>   drivers/net/bonding/Makefile          |   2 +-
>>>>>   drivers/net/bonding/bond_mac_filter.c | 201 ++++++++++++++++++++++++++
>>>>>   drivers/net/bonding/bond_mac_filter.h |  37 +++++
>>>>>   drivers/net/bonding/bond_main.c       |  30 ++++
>>>>>   drivers/net/bonding/bond_netlink.c    |  13 ++
>>>>>   drivers/net/bonding/bond_options.c    |  81 +++++++++--
>>>>>   drivers/net/bonding/bonding_priv.h    |   1 +
>>>>>   include/net/bond_options.h            |   1 +
>>>>>   include/net/bonding.h                 |   3 +
>>>>>   include/uapi/linux/if_link.h          |   1 +
>>>>>   11 files changed, 373 insertions(+), 17 deletions(-)
>>>>>   create mode 100644 drivers/net/bonding/bond_mac_filter.c
>>>>>   create mode 100644 drivers/net/bonding/bond_mac_filter.h
>>>>>
>>>>
>>> [snip]
>>>
>>> The same problem solved using a few nftables rules (in case you don't want to load eBPF):
>>> $ nft 'add table netdev nt'
>>> $ nft 'add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }'
>>> $ nft 'add chain netdev nt bond0IngressFilter { type filter hook ingress device bond0 priority 0; }'
>>> $ nft 'add set netdev nt macset { type ether_addr; flags timeout; }'
>>> $ nft 'add rule netdev nt bond0EgressFilter set update ether saddr timeout 5s @macset'
>>> $ nft 'add rule netdev nt bond0IngressFilter ether saddr @macset counter drop'
>>>
>>
>> I get the following when trying to apply this on a fedora 35 install.
>>
>> root@fedora ~]# ip link add bond0 type bond mode balance-xor xmit_hash_policy vlan+srcmac
>> [root@fedora ~]# nft 'add table netdev nt'
>> [root@fedora ~]# nft 'add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }'
>> Error: unknown chain hook
>> add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }
>>                                                          ^^^^^^
>> [root@fedora ~]# uname -a
>> Linux fedora 5.17.5-200.fc35.x86_64 #1 SMP PREEMPT Thu Apr 28 15:41:41 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
>>
> 
> Well, take it up with the Fedora nftables package maintainer. :) 
> 
> Your nftables version is old (I'd guess <1.0.1):
>  commit 510c4fad7e78
>  Author: Lukas Wunner <lukas@wunner.de>
>  Date:   Wed Mar 11 13:20:06 2020 +0100
> 
>      src: Support netdev egress hook
> 
> $ git tag --contains 510c4fad7e78f
> v1.0.1
> v1.0.2
> 
> I just tested it[1] on Linux 5.16.18-200.fc35.x86_64 #1 SMP PREEMPT Mon Mar 28 14:10:07 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
> 
> Cheers,
>  Nik
> 
> [1] You can clearly see the dynamically learned mac on egress (52:54:00:23:5f:13) and traffic
>     with that source is now blocked on ingress.
> 
> $ nft -a list table netdev nt
> 	set macset { # handle 10
> 		type ether_addr
> 		size 65535
> 		flags timeout
> 		elements = { 52:54:00:23:5f:13 timeout 5s expires 4s192ms }
> 	}
> 
> 	chain bond0EgressFilter { # handle 8
> 		type filter hook egress device "bond0" priority filter; policy accept;
> 		update @macset { ether saddr timeout 5s } # handle 11
> 	}
> 
> 	chain bond0IngressFilter { # handle 9
> 		type filter hook ingress device "bond0" priority filter; policy accept;
> 	}
> 

Oops, pasted an old list before adding the ingress rule:
	chain bond0IngressFilter { # handle 9
		type filter hook ingress device "bond0" priority filter; policy accept;
		ether saddr @macset counter packets 120 bytes 7680 drop # handle 15
	}


^ permalink raw reply

* Re: [PATCH net-next v3] bond: add mac filter option for balance-xor
From: Nikolay Aleksandrov @ 2022-05-16 17:22 UTC (permalink / raw)
  To: Jonathan Toppins, netdev
  Cc: toke, Long Xin, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Jonathan Corbet, Jay Vosburgh, Veaceslav Falico,
	Andy Gospodarek, linux-doc, linux-kernel
In-Reply-To: <6431569f-fb09-096e-7a89-284a71aa5c0f@redhat.com>

On 16/05/2022 17:06, Jonathan Toppins wrote:
> On 5/15/22 02:32, Nikolay Aleksandrov wrote:
>> On 15/05/2022 00:41, Nikolay Aleksandrov wrote:
>>> On 13/05/2022 20:43, Jonathan Toppins wrote:
>>>> Implement a MAC filter that prevents duplicate frame delivery when
>>>> handling BUM traffic. This attempts to partially replicate OvS SLB
>>>> Bonding[1] like functionality without requiring significant change
>>>> in the Linux bridging code.
>>>>
>>>> A typical network setup for this feature would be:
>>>>
>>>>              .--------------------------------------------.
>>>>              |         .--------------------.             |
>>>>              |         |                    |             |
>>>>         .-------------------.               |             |
>>>>         |    | Bond 0  |    |               |             |
>>>>         | .--'---. .---'--. |               |             |
>>>>    .----|-| eth0 |-| eth1 |-|----.    .-----+----.   .----+------.
>>>>    |    | '------' '------' |    |    | Switch 1 |   | Switch 2  |
>>>>    |    '---,---------------'    |    |          +---+           |
>>>>    |       /                     |    '----+-----'   '----+------'
>>>>    |  .---'---.    .------.      |         |              |
>>>>    |  |  br0  |----| VM 1 |      |      ~~~~~~~~~~~~~~~~~~~~~
>>>>    |  '-------'    '------'      |     (                     )
>>>>    |      |        .------.      |     ( Rest of Network     )
>>>>    |      '--------| VM # |      |     (_____________________)
>>>>    |               '------'      |
>>>>    |  Host 1                     |
>>>>    '-----------------------------'
>>>>
>>>> Where 'VM1' and 'VM#' are hosts connected to a Linux bridge, br0, with
>>>> bond0 and its associated links, eth0 & eth1, provide ingress/egress. One
>>>> can assume bond0, br1, and hosts VM1 to VM# are all contained in a
>>>> single box, as depicted. Interfaces eth0 and eth1 provide redundant
>>>> connections to the data center with the requirement to use all bandwidth
>>>> when the system is functioning normally. Switch 1 and Switch 2 are
>>>> physical switches that do not implement any advanced L2 management
>>>> features such as MLAG, Cisco's VPC, or LACP.
>>>>
>>>> Combining this feature with vlan+srcmac hash policy allows a user to
>>>> create an access network without the need to use expensive switches that
>>>> support features like Cisco's VCP.
>>>>
>>>> [1] https://docs.openvswitch.org/en/latest/topics/bonding/#slb-bonding
>>>>
>>>> Co-developed-by: Long Xin <lxin@redhat.com>
>>>> Signed-off-by: Long Xin <lxin@redhat.com>
>>>> Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
>>>> ---
>>>>
>>>> Notes:
>>>>      v2:
>>>>       * dropped needless abstraction functions and put code in module init
>>>>       * renamed variable "rc" to "ret" to stay consistent with most of the
>>>>         code
>>>>       * fixed parameter setting management, when arp-monitor is turned on
>>>>         this feature will be turned off similar to how miimon and arp-monitor
>>>>         interact
>>>>       * renamed bond_xor_recv to bond_mac_filter_recv for a little more
>>>>         clarity
>>>>       * it appears the implied default return code for any bonding recv probe
>>>>         must be `RX_HANDLER_ANOTHER`. Changed the default return code of
>>>>         bond_mac_filter_recv to use this return value to not break skb
>>>>         processing when the skb dev is switched to the bond dev:
>>>>           `skb->dev = bond->dev`
>>>>           v3: Nik's comments
>>>>       * clarified documentation
>>>>       * fixed inline and basic reverse Christmas tree formatting
>>>>       * zero'ed entry in mac_create
>>>>       * removed read_lock taking in bond_mac_filter_recv
>>>>       * made has_expired() atomic and removed critical sections
>>>>         surrounding calls to has_expired(), this also removed the
>>>>         use-after-free that would have occurred:
>>>>             spin_lock_irqsave(&entry->lock, flags);
>>>>                 if (has_expired(bond, entry))
>>>>                     mac_delete(bond, entry);
>>>>             spin_unlock_irqrestore(&entry->lock, flags); <---
>>>>       * moved init/destroy of mac_filter_tbl to bond_open/bond_close
>>>>         this removed the complex option dependencies, the only behavioural
>>>>         change the user will see is if the bond is up and mac_filter is
>>>>         enabled if they try and set arp_interval they will receive -EBUSY
>>>>       * in bond_changelink moved processing of mac_filter option just below
>>>>         mode processing
>>>>
>>>>   Documentation/networking/bonding.rst  |  20 +++
>>>>   drivers/net/bonding/Makefile          |   2 +-
>>>>   drivers/net/bonding/bond_mac_filter.c | 201 ++++++++++++++++++++++++++
>>>>   drivers/net/bonding/bond_mac_filter.h |  37 +++++
>>>>   drivers/net/bonding/bond_main.c       |  30 ++++
>>>>   drivers/net/bonding/bond_netlink.c    |  13 ++
>>>>   drivers/net/bonding/bond_options.c    |  81 +++++++++--
>>>>   drivers/net/bonding/bonding_priv.h    |   1 +
>>>>   include/net/bond_options.h            |   1 +
>>>>   include/net/bonding.h                 |   3 +
>>>>   include/uapi/linux/if_link.h          |   1 +
>>>>   11 files changed, 373 insertions(+), 17 deletions(-)
>>>>   create mode 100644 drivers/net/bonding/bond_mac_filter.c
>>>>   create mode 100644 drivers/net/bonding/bond_mac_filter.h
>>>>
>>>
>> [snip]
>>
>> The same problem solved using a few nftables rules (in case you don't want to load eBPF):
>> $ nft 'add table netdev nt'
>> $ nft 'add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }'
>> $ nft 'add chain netdev nt bond0IngressFilter { type filter hook ingress device bond0 priority 0; }'
>> $ nft 'add set netdev nt macset { type ether_addr; flags timeout; }'
>> $ nft 'add rule netdev nt bond0EgressFilter set update ether saddr timeout 5s @macset'
>> $ nft 'add rule netdev nt bond0IngressFilter ether saddr @macset counter drop'
>>
> 
> I get the following when trying to apply this on a fedora 35 install.
> 
> root@fedora ~]# ip link add bond0 type bond mode balance-xor xmit_hash_policy vlan+srcmac
> [root@fedora ~]# nft 'add table netdev nt'
> [root@fedora ~]# nft 'add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }'
> Error: unknown chain hook
> add chain netdev nt bond0EgressFilter { type filter hook egress device bond0 priority 0; }
>                                                          ^^^^^^
> [root@fedora ~]# uname -a
> Linux fedora 5.17.5-200.fc35.x86_64 #1 SMP PREEMPT Thu Apr 28 15:41:41 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
> 

Well, take it up with the Fedora nftables package maintainer. :) 

Your nftables version is old (I'd guess <1.0.1):
 commit 510c4fad7e78
 Author: Lukas Wunner <lukas@wunner.de>
 Date:   Wed Mar 11 13:20:06 2020 +0100

     src: Support netdev egress hook

$ git tag --contains 510c4fad7e78f
v1.0.1
v1.0.2

I just tested it[1] on Linux 5.16.18-200.fc35.x86_64 #1 SMP PREEMPT Mon Mar 28 14:10:07 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux

Cheers,
 Nik

[1] You can clearly see the dynamically learned mac on egress (52:54:00:23:5f:13) and traffic
    with that source is now blocked on ingress.

$ nft -a list table netdev nt
	set macset { # handle 10
		type ether_addr
		size 65535
		flags timeout
		elements = { 52:54:00:23:5f:13 timeout 5s expires 4s192ms }
	}

	chain bond0EgressFilter { # handle 8
		type filter hook egress device "bond0" priority filter; policy accept;
		update @macset { ether saddr timeout 5s } # handle 11
	}

	chain bond0IngressFilter { # handle 9
		type filter hook ingress device "bond0" priority filter; policy accept;
	}


^ permalink raw reply

* Re: [PATCH bpf-next v9 0/5] New BPF helpers to accelerate synproxy
From: Maxim Mikityanskiy @ 2022-05-16 17:17 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Networking, Tariq Toukan, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, David S. Miller,
	Jakub Kicinski, Eric Dumazet, Hideaki YOSHIFUJI, David Ahern,
	Shuah Khan, Jesper Dangaard Brouer, Nathan Chancellor,
	Nick Desaulniers, Joe Stringer, Florent Revest,
	open list:KERNEL SELFTEST FRAMEWORK,
	Toke Høiland-Jørgensen, Kumar Kartikeya Dwivedi,
	Florian Westphal, pabeni
In-Reply-To: <13051d07-babc-1991-104b-f4969ac24b9b@nvidia.com>

On 2022-05-11 14:48, Maxim Mikityanskiy wrote:
> On 2022-05-11 02:59, Andrii Nakryiko wrote:
>> On Tue, May 10, 2022 at 12:21 PM Maxim Mikityanskiy 
>> <maximmi@nvidia.com> wrote:
>>>
>>> On 2022-05-07 00:51, Andrii Nakryiko wrote:
>>>>
>>>> Is it expected that your selftests will fail on s390x? Please check [0]
>>>
>>> I see it fails with:
>>>
>>> test_synproxy:FAIL:ethtool -K tmp0 tx off unexpected error: 32512 
>>> (errno 2)
>>>
>>> errno 2 is ENOENT, probably the ethtool binary is missing from the s390x
>>> image? When reviewing v6, you said you added ethtool to the CI image.
>>> Maybe it was added to x86_64 only? Could you add it to s390x?
>>>
>>
>> Could be that it was outdated in s390x, but with [0] just merged in it
>> should have pretty recent one.
> 
> Do you mean the image was outdated and didn't contain ethtool? Or 
> ethtool was in the image, but was outdated? If the latter, I would 
> expect it to work, this specific ethtool command has worked for ages.

Hi Andrii,

Could you reply this question? I need to understand whether I need to 
make any changes to the CI before resubmitting.

Thanks,
Max

^ permalink raw reply

* Re: [PATCH v5 04/15] landlock: helper functions refactoring
From: Mickaël Salaün @ 2022-05-16 17:14 UTC (permalink / raw)
  To: Konstantin Meskhidze
  Cc: willemdebruijn.kernel, linux-security-module, netdev,
	netfilter-devel, yusongping, anton.sirazetdinov
In-Reply-To: <20220516152038.39594-5-konstantin.meskhidze@huawei.com>


On 16/05/2022 17:20, Konstantin Meskhidze wrote:
> Unmask_layers(), init_layer_masks() and
> get_handled_accesses() helper functions move to
> ruleset.c and rule_type argument is added.
> This modification supports implementing new rule
> types into next landlock versions.
> 
> Signed-off-by: Konstantin Meskhidze <konstantin.meskhidze@huawei.com>
> ---
> 
> Changes since v3:
> * Splits commit.
> * Refactoring landlock_unmask_layers functions.
> 

Please sort changes in antichronological order. It is easier to look at 
the first lines to get the last changes.

> Changes since v4:
> * Refactoring init_layer_masks(), get_handled_accesses()
> and unmask_layers() functions to support multiple rule types.
> * Refactoring landlock_get_fs_access_mask() function with
> LANDLOCK_MASK_ACCESS_FS mask. >
> ---
>   security/landlock/fs.c      | 158 ++++++++----------------------------
>   security/landlock/ruleset.c | 152 +++++++++++++++++++++++++++++++---
>   security/landlock/ruleset.h |  17 +++-
>   3 files changed, 192 insertions(+), 135 deletions(-)
> 
> diff --git a/security/landlock/fs.c b/security/landlock/fs.c
> index 5de24d4dd74c..3506e182b23e 100644
> --- a/security/landlock/fs.c
> +++ b/security/landlock/fs.c
> @@ -211,60 +211,6 @@ find_rule(const struct landlock_ruleset *const domain,
>   	return rule;
>   }
> 
> -/*
> - * @layer_masks is read and may be updated according to the access request and
> - * the matching rule.
> - *
> - * Returns true if the request is allowed (i.e. relevant layer masks for the
> - * request are empty).
> - */
> -static inline bool
> -unmask_layers(const struct landlock_rule *const rule,
> -	      const access_mask_t access_request,
> -	      layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])

Moving these entire blocks of code make the review/diff impossible. Why 
moving these helpers?

> -{
> -	size_t layer_level;
> -
> -	if (!access_request || !layer_masks)
> -		return true;
> -	if (!rule)
> -		return false;
> -
> -	/*
> -	 * An access is granted if, for each policy layer, at least one rule
> -	 * encountered on the pathwalk grants the requested access,
> -	 * regardless of its position in the layer stack.  We must then check
> -	 * the remaining layers for each inode, from the first added layer to
> -	 * the last one.  When there is multiple requested accesses, for each
> -	 * policy layer, the full set of requested accesses may not be granted
> -	 * by only one rule, but by the union (binary OR) of multiple rules.
> -	 * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
> -	 */
> -	for (layer_level = 0; layer_level < rule->num_layers; layer_level++) {
> -		const struct landlock_layer *const layer =
> -			&rule->layers[layer_level];
> -		const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
> -		const unsigned long access_req = access_request;
> -		unsigned long access_bit;
> -		bool is_empty;
> -
> -		/*
> -		 * Records in @layer_masks which layer grants access to each
> -		 * requested access.
> -		 */
> -		is_empty = true;
> -		for_each_set_bit(access_bit, &access_req,
> -				 ARRAY_SIZE(*layer_masks)) {
> -			if (layer->access & BIT_ULL(access_bit))
> -				(*layer_masks)[access_bit] &= ~layer_bit;
> -			is_empty = is_empty && !(*layer_masks)[access_bit];
> -		}
> -		if (is_empty)
> -			return true;
> -	}
> -	return false;
> -}
> -
>   /*
>    * Allows access to pseudo filesystems that will never be mountable (e.g.
>    * sockfs, pipefs), but can still be reachable through
> @@ -277,59 +223,6 @@ static inline bool is_nouser_or_private(const struct dentry *dentry)
>   		unlikely(IS_PRIVATE(d_backing_inode(dentry))));
>   }
> 
> -static inline access_mask_t
> -get_handled_accesses(const struct landlock_ruleset *const domain)
> -{
> -	access_mask_t access_dom = 0;
> -	unsigned long access_bit;
> -
> -	for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
> -	     access_bit++) {
> -		size_t layer_level;
> -
> -		for (layer_level = 0; layer_level < domain->num_layers;
> -				layer_level++) {
> -			if (landlock_get_fs_access_mask(domain, layer_level) &
> -					BIT_ULL(access_bit)) {
> -				access_dom |= BIT_ULL(access_bit);
> -				break;
> -			}
> -		}
> -	}
> -	return access_dom;
> -}
> -
> -static inline access_mask_t
> -init_layer_masks(const struct landlock_ruleset *const domain,
> -		 const access_mask_t access_request,
> -		 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
> -{
> -	access_mask_t handled_accesses = 0;
> -	size_t layer_level;
> -
> -	memset(layer_masks, 0, sizeof(*layer_masks));
> -	/* An empty access request can happen because of O_WRONLY | O_RDWR. */
> -	if (!access_request)
> -		return 0;
> -
> -	/* Saves all handled accesses per layer. */
> -	for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
> -		const unsigned long access_req = access_request;
> -		unsigned long access_bit;
> -
> -		for_each_set_bit(access_bit, &access_req,
> -				ARRAY_SIZE(*layer_masks)) {
> -			if (landlock_get_fs_access_mask(domain, layer_level) &
> -					BIT_ULL(access_bit)) {
> -				(*layer_masks)[access_bit] |=
> -					BIT_ULL(layer_level);
> -				handled_accesses |= BIT_ULL(access_bit);
> -			}
> -		}
> -	}
> -	return handled_accesses;
> -}
> -
>   /*
>    * Check that a destination file hierarchy has more restrictions than a source
>    * file hierarchy.  This is only used for link and rename actions.
> @@ -506,7 +399,8 @@ static int check_access_path_dual(
>   		 * a superset of the meaningful requested accesses).
>   		 */
>   		access_masked_parent1 = access_masked_parent2 =
> -			get_handled_accesses(domain);
> +			get_handled_accesses(domain, LANDLOCK_RULE_PATH_BENEATH,
> +					     LANDLOCK_NUM_ACCESS_FS);
>   		is_dom_check = true;
>   	} else {
>   		if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
> @@ -519,17 +413,25 @@ static int check_access_path_dual(
> 
>   	if (unlikely(dentry_child1)) {
>   		unmask_layers(find_rule(domain, dentry_child1),
> -			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
> -					       &_layer_masks_child1),
> -			      &_layer_masks_child1);
> +				init_layer_masks(domain,
> +					LANDLOCK_MASK_ACCESS_FS,
> +					&_layer_masks_child1,
> +					sizeof(_layer_masks_child1),
> +					LANDLOCK_RULE_PATH_BENEATH),
> +				&_layer_masks_child1,
> +				ARRAY_SIZE(_layer_masks_child1));

There is a lot of formatting diff and that makes the review difficult. 
Please format everything with clang-format-14.

>   		layer_masks_child1 = &_layer_masks_child1;
>   		child1_is_directory = d_is_dir(dentry_child1);
>   	}
>   	if (unlikely(dentry_child2)) {
>   		unmask_layers(find_rule(domain, dentry_child2),
> -			      init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
> -					       &_layer_masks_child2),
> -			      &_layer_masks_child2);
> +				init_layer_masks(domain,
> +					LANDLOCK_MASK_ACCESS_FS,
> +					&_layer_masks_child2,
> +					sizeof(_layer_masks_child2),
> +					LANDLOCK_RULE_PATH_BENEATH),
> +				&_layer_masks_child2,
> +				ARRAY_SIZE(_layer_masks_child2));
>   		layer_masks_child2 = &_layer_masks_child2;
>   		child2_is_directory = d_is_dir(dentry_child2);
>   	}
> @@ -582,14 +484,15 @@ static int check_access_path_dual(
> 
>   		rule = find_rule(domain, walker_path.dentry);
>   		allowed_parent1 = unmask_layers(rule, access_masked_parent1,
> -						layer_masks_parent1);
> +				layer_masks_parent1,
> +				ARRAY_SIZE(*layer_masks_parent1));
>   		allowed_parent2 = unmask_layers(rule, access_masked_parent2,
> -						layer_masks_parent2);
> +				layer_masks_parent2,
> +				ARRAY_SIZE(*layer_masks_parent2));
> 
>   		/* Stops when a rule from each layer grants access. */
>   		if (allowed_parent1 && allowed_parent2)
>   			break;
> -

There is no place for such formatting/whitespace patches.


>   jump_up:
>   		if (walker_path.dentry == walker_path.mnt->mnt_root) {
>   			if (follow_up(&walker_path)) {
> @@ -645,7 +548,9 @@ static inline int check_access_path(const struct landlock_ruleset *const domain,
>   {
>   	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
> 
> -	access_request = init_layer_masks(domain, access_request, &layer_masks);
> +	access_request = init_layer_masks(domain, access_request,
> +			&layer_masks, sizeof(layer_masks),
> +			LANDLOCK_RULE_PATH_BENEATH);
>   	return check_access_path_dual(domain, path, access_request,
>   				      &layer_masks, NULL, 0, NULL, NULL);
>   }
> @@ -729,7 +634,8 @@ static bool collect_domain_accesses(
>   		return true;
> 
>   	access_dom = init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
> -				      layer_masks_dom);
> +			layer_masks_dom, sizeof(*layer_masks_dom),
> +			LANDLOCK_RULE_PATH_BENEATH);
> 
>   	dget(dir);
>   	while (true) {
> @@ -737,7 +643,8 @@ static bool collect_domain_accesses(
> 
>   		/* Gets all layers allowing all domain accesses. */
>   		if (unmask_layers(find_rule(domain, dir), access_dom,
> -				  layer_masks_dom)) {
> +					layer_masks_dom,
> +					ARRAY_SIZE(*layer_masks_dom))) {
>   			/*
>   			 * Stops when all handled accesses are allowed by at
>   			 * least one rule in each layer.
> @@ -851,9 +758,10 @@ static int current_check_refer_path(struct dentry *const old_dentry,
>   		 * The LANDLOCK_ACCESS_FS_REFER access right is not required
>   		 * for same-directory referer (i.e. no reparenting).
>   		 */
> -		access_request_parent1 = init_layer_masks(
> -			dom, access_request_parent1 | access_request_parent2,
> -			&layer_masks_parent1);
> +		access_request_parent1 = init_layer_masks(dom,
> +				access_request_parent1 | access_request_parent2,
> +				&layer_masks_parent1, sizeof(layer_masks_parent1),
> +				LANDLOCK_RULE_PATH_BENEATH);
>   		return check_access_path_dual(dom, new_dir,
>   					      access_request_parent1,
>   					      &layer_masks_parent1, NULL, 0,
> @@ -861,7 +769,9 @@ static int current_check_refer_path(struct dentry *const old_dentry,
>   	}
> 
>   	/* Backward compatibility: no reparenting support. */
> -	if (!(get_handled_accesses(dom) & LANDLOCK_ACCESS_FS_REFER))
> +	if (!(get_handled_accesses(dom, LANDLOCK_RULE_PATH_BENEATH,
> +				   LANDLOCK_NUM_ACCESS_FS) &
> +						LANDLOCK_ACCESS_FS_REFER))
>   		return -EXDEV;
> 
>   	access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
> diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c
> index 4b4c9953bb32..c4ed783d655b 100644
> --- a/security/landlock/ruleset.c
> +++ b/security/landlock/ruleset.c
> @@ -233,7 +233,8 @@ static int insert_rule(struct landlock_ruleset *const ruleset,
>   					       &(*layers)[0]);
>   			if (IS_ERR(new_rule))
>   				return PTR_ERR(new_rule);
> -			rb_replace_node(&this->node, &new_rule->node, &ruleset->root_inode);
> +			rb_replace_node(&this->node, &new_rule->node,
> +					&ruleset->root_inode);

This is a pure formatting hunk. :/


>   			free_rule(this, rule_type);
>   			break;
>   		}
> @@ -246,7 +247,8 @@ static int insert_rule(struct landlock_ruleset *const ruleset,
>   		return -E2BIG;
>   	switch (rule_type) {
>   	case LANDLOCK_RULE_PATH_BENEATH:
> -		new_rule = create_rule(object_ptr, 0, layers, num_layers, NULL);
> +		new_rule = create_rule(object_ptr, 0, layers,
> +				       num_layers, NULL);
>   		if (IS_ERR(new_rule))
>   			return PTR_ERR(new_rule);
>   		rb_link_node(&new_rule->node, parent_node, walker_node);
> @@ -281,8 +283,8 @@ int landlock_insert_rule(struct landlock_ruleset *const ruleset,
>   	} };
> 
>   	build_check_layer();
> -	return insert_rule(ruleset, object_ptr, object_data, rule_type, &layers,
> -			   ARRAY_SIZE(layers));
> +	return insert_rule(ruleset, object_ptr, object_data, rule_type,
> +			   &layers, ARRAY_SIZE(layers));
>   }
> 
>   static inline void get_hierarchy(struct landlock_hierarchy *const hierarchy)
> @@ -335,8 +337,9 @@ static int tree_merge(struct landlock_ruleset *const src,
> 
>   		switch (rule_type) {
>   		case LANDLOCK_RULE_PATH_BENEATH:
> -			err = insert_rule(dst, walker_rule->object.ptr, 0, rule_type,
> -					  &layers, ARRAY_SIZE(layers));
> +			err = insert_rule(dst, walker_rule->object.ptr, 0,
> +					  rule_type, &layers,
> +					  ARRAY_SIZE(layers));
>   			break;
>   		}
>   		if (err)
> @@ -433,9 +436,13 @@ static int inherit_ruleset(struct landlock_ruleset *const parent,
>   		err = -EINVAL;
>   		goto out_unlock;
>   	}
> -	/* Copies the parent layer stack and leaves a space for the new layer. */
> +	/*
> +	 * Copies the parent layer stack and leaves a space
> +	 * for the new layer.
> +	 */
>   	memcpy(child->access_masks, parent->access_masks,
> -			flex_array_size(parent, access_masks, parent->num_layers));
> +			flex_array_size(parent, access_masks,
> +					parent->num_layers));
> 
>   	if (WARN_ON_ONCE(!parent->hierarchy)) {
>   		err = -EINVAL;
> @@ -455,8 +462,9 @@ static void free_ruleset(struct landlock_ruleset *const ruleset)
>   	struct landlock_rule *freeme, *next;
> 
>   	might_sleep();
> -	rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode,
> -			node)
> +	rbtree_postorder_for_each_entry_safe(freeme, next,
> +					     &ruleset->root_inode,
> +					     node)
>   		free_rule(freeme, LANDLOCK_RULE_PATH_BENEATH);
>   	put_hierarchy(ruleset->hierarchy);
>   	kfree(ruleset);
> @@ -577,3 +585,127 @@ const struct landlock_rule *landlock_find_rule(
>   	}
>   	return NULL;
>   }
> +
> +access_mask_t get_handled_accesses(
> +		const struct landlock_ruleset *const domain,
> +		u16 rule_type, u16 num_access)
> +{
> +	access_mask_t access_dom = 0;
> +	unsigned long access_bit;
> +
> +	switch (rule_type) {
> +	case LANDLOCK_RULE_PATH_BENEATH:
> +		for (access_bit = 0; access_bit < LANDLOCK_NUM_ACCESS_FS;
> +			access_bit++) {
> +			size_t layer_level;
> +
> +			for (layer_level = 0; layer_level < domain->num_layers;
> +					layer_level++) {
> +				if (landlock_get_fs_access_mask(domain,
> +								layer_level) &
> +						BIT_ULL(access_bit)) {
> +					access_dom |= BIT_ULL(access_bit);
> +					break;
> +				}
> +			}
> +		}
> +		break;
> +	default:
> +		break;
> +	}
> +	return access_dom;
> +}
> +
> +/*
> + * @layer_masks is read and may be updated according to the access request and
> + * the matching rule.
> + *
> + * Returns true if the request is allowed (i.e. relevant layer masks for the
> + * request are empty).
> + */
> +bool unmask_layers(const struct landlock_rule *const rule,
> +		const access_mask_t access_request,
> +		layer_mask_t (*const layer_masks)[], size_t masks_array_size)
> +{
> +	size_t layer_level;
> +
> +	if (!access_request || !layer_masks)
> +		return true;
> +	if (!rule)
> +		return false;
> +
> +	/*
> +	 * An access is granted if, for each policy layer, at least one rule
> +	 * encountered on the pathwalk grants the requested access,
> +	 * regardless of its position in the layer stack.  We must then check
> +	 * the remaining layers for each inode, from the first added layer to
> +	 * the last one.  When there is multiple requested accesses, for each
> +	 * policy layer, the full set of requested accesses may not be granted
> +	 * by only one rule, but by the union (binary OR) of multiple rules.
> +	 * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
> +	 */
> +	for (layer_level = 0; layer_level < rule->num_layers; layer_level++) {
> +		const struct landlock_layer *const layer =
> +			&rule->layers[layer_level];
> +		const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
> +		const unsigned long access_req = access_request;
> +		unsigned long access_bit;
> +		bool is_empty;
> +
> +		/*
> +		 * Records in @layer_masks which layer grants access to each
> +		 * requested access.
> +		 */
> +		is_empty = true;
> +		for_each_set_bit(access_bit, &access_req, masks_array_size) {
> +			if (layer->access & BIT_ULL(access_bit))
> +				(*layer_masks)[access_bit] &= ~layer_bit;
> +			is_empty = is_empty && !(*layer_masks)[access_bit];
> +		}
> +		if (is_empty)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +access_mask_t init_layer_masks(const struct landlock_ruleset *const domain,
> +			       const access_mask_t access_request,
> +			       layer_mask_t (*const layer_masks)[],
> +			       size_t masks_size,
> +			       u16 rule_type)
> +{
> +	access_mask_t handled_accesses = 0;
> +	size_t layer_level;
> +
> +	memset(layer_masks, 0, masks_size);
> +
> +	/* An empty access request can happen because of O_WRONLY | O_RDWR. */
> +	if (!access_request)
> +		return 0;
> +
> +	/* Saves all handled accesses per layer. */
> +	for (layer_level = 0; layer_level < domain->num_layers;
> +			layer_level++) {
> +		const unsigned long access_req = access_request;
> +		unsigned long access_bit;
> +
> +		switch (rule_type) {
> +		case LANDLOCK_RULE_PATH_BENEATH:
> +			for_each_set_bit(access_bit, &access_req,
> +					LANDLOCK_NUM_ACCESS_FS) {
> +				if (landlock_get_fs_access_mask(domain,
> +								layer_level) &
> +						BIT_ULL(access_bit)) {
> +					(*layer_masks)[access_bit] |=
> +						BIT_ULL(layer_level);
> +					handled_accesses |=
> +							   BIT_ULL(access_bit);
> +				}
> +			}
> +			break;
> +		default:
> +			return 0;
> +		}
> +	}
> +	return handled_accesses;
> +}
> diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
> index 3066e5d7180c..f3cd890d0348 100644
> --- a/security/landlock/ruleset.h
> +++ b/security/landlock/ruleset.h
> @@ -195,7 +195,22 @@ static inline u32 landlock_get_fs_access_mask(
>   					const struct landlock_ruleset *ruleset,
>   					u16 mask_level)
>   {
> -	return ruleset->access_masks[mask_level];
> +	return (ruleset->access_masks[mask_level] & LANDLOCK_MASK_ACCESS_FS);
>   }
> 
> +access_mask_t get_handled_accesses(
> +		const struct landlock_ruleset *const domain,
> +		u16 rule_type, u16 num_access);
> +
> +bool unmask_layers(const struct landlock_rule *const rule,
> +		   const access_mask_t access_request,
> +		   layer_mask_t (*const layer_masks)[],
> +		   size_t masks_array_size);
> +
> +access_mask_t init_layer_masks(const struct landlock_ruleset *const domain,
> +			       const access_mask_t access_request,
> +			       layer_mask_t (*const layer_masks)[],
> +			       size_t masks_size,
> +			       u16 rule_type);

These declarations are useless.

> +
>   #endif /* _SECURITY_LANDLOCK_RULESET_H */
> --
> 2.25.1
> 

^ permalink raw reply

* Re: [PATCH linux-next] net: ath: fix minmax.cocci warnings
From: Kalle Valo @ 2022-05-16 17:04 UTC (permalink / raw)
  To: Guo Zhengkui
  Cc: Jiri Slaby, Nick Kossifidis, Luis Chamberlain, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Toke Høiland-Jørgensen,
	open list:ATHEROS ATH5K WIRELESS DRIVER,
	open list:NETWORKING DRIVERS, open list, zhengkui_guo
In-Reply-To: <20220516134057.72365-1-guozhengkui@vivo.com>

Guo Zhengkui <guozhengkui@vivo.com> writes:

> Fix the following coccicheck warnings:
>
> drivers/net/wireless/ath/ath5k/phy.c:3139:62-63: WARNING
> opportunity for min()
> drivers/net/wireless/ath/ath9k/dfs.c:249:28-30: WARNING
> opportunity for max()
>
> Signed-off-by: Guo Zhengkui <guozhengkui@vivo.com>
> ---
>  drivers/net/wireless/ath/ath5k/phy.c | 2 +-
>  drivers/net/wireless/ath/ath9k/dfs.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)

Please split this into two patches, one for ath5k and one for ath9k.

-- 
https://patchwork.kernel.org/project/linux-wireless/list/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH net-next v3 0/2] net/smc: send and write inline optimization for smc
From: Karsten Graul @ 2022-05-16 16:58 UTC (permalink / raw)
  To: Guangguan Wang, davem, edumazet, kuba, pabeni, leon, tonylu
  Cc: linux-s390, netdev, linux-kernel
In-Reply-To: <20220516055137.51873-1-guangguan.wang@linux.alibaba.com>

On 16/05/2022 07:51, Guangguan Wang wrote:
> Send cdc msgs and write data inline if qp has sufficent inline
> space, helps latency reducing. 
> 
> In my test environment, which are 2 VMs running on the same
> physical host and whose NICs(ConnectX-4Lx) are working on
> SR-IOV mode, qperf shows 0.4us-1.3us improvement in latency.
> 
> Test command:
> server: smc_run taskset -c 1 qperf
> client: smc_run taskset -c 1 qperf <server ip> -oo \
> 		msg_size:1:2K:*2 -t 30 -vu tcp_lat
> 
> The results shown below:
> msgsize     before       after
> 1B          11.9 us      10.6 us (-1.3 us)
> 2B          11.7 us      10.7 us (-1.0 us)
> 4B          11.7 us      10.7 us (-1.0 us)
> 8B          11.6 us      10.6 us (-1.0 us)
> 16B         11.7 us      10.7 us (-1.0 us)
> 32B         11.7 us      10.6 us (-1.1 us)
> 64B         11.7 us      11.2 us (-0.5 us)
> 128B        11.6 us      11.2 us (-0.4 us)
> 256B        11.8 us      11.2 us (-0.6 us)
> 512B        11.8 us      11.3 us (-0.5 us)
> 1KB         11.9 us      11.5 us (-0.4 us)
> 2KB         12.1 us      11.5 us (-0.6 us)
> 
> Guangguan Wang (2):
>   net/smc: send cdc msg inline if qp has sufficient inline space
>   net/smc: rdma write inline if qp has sufficient inline space
> 
>  net/smc/smc_ib.c |  1 +
>  net/smc/smc_tx.c | 17 ++++++++++++-----
>  net/smc/smc_wr.c |  5 ++++-
>  3 files changed, 17 insertions(+), 6 deletions(-)
> 

I like that change, thank you!

For the series:

Acked-by: Karsten Graul <kgraul@linux.ibm.com>


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox